mirror of
https://github.com/Zygo/bees.git
synced 2025-08-03 22:33:28 +02:00
Compare commits
20 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
124507232f | ||
|
3c5e13c885 | ||
|
a6ca2fa2f6 | ||
|
3f23a0c73f | ||
|
d6732c58e2 | ||
|
75b2067cef | ||
|
da3ef216b1 | ||
|
b7665d49d9 | ||
|
717bdf5eb5 | ||
|
9b60f2b94d | ||
|
8978d63e75 | ||
|
82474b4ef4 | ||
|
73834beb5a | ||
|
c92ba117d8 | ||
|
c354e77634 | ||
|
f21569e88c | ||
|
3d5ebe4d40 | ||
|
3430f16998 | ||
|
7c764a73c8 | ||
|
a9a5cd03a5 |
@@ -17,7 +17,6 @@ Strengths
|
|||||||
* Space-efficient hash table and matching algorithms - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
|
* Space-efficient hash table and matching algorithms - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
|
||||||
* Daemon incrementally dedupes new data using btrfs tree search
|
* Daemon incrementally dedupes new data using btrfs tree search
|
||||||
* Works with btrfs compression - dedupe any combination of compressed and uncompressed files
|
* Works with btrfs compression - dedupe any combination of compressed and uncompressed files
|
||||||
* **NEW** [Works around `btrfs send` problems with dedupe and incremental parent snapshots](docs/options.md)
|
|
||||||
* Works around btrfs filesystem structure to free more disk space
|
* Works around btrfs filesystem structure to free more disk space
|
||||||
* Persistent hash table for rapid restart after shutdown
|
* Persistent hash table for rapid restart after shutdown
|
||||||
* Whole-filesystem dedupe - including snapshots
|
* Whole-filesystem dedupe - including snapshots
|
||||||
@@ -70,6 +69,6 @@ You can also use Github:
|
|||||||
Copyright & License
|
Copyright & License
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
Copyright 2015-2022 Zygo Blaxell <bees@furryterror.org>.
|
Copyright 2015-2023 Zygo Blaxell <bees@furryterror.org>.
|
||||||
|
|
||||||
GPL (version 3 or later).
|
GPL (version 3 or later).
|
||||||
|
@@ -7,23 +7,24 @@ First, a warning that is not specific to bees:
|
|||||||
severe regression that can lead to fatal metadata corruption.**
|
severe regression that can lead to fatal metadata corruption.**
|
||||||
This issue is fixed in kernel 5.4.14 and later.
|
This issue is fixed in kernel 5.4.14 and later.
|
||||||
|
|
||||||
**Recommended kernel versions for bees are 4.19, 5.4, 5.10, 5.11, or 5.12,
|
**Recommended kernel versions for bees are 4.19, 5.4, 5.10, 5.11, 5.15,
|
||||||
with recent LTS and -stable updates.** The latest released kernel as
|
6.0, or 6.1, with recent LTS and -stable updates.** The latest released
|
||||||
of this writing is 5.18.18.
|
kernel as of this writing is 6.4.1.
|
||||||
|
|
||||||
4.14, 4.9, and 4.4 LTS kernels with recent updates are OK with
|
4.14, 4.9, and 4.4 LTS kernels with recent updates are OK with some
|
||||||
some issues. Older kernels will be slower (a little slower or a lot
|
issues. Older kernels will be slower (a little slower or a lot slower
|
||||||
slower depending on which issues are triggered). Not all fixes are
|
depending on which issues are triggered). Not all fixes are backported.
|
||||||
backported.
|
|
||||||
|
|
||||||
Obsolete non-LTS kernels have a variety of unfixed issues and should
|
Obsolete non-LTS kernels have a variety of unfixed issues and should
|
||||||
not be used with btrfs. For details see the table below.
|
not be used with btrfs. For details see the table below.
|
||||||
|
|
||||||
bees requires btrfs kernel API version 4.2 or higher, and does not work
|
bees requires btrfs kernel API version 4.2 or higher, and does not work
|
||||||
on older kernels.
|
at all on older kernels.
|
||||||
|
|
||||||
bees will detect and use btrfs kernel API up to version 4.15 if present.
|
Some bees features rely on kernel 4.15 to work, and these features will
|
||||||
In some future bees release, this API version may become mandatory.
|
not be available on older kernels. Currently, bees is still usable on
|
||||||
|
older kernels with degraded performance or with options disabled, but
|
||||||
|
support for older kernels may be removed.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -58,14 +59,17 @@ These bugs are particularly popular among bees users, though not all are specifi
|
|||||||
| - | 5.8 | deadlock in `TREE_SEARCH` ioctl (core component of bees filesystem scanner), followed by regression in deadlock fix | 4.4.237, 4.9.237, 4.14.199, 4.19.146, 5.4.66, 5.8.10 and later | a48b73eca4ce btrfs: fix potential deadlock in the search ioctl, 1c78544eaa46 btrfs: fix wrong address when faulting in pages in the search ioctl
|
| - | 5.8 | deadlock in `TREE_SEARCH` ioctl (core component of bees filesystem scanner), followed by regression in deadlock fix | 4.4.237, 4.9.237, 4.14.199, 4.19.146, 5.4.66, 5.8.10 and later | a48b73eca4ce btrfs: fix potential deadlock in the search ioctl, 1c78544eaa46 btrfs: fix wrong address when faulting in pages in the search ioctl
|
||||||
| 5.7 | 5.10 | kernel crash if balance receives fatal signal e.g. Ctrl-C | 5.4.93, 5.10.11, 5.11 and later | 18d3bff411c8 btrfs: don't get an EINTR during drop_snapshot for reloc
|
| 5.7 | 5.10 | kernel crash if balance receives fatal signal e.g. Ctrl-C | 5.4.93, 5.10.11, 5.11 and later | 18d3bff411c8 btrfs: don't get an EINTR during drop_snapshot for reloc
|
||||||
| 5.10 | 5.10 | 20x write performance regression | 5.10.8, 5.11 and later | e076ab2a2ca7 btrfs: shrink delalloc pages instead of full inodes
|
| 5.10 | 5.10 | 20x write performance regression | 5.10.8, 5.11 and later | e076ab2a2ca7 btrfs: shrink delalloc pages instead of full inodes
|
||||||
| 5.4 | 5.11 | spurious tree checker failures on extent ref hash | 5.11.5, 5.12 and later | 1119a72e223f btrfs: tree-checker: do not error out if extent ref hash doesn't match
|
| 5.4 | 5.11 | spurious tree checker failures on extent ref hash | 5.4.125, 5.10.43, 5.11.5, 5.12 and later | 1119a72e223f btrfs: tree-checker: do not error out if extent ref hash doesn't match
|
||||||
| - | 5.11 | tree mod log issue #5 | 4.4.263, 4.9.263, 4.14.227, 4.19.183, 5.4.108, 5.10.26, 5.11.9, 5.12 and later | dbcc7d57bffc btrfs: fix race when cloning extent buffer during rewind of an old root
|
| - | 5.11 | tree mod log issue #5 | 4.4.263, 4.9.263, 4.14.227, 4.19.183, 5.4.108, 5.10.26, 5.11.9, 5.12 and later | dbcc7d57bffc btrfs: fix race when cloning extent buffer during rewind of an old root
|
||||||
| - | 5.12 | tree mod log issue #6 | 4.14.233, 4.19.191, 5.4.118, 5.10.36, 5.11.20, 5.12.3, 5.13 and later | f9690f426b21 btrfs: fix race when picking most recent mod log operation for an old root
|
| - | 5.12 | tree mod log issue #6 | 4.14.233, 4.19.191, 5.4.118, 5.10.36, 5.11.20, 5.12.3, 5.13 and later | f9690f426b21 btrfs: fix race when picking most recent mod log operation for an old root
|
||||||
| 4.15 | 5.16 | spurious warnings from `fs/fs-writeback.c` when `flushoncommit` is enabled | 5.15.27, 5.16.13, 5.17 and later | a0f0cf8341e3 btrfs: get rid of warning on transaction commit when using flushoncommit
|
| 4.15 | 5.16 | spurious warnings from `fs/fs-writeback.c` when `flushoncommit` is enabled | 5.15.27, 5.16.13, 5.17 and later | a0f0cf8341e3 btrfs: get rid of warning on transaction commit when using flushoncommit
|
||||||
| - | 5.17 | crash during device removal can make filesystem unmountable | 5.15.54, 5.16.20, 5.17.3, 5.18 and later | bbac58698a55 btrfs: remove device item and update super block in the same transaction
|
| - | 5.17 | crash during device removal can make filesystem unmountable | 5.15.54, 5.16.20, 5.17.3, 5.18 and later | bbac58698a55 btrfs: remove device item and update super block in the same transaction
|
||||||
| - | 5.18 | wrong superblock num_devices makes filesystem unmountable | 4.14.283, 4.19.247, 5.4.198, 5.10.121, 5.15.46, 5.17.14, 5.18.3, 5.19 and later | d201238ccd2f btrfs: repair super block num_devices automatically
|
| - | 5.18 | wrong superblock num_devices makes filesystem unmountable | 4.14.283, 4.19.247, 5.4.198, 5.10.121, 5.15.46, 5.17.14, 5.18.3, 5.19 and later | d201238ccd2f btrfs: repair super block num_devices automatically
|
||||||
| 5.18 | 5.19 | parent transid verify failed during log tree replay after a crash during a rename operation | 5.18.18, 5.19.2, 6.0 and later | 723df2bcc9e1 btrfs: join running log transaction when logging new name
|
| 5.18 | 5.19 | parent transid verify failed during log tree replay after a crash during a rename operation | 5.18.18, 5.19.2, 6.0 and later | 723df2bcc9e1 btrfs: join running log transaction when logging new name
|
||||||
| 5.4 | - | kernel hang when multiple threads are running `LOGICAL_INO` and dedupe ioctl | - | workaround: reduce bees thread count to 1 with `-c1`
|
| 5.12 | 6.0 | space cache corruption and potential double allocations | 5.15.65, 5.19.6, 6.0 and later | ced8ecf026fd btrfs: fix space cache corruption and potential double allocations
|
||||||
|
| 6.3, backported to 5.15.107, 6.1.24, 6.2.11 | 6.3 | vmalloc error, failed to allocate pages | 6.3.10, 6.4 and later. Bug (f349b15e183d "mm: vmalloc: avoid warn_alloc noise caused by fatal signal" in v6.3-rc6) backported to 6.1.24, 6.2.11, and 5.15.107. | 95a301eefa82 mm/vmalloc: do not output a spurious warning when huge vmalloc() fails
|
||||||
|
| 6.2 | 6.3 | `IGNORE_OFFSET` flag ignored in `LOGICAL_INO` ioctl | 6.2.16, 6.3.3, 6.4 and later | 0cad8f14d70c btrfs: fix backref walking not returning all inode refs
|
||||||
|
| 5.4 | - | kernel hang when multiple threads are running `LOGICAL_INO` and dedupe ioctl on the same extent | - | workaround: avoid doing that
|
||||||
|
|
||||||
"Last bad kernel" refers to that version's last stable update from
|
"Last bad kernel" refers to that version's last stable update from
|
||||||
kernel.org. Distro kernels may backport additional fixes. Consult
|
kernel.org. Distro kernels may backport additional fixes. Consult
|
||||||
@@ -80,21 +84,45 @@ through 5.4.13 inclusive.
|
|||||||
A "-" for "first bad kernel" indicates the bug has been present since
|
A "-" for "first bad kernel" indicates the bug has been present since
|
||||||
the relevant feature first appeared in btrfs.
|
the relevant feature first appeared in btrfs.
|
||||||
|
|
||||||
A "-" for "last bad kernel" indicates the bug has not yet been fixed as
|
A "-" for "last bad kernel" indicates the bug has not yet been fixed in
|
||||||
of 5.18.18.
|
current kernels (see top of this page for which kernel version that is).
|
||||||
|
|
||||||
In cases where issues are fixed by commits spread out over multiple
|
In cases where issues are fixed by commits spread out over multiple
|
||||||
kernel versions, "fixed kernel version" refers to the version that
|
kernel versions, "fixed kernel version" refers to the version that
|
||||||
contains all components of the fix.
|
contains the last committed component of the fix.
|
||||||
|
|
||||||
|
|
||||||
Workarounds for known kernel bugs
|
Workarounds for known kernel bugs
|
||||||
---------------------------------
|
---------------------------------
|
||||||
|
|
||||||
* **Hangs with high worker thread counts**: On kernels newer than
|
* **Hangs with concurrent `LOGICAL_INO` and dedupe**: on all
|
||||||
5.4, multiple threads running `LOGICAL_INO` and dedupe ioctls
|
kernel versions so far, multiple threads running `LOGICAL_INO`
|
||||||
at the same time can lead to a kernel hang. The workaround is
|
and dedupe ioctls at the same time on the same inodes or extents
|
||||||
to reduce the thread count to 1 with `-c1`.
|
can lead to a kernel hang. The kernel enters an infinite loop in
|
||||||
|
`add_all_parents`, where `count` is 0, `ref->count` is 1, and
|
||||||
|
`btrfs_next_item` or `btrfs_next_old_item` never find a matching ref).
|
||||||
|
|
||||||
|
bees has two workarounds for this bug: 1. schedule work so that multiple
|
||||||
|
threads do not simultaneously access the same inode or the same extent,
|
||||||
|
and 2. use a brute-force global lock within bees that prevents any
|
||||||
|
thread from running `LOGICAL_INO` while any other thread is running
|
||||||
|
dedupe.
|
||||||
|
|
||||||
|
Workaround #1 isn't really a workaround, since we want to do the same
|
||||||
|
thing for unrelated performance reasons. If multiple threads try to
|
||||||
|
perform dedupe operations on the same extent or inode, btrfs will make
|
||||||
|
all the threads wait for the same locks anyway, so it's better to have
|
||||||
|
bees find some other inode or extent to work on while waiting for btrfs
|
||||||
|
to finish.
|
||||||
|
|
||||||
|
Workaround #2 doesn't seem to be needed after implementing workaround
|
||||||
|
#1, but it's better to be slightly slower than to hang one CPU core
|
||||||
|
and the filesystem until the kernel is rebooted.
|
||||||
|
|
||||||
|
It is still theoretically possible to trigger the kernel bug when
|
||||||
|
running bees at the same time as other dedupers, or other programs
|
||||||
|
that use `LOGICAL_INO` like `btdu`; however, it's extremely difficult
|
||||||
|
to reproduce the bug without closely cooperating threads.
|
||||||
|
|
||||||
* **Slow backrefs** (aka toxic extents): Under certain conditions,
|
* **Slow backrefs** (aka toxic extents): Under certain conditions,
|
||||||
if the number of references to a single shared extent grows too
|
if the number of references to a single shared extent grows too
|
||||||
@@ -110,8 +138,8 @@ Workarounds for known kernel bugs
|
|||||||
at this time of writing only bees has a workaround for this bug.
|
at this time of writing only bees has a workaround for this bug.
|
||||||
|
|
||||||
This workaround is less necessary for kernels 5.4.96, 5.7 and later,
|
This workaround is less necessary for kernels 5.4.96, 5.7 and later,
|
||||||
though it can still take 2 ms of CPU to resolve each extent ref on a
|
though the bees workaround can still be triggered on newer kernels
|
||||||
fast machine on a large, heavily fragmented file.
|
by changes in btrfs since kernel version 5.1.
|
||||||
|
|
||||||
* **dedupe breaks `btrfs send` in old kernels**. The bees option
|
* **dedupe breaks `btrfs send` in old kernels**. The bees option
|
||||||
`--workaround-btrfs-send` prevents any modification of read-only subvols
|
`--workaround-btrfs-send` prevents any modification of read-only subvols
|
||||||
@@ -127,8 +155,6 @@ Workarounds for known kernel bugs
|
|||||||
Unfixed kernel bugs
|
Unfixed kernel bugs
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
As of 5.18.18:
|
|
||||||
|
|
||||||
* **The kernel does not permit `btrfs send` and dedupe to run at the
|
* **The kernel does not permit `btrfs send` and dedupe to run at the
|
||||||
same time**. Recent kernels no longer crash, but now refuse one
|
same time**. Recent kernels no longer crash, but now refuse one
|
||||||
operation with an error if the other operation was already running.
|
operation with an error if the other operation was already running.
|
||||||
|
@@ -8,44 +8,35 @@ bees has been tested in combination with the following:
|
|||||||
* HOLE extents and btrfs no-holes feature
|
* HOLE extents and btrfs no-holes feature
|
||||||
* Other deduplicators, reflink copies (though bees may decide to redo their work)
|
* Other deduplicators, reflink copies (though bees may decide to redo their work)
|
||||||
* btrfs snapshots and non-snapshot subvols (RW and RO)
|
* btrfs snapshots and non-snapshot subvols (RW and RO)
|
||||||
* Concurrent file modification (e.g. PostgreSQL and sqlite databases, build daemons)
|
* Concurrent file modification (e.g. PostgreSQL and sqlite databases, VMs, build daemons)
|
||||||
* all btrfs RAID profiles
|
* All btrfs RAID profiles
|
||||||
* IO errors during dedupe (read errors will throw exceptions, bees will catch them and skip over the affected extent)
|
* IO errors during dedupe (read errors will throw exceptions, bees will catch them and skip over the affected extent)
|
||||||
* Filesystems mounted *with* the flushoncommit option ([lots of harmless kernel log warnings on 4.15 and later](btrfs-kernel.md))
|
* Filesystems mounted with or without the `flushoncommit` option
|
||||||
* Filesystems mounted *without* the flushoncommit option
|
|
||||||
* 4K filesystem data block size / clone alignment
|
* 4K filesystem data block size / clone alignment
|
||||||
* 64-bit and 32-bit LE host CPUs (amd64, x86, arm)
|
* 64-bit and 32-bit LE host CPUs (amd64, x86, arm)
|
||||||
* Huge files (>1TB--although Btrfs performance on such files isn't great in general)
|
* Large files (kernel 5.4 or later strongly recommended)
|
||||||
* filesystems up to 30T+ bytes, 100M+ files
|
* Filesystems up to 90T+ bytes, 1000M+ files
|
||||||
* btrfs receive
|
* btrfs receive
|
||||||
* btrfs nodatacow/nodatasum inode attribute or mount option (bees skips all nodatasum files)
|
* btrfs nodatacow/nodatasum inode attribute or mount option (bees skips all nodatasum files)
|
||||||
* open(O_DIRECT) (seems to work as well--or as poorly--with bees as with any other btrfs feature)
|
* open(O_DIRECT) (seems to work as well--or as poorly--with bees as with any other btrfs feature)
|
||||||
* lvmcache: no problems observed in testing with recent kernels or reported by users in the last year.
|
* lvm dm-cache, writecache
|
||||||
|
|
||||||
Bad Btrfs Feature Interactions
|
Bad Btrfs Feature Interactions
|
||||||
------------------------------
|
------------------------------
|
||||||
|
|
||||||
bees has been tested in combination with the following, and various problems are known:
|
bees has been tested in combination with the following, and various problems are known:
|
||||||
|
|
||||||
* bcache: no data-losing problems observed in testing with recent kernels
|
* btrfs send: there are bugs in `btrfs send` that can be triggered by
|
||||||
or reported by users in the last year. Some issues observed with
|
bees on old kernels. The [`--workaround-btrfs-send` option](options.md)
|
||||||
bcache interacting badly with some SSD models' firmware, but so far
|
works around this issue by preventing bees from modifying read-only
|
||||||
this only causes temporary loss of service, not filesystem damage.
|
snapshots.
|
||||||
This behavior does not seem to be specific to bees (ordinary filesystem
|
|
||||||
tests with rsync and snapshots will reproduce it), but it does prevent
|
|
||||||
any significant testing of bees on bcache.
|
|
||||||
|
|
||||||
* btrfs send: there are bugs in `btrfs send` that can be triggered by bees.
|
|
||||||
The [`--workaround-btrfs-send` option](options.md) works around this issue
|
|
||||||
by preventing bees from modifying read-only snapshots.
|
|
||||||
|
|
||||||
* btrfs qgroups: very slow, sometimes hangs...and it's even worse when
|
* btrfs qgroups: very slow, sometimes hangs...and it's even worse when
|
||||||
bees is running.
|
bees is running.
|
||||||
|
|
||||||
* btrfs autodefrag mount option: hangs and high CPU usage problems
|
* btrfs autodefrag mount option: bees cannot distinguish autodefrag
|
||||||
reported by users. bees cannot distinguish autodefrag activity from
|
activity from normal filesystem activity, and may try to undo the
|
||||||
normal filesystem activity and will likely try to undo the autodefrag
|
autodefrag if duplicate copies of the defragmented data exist.
|
||||||
if duplicate copies of the defragmented data exist.
|
|
||||||
|
|
||||||
Untested Btrfs Feature Interactions
|
Untested Btrfs Feature Interactions
|
||||||
-----------------------------------
|
-----------------------------------
|
||||||
@@ -54,9 +45,10 @@ bees has not been tested with the following, and undesirable interactions may oc
|
|||||||
|
|
||||||
* Non-4K filesystem data block size (should work if recompiled)
|
* Non-4K filesystem data block size (should work if recompiled)
|
||||||
* Non-equal hash (SUM) and filesystem data block (CLONE) sizes (need to fix that eventually)
|
* Non-equal hash (SUM) and filesystem data block (CLONE) sizes (need to fix that eventually)
|
||||||
* btrfs seed filesystems (does anyone even use those?)
|
* btrfs seed filesystems (no particular reason it wouldn't work, but no one has reported trying)
|
||||||
* btrfs out-of-tree kernel patches (e.g. in-kernel dedupe or encryption)
|
* btrfs out-of-tree kernel patches (e.g. in-kernel dedupe, encryption, extent tree v2)
|
||||||
* btrfs-convert from ext2/3/4 (never tested, might run out of space or ignore significant portions of the filesystem due to sanity checks)
|
* btrfs-convert from ext2/3/4 (never tested, might run out of space or ignore significant portions of the filesystem due to sanity checks)
|
||||||
* btrfs mixed block groups (don't know a reason why it would *not* work, but never tested)
|
* btrfs mixed block groups (don't know a reason why it would *not* work, but never tested)
|
||||||
* flashcache: an out-of-tree cache-HDD-on-SSD block layer helper.
|
|
||||||
* Host CPUs with exotic page sizes, alignment requirements, or endianness (ppc, alpha, sparc, strongarm, s390, mips, m68k...)
|
* Host CPUs with exotic page sizes, alignment requirements, or endianness (ppc, alpha, sparc, strongarm, s390, mips, m68k...)
|
||||||
|
* bcache: used to be in the "bad" list, now in the "untested" list because nobody is rigorously testing, and bcache bugs come and go
|
||||||
|
* flashcache: an out-of-tree cache-HDD-on-SSD block layer helper
|
||||||
|
@@ -8,9 +8,10 @@ are reasonable in most cases.
|
|||||||
Hash Table Sizing
|
Hash Table Sizing
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
Hash table entries are 16 bytes per data block. The hash table stores
|
Hash table entries are 16 bytes per data block. The hash table stores the
|
||||||
the most recently read unique hashes. Once the hash table is full,
|
most recently read unique hashes. Once the hash table is full, each new
|
||||||
each new entry in the table evicts an old entry.
|
entry added to the table evicts an old entry. This makes the hash table
|
||||||
|
a sliding window over the most recently scanned data from the filesystem.
|
||||||
|
|
||||||
Here are some numbers to estimate appropriate hash table sizes:
|
Here are some numbers to estimate appropriate hash table sizes:
|
||||||
|
|
||||||
@@ -25,9 +26,11 @@ Here are some numbers to estimate appropriate hash table sizes:
|
|||||||
Notes:
|
Notes:
|
||||||
|
|
||||||
* If the hash table is too large, no extra dedupe efficiency is
|
* If the hash table is too large, no extra dedupe efficiency is
|
||||||
obtained, and the extra space just wastes RAM. Extra space can also slow
|
obtained, and the extra space wastes RAM. If the hash table contains
|
||||||
bees down by preventing old data from being evicted, so bees wastes time
|
more block records than there are blocks in the filesystem, the extra
|
||||||
looking for matching data that is no longer present on the filesystem.
|
space can slow bees down. A table that is too large prevents obsolete
|
||||||
|
data from being evicted, so bees wastes time looking for matching data
|
||||||
|
that is no longer present on the filesystem.
|
||||||
|
|
||||||
* If the hash table is too small, bees extrapolates from matching
|
* If the hash table is too small, bees extrapolates from matching
|
||||||
blocks to find matching adjacent blocks in the filesystem that have been
|
blocks to find matching adjacent blocks in the filesystem that have been
|
||||||
@@ -36,6 +39,10 @@ one block in common between two extents in order to be able to dedupe
|
|||||||
the entire extents. This provides significantly more dedupe hit rate
|
the entire extents. This provides significantly more dedupe hit rate
|
||||||
per hash table byte than other dedupe tools.
|
per hash table byte than other dedupe tools.
|
||||||
|
|
||||||
|
* There is a fairly wide range of usable hash sizes, and performances
|
||||||
|
degrades according to a smooth probabilistic curve in both directions.
|
||||||
|
Double or half the optimium size usually works just as well.
|
||||||
|
|
||||||
* When counting unique data in compressed data blocks to estimate
|
* When counting unique data in compressed data blocks to estimate
|
||||||
optimum hash table size, count the *uncompressed* size of the data.
|
optimum hash table size, count the *uncompressed* size of the data.
|
||||||
|
|
||||||
@@ -66,11 +73,11 @@ data on an uncompressed filesystem. Dedupe efficiency falls dramatically
|
|||||||
with hash tables smaller than 128MB/TB as the average dedupe extent size
|
with hash tables smaller than 128MB/TB as the average dedupe extent size
|
||||||
is larger than the largest possible compressed extent size (128KB).
|
is larger than the largest possible compressed extent size (128KB).
|
||||||
|
|
||||||
* **Short writes** also shorten the average extent length and increase
|
* **Short writes or fragmentation** also shorten the average extent
|
||||||
optimum hash table size. If a database writes to files randomly using
|
length and increase optimum hash table size. If a database writes to
|
||||||
4K page writes, all of these extents will be 4K in length, and the hash
|
files randomly using 4K page writes, all of these extents will be 4K
|
||||||
table size must be increased to retain each one (or the user must accept
|
in length, and the hash table size must be increased to retain each one
|
||||||
a lower dedupe hit rate).
|
(or the user must accept a lower dedupe hit rate).
|
||||||
|
|
||||||
Defragmenting files that have had many short writes increases the
|
Defragmenting files that have had many short writes increases the
|
||||||
extent length and therefore reduces the optimum hash table size.
|
extent length and therefore reduces the optimum hash table size.
|
||||||
|
@@ -296,6 +296,7 @@ resolve
|
|||||||
|
|
||||||
The `resolve` event group consists of operations related to translating a btrfs virtual block address (i.e. physical block address) to a `(root, inode, offset)` tuple (i.e. locating and opening the file containing a matching block). `resolve` is the top level, `chase` and `adjust` are the lower two levels.
|
The `resolve` event group consists of operations related to translating a btrfs virtual block address (i.e. physical block address) to a `(root, inode, offset)` tuple (i.e. locating and opening the file containing a matching block). `resolve` is the top level, `chase` and `adjust` are the lower two levels.
|
||||||
|
|
||||||
|
* `resolve_empty`: The `LOGICAL_INO` ioctl returned successfully with an empty reference list (0 items).
|
||||||
* `resolve_fail`: The `LOGICAL_INO` ioctl returned an error.
|
* `resolve_fail`: The `LOGICAL_INO` ioctl returned an error.
|
||||||
* `resolve_large`: The `LOGICAL_INO` ioctl returned more than 2730 results (the limit of the v1 ioctl).
|
* `resolve_large`: The `LOGICAL_INO` ioctl returned more than 2730 results (the limit of the v1 ioctl).
|
||||||
* `resolve_ms`: Total time spent in the `LOGICAL_INO` ioctl (i.e. wallclock time, not kernel CPU time).
|
* `resolve_ms`: Total time spent in the `LOGICAL_INO` ioctl (i.e. wallclock time, not kernel CPU time).
|
||||||
|
@@ -51,81 +51,40 @@ loops early. The exception text in this case is:
|
|||||||
Terminating bees with SIGTERM
|
Terminating bees with SIGTERM
|
||||||
-----------------------------
|
-----------------------------
|
||||||
|
|
||||||
bees is designed to survive host crashes, so it is safe to terminate
|
bees is designed to survive host crashes, so it is safe to terminate bees
|
||||||
bees using SIGKILL; however, when bees next starts up, it will repeat
|
using SIGKILL; however, when bees next starts up, it will repeat some
|
||||||
some work that was performed between the last bees crawl state save point
|
work that was performed between the last bees crawl state save point
|
||||||
and the SIGKILL (up to 15 minutes). If bees is stopped and started less
|
and the SIGKILL (up to 15 minutes), and a large hash table may not be
|
||||||
than once per day, then this is not a problem as the proportional impact
|
completely written back to disk, so some duplicate matches will be lost.
|
||||||
is quite small; however, users who stop and start bees daily or even
|
|
||||||
more often may prefer to have a clean shutdown with SIGTERM so bees can
|
|
||||||
restart faster.
|
|
||||||
|
|
||||||
bees handling of SIGTERM can take a long time on machines with some or
|
If bees is stopped and started less than once per week, then this is not
|
||||||
all of:
|
a problem as the proportional impact is quite small; however, users who
|
||||||
|
stop and start bees daily or even more often may prefer to have a clean
|
||||||
|
shutdown with SIGTERM so bees can restart faster.
|
||||||
|
|
||||||
* Large RAM and `vm.dirty_ratio`
|
The shutdown procedure performs these steps:
|
||||||
* Large number of active bees worker threads
|
|
||||||
* Large number of bees temporary files (proportional to thread count)
|
|
||||||
* Large hash table size
|
|
||||||
* Large filesystem size
|
|
||||||
* High IO latency, especially "low power" spinning disks
|
|
||||||
* High filesystem activity, especially duplicate data writes
|
|
||||||
|
|
||||||
Each of these factors individually increases the total time required
|
1. Crawl state is saved to `$BEESHOME`. This is the most
|
||||||
to perform a clean bees shutdown. When combined, the factors can
|
|
||||||
multiply with each other, dramatically increasing the time required to
|
|
||||||
flush bees state to disk.
|
|
||||||
|
|
||||||
On a large system with many of the above factors present, a "clean"
|
|
||||||
bees shutdown can take more than 20 minutes. Even a small machine
|
|
||||||
(16GB RAM, 1GB hash table, 1TB NVME disk) can take several seconds to
|
|
||||||
complete a SIGTERM shutdown.
|
|
||||||
|
|
||||||
The shutdown procedure performs potentially long-running tasks in
|
|
||||||
this order:
|
|
||||||
|
|
||||||
1. Worker threads finish executing their current Task and exit.
|
|
||||||
Threads executing `LOGICAL_INO` ioctl calls usually finish quickly,
|
|
||||||
but btrfs imposes no limit on the ioctl's running time, so it
|
|
||||||
can take several minutes in rare bad cases. If there is a btrfs
|
|
||||||
commit already in progress on the filesystem, then most worker
|
|
||||||
threads will be blocked until the btrfs commit is finished.
|
|
||||||
|
|
||||||
2. Crawl state is saved to `$BEESHOME`. This normally completes
|
|
||||||
relatively quickly (a few seconds at most). This is the most
|
|
||||||
important bees state to save to disk as it directly impacts
|
important bees state to save to disk as it directly impacts
|
||||||
restart time, so it is done as early as possible (but no earlier).
|
restart time, so it is done as early as possible
|
||||||
|
|
||||||
3. Hash table is written to disk. Normally the hash table is
|
2. Hash table is written to disk. Normally the hash table is
|
||||||
trickled back to disk at a rate of about 2GB per hour;
|
trickled back to disk at a rate of about 128KiB per second;
|
||||||
however, SIGTERM causes bees to attempt to flush the whole table
|
however, SIGTERM causes bees to attempt to flush the whole table
|
||||||
immediately. If bees has recently been idle then the hash table is
|
immediately. The time spent here depends on the size of RAM, speed
|
||||||
likely already flushed to disk, so this step will finish quickly;
|
of disks, and aggressiveness of competing filesystem workloads.
|
||||||
however, if bees has recently been active and the hash table is
|
It can trigger `vm.dirty_bytes` limits and block other processes
|
||||||
large relative to RAM size, the blast of rapidly written data
|
writing to the filesystem for a while.
|
||||||
can force the Linux VFS to block all writes to the filesystem
|
|
||||||
for sufficient time to complete all pending btrfs metadata
|
|
||||||
writes which accumulated during the btrfs commit before bees
|
|
||||||
received SIGTERM...and _then_ let bees write out the hash table.
|
|
||||||
The time spent here depends on the size of RAM, speed of disks,
|
|
||||||
and aggressiveness of competing filesystem workloads.
|
|
||||||
|
|
||||||
4. bees temporary files are closed, which implies deletion of their
|
3. The bees process calls `_exit`, which terminates all running
|
||||||
inodes. These are files which consist entirely of shared extent
|
worker threads, closes and deletes all temporary files. This
|
||||||
structures, and btrfs takes an unusually long time to delete such
|
can take a while _after_ the bees process exits, especially on
|
||||||
files (up to a few minutes for each on slow spinning disks).
|
slow spinning disks.
|
||||||
|
|
||||||
If bees is terminated with SIGKILL, only step #1 and #4 are performed (the
|
|
||||||
kernel performs these automatically if bees exits). This reduces the
|
|
||||||
shutdown time at the cost of increased startup time.
|
|
||||||
|
|
||||||
Balances
|
Balances
|
||||||
--------
|
--------
|
||||||
|
|
||||||
First, read [`LOGICAL_INO` and btrfs balance WARNING](btrfs-kernel.md).
|
|
||||||
bees will suspend operations during a btrfs balance to work around
|
|
||||||
kernel bugs.
|
|
||||||
|
|
||||||
A btrfs balance relocates data on disk by making a new copy of the
|
A btrfs balance relocates data on disk by making a new copy of the
|
||||||
data, replacing all references to the old data with references to the
|
data, replacing all references to the old data with references to the
|
||||||
new copy, and deleting the old copy. To bees, this is the same as any
|
new copy, and deleting the old copy. To bees, this is the same as any
|
||||||
@@ -175,7 +134,9 @@ the beginning.
|
|||||||
|
|
||||||
Each time bees dedupes an extent that is referenced by a snapshot,
|
Each time bees dedupes an extent that is referenced by a snapshot,
|
||||||
the entire metadata page in the snapshot subvol (16KB by default) must
|
the entire metadata page in the snapshot subvol (16KB by default) must
|
||||||
be CoWed in btrfs. This can result in a substantial increase in btrfs
|
be CoWed in btrfs. Since all references must be removed at the same
|
||||||
|
time, this CoW operation is repeated in every snapshot containing the
|
||||||
|
duplicate data. This can result in a substantial increase in btrfs
|
||||||
metadata size if there are many snapshots on a filesystem.
|
metadata size if there are many snapshots on a filesystem.
|
||||||
|
|
||||||
Normally, metadata is small (less than 1% of the filesystem) and dedupe
|
Normally, metadata is small (less than 1% of the filesystem) and dedupe
|
||||||
@@ -252,17 +213,18 @@ Other Gotchas
|
|||||||
filesystem while `LOGICAL_INO` is running. Generally the CPU spends
|
filesystem while `LOGICAL_INO` is running. Generally the CPU spends
|
||||||
most of the runtime of the `LOGICAL_INO` ioctl running the kernel,
|
most of the runtime of the `LOGICAL_INO` ioctl running the kernel,
|
||||||
so on a single-core CPU the entire system can freeze up for a second
|
so on a single-core CPU the entire system can freeze up for a second
|
||||||
during operations on toxic extents.
|
during operations on toxic extents. Note this only occurs on older
|
||||||
|
kernels. See [the slow backrefs kernel bug section](btrfs-kernel.md).
|
||||||
|
|
||||||
* If a process holds a directory FD open, the subvol containing the
|
* If a process holds a directory FD open, the subvol containing the
|
||||||
directory cannot be deleted (`btrfs sub del` will start the deletion
|
directory cannot be deleted (`btrfs sub del` will start the deletion
|
||||||
process, but it will not proceed past the first open directory FD).
|
process, but it will not proceed past the first open directory FD).
|
||||||
`btrfs-cleaner` will simply skip over the directory *and all of its
|
`btrfs-cleaner` will simply skip over the directory *and all of its
|
||||||
children* until the FD is closed. bees avoids this gotcha by closing
|
children* until the FD is closed. bees avoids this gotcha by closing
|
||||||
all of the FDs in its directory FD cache every 10 btrfs transactions.
|
all of the FDs in its directory FD cache every btrfs transaction.
|
||||||
|
|
||||||
* If a file is deleted while bees is caching an open FD to the file,
|
* If a file is deleted while bees is caching an open FD to the file,
|
||||||
bees continues to scan the file. For very large files (e.g. VM
|
bees continues to scan the file. For very large files (e.g. VM
|
||||||
images), the deletion of the file can be delayed indefinitely.
|
images), the deletion of the file can be delayed indefinitely.
|
||||||
To limit this delay, bees closes all FDs in its file FD cache every
|
To limit this delay, bees closes all FDs in its file FD cache every
|
||||||
10 btrfs transactions.
|
btrfs transaction.
|
||||||
|
@@ -8,10 +8,12 @@ bees uses checkpoints for persistence to eliminate the IO overhead of a
|
|||||||
transactional data store. On restart, bees will dedupe any data that
|
transactional data store. On restart, bees will dedupe any data that
|
||||||
was added to the filesystem since the last checkpoint. Checkpoints
|
was added to the filesystem since the last checkpoint. Checkpoints
|
||||||
occur every 15 minutes for scan progress, stored in `beescrawl.dat`.
|
occur every 15 minutes for scan progress, stored in `beescrawl.dat`.
|
||||||
The hash table trickle-writes to disk at 4GB/hour to `beeshash.dat`.
|
The hash table trickle-writes to disk at 128KiB/s to `beeshash.dat`,
|
||||||
An hourly performance report is written to `beesstats.txt`. There are
|
but will flush immediately if bees is terminated by SIGTERM.
|
||||||
no special requirements for bees hash table storage--`.beeshome` could
|
|
||||||
be stored on a different btrfs filesystem, ext4, or even CIFS.
|
There are no special requirements for bees hash table storage--`.beeshome`
|
||||||
|
could be stored on a different btrfs filesystem, ext4, or even CIFS (but
|
||||||
|
not MS-DOS--beeshome does need filenames longer than 8.3).
|
||||||
|
|
||||||
bees uses a persistent dedupe hash table with a fixed size configured
|
bees uses a persistent dedupe hash table with a fixed size configured
|
||||||
by the user. Any size of hash table can be dedicated to dedupe. If a
|
by the user. Any size of hash table can be dedicated to dedupe. If a
|
||||||
@@ -20,7 +22,7 @@ small as 128KB.
|
|||||||
|
|
||||||
The bees hash table is loaded into RAM at startup and `mlock`ed so it
|
The bees hash table is loaded into RAM at startup and `mlock`ed so it
|
||||||
will not be swapped out by the kernel (if swap is permitted, performance
|
will not be swapped out by the kernel (if swap is permitted, performance
|
||||||
degrades to nearly zero).
|
degrades to nearly zero, for both bees and the swap device).
|
||||||
|
|
||||||
bees scans the filesystem in a single pass which removes duplicate
|
bees scans the filesystem in a single pass which removes duplicate
|
||||||
extents immediately after they are detected. There are no distinct
|
extents immediately after they are detected. There are no distinct
|
||||||
@@ -83,12 +85,12 @@ of these functions in userspace, at the expense of encountering [some
|
|||||||
kernel bugs in `LOGICAL_INO` performance](btrfs-kernel.md).
|
kernel bugs in `LOGICAL_INO` performance](btrfs-kernel.md).
|
||||||
|
|
||||||
bees uses only the data-safe `FILE_EXTENT_SAME` (aka `FIDEDUPERANGE`)
|
bees uses only the data-safe `FILE_EXTENT_SAME` (aka `FIDEDUPERANGE`)
|
||||||
kernel operations to manipulate user data, so it can dedupe live data
|
kernel ioctl to manipulate user data, so it can dedupe live data
|
||||||
(e.g. build servers, sqlite databases, VM disk images). It does not
|
(e.g. build servers, sqlite databases, VM disk images). bees does not
|
||||||
modify file attributes or timestamps.
|
modify file attributes or timestamps in deduplicated files.
|
||||||
|
|
||||||
When bees has scanned all of the data, bees will pause until 10
|
When bees has scanned all of the data, bees will pause until a new
|
||||||
transactions have been completed in the btrfs filesystem. bees tracks
|
transaction has completed in the btrfs filesystem. bees tracks
|
||||||
the current btrfs transaction ID over time so that it polls less often
|
the current btrfs transaction ID over time so that it polls less often
|
||||||
on quiescent filesystems and more often on busy filesystems.
|
on quiescent filesystems and more often on busy filesystems.
|
||||||
|
|
||||||
|
@@ -17,7 +17,6 @@ Strengths
|
|||||||
* Space-efficient hash table and matching algorithms - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
|
* Space-efficient hash table and matching algorithms - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
|
||||||
* Daemon incrementally dedupes new data using btrfs tree search
|
* Daemon incrementally dedupes new data using btrfs tree search
|
||||||
* Works with btrfs compression - dedupe any combination of compressed and uncompressed files
|
* Works with btrfs compression - dedupe any combination of compressed and uncompressed files
|
||||||
* **NEW** [Works around `btrfs send` problems with dedupe and incremental parent snapshots](options.md)
|
|
||||||
* Works around btrfs filesystem structure to free more disk space
|
* Works around btrfs filesystem structure to free more disk space
|
||||||
* Persistent hash table for rapid restart after shutdown
|
* Persistent hash table for rapid restart after shutdown
|
||||||
* Whole-filesystem dedupe - including snapshots
|
* Whole-filesystem dedupe - including snapshots
|
||||||
@@ -70,6 +69,6 @@ You can also use Github:
|
|||||||
Copyright & License
|
Copyright & License
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
Copyright 2015-2022 Zygo Blaxell <bees@furryterror.org>.
|
Copyright 2015-2023 Zygo Blaxell <bees@furryterror.org>.
|
||||||
|
|
||||||
GPL (version 3 or later).
|
GPL (version 3 or later).
|
||||||
|
@@ -4,7 +4,7 @@ Building bees
|
|||||||
Dependencies
|
Dependencies
|
||||||
------------
|
------------
|
||||||
|
|
||||||
* C++11 compiler (tested with GCC 4.9, 6.3.0, 8.1.0)
|
* C++11 compiler (tested with GCC 8.1.0, 12.2.0)
|
||||||
|
|
||||||
Sorry. I really like closures and shared_ptr, so support
|
Sorry. I really like closures and shared_ptr, so support
|
||||||
for earlier compiler versions is unlikely.
|
for earlier compiler versions is unlikely.
|
||||||
@@ -19,7 +19,7 @@ Dependencies
|
|||||||
|
|
||||||
* [Linux kernel version](btrfs-kernel.md) gets its own page.
|
* [Linux kernel version](btrfs-kernel.md) gets its own page.
|
||||||
|
|
||||||
* markdown for documentation
|
* markdown to build the documentation
|
||||||
|
|
||||||
* util-linux version that provides `blkid` command for the helper
|
* util-linux version that provides `blkid` command for the helper
|
||||||
script `scripts/beesd` to work
|
script `scripts/beesd` to work
|
||||||
|
@@ -2,8 +2,8 @@ Features You Might Expect That bees Doesn't Have
|
|||||||
------------------------------------------------
|
------------------------------------------------
|
||||||
|
|
||||||
* There's no configuration file (patches welcome!). There are
|
* There's no configuration file (patches welcome!). There are
|
||||||
some tunables hardcoded in the source that could eventually become
|
some tunables hardcoded in the source (`src/bees.h`) that could eventually
|
||||||
configuration options. There's also an incomplete option parser
|
become configuration options. There's also an incomplete option parser
|
||||||
(patches welcome!).
|
(patches welcome!).
|
||||||
|
|
||||||
* The bees process doesn't fork and writes its log to stdout/stderr.
|
* The bees process doesn't fork and writes its log to stdout/stderr.
|
||||||
@@ -43,3 +43,6 @@ compression method or not compress the data (patches welcome!).
|
|||||||
* It is theoretically possible to resize the hash table without starting
|
* It is theoretically possible to resize the hash table without starting
|
||||||
over with a new full-filesystem scan; however, this feature has not been
|
over with a new full-filesystem scan; however, this feature has not been
|
||||||
implemented yet.
|
implemented yet.
|
||||||
|
|
||||||
|
* btrfs maintains csums of data blocks which bees could use to improve
|
||||||
|
scan speeds, but bees doesn't use them yet.
|
||||||
|
@@ -69,9 +69,11 @@ namespace crucible {
|
|||||||
|
|
||||||
uint64_t get_flags() const;
|
uint64_t get_flags() const;
|
||||||
void set_flags(uint64_t new_flags);
|
void set_flags(uint64_t new_flags);
|
||||||
|
void set_logical(uint64_t new_logical);
|
||||||
|
void set_size(uint64_t new_size);
|
||||||
|
|
||||||
virtual void do_ioctl(int fd);
|
void do_ioctl(int fd);
|
||||||
virtual bool do_ioctl_nothrow(int fd);
|
bool do_ioctl_nothrow(int fd);
|
||||||
|
|
||||||
struct BtrfsInodeOffsetRootSpan {
|
struct BtrfsInodeOffsetRootSpan {
|
||||||
using iterator = BtrfsInodeOffsetRoot*;
|
using iterator = BtrfsInodeOffsetRoot*;
|
||||||
|
@@ -4,13 +4,20 @@
|
|||||||
#include "crucible/error.h"
|
#include "crucible/error.h"
|
||||||
|
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <map>
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
#include <set>
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
|
||||||
namespace crucible {
|
namespace crucible {
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
|
/// A class to track progress of multiple workers using only two points:
|
||||||
|
/// the first and last incomplete state. The first incomplete
|
||||||
|
/// state can be recorded as a checkpoint to resume later on.
|
||||||
|
/// The last completed state is the starting point for workers that
|
||||||
|
/// need something to do.
|
||||||
template <class T>
|
template <class T>
|
||||||
class ProgressTracker {
|
class ProgressTracker {
|
||||||
struct ProgressTrackerState;
|
struct ProgressTrackerState;
|
||||||
@@ -19,8 +26,16 @@ namespace crucible {
|
|||||||
using value_type = T;
|
using value_type = T;
|
||||||
using ProgressHolder = shared_ptr<ProgressHolderState>;
|
using ProgressHolder = shared_ptr<ProgressHolderState>;
|
||||||
|
|
||||||
|
/// Create ProgressTracker with initial begin and end state 'v'.
|
||||||
ProgressTracker(const value_type &v);
|
ProgressTracker(const value_type &v);
|
||||||
|
|
||||||
|
/// The first incomplete state. This is not "sticky",
|
||||||
|
/// it will revert to the end state if there are no
|
||||||
|
/// items in progress.
|
||||||
value_type begin() const;
|
value_type begin() const;
|
||||||
|
|
||||||
|
/// The last incomplete state. This is "sticky",
|
||||||
|
/// it can only increase and never decrease.
|
||||||
value_type end() const;
|
value_type end() const;
|
||||||
|
|
||||||
ProgressHolder hold(const value_type &v);
|
ProgressHolder hold(const value_type &v);
|
||||||
@@ -31,7 +46,7 @@ namespace crucible {
|
|||||||
struct ProgressTrackerState {
|
struct ProgressTrackerState {
|
||||||
using key_type = pair<value_type, ProgressHolderState *>;
|
using key_type = pair<value_type, ProgressHolderState *>;
|
||||||
mutex m_mutex;
|
mutex m_mutex;
|
||||||
map<key_type, bool> m_in_progress;
|
set<key_type> m_in_progress;
|
||||||
value_type m_begin;
|
value_type m_begin;
|
||||||
value_type m_end;
|
value_type m_end;
|
||||||
};
|
};
|
||||||
@@ -39,6 +54,7 @@ namespace crucible {
|
|||||||
class ProgressHolderState {
|
class ProgressHolderState {
|
||||||
shared_ptr<ProgressTrackerState> m_state;
|
shared_ptr<ProgressTrackerState> m_state;
|
||||||
const value_type m_value;
|
const value_type m_value;
|
||||||
|
using key_type = typename ProgressTrackerState::key_type;
|
||||||
public:
|
public:
|
||||||
ProgressHolderState(shared_ptr<ProgressTrackerState> state, const value_type &v);
|
ProgressHolderState(shared_ptr<ProgressTrackerState> state, const value_type &v);
|
||||||
~ProgressHolderState();
|
~ProgressHolderState();
|
||||||
@@ -86,7 +102,11 @@ namespace crucible {
|
|||||||
m_value(v)
|
m_value(v)
|
||||||
{
|
{
|
||||||
unique_lock<mutex> lock(m_state->m_mutex);
|
unique_lock<mutex> lock(m_state->m_mutex);
|
||||||
m_state->m_in_progress[make_pair(m_value, this)] = true;
|
const auto rv = m_state->m_in_progress.insert(key_type(m_value, this));
|
||||||
|
THROW_CHECK1(runtime_error, m_value, rv.second);
|
||||||
|
// Set the beginning to the first existing in-progress item
|
||||||
|
m_state->m_begin = m_state->m_in_progress.begin()->first;
|
||||||
|
// If this value is past the end, move the end, but don't go backwards
|
||||||
if (m_state->m_end < m_value) {
|
if (m_state->m_end < m_value) {
|
||||||
m_state->m_end = m_value;
|
m_state->m_end = m_value;
|
||||||
}
|
}
|
||||||
@@ -96,17 +116,15 @@ namespace crucible {
|
|||||||
ProgressTracker<T>::ProgressHolderState::~ProgressHolderState()
|
ProgressTracker<T>::ProgressHolderState::~ProgressHolderState()
|
||||||
{
|
{
|
||||||
unique_lock<mutex> lock(m_state->m_mutex);
|
unique_lock<mutex> lock(m_state->m_mutex);
|
||||||
m_state->m_in_progress[make_pair(m_value, this)] = false;
|
const auto rv = m_state->m_in_progress.erase(key_type(m_value, this));
|
||||||
auto p = m_state->m_in_progress.begin();
|
// THROW_CHECK2(runtime_error, m_value, rv, rv == 1);
|
||||||
while (p != m_state->m_in_progress.end()) {
|
assert(rv == 1);
|
||||||
if (p->second) {
|
if (m_state->m_in_progress.empty()) {
|
||||||
break;
|
// If we made the list empty, then m_begin == m_end
|
||||||
}
|
m_state->m_begin = m_state->m_end;
|
||||||
if (m_state->m_begin < p->first.first) {
|
} else {
|
||||||
m_state->m_begin = p->first.first;
|
// If we deleted the first element, then m_begin = current first element
|
||||||
}
|
m_state->m_begin = m_state->m_in_progress.begin()->first;
|
||||||
m_state->m_in_progress.erase(p);
|
|
||||||
p = m_state->m_in_progress.begin();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -548,7 +548,7 @@ namespace crucible {
|
|||||||
#endif
|
#endif
|
||||||
const uint64_t logical_end = logical + count * block_size();
|
const uint64_t logical_end = logical + count * block_size();
|
||||||
BtrfsTreeItem bti = rlower_bound(logical);
|
BtrfsTreeItem bti = rlower_bound(logical);
|
||||||
size_t loops = 0;
|
size_t __attribute__((unused)) loops = 0;
|
||||||
BCTFGS_DEBUG("get_sums " << to_hex(logical) << ".." << to_hex(logical_end) << endl);
|
BCTFGS_DEBUG("get_sums " << to_hex(logical) << ".." << to_hex(logical_end) << endl);
|
||||||
while (!!bti) {
|
while (!!bti) {
|
||||||
BCTFGS_DEBUG("get_sums[" << loops << "]: " << bti << endl);
|
BCTFGS_DEBUG("get_sums[" << loops << "]: " << bti << endl);
|
||||||
|
12
lib/fs.cc
12
lib/fs.cc
@@ -315,6 +315,18 @@ namespace crucible {
|
|||||||
return m_flags;
|
return m_flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
BtrfsIoctlLogicalInoArgs::set_logical(uint64_t new_logical)
|
||||||
|
{
|
||||||
|
m_logical = new_logical;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
BtrfsIoctlLogicalInoArgs::set_size(uint64_t new_size)
|
||||||
|
{
|
||||||
|
m_container_size = new_size;
|
||||||
|
}
|
||||||
|
|
||||||
bool
|
bool
|
||||||
BtrfsIoctlLogicalInoArgs::do_ioctl_nothrow(int fd)
|
BtrfsIoctlLogicalInoArgs::do_ioctl_nothrow(int fd)
|
||||||
{
|
{
|
||||||
|
@@ -757,6 +757,15 @@ BeesResolveAddrResult::BeesResolveAddrResult()
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
shared_ptr<BtrfsIoctlLogicalInoArgs>
|
||||||
|
BeesContext::logical_ino(const uint64_t logical, const bool all_refs)
|
||||||
|
{
|
||||||
|
const auto rv = m_logical_ino_pool();
|
||||||
|
rv->set_logical(logical);
|
||||||
|
rv->set_flags(all_refs ? BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET : 0);
|
||||||
|
return rv;
|
||||||
|
}
|
||||||
|
|
||||||
BeesResolveAddrResult
|
BeesResolveAddrResult
|
||||||
BeesContext::resolve_addr_uncached(BeesAddress addr)
|
BeesContext::resolve_addr_uncached(BeesAddress addr)
|
||||||
{
|
{
|
||||||
@@ -768,7 +777,8 @@ BeesContext::resolve_addr_uncached(BeesAddress addr)
|
|||||||
// transaction latency, competing threads, and freeze/SIGSTOP
|
// transaction latency, competing threads, and freeze/SIGSTOP
|
||||||
// pausing the bees process.
|
// pausing the bees process.
|
||||||
|
|
||||||
BtrfsIoctlLogicalInoArgs log_ino(addr.get_physical_or_zero());
|
const auto log_ino_ptr = logical_ino(addr.get_physical_or_zero(), false);
|
||||||
|
auto &log_ino = *log_ino_ptr;
|
||||||
|
|
||||||
// Time how long this takes
|
// Time how long this takes
|
||||||
Timer resolve_timer;
|
Timer resolve_timer;
|
||||||
@@ -811,6 +821,10 @@ BeesContext::resolve_addr_uncached(BeesAddress addr)
|
|||||||
|
|
||||||
// Avoid performance problems - pretend resolve failed if there are too many refs
|
// Avoid performance problems - pretend resolve failed if there are too many refs
|
||||||
const size_t rv_count = log_ino.m_iors.size();
|
const size_t rv_count = log_ino.m_iors.size();
|
||||||
|
if (!rv_count) {
|
||||||
|
BEESLOGDEBUG("LOGICAL_INO returned 0 refs at " << to_hex(addr));
|
||||||
|
BEESCOUNT(resolve_empty);
|
||||||
|
}
|
||||||
if (rv_count < BEES_MAX_EXTENT_REF_COUNT) {
|
if (rv_count < BEES_MAX_EXTENT_REF_COUNT) {
|
||||||
rv.m_biors = vector<BtrfsInodeOffsetRoot>(log_ino.m_iors.begin(), log_ino.m_iors.end());
|
rv.m_biors = vector<BtrfsInodeOffsetRoot>(log_ino.m_iors.begin(), log_ino.m_iors.end());
|
||||||
} else {
|
} else {
|
||||||
@@ -822,7 +836,7 @@ BeesContext::resolve_addr_uncached(BeesAddress addr)
|
|||||||
if (sys_usage_delta < BEES_TOXIC_SYS_DURATION) {
|
if (sys_usage_delta < BEES_TOXIC_SYS_DURATION) {
|
||||||
rv.m_is_toxic = false;
|
rv.m_is_toxic = false;
|
||||||
} else {
|
} else {
|
||||||
BEESLOGNOTICE("WORKAROUND: toxic address: addr = " << addr << ", sys_usage_delta = " << round(sys_usage_delta* 1000.0) / 1000.0 << ", user_usage_delta = " << round(user_usage_delta * 1000.0) / 1000.0 << ", rt_age = " << rt_age << ", refs " << rv_count);
|
BEESLOGDEBUG("WORKAROUND: toxic address: addr = " << addr << ", sys_usage_delta = " << round(sys_usage_delta* 1000.0) / 1000.0 << ", user_usage_delta = " << round(user_usage_delta * 1000.0) / 1000.0 << ", rt_age = " << rt_age << ", refs " << rv_count);
|
||||||
BEESCOUNT(resolve_toxic);
|
BEESCOUNT(resolve_toxic);
|
||||||
rv.m_is_toxic = true;
|
rv.m_is_toxic = true;
|
||||||
}
|
}
|
||||||
@@ -910,6 +924,9 @@ BeesContext::start()
|
|||||||
m_tmpfile_pool.generator([=]() -> shared_ptr<BeesTempFile> {
|
m_tmpfile_pool.generator([=]() -> shared_ptr<BeesTempFile> {
|
||||||
return make_shared<BeesTempFile>(shared_from_this());
|
return make_shared<BeesTempFile>(shared_from_this());
|
||||||
});
|
});
|
||||||
|
m_logical_ino_pool.generator([]() {
|
||||||
|
return make_shared<BtrfsIoctlLogicalInoArgs>(0);
|
||||||
|
});
|
||||||
m_tmpfile_pool.checkin([](const shared_ptr<BeesTempFile> &btf) {
|
m_tmpfile_pool.checkin([](const shared_ptr<BeesTempFile> &btf) {
|
||||||
catch_all([&](){
|
catch_all([&](){
|
||||||
btf->reset();
|
btf->reset();
|
||||||
|
@@ -515,7 +515,12 @@ BeesRoots::transid_max_nocache()
|
|||||||
uint64_t
|
uint64_t
|
||||||
BeesRoots::transid_max()
|
BeesRoots::transid_max()
|
||||||
{
|
{
|
||||||
return m_transid_re.count();
|
const auto rv = m_transid_re.count();
|
||||||
|
// transid must be greater than zero, or we did something very wrong
|
||||||
|
THROW_CHECK1(runtime_error, rv, rv > 0);
|
||||||
|
// transid must be less than max, or we did something very wrong
|
||||||
|
THROW_CHECK1(runtime_error, rv, rv < numeric_limits<uint64_t>::max());
|
||||||
|
return rv;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct BeesFileCrawl {
|
struct BeesFileCrawl {
|
||||||
|
@@ -714,6 +714,7 @@ class BeesContext : public enable_shared_from_this<BeesContext> {
|
|||||||
shared_ptr<BeesHashTable> m_hash_table;
|
shared_ptr<BeesHashTable> m_hash_table;
|
||||||
shared_ptr<BeesRoots> m_roots;
|
shared_ptr<BeesRoots> m_roots;
|
||||||
Pool<BeesTempFile> m_tmpfile_pool;
|
Pool<BeesTempFile> m_tmpfile_pool;
|
||||||
|
Pool<BtrfsIoctlLogicalInoArgs> m_logical_ino_pool;
|
||||||
|
|
||||||
LRUCache<BeesResolveAddrResult, BeesAddress> m_resolve_cache;
|
LRUCache<BeesResolveAddrResult, BeesAddress> m_resolve_cache;
|
||||||
|
|
||||||
@@ -753,6 +754,8 @@ public:
|
|||||||
|
|
||||||
bool scan_forward(const BeesFileRange &bfr);
|
bool scan_forward(const BeesFileRange &bfr);
|
||||||
|
|
||||||
|
shared_ptr<BtrfsIoctlLogicalInoArgs> logical_ino(uint64_t bytenr, bool all_refs);
|
||||||
|
|
||||||
bool is_root_ro(uint64_t root);
|
bool is_root_ro(uint64_t root);
|
||||||
BeesRangePair dup_extent(const BeesFileRange &src, const shared_ptr<BeesTempFile> &tmpfile);
|
BeesRangePair dup_extent(const BeesFileRange &src, const shared_ptr<BeesTempFile> &tmpfile);
|
||||||
bool dedup(const BeesRangePair &brp);
|
bool dedup(const BeesRangePair &brp);
|
||||||
|
@@ -3,6 +3,7 @@
|
|||||||
#include "crucible/limits.h"
|
#include "crucible/limits.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <cstdint>
|
||||||
|
|
||||||
using namespace crucible;
|
using namespace crucible;
|
||||||
|
|
||||||
|
@@ -12,23 +12,49 @@ using namespace std;
|
|||||||
void
|
void
|
||||||
test_progress()
|
test_progress()
|
||||||
{
|
{
|
||||||
|
// On create, begin == end == constructor argument
|
||||||
ProgressTracker<uint64_t> pt(123);
|
ProgressTracker<uint64_t> pt(123);
|
||||||
auto hold = pt.hold(234);
|
|
||||||
auto hold2 = pt.hold(345);
|
|
||||||
assert(pt.begin() == 123);
|
assert(pt.begin() == 123);
|
||||||
assert(pt.end() == 345);
|
assert(pt.end() == 123);
|
||||||
auto hold3 = pt.hold(456);
|
|
||||||
assert(pt.begin() == 123);
|
// Holding a position past the end increases the end (and moves begin to match)
|
||||||
assert(pt.end() == 456);
|
auto hold345 = pt.hold(345);
|
||||||
hold2.reset();
|
|
||||||
assert(pt.begin() == 123);
|
|
||||||
assert(pt.end() == 456);
|
|
||||||
hold.reset();
|
|
||||||
assert(pt.begin() == 345);
|
assert(pt.begin() == 345);
|
||||||
|
assert(pt.end() == 345);
|
||||||
|
|
||||||
|
// Holding a position before begin reduces begin, without changing end
|
||||||
|
auto hold234 = pt.hold(234);
|
||||||
|
assert(pt.begin() == 234);
|
||||||
|
assert(pt.end() == 345);
|
||||||
|
|
||||||
|
// Holding a position past the end increases the end, without affecting begin
|
||||||
|
auto hold456 = pt.hold(456);
|
||||||
|
assert(pt.begin() == 234);
|
||||||
assert(pt.end() == 456);
|
assert(pt.end() == 456);
|
||||||
hold3.reset();
|
|
||||||
|
// Releasing a position in the middle affects neither begin nor end
|
||||||
|
hold345.reset();
|
||||||
|
assert(pt.begin() == 234);
|
||||||
|
assert(pt.end() == 456);
|
||||||
|
|
||||||
|
// Hold another position in the middle to test begin moving forward
|
||||||
|
auto hold400 = pt.hold(400);
|
||||||
|
|
||||||
|
// Releasing a position at the beginning moves begin forward
|
||||||
|
hold234.reset();
|
||||||
|
assert(pt.begin() == 400);
|
||||||
|
assert(pt.end() == 456);
|
||||||
|
|
||||||
|
// Releasing a position at the end doesn't move end backward
|
||||||
|
hold456.reset();
|
||||||
|
assert(pt.begin() == 400);
|
||||||
|
assert(pt.end() == 456);
|
||||||
|
|
||||||
|
// Releasing a position in the middle doesn't move end backward but does move begin forward
|
||||||
|
hold400.reset();
|
||||||
assert(pt.begin() == 456);
|
assert(pt.begin() == 456);
|
||||||
assert(pt.end() == 456);
|
assert(pt.end() == 456);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
|
Reference in New Issue
Block a user