mirror of
https://github.com/Zygo/bees.git
synced 2025-08-04 14:53:28 +02:00
Compare commits
54 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
c53fa04a2f | ||
|
d4a681c8a2 | ||
|
a819d623f7 | ||
|
de9d72da80 | ||
|
74d8bdd60f | ||
|
a5d078d48b | ||
|
e2587cae9b | ||
|
ac581273d3 | ||
|
7fcde97b70 | ||
|
e457f502b7 | ||
|
46815f1a9d | ||
|
0d251d30f4 | ||
|
b8dd9a2db0 | ||
|
8bc90b743b | ||
|
2f2a68be3d | ||
|
82f1fd8054 | ||
|
a9b07d7684 | ||
|
613ddc3c71 | ||
|
c3a39b7691 | ||
|
58db4071de | ||
|
0d3e13cc5f | ||
|
1af5fcdf34 | ||
|
87472b6086 | ||
|
ca351d389f | ||
|
1f0b8c623c | ||
|
74296c644a | ||
|
231593bfbc | ||
|
d4900cc5d5 | ||
|
81bbf7e1d4 | ||
|
bd9dc0229b | ||
|
2a1ed0b455 | ||
|
d160edc15a | ||
|
e79b242ce2 | ||
|
ea45982293 | ||
|
f209cafcd8 | ||
|
c4b31bdd5c | ||
|
08fe145988 | ||
|
bb09b1ab0e | ||
|
94d9945d04 | ||
|
a02588b16f | ||
|
21cedfb13e | ||
|
b9abcceacb | ||
|
31f3a8d67d | ||
|
9beb602b16 | ||
|
0580c10082 | ||
|
1cbc894e6f | ||
|
d74862f1fc | ||
|
e40339856f | ||
|
1dd96f20c6 | ||
|
cd7a71aba3 | ||
|
e99a505b3b | ||
|
3e89fe34ed | ||
|
dc74766179 | ||
|
3a33a5386b |
26
README.md
26
README.md
@@ -6,30 +6,30 @@ Best-Effort Extent-Same, a btrfs deduplication agent.
|
|||||||
About bees
|
About bees
|
||||||
----------
|
----------
|
||||||
|
|
||||||
bees is a block-oriented userspace deduplication agent designed for large
|
bees is a block-oriented userspace deduplication agent designed to scale
|
||||||
btrfs filesystems. It is an offline dedupe combined with an incremental
|
up to large btrfs filesystems. It is an offline dedupe combined with
|
||||||
data scan capability to minimize time data spends on disk from write
|
an incremental data scan capability to minimize time data spends on disk
|
||||||
to dedupe.
|
from write to dedupe.
|
||||||
|
|
||||||
Strengths
|
Strengths
|
||||||
---------
|
---------
|
||||||
|
|
||||||
* Space-efficient hash table and matching algorithms - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
|
* Space-efficient hash table - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
|
||||||
* Daemon incrementally dedupes new data using btrfs tree search
|
* Daemon mode - incrementally dedupes new data as it appears
|
||||||
|
* Largest extents first - recover more free space during fixed maintenance windows
|
||||||
* Works with btrfs compression - dedupe any combination of compressed and uncompressed files
|
* Works with btrfs compression - dedupe any combination of compressed and uncompressed files
|
||||||
* Works around btrfs filesystem structure to free more disk space
|
* Whole-filesystem dedupe - scans data only once, even with snapshots and reflinks
|
||||||
* Persistent hash table for rapid restart after shutdown
|
* Persistent hash table for rapid restart after shutdown
|
||||||
* Whole-filesystem dedupe - including snapshots
|
|
||||||
* Constant hash table size - no increased RAM usage if data set becomes larger
|
* Constant hash table size - no increased RAM usage if data set becomes larger
|
||||||
* Works on live data - no scheduled downtime required
|
* Works on live data - no scheduled downtime required
|
||||||
* Automatic self-throttling based on system load
|
* Automatic self-throttling - reduces system load
|
||||||
|
* btrfs support - recovers more free space from btrfs than naive dedupers
|
||||||
|
|
||||||
Weaknesses
|
Weaknesses
|
||||||
----------
|
----------
|
||||||
|
|
||||||
* Whole-filesystem dedupe - has no include/exclude filters, does not accept file lists
|
* Whole-filesystem dedupe - has no include/exclude filters, does not accept file lists
|
||||||
* Requires root privilege (or `CAP_SYS_ADMIN`)
|
* Requires root privilege (`CAP_SYS_ADMIN` plus the usual filesystem read/modify caps)
|
||||||
* First run may require temporary disk space for extent reorganization
|
|
||||||
* [First run may increase metadata space usage if many snapshots exist](docs/gotchas.md)
|
* [First run may increase metadata space usage if many snapshots exist](docs/gotchas.md)
|
||||||
* Constant hash table size - no decreased RAM usage if data set becomes smaller
|
* Constant hash table size - no decreased RAM usage if data set becomes smaller
|
||||||
* btrfs only
|
* btrfs only
|
||||||
@@ -46,7 +46,7 @@ Recommended Reading
|
|||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
* [bees Gotchas](docs/gotchas.md)
|
* [bees Gotchas](docs/gotchas.md)
|
||||||
* [btrfs kernel bugs](docs/btrfs-kernel.md) - especially DATA CORRUPTION WARNING
|
* [btrfs kernel bugs](docs/btrfs-kernel.md) - especially DATA CORRUPTION WARNING for old kernels
|
||||||
* [bees vs. other btrfs features](docs/btrfs-other.md)
|
* [bees vs. other btrfs features](docs/btrfs-other.md)
|
||||||
* [What to do when something goes wrong](docs/wrong.md)
|
* [What to do when something goes wrong](docs/wrong.md)
|
||||||
|
|
||||||
@@ -69,6 +69,6 @@ You can also use Github:
|
|||||||
Copyright & License
|
Copyright & License
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
Copyright 2015-2023 Zygo Blaxell <bees@furryterror.org>.
|
Copyright 2015-2025 Zygo Blaxell <bees@furryterror.org>.
|
||||||
|
|
||||||
GPL (version 3 or later).
|
GPL (version 3 or later).
|
||||||
|
@@ -1,31 +1,24 @@
|
|||||||
Recommended Kernel Version for bees
|
Recommended Linux Kernel Version for bees
|
||||||
===================================
|
=========================================
|
||||||
|
|
||||||
First, a warning that is not specific to bees:
|
First, a warning about old Linux kernel versions:
|
||||||
|
|
||||||
> **Kernel 5.1, 5.2, and 5.3 should not be used with btrfs due to a
|
> **Linux kernel version 5.1, 5.2, and 5.3 should not be used with btrfs
|
||||||
severe regression that can lead to fatal metadata corruption.**
|
due to a severe regression that can lead to fatal metadata corruption.**
|
||||||
This issue is fixed in kernel 5.4.14 and later.
|
This issue is fixed in version 5.4.14 and later.
|
||||||
|
|
||||||
**Recommended kernel versions for bees are 4.19, 5.4, 5.10, 5.11, 5.15,
|
**Recommended Linux kernel versions for bees are 5.4, 5.10, 5.15, 6.1,
|
||||||
6.0, or 6.1, with recent LTS and -stable updates.** The latest released
|
6.6, or 6.12 with recent LTS and -stable updates.** The latest released
|
||||||
kernel as of this writing is 6.4.1.
|
kernel as of this writing is 6.12.9, and the earliest supported LTS
|
||||||
|
kernel is 5.4.
|
||||||
|
|
||||||
4.14, 4.9, and 4.4 LTS kernels with recent updates are OK with some
|
Some optional bees features use kernel APIs introduced in kernel 4.15
|
||||||
issues. Older kernels will be slower (a little slower or a lot slower
|
(extent scan) and 5.6 (`openat2` support). These bees features are not
|
||||||
depending on which issues are triggered). Not all fixes are backported.
|
available on older kernels. Support for older kernels may be removed
|
||||||
|
in a future bees release.
|
||||||
Obsolete non-LTS kernels have a variety of unfixed issues and should
|
|
||||||
not be used with btrfs. For details see the table below.
|
|
||||||
|
|
||||||
bees requires btrfs kernel API version 4.2 or higher, and does not work
|
|
||||||
at all on older kernels.
|
|
||||||
|
|
||||||
Some bees features rely on kernel 4.15 to work, and these features will
|
|
||||||
not be available on older kernels. Currently, bees is still usable on
|
|
||||||
older kernels with degraded performance or with options disabled, but
|
|
||||||
support for older kernels may be removed.
|
|
||||||
|
|
||||||
|
bees will not run at all on kernels before 4.2 due to lack of minimal
|
||||||
|
API support.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -71,7 +64,7 @@ These bugs are particularly popular among bees users, though not all are specifi
|
|||||||
| 6.3, backported to 5.15.107, 6.1.24, 6.2.11 | 6.3 | vmalloc error, failed to allocate pages | 6.3.10, 6.4 and later. Bug (f349b15e183d "mm: vmalloc: avoid warn_alloc noise caused by fatal signal" in v6.3-rc6) backported to 6.1.24, 6.2.11, and 5.15.107. | 95a301eefa82 mm/vmalloc: do not output a spurious warning when huge vmalloc() fails
|
| 6.3, backported to 5.15.107, 6.1.24, 6.2.11 | 6.3 | vmalloc error, failed to allocate pages | 6.3.10, 6.4 and later. Bug (f349b15e183d "mm: vmalloc: avoid warn_alloc noise caused by fatal signal" in v6.3-rc6) backported to 6.1.24, 6.2.11, and 5.15.107. | 95a301eefa82 mm/vmalloc: do not output a spurious warning when huge vmalloc() fails
|
||||||
| 6.2 | 6.3 | `IGNORE_OFFSET` flag ignored in `LOGICAL_INO` ioctl | 6.2.16, 6.3.3, 6.4 and later | 0cad8f14d70c btrfs: fix backref walking not returning all inode refs
|
| 6.2 | 6.3 | `IGNORE_OFFSET` flag ignored in `LOGICAL_INO` ioctl | 6.2.16, 6.3.3, 6.4 and later | 0cad8f14d70c btrfs: fix backref walking not returning all inode refs
|
||||||
| 6.10 | 6.11 | `adding refs to an existing tree ref`, `failed to run delayed ref`, then read-only | 6.11.10, 6.12 and later | 7d493a5ecc26 btrfs: fix incorrect comparison for delayed refs
|
| 6.10 | 6.11 | `adding refs to an existing tree ref`, `failed to run delayed ref`, then read-only | 6.11.10, 6.12 and later | 7d493a5ecc26 btrfs: fix incorrect comparison for delayed refs
|
||||||
| 5.4 | - | kernel hang when multiple threads are running `LOGICAL_INO` and dedupe ioctl on the same extent | - | workaround: avoid doing that
|
| 5.4 | - | kernel hang when multiple threads are running `LOGICAL_INO` and dedupe/clone ioctl on the same extent | - | workaround: avoid doing that
|
||||||
|
|
||||||
"Last bad kernel" refers to that version's last stable update from
|
"Last bad kernel" refers to that version's last stable update from
|
||||||
kernel.org. Distro kernels may backport additional fixes. Consult
|
kernel.org. Distro kernels may backport additional fixes. Consult
|
||||||
@@ -97,12 +90,12 @@ contains the last committed component of the fix.
|
|||||||
Workarounds for known kernel bugs
|
Workarounds for known kernel bugs
|
||||||
---------------------------------
|
---------------------------------
|
||||||
|
|
||||||
* **Hangs with concurrent `LOGICAL_INO` and dedupe**: on all
|
* **Hangs with concurrent `LOGICAL_INO` and dedupe/clone**: on all
|
||||||
kernel versions so far, multiple threads running `LOGICAL_INO`
|
kernel versions so far, multiple threads running `LOGICAL_INO` and
|
||||||
and dedupe ioctls at the same time on the same inodes or extents
|
dedupe/clone ioctls at the same time on the same inodes or extents
|
||||||
can lead to a kernel hang. The kernel enters an infinite loop in
|
can lead to a kernel hang. The kernel enters an infinite loop in
|
||||||
`add_all_parents`, where `count` is 0, `ref->count` is 1, and
|
`add_all_parents`, where `count` is 0, `ref->count` is 1, and
|
||||||
`btrfs_next_item` or `btrfs_next_old_item` never find a matching ref).
|
`btrfs_next_item` or `btrfs_next_old_item` never find a matching ref.
|
||||||
|
|
||||||
bees has two workarounds for this bug: 1. schedule work so that multiple
|
bees has two workarounds for this bug: 1. schedule work so that multiple
|
||||||
threads do not simultaneously access the same inode or the same extent,
|
threads do not simultaneously access the same inode or the same extent,
|
||||||
@@ -123,58 +116,32 @@ Workarounds for known kernel bugs
|
|||||||
|
|
||||||
It is still theoretically possible to trigger the kernel bug when
|
It is still theoretically possible to trigger the kernel bug when
|
||||||
running bees at the same time as other dedupers, or other programs
|
running bees at the same time as other dedupers, or other programs
|
||||||
that use `LOGICAL_INO` like `btdu`; however, it's extremely difficult
|
that use `LOGICAL_INO` like `btdu`, or when performing a reflink clone
|
||||||
to reproduce the bug without closely cooperating threads.
|
operation such as `cp` or `mv`; however, it's extremely difficult to
|
||||||
|
reproduce the bug without closely cooperating threads.
|
||||||
|
|
||||||
* **Slow backrefs** (aka toxic extents): Under certain conditions,
|
* **Slow backrefs** (aka toxic extents): On older kernels, under certain
|
||||||
if the number of references to a single shared extent grows too
|
conditions, if the number of references to a single shared extent grows
|
||||||
high, the kernel consumes more and more CPU while also holding locks
|
too high, the kernel consumes more and more CPU while also holding
|
||||||
that delay write access to the filesystem. bees avoids this bug
|
locks that delay write access to the filesystem. This is no longer
|
||||||
by measuring the time the kernel spends performing `LOGICAL_INO`
|
a concern on kernels after 5.7 (or an up-to-date 5.4 LTS version),
|
||||||
operations and permanently blacklisting any extent or hash involved
|
but there are still some remains of earlier workarounds for this issue
|
||||||
where the kernel starts to get slow. In the bees log, such blocks
|
in bees that have not been fully removed.
|
||||||
are labelled as 'toxic' hash/block addresses. Toxic extents are
|
|
||||||
rare (about 1 in 100,000 extents become toxic), but toxic extents can
|
|
||||||
become 8 orders of magnitude more expensive to process than the fastest
|
|
||||||
non-toxic extents. This seems to affect all dedupe agents on btrfs;
|
|
||||||
at this time of writing only bees has a workaround for this bug.
|
|
||||||
|
|
||||||
This workaround is less necessary for kernels 5.4.96, 5.7 and later,
|
bees avoided this bug by measuring the time the kernel spends performing
|
||||||
though the bees workaround can still be triggered on newer kernels
|
`LOGICAL_INO` operations and permanently blacklisting any extent or
|
||||||
by changes in btrfs since kernel version 5.1.
|
hash involved where the kernel starts to get slow. In the bees log,
|
||||||
|
such blocks are labelled as 'toxic' hash/block addresses.
|
||||||
|
|
||||||
|
Future bees releases will remove toxic extent detection (it only detects
|
||||||
|
false positives now) and clear all previously saved toxic extent bits.
|
||||||
|
|
||||||
* **dedupe breaks `btrfs send` in old kernels**. The bees option
|
* **dedupe breaks `btrfs send` in old kernels**. The bees option
|
||||||
`--workaround-btrfs-send` prevents any modification of read-only subvols
|
`--workaround-btrfs-send` prevents any modification of read-only subvols
|
||||||
in order to avoid breaking `btrfs send`.
|
in order to avoid breaking `btrfs send` on kernels before 5.2.
|
||||||
|
|
||||||
This workaround is no longer necessary to avoid kernel crashes
|
This workaround is no longer necessary to avoid kernel crashes and
|
||||||
and send performance failure on kernel 4.9.207, 4.14.159, 4.19.90,
|
send performance failure on kernel 5.4.4 and later. bees will pause
|
||||||
5.3.17, 5.4.4, 5.5 and later; however, some conflict between send
|
dedupe until the send is finished on current kernels.
|
||||||
and dedupe still remains, so the workaround is still useful.
|
|
||||||
|
|
||||||
`btrfs receive` is not and has never been affected by this issue.
|
`btrfs receive` is not and has never been affected by this issue.
|
||||||
|
|
||||||
Unfixed kernel bugs
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
* **The kernel does not permit `btrfs send` and dedupe to run at the
|
|
||||||
same time**. Recent kernels no longer crash, but now refuse one
|
|
||||||
operation with an error if the other operation was already running.
|
|
||||||
|
|
||||||
bees has not been updated to handle the new dedupe behavior optimally.
|
|
||||||
Optimal behavior is to defer dedupe operations when send is detected,
|
|
||||||
and resume after the send is finished. Current bees behavior is to
|
|
||||||
complain loudly about each individual dedupe failure in log messages,
|
|
||||||
and abandon duplicate data references in the snapshot that send is
|
|
||||||
processing. A future bees version shall have better handling for
|
|
||||||
this situation.
|
|
||||||
|
|
||||||
Workaround: send `SIGSTOP` to bees, or terminate the bees process,
|
|
||||||
before running `btrfs send`.
|
|
||||||
|
|
||||||
This workaround is not strictly required if snapshot is deleted after
|
|
||||||
sending. In that case, any duplicate data blocks that were not removed
|
|
||||||
by dedupe will be removed by snapshot delete instead. The workaround
|
|
||||||
still saves some IO.
|
|
||||||
|
|
||||||
`btrfs receive` is not affected by this issue.
|
|
||||||
|
@@ -3,40 +3,34 @@ Good Btrfs Feature Interactions
|
|||||||
|
|
||||||
bees has been tested in combination with the following:
|
bees has been tested in combination with the following:
|
||||||
|
|
||||||
* btrfs compression (zlib, lzo, zstd), mixtures of compressed and uncompressed extents
|
* btrfs compression (zlib, lzo, zstd)
|
||||||
* PREALLOC extents (unconditionally replaced with holes)
|
* PREALLOC extents (unconditionally replaced with holes)
|
||||||
* HOLE extents and btrfs no-holes feature
|
* HOLE extents and btrfs no-holes feature
|
||||||
* Other deduplicators, reflink copies (though bees may decide to redo their work)
|
* Other deduplicators (`duperemove`, `jdupes`)
|
||||||
* btrfs snapshots and non-snapshot subvols (RW and RO)
|
* Reflink copies (modern coreutils `cp` and `mv`)
|
||||||
* Concurrent file modification (e.g. PostgreSQL and sqlite databases, VMs, build daemons)
|
* Concurrent file modification (e.g. PostgreSQL and sqlite databases, VMs, build daemons)
|
||||||
* All btrfs RAID profiles
|
* All btrfs RAID profiles: single, dup, raid0, raid1, raid10, raid1c3, raid1c4, raid5, raid6
|
||||||
* IO errors during dedupe (read errors will throw exceptions, bees will catch them and skip over the affected extent)
|
* IO errors during dedupe (affected extents are skipped)
|
||||||
* Filesystems mounted with or without the `flushoncommit` option
|
|
||||||
* 4K filesystem data block size / clone alignment
|
* 4K filesystem data block size / clone alignment
|
||||||
* 64-bit and 32-bit LE host CPUs (amd64, x86, arm)
|
* 64-bit and 32-bit LE host CPUs (amd64, x86, arm)
|
||||||
* Large files (kernel 5.4 or later strongly recommended)
|
* Large files (kernel 5.4 or later strongly recommended)
|
||||||
* Filesystems up to 90T+ bytes, 1000M+ files
|
* Filesystem data sizes up to 100T+ bytes, 1000M+ files
|
||||||
|
* `open(O_DIRECT)` (seems to work as well--or as poorly--with bees as with any other btrfs feature)
|
||||||
|
* btrfs-convert from ext2/3/4
|
||||||
|
* btrfs `autodefrag` mount option
|
||||||
|
* btrfs balance (data balances cause rescan of relocated data)
|
||||||
|
* btrfs block-group-tree
|
||||||
|
* btrfs `flushoncommit` and `noflushoncommit` mount options
|
||||||
|
* btrfs mixed block groups
|
||||||
|
* btrfs `nodatacow`/`nodatasum` inode attribute or mount option (bees skips all nodatasum files)
|
||||||
|
* btrfs qgroups and quota support (_not_ squotas)
|
||||||
* btrfs receive
|
* btrfs receive
|
||||||
* btrfs nodatacow/nodatasum inode attribute or mount option (bees skips all nodatasum files)
|
* btrfs scrub
|
||||||
* open(O_DIRECT) (seems to work as well--or as poorly--with bees as with any other btrfs feature)
|
* btrfs send (dedupe pauses automatically, kernel 5.4 or later required)
|
||||||
* lvm dm-cache, writecache
|
* btrfs snapshot, non-snapshot subvols (RW and RO), snapshot delete
|
||||||
|
|
||||||
Bad Btrfs Feature Interactions
|
**Note:** some btrfs features have minimum kernel versions which are
|
||||||
------------------------------
|
higher than the minimum kernel version for bees.
|
||||||
|
|
||||||
bees has been tested in combination with the following, and various problems are known:
|
|
||||||
|
|
||||||
* btrfs send: there are bugs in `btrfs send` that can be triggered by
|
|
||||||
bees on old kernels. The [`--workaround-btrfs-send` option](options.md)
|
|
||||||
works around this issue by preventing bees from modifying read-only
|
|
||||||
snapshots.
|
|
||||||
|
|
||||||
* btrfs qgroups: very slow, sometimes hangs...and it's even worse when
|
|
||||||
bees is running.
|
|
||||||
|
|
||||||
* btrfs autodefrag mount option: bees cannot distinguish autodefrag
|
|
||||||
activity from normal filesystem activity, and may try to undo the
|
|
||||||
autodefrag if duplicate copies of the defragmented data exist.
|
|
||||||
|
|
||||||
Untested Btrfs Feature Interactions
|
Untested Btrfs Feature Interactions
|
||||||
-----------------------------------
|
-----------------------------------
|
||||||
@@ -45,10 +39,6 @@ bees has not been tested with the following, and undesirable interactions may oc
|
|||||||
|
|
||||||
* Non-4K filesystem data block size (should work if recompiled)
|
* Non-4K filesystem data block size (should work if recompiled)
|
||||||
* Non-equal hash (SUM) and filesystem data block (CLONE) sizes (need to fix that eventually)
|
* Non-equal hash (SUM) and filesystem data block (CLONE) sizes (need to fix that eventually)
|
||||||
* btrfs seed filesystems (no particular reason it wouldn't work, but no one has reported trying)
|
* btrfs seed filesystems, raid-stripe-tree, squotas (no particular reason these wouldn't work, but no one has reported trying)
|
||||||
* btrfs out-of-tree kernel patches (e.g. in-kernel dedupe, encryption, extent tree v2)
|
* btrfs out-of-tree kernel patches (e.g. encryption, extent tree v2)
|
||||||
* btrfs-convert from ext2/3/4 (never tested, might run out of space or ignore significant portions of the filesystem due to sanity checks)
|
|
||||||
* btrfs mixed block groups (don't know a reason why it would *not* work, but never tested)
|
|
||||||
* Host CPUs with exotic page sizes, alignment requirements, or endianness (ppc, alpha, sparc, strongarm, s390, mips, m68k...)
|
* Host CPUs with exotic page sizes, alignment requirements, or endianness (ppc, alpha, sparc, strongarm, s390, mips, m68k...)
|
||||||
* bcache: used to be in the "bad" list, now in the "untested" list because nobody is rigorously testing, and bcache bugs come and go
|
|
||||||
* flashcache: an out-of-tree cache-HDD-on-SSD block layer helper
|
|
||||||
|
@@ -26,11 +26,7 @@ Here are some numbers to estimate appropriate hash table sizes:
|
|||||||
Notes:
|
Notes:
|
||||||
|
|
||||||
* If the hash table is too large, no extra dedupe efficiency is
|
* If the hash table is too large, no extra dedupe efficiency is
|
||||||
obtained, and the extra space wastes RAM. If the hash table contains
|
obtained, and the extra space wastes RAM.
|
||||||
more block records than there are blocks in the filesystem, the extra
|
|
||||||
space can slow bees down. A table that is too large prevents obsolete
|
|
||||||
data from being evicted, so bees wastes time looking for matching data
|
|
||||||
that is no longer present on the filesystem.
|
|
||||||
|
|
||||||
* If the hash table is too small, bees extrapolates from matching
|
* If the hash table is too small, bees extrapolates from matching
|
||||||
blocks to find matching adjacent blocks in the filesystem that have been
|
blocks to find matching adjacent blocks in the filesystem that have been
|
||||||
@@ -59,19 +55,19 @@ patterns on dedupe effectiveness without performing deep inspection of
|
|||||||
both the filesystem data and its structure--a task that is as expensive
|
both the filesystem data and its structure--a task that is as expensive
|
||||||
as performing the deduplication.
|
as performing the deduplication.
|
||||||
|
|
||||||
* **Compression** on the filesystem reduces the average extent length
|
* **Compression** in files reduces the average extent length compared
|
||||||
compared to uncompressed filesystems. The maximum compressed extent
|
to uncompressed files. The maximum compressed extent length on
|
||||||
length on btrfs is 128KB, while the maximum uncompressed extent length
|
btrfs is 128KB, while the maximum uncompressed extent length is 128MB.
|
||||||
is 128MB. Longer extents decrease the optimum hash table size while
|
Longer extents decrease the optimum hash table size while shorter extents
|
||||||
shorter extents increase the optimum hash table size because the
|
increase the optimum hash table size, because the probability of a hash
|
||||||
probability of a hash table entry being present (i.e. unevicted) in
|
table entry being present (i.e. unevicted) in each extent is proportional
|
||||||
each extent is proportional to the extent length.
|
to the extent length.
|
||||||
|
|
||||||
As a rule of thumb, the optimal hash table size for a compressed
|
As a rule of thumb, the optimal hash table size for a compressed
|
||||||
filesystem is 2-4x larger than the optimal hash table size for the same
|
filesystem is 2-4x larger than the optimal hash table size for the same
|
||||||
data on an uncompressed filesystem. Dedupe efficiency falls dramatically
|
data on an uncompressed filesystem. Dedupe efficiency falls rapidly with
|
||||||
with hash tables smaller than 128MB/TB as the average dedupe extent size
|
hash tables smaller than 128MB/TB as the average dedupe extent size is
|
||||||
is larger than the largest possible compressed extent size (128KB).
|
larger than the largest possible compressed extent size (128KB).
|
||||||
|
|
||||||
* **Short writes or fragmentation** also shorten the average extent
|
* **Short writes or fragmentation** also shorten the average extent
|
||||||
length and increase optimum hash table size. If a database writes to
|
length and increase optimum hash table size. If a database writes to
|
||||||
@@ -115,7 +111,6 @@ Extent scan mode:
|
|||||||
* Works with 4.15 and later kernels.
|
* Works with 4.15 and later kernels.
|
||||||
* Can estimate progress and provide an ETA.
|
* Can estimate progress and provide an ETA.
|
||||||
* Can optimize scanning order to dedupe large extents first.
|
* Can optimize scanning order to dedupe large extents first.
|
||||||
* Cannot avoid modifying read-only subvols.
|
|
||||||
* Can keep up with frequent creation and deletion of snapshots.
|
* Can keep up with frequent creation and deletion of snapshots.
|
||||||
|
|
||||||
Subvol scan modes:
|
Subvol scan modes:
|
||||||
@@ -123,8 +118,7 @@ Subvol scan modes:
|
|||||||
* Work with 4.14 and earlier kernels.
|
* Work with 4.14 and earlier kernels.
|
||||||
* Cannot estimate or report progress.
|
* Cannot estimate or report progress.
|
||||||
* Cannot optimize scanning order by extent size.
|
* Cannot optimize scanning order by extent size.
|
||||||
* Can avoid modifying read-only subvols (for `btrfs send` workaround).
|
* Have problems keeping up with multiple snapshots created during a scan.
|
||||||
* Have problems keeping up with snapshots created during a scan.
|
|
||||||
|
|
||||||
The default scan mode is 4, "extent".
|
The default scan mode is 4, "extent".
|
||||||
|
|
||||||
@@ -212,7 +206,7 @@ Extent scan mode
|
|||||||
Scan mode 4, "extent", scans the extent tree instead of the subvol trees.
|
Scan mode 4, "extent", scans the extent tree instead of the subvol trees.
|
||||||
Extent scan mode reads each extent once, regardless of the number of
|
Extent scan mode reads each extent once, regardless of the number of
|
||||||
reflinks or snapshots. It adapts to the creation of new snapshots
|
reflinks or snapshots. It adapts to the creation of new snapshots
|
||||||
immediately, without having to revisit old data.
|
and reflinks immediately, without having to revisit old data.
|
||||||
|
|
||||||
In the extent scan mode, extents are separated into multiple size tiers
|
In the extent scan mode, extents are separated into multiple size tiers
|
||||||
to prioritize large extents over small ones. Deduping large extents
|
to prioritize large extents over small ones. Deduping large extents
|
||||||
@@ -268,17 +262,54 @@ send` in extent scan mode, and restart bees after the `send` is complete.
|
|||||||
Threads and load management
|
Threads and load management
|
||||||
---------------------------
|
---------------------------
|
||||||
|
|
||||||
By default, bees creates one worker thread for each CPU detected.
|
By default, bees creates one worker thread for each CPU detected. These
|
||||||
These threads then perform scanning and dedupe operations. The number of
|
threads then perform scanning and dedupe operations. bees attempts to
|
||||||
worker threads can be set with the [`--thread-count` and `--thread-factor`
|
maximize the amount of productive work each thread does, until either the
|
||||||
options](options.md).
|
threads are all continuously busy, or there is no remaining work to do.
|
||||||
|
|
||||||
If desired, bees can automatically increase or decrease the number
|
In many cases it is not desirable to continually run bees at maximum
|
||||||
of worker threads in response to system load. This reduces impact on
|
performance. Maximum performance is not necessary if bees can dedupe
|
||||||
the rest of the system by pausing bees when other CPU and IO intensive
|
new data faster than it appears on the filesystem. If it only takes
|
||||||
loads are active on the system, and resumes bees when the other loads
|
bees 10 minutes per day to dedupe all new data on a filesystem, then
|
||||||
are inactive. This is configured with the [`--loadavg-target` and
|
bees doesn't need to run for more than 10 minutes per day.
|
||||||
`--thread-min` options](options.md).
|
|
||||||
|
bees supports a number of options for reducing system load:
|
||||||
|
|
||||||
|
* Run bees for a few hours per day, at an off-peak time (i.e. during
|
||||||
|
a maintenace window), instead of running bees continuously. Any data
|
||||||
|
added to the filesystem while bees is not running will be scanned when
|
||||||
|
bees restarts. At the end of the maintenance window, terminate the
|
||||||
|
bees process with SIGTERM to write the hash table and scan position
|
||||||
|
for the next maintenance window.
|
||||||
|
|
||||||
|
* Temporarily pause bees operation by sending the bees process SIGUSR1,
|
||||||
|
and resume operation with SIGUSR2. This is preferable to freezing
|
||||||
|
and thawing the process, e.g. with freezer cgroups or SIGSTOP/SIGCONT
|
||||||
|
signals, because it allows bees to close open file handles that would
|
||||||
|
otherwise prevent those files from being deleted while bees is frozen.
|
||||||
|
|
||||||
|
* Reduce the number of worker threads with the [`--thread-count` or
|
||||||
|
`--thread-factor` options](options.md). This simply leaves CPU cores
|
||||||
|
idle so that other applications on the host can use them, or to save
|
||||||
|
power.
|
||||||
|
|
||||||
|
* Allow bees to automatically track system load and increase or decrease
|
||||||
|
the number of threads to reach a target system load. This reduces
|
||||||
|
impact on the rest of the system by pausing bees when other CPU and IO
|
||||||
|
intensive loads are active on the system, and resumes bees when the other
|
||||||
|
loads are inactive. This is configured with the [`--loadavg-target`
|
||||||
|
and `--thread-min` options](options.md).
|
||||||
|
|
||||||
|
* Allow bees to self-throttle operations that enqueue delayed work
|
||||||
|
within btrfs. These operations are not well controlled by Linux
|
||||||
|
features such as process priority or IO priority or IO rate-limiting,
|
||||||
|
because the enqueued work is submitted to btrfs several seconds before
|
||||||
|
btrfs performs the work. By the time btrfs performs the work, it's too
|
||||||
|
late for external throttling to be effective. The [`--throttle-factor`
|
||||||
|
option](options.md) tracks how long it takes btrfs to complete queued
|
||||||
|
operations, and reduces bees's queued work submission rate to match
|
||||||
|
btrfs's queued work completion rate (or a fraction thereof, to reduce
|
||||||
|
system load).
|
||||||
|
|
||||||
Log verbosity
|
Log verbosity
|
||||||
-------------
|
-------------
|
||||||
|
@@ -6,30 +6,30 @@ Best-Effort Extent-Same, a btrfs deduplication agent.
|
|||||||
About bees
|
About bees
|
||||||
----------
|
----------
|
||||||
|
|
||||||
bees is a block-oriented userspace deduplication agent designed for large
|
bees is a block-oriented userspace deduplication agent designed to scale
|
||||||
btrfs filesystems. It is an offline dedupe combined with an incremental
|
up to large btrfs filesystems. It is an offline dedupe combined with
|
||||||
data scan capability to minimize time data spends on disk from write
|
an incremental data scan capability to minimize time data spends on disk
|
||||||
to dedupe.
|
from write to dedupe.
|
||||||
|
|
||||||
Strengths
|
Strengths
|
||||||
---------
|
---------
|
||||||
|
|
||||||
* Space-efficient hash table and matching algorithms - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
|
* Space-efficient hash table - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
|
||||||
* Daemon incrementally dedupes new data using btrfs tree search
|
* Daemon mode - incrementally dedupes new data as it appears
|
||||||
|
* Largest extents first - recover more free space during fixed maintenance windows
|
||||||
* Works with btrfs compression - dedupe any combination of compressed and uncompressed files
|
* Works with btrfs compression - dedupe any combination of compressed and uncompressed files
|
||||||
* Works around btrfs filesystem structure to free more disk space
|
* Whole-filesystem dedupe - scans data only once, even with snapshots and reflinks
|
||||||
* Persistent hash table for rapid restart after shutdown
|
* Persistent hash table for rapid restart after shutdown
|
||||||
* Whole-filesystem dedupe - including snapshots
|
|
||||||
* Constant hash table size - no increased RAM usage if data set becomes larger
|
* Constant hash table size - no increased RAM usage if data set becomes larger
|
||||||
* Works on live data - no scheduled downtime required
|
* Works on live data - no scheduled downtime required
|
||||||
* Automatic self-throttling based on system load
|
* Automatic self-throttling - reduces system load
|
||||||
|
* btrfs support - recovers more free space from btrfs than naive dedupers
|
||||||
|
|
||||||
Weaknesses
|
Weaknesses
|
||||||
----------
|
----------
|
||||||
|
|
||||||
* Whole-filesystem dedupe - has no include/exclude filters, does not accept file lists
|
* Whole-filesystem dedupe - has no include/exclude filters, does not accept file lists
|
||||||
* Requires root privilege (or `CAP_SYS_ADMIN`)
|
* Requires root privilege (`CAP_SYS_ADMIN` plus the usual filesystem read/modify caps)
|
||||||
* First run may require temporary disk space for extent reorganization
|
|
||||||
* [First run may increase metadata space usage if many snapshots exist](gotchas.md)
|
* [First run may increase metadata space usage if many snapshots exist](gotchas.md)
|
||||||
* Constant hash table size - no decreased RAM usage if data set becomes smaller
|
* Constant hash table size - no decreased RAM usage if data set becomes smaller
|
||||||
* btrfs only
|
* btrfs only
|
||||||
@@ -46,7 +46,7 @@ Recommended Reading
|
|||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
* [bees Gotchas](gotchas.md)
|
* [bees Gotchas](gotchas.md)
|
||||||
* [btrfs kernel bugs](btrfs-kernel.md) - especially DATA CORRUPTION WARNING
|
* [btrfs kernel bugs](btrfs-kernel.md) - especially DATA CORRUPTION WARNING for old kernels
|
||||||
* [bees vs. other btrfs features](btrfs-other.md)
|
* [bees vs. other btrfs features](btrfs-other.md)
|
||||||
* [What to do when something goes wrong](wrong.md)
|
* [What to do when something goes wrong](wrong.md)
|
||||||
|
|
||||||
@@ -69,6 +69,6 @@ You can also use Github:
|
|||||||
Copyright & License
|
Copyright & License
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
Copyright 2015-2023 Zygo Blaxell <bees@furryterror.org>.
|
Copyright 2015-2025 Zygo Blaxell <bees@furryterror.org>.
|
||||||
|
|
||||||
GPL (version 3 or later).
|
GPL (version 3 or later).
|
||||||
|
@@ -36,6 +36,34 @@
|
|||||||
|
|
||||||
Has no effect unless `--loadavg-target` is used to specify a target load.
|
Has no effect unless `--loadavg-target` is used to specify a target load.
|
||||||
|
|
||||||
|
* `--throttle-factor FACTOR`
|
||||||
|
|
||||||
|
In order to avoid saturating btrfs deferred work queues, bees tracks
|
||||||
|
the time that operations with delayed effect (dedupe and tmpfile copy)
|
||||||
|
and operations with long run times (`LOGICAL_INO`) run. If an operation
|
||||||
|
finishes before the average run time for that operation, bees will
|
||||||
|
sleep for the remainder of the average run time, so that operations
|
||||||
|
are submitted to btrfs at a rate similar to the rate that btrfs can
|
||||||
|
complete them.
|
||||||
|
|
||||||
|
The `FACTOR` is multiplied by the average run time for each operation
|
||||||
|
to calculate the target delay time.
|
||||||
|
|
||||||
|
`FACTOR` 0 is the default, which adds no delays. bees will attempt
|
||||||
|
to saturate btrfs delayed work queues as quickly as possible, which
|
||||||
|
may impact other processes on the same filesystem, or even slow down
|
||||||
|
bees itself.
|
||||||
|
|
||||||
|
`FACTOR` 1.0 will attempt to keep btrfs delayed work queues filled at
|
||||||
|
a steady average rate.
|
||||||
|
|
||||||
|
`FACTOR` more than 1.0 will add delays longer than the average
|
||||||
|
run time (e.g. 10.0 will delay all operations that take less than 10x
|
||||||
|
the average run time). High values of `FACTOR` may be desirable when
|
||||||
|
using bees with other applications on the same filesystem.
|
||||||
|
|
||||||
|
The maximum delay per operation is 60 seconds.
|
||||||
|
|
||||||
## Filesystem tree traversal options
|
## Filesystem tree traversal options
|
||||||
|
|
||||||
* `--scan-mode MODE` or `-m`
|
* `--scan-mode MODE` or `-m`
|
||||||
@@ -56,19 +84,22 @@
|
|||||||
|
|
||||||
* `--workaround-btrfs-send` or `-a`
|
* `--workaround-btrfs-send` or `-a`
|
||||||
|
|
||||||
|
_This option is obsolete and should not be used any more._
|
||||||
|
|
||||||
Pretend that read-only snapshots are empty and silently discard any
|
Pretend that read-only snapshots are empty and silently discard any
|
||||||
request to dedupe files referenced through them. This is a workaround for
|
request to dedupe files referenced through them. This is a workaround
|
||||||
[problems with the kernel implementation of `btrfs send` and `btrfs send
|
for [problems with old kernels running `btrfs send` and `btrfs send
|
||||||
-p`](btrfs-kernel.md) which make these btrfs features unusable with bees.
|
-p`](btrfs-kernel.md) which make these btrfs features unusable with bees.
|
||||||
|
|
||||||
This option should be used to avoid breaking `btrfs send` on the same
|
This option was used to avoid breaking `btrfs send` on old kernels.
|
||||||
filesystem.
|
The affected kernels are now too old to be recommended for use with bees.
|
||||||
|
|
||||||
|
bees now waits for `btrfs send` to finish. There is no need for an
|
||||||
|
option to enable this.
|
||||||
|
|
||||||
**Note:** There is a _significant_ space tradeoff when using this option:
|
**Note:** There is a _significant_ space tradeoff when using this option:
|
||||||
it is likely no space will be recovered--and possibly significant extra
|
it is likely no space will be recovered--and possibly significant extra
|
||||||
space used--until the read-only snapshots are deleted. On the other
|
space used--until the read-only snapshots are deleted.
|
||||||
hand, if snapshots are rotated frequently then bees will spend less time
|
|
||||||
scanning them.
|
|
||||||
|
|
||||||
## Logging options
|
## Logging options
|
||||||
|
|
||||||
|
@@ -75,9 +75,8 @@ in the shell script that launches `bees`:
|
|||||||
schedtool -D -n20 $$
|
schedtool -D -n20 $$
|
||||||
ionice -c3 -p $$
|
ionice -c3 -p $$
|
||||||
|
|
||||||
You can also use the [`--loadavg-target` and `--thread-min`
|
You can also use the [load management options](options.md) to further
|
||||||
options](options.md) to further control the impact of bees on the rest
|
control the impact of bees on the rest of the system.
|
||||||
of the system.
|
|
||||||
|
|
||||||
Let the bees fly:
|
Let the bees fly:
|
||||||
|
|
||||||
|
@@ -4,16 +4,13 @@ What to do when something goes wrong with bees
|
|||||||
Hangs and excessive slowness
|
Hangs and excessive slowness
|
||||||
----------------------------
|
----------------------------
|
||||||
|
|
||||||
### Are you using qgroups or autodefrag?
|
|
||||||
|
|
||||||
Read about [bad btrfs feature interactions](btrfs-other.md).
|
|
||||||
|
|
||||||
### Use load-throttling options
|
### Use load-throttling options
|
||||||
|
|
||||||
If bees is just more aggressive than you would like, consider using
|
If bees is just more aggressive than you would like, consider using
|
||||||
[load throttling options](options.md). These are usually more effective
|
[load throttling options](options.md). These are usually more effective
|
||||||
than `ionice`, `schedtool`, and the `blkio` cgroup (though you can
|
than `ionice`, `schedtool`, and the `blkio` cgroup (though you can
|
||||||
certainly use those too).
|
certainly use those too) because they limit work that bees queues up
|
||||||
|
for later execution inside btrfs.
|
||||||
|
|
||||||
### Check `$BEESSTATUS`
|
### Check `$BEESSTATUS`
|
||||||
|
|
||||||
@@ -52,10 +49,6 @@ dst = 15 /run/bees/ede84fbd-cb59-0c60-9ea7-376fa4984887/data.new/home/builder/li
|
|||||||
|
|
||||||
Thread names of note:
|
Thread names of note:
|
||||||
|
|
||||||
* `crawl_12345`: scan/dedupe worker threads (the number is the subvol
|
|
||||||
ID which the thread is currently working on). These threads appear
|
|
||||||
and disappear from the status dynamically according to the requirements
|
|
||||||
of the work queue and loadavg throttling.
|
|
||||||
* `bees`: main thread (doesn't do anything after startup, but its task execution time is that of the whole bees process)
|
* `bees`: main thread (doesn't do anything after startup, but its task execution time is that of the whole bees process)
|
||||||
* `crawl_master`: task that finds new extents in the filesystem and populates the work queue
|
* `crawl_master`: task that finds new extents in the filesystem and populates the work queue
|
||||||
* `crawl_transid`: btrfs transid (generation number) tracker and polling thread
|
* `crawl_transid`: btrfs transid (generation number) tracker and polling thread
|
||||||
@@ -64,6 +57,13 @@ dst = 15 /run/bees/ede84fbd-cb59-0c60-9ea7-376fa4984887/data.new/home/builder/li
|
|||||||
* `hash_writeback`: trickle-writes the hash table back to `beeshash.dat`
|
* `hash_writeback`: trickle-writes the hash table back to `beeshash.dat`
|
||||||
* `hash_prefetch`: prefetches the hash table at startup and updates `beesstats.txt` hourly
|
* `hash_prefetch`: prefetches the hash table at startup and updates `beesstats.txt` hourly
|
||||||
|
|
||||||
|
Most other threads have names that are derived from the current dedupe
|
||||||
|
task that they are executing:
|
||||||
|
|
||||||
|
* `ref_205ad76b1000_24K_50`: extent scan performing dedupe of btrfs extent bytenr `205ad76b1000`, which is 24 KiB long and has 50 references
|
||||||
|
* `extent_250_32M_16E`: extent scan searching for extents between 32 MiB + 1 and 16 EiB bytes long, tracking scan position in virtual subvol `250`.
|
||||||
|
* `crawl_378_18916`: subvol scan searching for extent refs in subvol `378`, inode `18916`.
|
||||||
|
|
||||||
### Dump kernel stacks of hung processes
|
### Dump kernel stacks of hung processes
|
||||||
|
|
||||||
Check the kernel stacks of all blocked kernel processes:
|
Check the kernel stacks of all blocked kernel processes:
|
||||||
@@ -91,7 +91,7 @@ bees Crashes
|
|||||||
(gdb) thread apply all bt full
|
(gdb) thread apply all bt full
|
||||||
|
|
||||||
The last line generates megabytes of output and will often crash gdb.
|
The last line generates megabytes of output and will often crash gdb.
|
||||||
This is OK, submit whatever output gdb can produce.
|
Submit whatever output gdb can produce.
|
||||||
|
|
||||||
**Note that this output may include filenames or data from your
|
**Note that this output may include filenames or data from your
|
||||||
filesystem.**
|
filesystem.**
|
||||||
@@ -160,8 +160,7 @@ Kernel crashes, corruption, and filesystem damage
|
|||||||
-------------------------------------------------
|
-------------------------------------------------
|
||||||
|
|
||||||
bees doesn't do anything that _should_ cause corruption or data loss;
|
bees doesn't do anything that _should_ cause corruption or data loss;
|
||||||
however, [btrfs has kernel bugs](btrfs-kernel.md) and [interacts poorly
|
however, [btrfs has kernel bugs](btrfs-kernel.md), so corruption is
|
||||||
with some Linux block device layers](btrfs-other.md), so corruption is
|
|
||||||
not impossible.
|
not impossible.
|
||||||
|
|
||||||
Issues with the btrfs filesystem kernel code or other block device layers
|
Issues with the btrfs filesystem kernel code or other block device layers
|
||||||
|
@@ -55,7 +55,6 @@ namespace crucible {
|
|||||||
Pointer m_ptr;
|
Pointer m_ptr;
|
||||||
size_t m_size = 0;
|
size_t m_size = 0;
|
||||||
mutable mutex m_mutex;
|
mutable mutex m_mutex;
|
||||||
friend ostream & operator<<(ostream &os, const ByteVector &bv);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
@@ -74,6 +73,8 @@ namespace crucible {
|
|||||||
THROW_CHECK2(out_of_range, size(), sizeof(T), size() >= sizeof(T));
|
THROW_CHECK2(out_of_range, size(), sizeof(T), size() >= sizeof(T));
|
||||||
return reinterpret_cast<T*>(data());
|
return reinterpret_cast<T*>(data());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ostream& operator<<(ostream &os, const ByteVector &bv);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // _CRUCIBLE_BYTEVECTOR_H_
|
#endif // _CRUCIBLE_BYTEVECTOR_H_
|
||||||
|
@@ -243,8 +243,6 @@ namespace crucible {
|
|||||||
unsigned long available() const;
|
unsigned long available() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class V> ostream &hexdump(ostream &os, const V &v);
|
|
||||||
|
|
||||||
struct BtrfsIoctlFsInfoArgs : public btrfs_ioctl_fs_info_args_v3 {
|
struct BtrfsIoctlFsInfoArgs : public btrfs_ioctl_fs_info_args_v3 {
|
||||||
BtrfsIoctlFsInfoArgs();
|
BtrfsIoctlFsInfoArgs();
|
||||||
void do_ioctl(int fd);
|
void do_ioctl(int fd);
|
||||||
|
@@ -12,12 +12,14 @@ namespace crucible {
|
|||||||
ostream &
|
ostream &
|
||||||
hexdump(ostream &os, const V &v)
|
hexdump(ostream &os, const V &v)
|
||||||
{
|
{
|
||||||
os << "V { size = " << v.size() << ", data:\n";
|
const auto v_size = v.size();
|
||||||
for (size_t i = 0; i < v.size(); i += 8) {
|
const uint8_t* const v_data = reinterpret_cast<uint8_t*>(v.data());
|
||||||
|
os << "V { size = " << v_size << ", data:\n";
|
||||||
|
for (size_t i = 0; i < v_size; i += 8) {
|
||||||
string hex, ascii;
|
string hex, ascii;
|
||||||
for (size_t j = i; j < i + 8; ++j) {
|
for (size_t j = i; j < i + 8; ++j) {
|
||||||
if (j < v.size()) {
|
if (j < v_size) {
|
||||||
uint8_t c = v[j];
|
const uint8_t c = v_data[j];
|
||||||
char buf[8];
|
char buf[8];
|
||||||
sprintf(buf, "%02x ", c);
|
sprintf(buf, "%02x ", c);
|
||||||
hex += buf;
|
hex += buf;
|
||||||
|
@@ -117,7 +117,7 @@ namespace crucible {
|
|||||||
while (full() || locked(name)) {
|
while (full() || locked(name)) {
|
||||||
m_condvar.wait(lock);
|
m_condvar.wait(lock);
|
||||||
}
|
}
|
||||||
auto rv = m_set.insert(make_pair(name, crucible::gettid()));
|
auto rv = m_set.insert(make_pair(name, gettid()));
|
||||||
THROW_CHECK0(runtime_error, rv.second);
|
THROW_CHECK0(runtime_error, rv.second);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -129,7 +129,7 @@ namespace crucible {
|
|||||||
if (full() || locked(name)) {
|
if (full() || locked(name)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
auto rv = m_set.insert(make_pair(name, crucible::gettid()));
|
auto rv = m_set.insert(make_pair(name, gettid()));
|
||||||
THROW_CHECK1(runtime_error, name, rv.second);
|
THROW_CHECK1(runtime_error, name, rv.second);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
17
include/crucible/openat2.h
Normal file
17
include/crucible/openat2.h
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
#ifndef CRUCIBLE_OPENAT2_H
|
||||||
|
#define CRUCIBLE_OPENAT2_H
|
||||||
|
|
||||||
|
#include <linux/openat2.h>
|
||||||
|
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <sys/syscall.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
|
||||||
|
/// Weak symbol to support libc with no syscall wrapper
|
||||||
|
int openat2(int dirfd, const char *pathname, struct open_how *how, size_t size) throw();
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // CRUCIBLE_OPENAT2_H
|
@@ -10,6 +10,10 @@
|
|||||||
#include <sys/wait.h>
|
#include <sys/wait.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
pid_t gettid() throw();
|
||||||
|
};
|
||||||
|
|
||||||
namespace crucible {
|
namespace crucible {
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
@@ -73,7 +77,6 @@ namespace crucible {
|
|||||||
|
|
||||||
typedef ResourceHandle<Process::id, Process> Pid;
|
typedef ResourceHandle<Process::id, Process> Pid;
|
||||||
|
|
||||||
pid_t gettid();
|
|
||||||
double getloadavg1();
|
double getloadavg1();
|
||||||
double getloadavg5();
|
double getloadavg5();
|
||||||
double getloadavg15();
|
double getloadavg15();
|
||||||
|
@@ -47,6 +47,10 @@ namespace crucible {
|
|||||||
/// been destroyed.
|
/// been destroyed.
|
||||||
void append(const Task &task) const;
|
void append(const Task &task) const;
|
||||||
|
|
||||||
|
/// Schedule Task to run after this Task has run or
|
||||||
|
/// been destroyed, in Task ID order.
|
||||||
|
void insert(const Task &task) const;
|
||||||
|
|
||||||
/// Describe Task as text.
|
/// Describe Task as text.
|
||||||
string title() const;
|
string title() const;
|
||||||
|
|
||||||
@@ -172,9 +176,6 @@ namespace crucible {
|
|||||||
/// objects it holds, and exit its Task function.
|
/// objects it holds, and exit its Task function.
|
||||||
ExclusionLock try_lock(const Task &task);
|
ExclusionLock try_lock(const Task &task);
|
||||||
|
|
||||||
/// Execute Task when Exclusion is unlocked (possibly
|
|
||||||
/// immediately).
|
|
||||||
void insert_task(const Task &t);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Wrapper around pthread_setname_np which handles length limits
|
/// Wrapper around pthread_setname_np which handles length limits
|
||||||
|
@@ -34,7 +34,7 @@ namespace crucible {
|
|||||||
double m_rate;
|
double m_rate;
|
||||||
double m_burst;
|
double m_burst;
|
||||||
double m_tokens = 0.0;
|
double m_tokens = 0.0;
|
||||||
mutex m_mutex;
|
mutable mutex m_mutex;
|
||||||
|
|
||||||
void update_tokens();
|
void update_tokens();
|
||||||
RateLimiter() = delete;
|
RateLimiter() = delete;
|
||||||
@@ -45,6 +45,8 @@ namespace crucible {
|
|||||||
double sleep_time(double cost = 1.0);
|
double sleep_time(double cost = 1.0);
|
||||||
bool is_ready();
|
bool is_ready();
|
||||||
void borrow(double cost = 1.0);
|
void borrow(double cost = 1.0);
|
||||||
|
void rate(double new_rate);
|
||||||
|
double rate() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
class RateEstimator {
|
class RateEstimator {
|
||||||
@@ -88,6 +90,9 @@ namespace crucible {
|
|||||||
// Read count
|
// Read count
|
||||||
uint64_t count() const;
|
uint64_t count() const;
|
||||||
|
|
||||||
|
/// Increment count (like update(count() + more), but atomic)
|
||||||
|
void increment(uint64_t more = 1);
|
||||||
|
|
||||||
// Convert counts to chrono types
|
// Convert counts to chrono types
|
||||||
chrono::high_resolution_clock::time_point time_point(uint64_t absolute_count) const;
|
chrono::high_resolution_clock::time_point time_point(uint64_t absolute_count) const;
|
||||||
chrono::duration<double> duration(uint64_t relative_count) const;
|
chrono::duration<double> duration(uint64_t relative_count) const;
|
||||||
|
@@ -14,6 +14,7 @@ CRUCIBLE_OBJS = \
|
|||||||
fs.o \
|
fs.o \
|
||||||
multilock.o \
|
multilock.o \
|
||||||
ntoa.o \
|
ntoa.o \
|
||||||
|
openat2.o \
|
||||||
path.o \
|
path.o \
|
||||||
process.o \
|
process.o \
|
||||||
string.o \
|
string.o \
|
||||||
|
@@ -44,10 +44,10 @@ namespace crucible {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ByteVector::value_type&
|
ByteVector::value_type&
|
||||||
ByteVector::operator[](size_t size) const
|
ByteVector::operator[](size_t index) const
|
||||||
{
|
{
|
||||||
unique_lock<mutex> lock(m_mutex);
|
unique_lock<mutex> lock(m_mutex);
|
||||||
return m_ptr.get()[size];
|
return m_ptr.get()[index];
|
||||||
}
|
}
|
||||||
|
|
||||||
ByteVector::ByteVector(const ByteVector &that)
|
ByteVector::ByteVector(const ByteVector &that)
|
||||||
@@ -183,7 +183,6 @@ namespace crucible {
|
|||||||
|
|
||||||
ostream&
|
ostream&
|
||||||
operator<<(ostream &os, const ByteVector &bv) {
|
operator<<(ostream &os, const ByteVector &bv) {
|
||||||
unique_lock<mutex> lock(bv.m_mutex);
|
|
||||||
hexdump(os, bv);
|
hexdump(os, bv);
|
||||||
return os;
|
return os;
|
||||||
}
|
}
|
||||||
|
@@ -76,7 +76,7 @@ namespace crucible {
|
|||||||
DIE_IF_ZERO(strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", <m));
|
DIE_IF_ZERO(strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", <m));
|
||||||
|
|
||||||
header_stream << buf;
|
header_stream << buf;
|
||||||
header_stream << " " << getpid() << "." << crucible::gettid();
|
header_stream << " " << getpid() << "." << gettid();
|
||||||
if (add_prefix_level) {
|
if (add_prefix_level) {
|
||||||
header_stream << "<" << m_loglevel << ">";
|
header_stream << "<" << m_loglevel << ">";
|
||||||
}
|
}
|
||||||
@@ -88,7 +88,7 @@ namespace crucible {
|
|||||||
header_stream << "<" << m_loglevel << ">";
|
header_stream << "<" << m_loglevel << ">";
|
||||||
}
|
}
|
||||||
header_stream << (m_name.empty() ? "thread" : m_name);
|
header_stream << (m_name.empty() ? "thread" : m_name);
|
||||||
header_stream << "[" << crucible::gettid() << "]";
|
header_stream << "[" << gettid() << "]";
|
||||||
}
|
}
|
||||||
|
|
||||||
header_stream << ": ";
|
header_stream << ": ";
|
||||||
|
@@ -781,7 +781,7 @@ namespace crucible {
|
|||||||
++s_calls;
|
++s_calls;
|
||||||
if (rv != 0 && errno == ENOENT) {
|
if (rv != 0 && errno == ENOENT) {
|
||||||
// If we are searching a tree that is deleted or no longer exists, just return an empty list
|
// If we are searching a tree that is deleted or no longer exists, just return an empty list
|
||||||
nr_items = 0;
|
ioctl_ptr->key.nr_items = 0;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (rv != 0 && errno != EOVERFLOW) {
|
if (rv != 0 && errno != EOVERFLOW) {
|
||||||
|
13
lib/openat2.cc
Normal file
13
lib/openat2.cc
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
#include "crucible/openat2.h"
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
|
||||||
|
int
|
||||||
|
__attribute__((weak))
|
||||||
|
openat2(int const dirfd, const char *const pathname, struct open_how *const how, size_t const size)
|
||||||
|
throw()
|
||||||
|
{
|
||||||
|
return syscall(SYS_openat2, dirfd, pathname, how, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
@@ -7,13 +7,18 @@
|
|||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
// for gettid()
|
|
||||||
#ifndef _GNU_SOURCE
|
|
||||||
#define _GNU_SOURCE
|
|
||||||
#endif
|
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <sys/syscall.h>
|
#include <sys/syscall.h>
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
pid_t
|
||||||
|
__attribute__((weak))
|
||||||
|
gettid() throw()
|
||||||
|
{
|
||||||
|
return syscall(SYS_gettid);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
namespace crucible {
|
namespace crucible {
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
@@ -111,12 +116,6 @@ namespace crucible {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pid_t
|
|
||||||
gettid()
|
|
||||||
{
|
|
||||||
return syscall(SYS_gettid);
|
|
||||||
}
|
|
||||||
|
|
||||||
double
|
double
|
||||||
getloadavg1()
|
getloadavg1()
|
||||||
{
|
{
|
||||||
|
124
lib/task.cc
124
lib/task.cc
@@ -76,13 +76,24 @@ namespace crucible {
|
|||||||
/// Tasks to be executed after the current task is executed
|
/// Tasks to be executed after the current task is executed
|
||||||
list<TaskStatePtr> m_post_exec_queue;
|
list<TaskStatePtr> m_post_exec_queue;
|
||||||
|
|
||||||
/// Set by run() and append(). Cleared by exec().
|
/// Set by run(), append(), and insert(). Cleared by exec().
|
||||||
bool m_run_now = false;
|
bool m_run_now = false;
|
||||||
|
|
||||||
|
/// Set by insert(). Cleared by exec() and destructor.
|
||||||
|
bool m_sort_queue = false;
|
||||||
|
|
||||||
/// Set when task starts execution by exec().
|
/// Set when task starts execution by exec().
|
||||||
/// Cleared when exec() ends.
|
/// Cleared when exec() ends.
|
||||||
bool m_is_running = false;
|
bool m_is_running = false;
|
||||||
|
|
||||||
|
/// Set when task is queued while already running.
|
||||||
|
/// Cleared when task is requeued.
|
||||||
|
bool m_run_again = false;
|
||||||
|
|
||||||
|
/// Set when task is queued as idle task while already running.
|
||||||
|
/// Cleared when task is queued as non-idle task.
|
||||||
|
bool m_idle = false;
|
||||||
|
|
||||||
/// Sequential identifier for next task
|
/// Sequential identifier for next task
|
||||||
static atomic<TaskId> s_next_id;
|
static atomic<TaskId> s_next_id;
|
||||||
|
|
||||||
@@ -107,7 +118,7 @@ namespace crucible {
|
|||||||
static void clear_queue(TaskQueue &tq);
|
static void clear_queue(TaskQueue &tq);
|
||||||
|
|
||||||
/// Rescue any TaskQueue, not just this one.
|
/// Rescue any TaskQueue, not just this one.
|
||||||
static void rescue_queue(TaskQueue &tq);
|
static void rescue_queue(TaskQueue &tq, const bool sort_queue);
|
||||||
|
|
||||||
TaskState &operator=(const TaskState &) = delete;
|
TaskState &operator=(const TaskState &) = delete;
|
||||||
TaskState(const TaskState &) = delete;
|
TaskState(const TaskState &) = delete;
|
||||||
@@ -142,6 +153,10 @@ namespace crucible {
|
|||||||
/// or is destroyed.
|
/// or is destroyed.
|
||||||
void append(const TaskStatePtr &task);
|
void append(const TaskStatePtr &task);
|
||||||
|
|
||||||
|
/// Queue task to execute after current task finishes executing
|
||||||
|
/// or is destroyed, in task ID order.
|
||||||
|
void insert(const TaskStatePtr &task);
|
||||||
|
|
||||||
/// How masy Tasks are there? Good for catching leaks
|
/// How masy Tasks are there? Good for catching leaks
|
||||||
static size_t instance_count();
|
static size_t instance_count();
|
||||||
};
|
};
|
||||||
@@ -219,16 +234,21 @@ namespace crucible {
|
|||||||
static auto s_tms = make_shared<TaskMasterState>();
|
static auto s_tms = make_shared<TaskMasterState>();
|
||||||
|
|
||||||
void
|
void
|
||||||
TaskState::rescue_queue(TaskQueue &queue)
|
TaskState::rescue_queue(TaskQueue &queue, const bool sort_queue)
|
||||||
{
|
{
|
||||||
if (queue.empty()) {
|
if (queue.empty()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const auto tlcc = tl_current_consumer;
|
const auto &tlcc = tl_current_consumer;
|
||||||
if (tlcc) {
|
if (tlcc) {
|
||||||
// We are executing under a TaskConsumer, splice our post-exec queue at front.
|
// We are executing under a TaskConsumer, splice our post-exec queue at front.
|
||||||
// No locks needed because we are using only thread-local objects.
|
// No locks needed because we are using only thread-local objects.
|
||||||
tlcc->m_local_queue.splice(tlcc->m_local_queue.begin(), queue);
|
tlcc->m_local_queue.splice(tlcc->m_local_queue.begin(), queue);
|
||||||
|
if (sort_queue) {
|
||||||
|
tlcc->m_local_queue.sort([&](const TaskStatePtr &a, const TaskStatePtr &b) {
|
||||||
|
return a->m_id < b->m_id;
|
||||||
|
});
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// We are not executing under a TaskConsumer.
|
// We are not executing under a TaskConsumer.
|
||||||
// If there is only one task, then just insert it at the front of the queue.
|
// If there is only one task, then just insert it at the front of the queue.
|
||||||
@@ -239,6 +259,8 @@ namespace crucible {
|
|||||||
// then push it to the front of the global queue using normal locking methods.
|
// then push it to the front of the global queue using normal locking methods.
|
||||||
TaskStatePtr rescue_task(make_shared<TaskState>("rescue_task", [](){}));
|
TaskStatePtr rescue_task(make_shared<TaskState>("rescue_task", [](){}));
|
||||||
swap(rescue_task->m_post_exec_queue, queue);
|
swap(rescue_task->m_post_exec_queue, queue);
|
||||||
|
// Do the sort--once--when a new Consumer has picked up the Task
|
||||||
|
rescue_task->m_sort_queue = sort_queue;
|
||||||
TaskQueue tq_one { rescue_task };
|
TaskQueue tq_one { rescue_task };
|
||||||
TaskMasterState::push_front(tq_one);
|
TaskMasterState::push_front(tq_one);
|
||||||
}
|
}
|
||||||
@@ -251,7 +273,8 @@ namespace crucible {
|
|||||||
--s_instance_count;
|
--s_instance_count;
|
||||||
unique_lock<mutex> lock(m_mutex);
|
unique_lock<mutex> lock(m_mutex);
|
||||||
// If any dependent Tasks were appended since the last exec, run them now
|
// If any dependent Tasks were appended since the last exec, run them now
|
||||||
TaskState::rescue_queue(m_post_exec_queue);
|
TaskState::rescue_queue(m_post_exec_queue, m_sort_queue);
|
||||||
|
// No need to clear m_sort_queue here, it won't exist soon
|
||||||
}
|
}
|
||||||
|
|
||||||
TaskState::TaskState(string title, function<void()> exec_fn) :
|
TaskState::TaskState(string title, function<void()> exec_fn) :
|
||||||
@@ -310,6 +333,24 @@ namespace crucible {
|
|||||||
task->m_run_now = true;
|
task->m_run_now = true;
|
||||||
append_nolock(task);
|
append_nolock(task);
|
||||||
}
|
}
|
||||||
|
task->m_idle = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
TaskState::insert(const TaskStatePtr &task)
|
||||||
|
{
|
||||||
|
THROW_CHECK0(invalid_argument, task);
|
||||||
|
THROW_CHECK2(invalid_argument, m_id, task->m_id, m_id != task->m_id);
|
||||||
|
PairLock lock(m_mutex, task->m_mutex);
|
||||||
|
if (!task->m_run_now) {
|
||||||
|
task->m_run_now = true;
|
||||||
|
// Move the task and its post-exec queue to follow this task,
|
||||||
|
// and request a sort of the flattened list.
|
||||||
|
m_sort_queue = true;
|
||||||
|
m_post_exec_queue.push_back(task);
|
||||||
|
m_post_exec_queue.splice(m_post_exec_queue.end(), task->m_post_exec_queue);
|
||||||
|
}
|
||||||
|
task->m_idle = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
@@ -320,7 +361,7 @@ namespace crucible {
|
|||||||
|
|
||||||
unique_lock<mutex> lock(m_mutex);
|
unique_lock<mutex> lock(m_mutex);
|
||||||
if (m_is_running) {
|
if (m_is_running) {
|
||||||
append_nolock(shared_from_this());
|
m_run_again = true;
|
||||||
return;
|
return;
|
||||||
} else {
|
} else {
|
||||||
m_run_now = false;
|
m_run_now = false;
|
||||||
@@ -344,8 +385,20 @@ namespace crucible {
|
|||||||
swap(this_task, tl_current_task);
|
swap(this_task, tl_current_task);
|
||||||
m_is_running = false;
|
m_is_running = false;
|
||||||
|
|
||||||
|
if (m_run_again) {
|
||||||
|
m_run_again = false;
|
||||||
|
if (m_idle) {
|
||||||
|
// All the way back to the end of the line
|
||||||
|
TaskMasterState::push_back_idle(shared_from_this());
|
||||||
|
} else {
|
||||||
|
// Insert after any dependents waiting for this Task
|
||||||
|
m_post_exec_queue.push_back(shared_from_this());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Splice task post_exec queue at front of local queue
|
// Splice task post_exec queue at front of local queue
|
||||||
TaskState::rescue_queue(m_post_exec_queue);
|
TaskState::rescue_queue(m_post_exec_queue, m_sort_queue);
|
||||||
|
m_sort_queue = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
string
|
string
|
||||||
@@ -365,22 +418,32 @@ namespace crucible {
|
|||||||
TaskState::run()
|
TaskState::run()
|
||||||
{
|
{
|
||||||
unique_lock<mutex> lock(m_mutex);
|
unique_lock<mutex> lock(m_mutex);
|
||||||
|
m_idle = false;
|
||||||
if (m_run_now) {
|
if (m_run_now) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
m_run_now = true;
|
m_run_now = true;
|
||||||
TaskMasterState::push_back(shared_from_this());
|
if (m_is_running) {
|
||||||
|
m_run_again = true;
|
||||||
|
} else {
|
||||||
|
TaskMasterState::push_back(shared_from_this());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
TaskState::idle()
|
TaskState::idle()
|
||||||
{
|
{
|
||||||
unique_lock<mutex> lock(m_mutex);
|
unique_lock<mutex> lock(m_mutex);
|
||||||
|
m_idle = true;
|
||||||
if (m_run_now) {
|
if (m_run_now) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
m_run_now = true;
|
m_run_now = true;
|
||||||
TaskMasterState::push_back_idle(shared_from_this());
|
if (m_is_running) {
|
||||||
|
m_run_again = true;
|
||||||
|
} else {
|
||||||
|
TaskMasterState::push_back_idle(shared_from_this());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TaskMasterState::TaskMasterState(size_t thread_max) :
|
TaskMasterState::TaskMasterState(size_t thread_max) :
|
||||||
@@ -530,11 +593,6 @@ namespace crucible {
|
|||||||
size_t
|
size_t
|
||||||
TaskMasterState::calculate_thread_count_nolock()
|
TaskMasterState::calculate_thread_count_nolock()
|
||||||
{
|
{
|
||||||
if (m_paused) {
|
|
||||||
// No threads running while paused or cancelled
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (m_load_target == 0) {
|
if (m_load_target == 0) {
|
||||||
// No limits, no stats, use configured thread count
|
// No limits, no stats, use configured thread count
|
||||||
return m_configured_thread_max;
|
return m_configured_thread_max;
|
||||||
@@ -645,6 +703,9 @@ namespace crucible {
|
|||||||
unique_lock<mutex> lock(m_mutex);
|
unique_lock<mutex> lock(m_mutex);
|
||||||
m_paused = paused;
|
m_paused = paused;
|
||||||
m_condvar.notify_all();
|
m_condvar.notify_all();
|
||||||
|
if (!m_paused) {
|
||||||
|
start_threads_nolock();
|
||||||
|
}
|
||||||
lock.unlock();
|
lock.unlock();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -742,6 +803,14 @@ namespace crucible {
|
|||||||
m_task_state->append(that.m_task_state);
|
m_task_state->append(that.m_task_state);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
Task::insert(const Task &that) const
|
||||||
|
{
|
||||||
|
THROW_CHECK0(runtime_error, m_task_state);
|
||||||
|
THROW_CHECK0(runtime_error, that);
|
||||||
|
m_task_state->insert(that.m_task_state);
|
||||||
|
}
|
||||||
|
|
||||||
Task
|
Task
|
||||||
Task::current_task()
|
Task::current_task()
|
||||||
{
|
{
|
||||||
@@ -856,11 +925,13 @@ namespace crucible {
|
|||||||
swap(this_consumer, tl_current_consumer);
|
swap(this_consumer, tl_current_consumer);
|
||||||
assert(!tl_current_consumer);
|
assert(!tl_current_consumer);
|
||||||
|
|
||||||
// Release lock to rescue queue (may attempt to queue a new task at TaskMaster).
|
// Release lock to rescue queue (may attempt to queue a
|
||||||
// rescue_queue normally sends tasks to the local queue of the current TaskConsumer thread,
|
// new task at TaskMaster). rescue_queue normally sends
|
||||||
// but we just disconnected ourselves from that.
|
// tasks to the local queue of the current TaskConsumer
|
||||||
|
// thread, but we just disconnected ourselves from that.
|
||||||
|
// No sorting here because this is not a TaskState.
|
||||||
lock.unlock();
|
lock.unlock();
|
||||||
TaskState::rescue_queue(m_local_queue);
|
TaskState::rescue_queue(m_local_queue, false);
|
||||||
|
|
||||||
// Hold lock so we can erase ourselves
|
// Hold lock so we can erase ourselves
|
||||||
lock.lock();
|
lock.lock();
|
||||||
@@ -938,21 +1009,6 @@ namespace crucible {
|
|||||||
m_owner.reset();
|
m_owner.reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
|
||||||
Exclusion::insert_task(const Task &task)
|
|
||||||
{
|
|
||||||
unique_lock<mutex> lock(m_mutex);
|
|
||||||
const auto sp = m_owner.lock();
|
|
||||||
lock.unlock();
|
|
||||||
if (sp) {
|
|
||||||
// If Exclusion is locked then queue task for release;
|
|
||||||
sp->append(task);
|
|
||||||
} else {
|
|
||||||
// otherwise, run the inserted task immediately
|
|
||||||
task.run();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ExclusionLock
|
ExclusionLock
|
||||||
Exclusion::try_lock(const Task &task)
|
Exclusion::try_lock(const Task &task)
|
||||||
{
|
{
|
||||||
@@ -960,7 +1016,7 @@ namespace crucible {
|
|||||||
const auto sp = m_owner.lock();
|
const auto sp = m_owner.lock();
|
||||||
if (sp) {
|
if (sp) {
|
||||||
if (task) {
|
if (task) {
|
||||||
sp->append(task);
|
sp->insert(task);
|
||||||
}
|
}
|
||||||
return ExclusionLock();
|
return ExclusionLock();
|
||||||
} else {
|
} else {
|
||||||
|
27
lib/time.cc
27
lib/time.cc
@@ -98,12 +98,16 @@ namespace crucible {
|
|||||||
m_rate(rate),
|
m_rate(rate),
|
||||||
m_burst(burst)
|
m_burst(burst)
|
||||||
{
|
{
|
||||||
|
THROW_CHECK1(invalid_argument, m_rate, m_rate > 0);
|
||||||
|
THROW_CHECK1(invalid_argument, m_burst, m_burst >= 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
RateLimiter::RateLimiter(double rate) :
|
RateLimiter::RateLimiter(double rate) :
|
||||||
m_rate(rate),
|
m_rate(rate),
|
||||||
m_burst(rate)
|
m_burst(rate)
|
||||||
{
|
{
|
||||||
|
THROW_CHECK1(invalid_argument, m_rate, m_rate > 0);
|
||||||
|
THROW_CHECK1(invalid_argument, m_burst, m_burst >= 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
@@ -119,6 +123,7 @@ namespace crucible {
|
|||||||
double
|
double
|
||||||
RateLimiter::sleep_time(double cost)
|
RateLimiter::sleep_time(double cost)
|
||||||
{
|
{
|
||||||
|
THROW_CHECK1(invalid_argument, m_rate, m_rate > 0);
|
||||||
borrow(cost);
|
borrow(cost);
|
||||||
unique_lock<mutex> lock(m_mutex);
|
unique_lock<mutex> lock(m_mutex);
|
||||||
update_tokens();
|
update_tokens();
|
||||||
@@ -154,6 +159,21 @@ namespace crucible {
|
|||||||
m_tokens -= cost;
|
m_tokens -= cost;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
RateLimiter::rate(double const new_rate)
|
||||||
|
{
|
||||||
|
THROW_CHECK1(invalid_argument, new_rate, new_rate > 0);
|
||||||
|
unique_lock<mutex> lock(m_mutex);
|
||||||
|
m_rate = new_rate;
|
||||||
|
}
|
||||||
|
|
||||||
|
double
|
||||||
|
RateLimiter::rate() const
|
||||||
|
{
|
||||||
|
unique_lock<mutex> lock(m_mutex);
|
||||||
|
return m_rate;
|
||||||
|
}
|
||||||
|
|
||||||
RateEstimator::RateEstimator(double min_delay, double max_delay) :
|
RateEstimator::RateEstimator(double min_delay, double max_delay) :
|
||||||
m_min_delay(min_delay),
|
m_min_delay(min_delay),
|
||||||
m_max_delay(max_delay)
|
m_max_delay(max_delay)
|
||||||
@@ -202,6 +222,13 @@ namespace crucible {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
RateEstimator::increment(const uint64_t more)
|
||||||
|
{
|
||||||
|
unique_lock<mutex> lock(m_mutex);
|
||||||
|
return update_unlocked(m_last_count + more);
|
||||||
|
}
|
||||||
|
|
||||||
uint64_t
|
uint64_t
|
||||||
RateEstimator::count() const
|
RateEstimator::count() const
|
||||||
{
|
{
|
||||||
|
@@ -20,7 +20,6 @@
|
|||||||
using namespace crucible;
|
using namespace crucible;
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
|
|
||||||
BeesFdCache::BeesFdCache(shared_ptr<BeesContext> ctx) :
|
BeesFdCache::BeesFdCache(shared_ptr<BeesContext> ctx) :
|
||||||
m_ctx(ctx)
|
m_ctx(ctx)
|
||||||
{
|
{
|
||||||
@@ -98,6 +97,7 @@ BeesContext::dump_status()
|
|||||||
TaskMaster::print_queue(ofs);
|
TaskMaster::print_queue(ofs);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
ofs << "PROGRESS:\n";
|
||||||
ofs << get_progress();
|
ofs << get_progress();
|
||||||
|
|
||||||
ofs.close();
|
ofs.close();
|
||||||
@@ -125,6 +125,9 @@ string
|
|||||||
BeesContext::get_progress()
|
BeesContext::get_progress()
|
||||||
{
|
{
|
||||||
unique_lock<mutex> lock(m_progress_mtx);
|
unique_lock<mutex> lock(m_progress_mtx);
|
||||||
|
if (m_progress_str.empty()) {
|
||||||
|
return "[No progress estimate available]\n";
|
||||||
|
}
|
||||||
return m_progress_str;
|
return m_progress_str;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -210,6 +213,7 @@ BeesContext::dedup(const BeesRangePair &brp_in)
|
|||||||
{
|
{
|
||||||
// TOOLONG and NOTE can retroactively fill in the filename details, but LOG can't
|
// TOOLONG and NOTE can retroactively fill in the filename details, but LOG can't
|
||||||
BEESNOTE("dedup " << brp_in);
|
BEESNOTE("dedup " << brp_in);
|
||||||
|
BEESTRACE("dedup " << brp_in);
|
||||||
|
|
||||||
if (is_root_ro(brp_in.second.fid().root())) {
|
if (is_root_ro(brp_in.second.fid().root())) {
|
||||||
// BEESLOGDEBUG("WORKAROUND: dst root " << (brp_in.second.fid().root()) << " is read-only);
|
// BEESLOGDEBUG("WORKAROUND: dst root " << (brp_in.second.fid().root()) << " is read-only);
|
||||||
@@ -237,27 +241,40 @@ BeesContext::dedup(const BeesRangePair &brp_in)
|
|||||||
BEESCOUNT(dedup_try);
|
BEESCOUNT(dedup_try);
|
||||||
|
|
||||||
BEESNOTE("waiting to dedup " << brp);
|
BEESNOTE("waiting to dedup " << brp);
|
||||||
const auto lock = MultiLocker::get_lock("dedupe");
|
auto lock = MultiLocker::get_lock("dedupe");
|
||||||
|
|
||||||
Timer dedup_timer;
|
|
||||||
|
|
||||||
BEESLOGINFO("dedup: src " << pretty(brp.first.size()) << " [" << to_hex(brp.first.begin()) << ".." << to_hex(brp.first.end()) << "] {" << first_addr << "} " << name_fd(brp.first.fd()) << "\n"
|
BEESLOGINFO("dedup: src " << pretty(brp.first.size()) << " [" << to_hex(brp.first.begin()) << ".." << to_hex(brp.first.end()) << "] {" << first_addr << "} " << name_fd(brp.first.fd()) << "\n"
|
||||||
<< " dst " << pretty(brp.second.size()) << " [" << to_hex(brp.second.begin()) << ".." << to_hex(brp.second.end()) << "] {" << second_addr << "} " << name_fd(brp.second.fd()));
|
<< " dst " << pretty(brp.second.size()) << " [" << to_hex(brp.second.begin()) << ".." << to_hex(brp.second.end()) << "] {" << second_addr << "} " << name_fd(brp.second.fd()));
|
||||||
BEESNOTE("dedup: src " << pretty(brp.first.size()) << " [" << to_hex(brp.first.begin()) << ".." << to_hex(brp.first.end()) << "] {" << first_addr << "} " << name_fd(brp.first.fd()) << "\n"
|
BEESNOTE("dedup: src " << pretty(brp.first.size()) << " [" << to_hex(brp.first.begin()) << ".." << to_hex(brp.first.end()) << "] {" << first_addr << "} " << name_fd(brp.first.fd()) << "\n"
|
||||||
<< " dst " << pretty(brp.second.size()) << " [" << to_hex(brp.second.begin()) << ".." << to_hex(brp.second.end()) << "] {" << second_addr << "} " << name_fd(brp.second.fd()));
|
<< " dst " << pretty(brp.second.size()) << " [" << to_hex(brp.second.begin()) << ".." << to_hex(brp.second.end()) << "] {" << second_addr << "} " << name_fd(brp.second.fd()));
|
||||||
|
|
||||||
const bool rv = btrfs_extent_same(brp.first.fd(), brp.first.begin(), brp.first.size(), brp.second.fd(), brp.second.begin());
|
while (true) {
|
||||||
BEESCOUNTADD(dedup_ms, dedup_timer.age() * 1000);
|
try {
|
||||||
|
Timer dedup_timer;
|
||||||
|
const bool rv = btrfs_extent_same(brp.first.fd(), brp.first.begin(), brp.first.size(), brp.second.fd(), brp.second.begin());
|
||||||
|
BEESCOUNTADD(dedup_ms, dedup_timer.age() * 1000);
|
||||||
|
|
||||||
if (rv) {
|
if (rv) {
|
||||||
BEESCOUNT(dedup_hit);
|
BEESCOUNT(dedup_hit);
|
||||||
BEESCOUNTADD(dedup_bytes, brp.first.size());
|
BEESCOUNTADD(dedup_bytes, brp.first.size());
|
||||||
} else {
|
} else {
|
||||||
BEESCOUNT(dedup_miss);
|
BEESCOUNT(dedup_miss);
|
||||||
BEESLOGWARN("NO Dedup! " << brp);
|
BEESLOGWARN("NO Dedup! " << brp);
|
||||||
|
}
|
||||||
|
|
||||||
|
lock.reset();
|
||||||
|
bees_throttle(dedup_timer.age(), "dedup");
|
||||||
|
return rv;
|
||||||
|
} catch (const std::system_error &e) {
|
||||||
|
if (e.code().value() == EAGAIN) {
|
||||||
|
BEESNOTE("dedup waiting for btrfs send on " << brp.second);
|
||||||
|
BEESLOGDEBUG("dedup waiting for btrfs send on " << brp.second);
|
||||||
|
roots()->wait_for_transid(1);
|
||||||
|
} else {
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return rv;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BeesRangePair
|
BeesRangePair
|
||||||
@@ -341,6 +358,8 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)
|
|||||||
BEESTRACE("scan bfr " << bfr);
|
BEESTRACE("scan bfr " << bfr);
|
||||||
BEESCOUNT(scan_extent);
|
BEESCOUNT(scan_extent);
|
||||||
|
|
||||||
|
Timer one_timer;
|
||||||
|
|
||||||
// We keep moving this method around
|
// We keep moving this method around
|
||||||
auto m_ctx = shared_from_this();
|
auto m_ctx = shared_from_this();
|
||||||
|
|
||||||
@@ -837,7 +856,8 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)
|
|||||||
<< pretty(e.size()) << " "
|
<< pretty(e.size()) << " "
|
||||||
<< dedupe_list.size() << "d" << copy_list.size() << "c"
|
<< dedupe_list.size() << "d" << copy_list.size() << "c"
|
||||||
<< ((bytes_zeroed + BLOCK_SIZE_SUMS - 1) / BLOCK_SIZE_SUMS) << "p"
|
<< ((bytes_zeroed + BLOCK_SIZE_SUMS - 1) / BLOCK_SIZE_SUMS) << "p"
|
||||||
<< (extent_compressed ? "z {" : " {")
|
<< (extent_compressed ? "z " : " ")
|
||||||
|
<< one_timer << "s {"
|
||||||
<< to_hex(e.bytenr()) << "+" << to_hex(e.offset()) << "} "
|
<< to_hex(e.bytenr()) << "+" << to_hex(e.offset()) << "} "
|
||||||
<< to_hex(e.begin()) << " [" << bar << "] " << to_hex(e.end())
|
<< to_hex(e.begin()) << " [" << bar << "] " << to_hex(e.end())
|
||||||
<< ' ' << name_fd(bfr.fd())
|
<< ' ' << name_fd(bfr.fd())
|
||||||
@@ -974,9 +994,10 @@ BeesContext::resolve_addr_uncached(BeesAddress addr)
|
|||||||
Timer resolve_timer;
|
Timer resolve_timer;
|
||||||
|
|
||||||
struct rusage usage_before;
|
struct rusage usage_before;
|
||||||
|
struct rusage usage_after;
|
||||||
{
|
{
|
||||||
BEESNOTE("waiting to resolve addr " << addr << " with LOGICAL_INO");
|
BEESNOTE("waiting to resolve addr " << addr << " with LOGICAL_INO");
|
||||||
const auto lock = MultiLocker::get_lock("logical_ino");
|
auto lock = MultiLocker::get_lock("logical_ino");
|
||||||
|
|
||||||
// Get this thread's system CPU usage
|
// Get this thread's system CPU usage
|
||||||
DIE_IF_MINUS_ONE(getrusage(RUSAGE_THREAD, &usage_before));
|
DIE_IF_MINUS_ONE(getrusage(RUSAGE_THREAD, &usage_before));
|
||||||
@@ -990,13 +1011,13 @@ BeesContext::resolve_addr_uncached(BeesAddress addr)
|
|||||||
} else {
|
} else {
|
||||||
BEESCOUNT(resolve_fail);
|
BEESCOUNT(resolve_fail);
|
||||||
}
|
}
|
||||||
BEESCOUNTADD(resolve_ms, resolve_timer.age() * 1000);
|
DIE_IF_MINUS_ONE(getrusage(RUSAGE_THREAD, &usage_after));
|
||||||
|
const auto resolve_timer_age = resolve_timer.age();
|
||||||
|
BEESCOUNTADD(resolve_ms, resolve_timer_age * 1000);
|
||||||
|
lock.reset();
|
||||||
|
bees_throttle(resolve_timer_age, "resolve_addr");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Again!
|
|
||||||
struct rusage usage_after;
|
|
||||||
DIE_IF_MINUS_ONE(getrusage(RUSAGE_THREAD, &usage_after));
|
|
||||||
|
|
||||||
const double sys_usage_delta =
|
const double sys_usage_delta =
|
||||||
(usage_after.ru_stime.tv_sec + usage_after.ru_stime.tv_usec / 1000000.0) -
|
(usage_after.ru_stime.tv_sec + usage_after.ru_stime.tv_usec / 1000000.0) -
|
||||||
(usage_before.ru_stime.tv_sec + usage_before.ru_stime.tv_usec / 1000000.0);
|
(usage_before.ru_stime.tv_sec + usage_before.ru_stime.tv_usec / 1000000.0);
|
||||||
|
@@ -3,6 +3,7 @@
|
|||||||
#include "crucible/btrfs-tree.h"
|
#include "crucible/btrfs-tree.h"
|
||||||
#include "crucible/cache.h"
|
#include "crucible/cache.h"
|
||||||
#include "crucible/ntoa.h"
|
#include "crucible/ntoa.h"
|
||||||
|
#include "crucible/openat2.h"
|
||||||
#include "crucible/string.h"
|
#include "crucible/string.h"
|
||||||
#include "crucible/table.h"
|
#include "crucible/table.h"
|
||||||
#include "crucible/task.h"
|
#include "crucible/task.h"
|
||||||
@@ -592,7 +593,7 @@ BeesScanModeExtent::create_extent_map(const uint64_t bytenr, const ProgressTrack
|
|||||||
|
|
||||||
{
|
{
|
||||||
BEESNOTE("waiting to create extent map for " << to_hex(bytenr) << " with LOGICAL_INO");
|
BEESNOTE("waiting to create extent map for " << to_hex(bytenr) << " with LOGICAL_INO");
|
||||||
const auto lock = MultiLocker::get_lock("logical_ino");
|
auto lock = MultiLocker::get_lock("logical_ino");
|
||||||
|
|
||||||
BEESNOTE("Resolving bytenr " << to_hex(bytenr) << " refs " << log_ino.m_iors.size());
|
BEESNOTE("Resolving bytenr " << to_hex(bytenr) << " refs " << log_ino.m_iors.size());
|
||||||
BEESTOOLONG("Resolving bytenr " << to_hex(bytenr) << " refs " << log_ino.m_iors.size());
|
BEESTOOLONG("Resolving bytenr " << to_hex(bytenr) << " refs " << log_ino.m_iors.size());
|
||||||
@@ -605,8 +606,11 @@ BeesScanModeExtent::create_extent_map(const uint64_t bytenr, const ProgressTrack
|
|||||||
} else {
|
} else {
|
||||||
BEESCOUNT(extent_fail);
|
BEESCOUNT(extent_fail);
|
||||||
}
|
}
|
||||||
|
const auto resolve_age = resolve_timer.age();
|
||||||
|
|
||||||
BEESCOUNTADD(extent_ms, resolve_timer.age() * 1000);
|
BEESCOUNTADD(extent_ms, resolve_age * 1000);
|
||||||
|
lock.reset();
|
||||||
|
bees_throttle(resolve_age, "extent_map");
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t rv_count = log_ino.m_iors.size();
|
const size_t rv_count = log_ino.m_iors.size();
|
||||||
@@ -645,7 +649,7 @@ BeesScanModeExtent::create_extent_map(const uint64_t bytenr, const ProgressTrack
|
|||||||
bedf.objectid(i.m_inum);
|
bedf.objectid(i.m_inum);
|
||||||
const auto bti = bedf.at(i.m_offset);
|
const auto bti = bedf.at(i.m_offset);
|
||||||
if (!bti) {
|
if (!bti) {
|
||||||
BEESLOGDEBUG("No ref for extent " << to_hex(bytenr) << " at root " << i.m_root << " ino " << i.m_inum << " offset " << to_hex(i.m_offset));
|
// BEESLOGDEBUG("No ref for extent " << to_hex(bytenr) << " at root " << i.m_root << " ino " << i.m_inum << " offset " << to_hex(i.m_offset));
|
||||||
BEESCOUNT(extent_ref_missing);
|
BEESCOUNT(extent_ref_missing);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -907,6 +911,18 @@ BeesScanModeExtent::map_next_extent(uint64_t const subvol)
|
|||||||
BEESCOUNT(crawl_done);
|
BEESCOUNT(crawl_done);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static
|
||||||
|
string
|
||||||
|
strf_localtime(const time_t &when)
|
||||||
|
{
|
||||||
|
struct tm ltm = { 0 };
|
||||||
|
DIE_IF_ZERO(localtime_r(&when, <m));
|
||||||
|
|
||||||
|
char buf[100] = { 0 };
|
||||||
|
DIE_IF_ZERO(strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M", <m));
|
||||||
|
return buf;
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
BeesScanModeExtent::next_transid(const CrawlMap &crawl_map_unused)
|
BeesScanModeExtent::next_transid(const CrawlMap &crawl_map_unused)
|
||||||
{
|
{
|
||||||
@@ -963,13 +979,15 @@ BeesScanModeExtent::next_transid(const CrawlMap &crawl_map_unused)
|
|||||||
THROW_CHECK0(runtime_error, offset > 0);
|
THROW_CHECK0(runtime_error, offset > 0);
|
||||||
THROW_CHECK0(runtime_error, chunk_length > 0);
|
THROW_CHECK0(runtime_error, chunk_length > 0);
|
||||||
last_bgaddr = offset + chunk_length;
|
last_bgaddr = offset + chunk_length;
|
||||||
|
// Mixed-bg filesystems have block groups that are data _and_ metadata.
|
||||||
|
// Block groups that are _only_ metadata should be filtered out.
|
||||||
|
if (0 == (bti.chunk_type() & BTRFS_BLOCK_GROUP_DATA)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
bg_info_map[last_bgaddr] = (bg_info) {
|
bg_info_map[last_bgaddr] = (bg_info) {
|
||||||
.first_bytenr = offset,
|
.first_bytenr = offset,
|
||||||
.first_total = fs_size,
|
.first_total = fs_size,
|
||||||
};
|
};
|
||||||
if (bti.chunk_type() & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
fs_size += chunk_length;
|
fs_size += chunk_length;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -995,6 +1013,19 @@ BeesScanModeExtent::next_transid(const CrawlMap &crawl_map_unused)
|
|||||||
|
|
||||||
// Report on progress using extent bytenr map
|
// Report on progress using extent bytenr map
|
||||||
Table::Table eta;
|
Table::Table eta;
|
||||||
|
eta.insert_row(0, vector<Table::Content> {
|
||||||
|
Table::Text("extsz"),
|
||||||
|
Table::Text("datasz"),
|
||||||
|
Table::Text("point"),
|
||||||
|
Table::Text("gen_min"),
|
||||||
|
Table::Text("gen_max"),
|
||||||
|
Table::Text("this cycle start"),
|
||||||
|
Table::Text("tm_left"),
|
||||||
|
Table::Text("next cycle ETA"),
|
||||||
|
});
|
||||||
|
const auto dash_fill = Table::Fill('-');
|
||||||
|
eta.insert_row(1, vector<Table::Content>(eta.cols().size(), dash_fill));
|
||||||
|
const auto now = time(NULL);
|
||||||
for (const auto &i : s_magic_crawl_map) {
|
for (const auto &i : s_magic_crawl_map) {
|
||||||
const auto &subvol = i.first;
|
const auto &subvol = i.first;
|
||||||
const auto &magic = i.second;
|
const auto &magic = i.second;
|
||||||
@@ -1033,55 +1064,47 @@ BeesScanModeExtent::next_transid(const CrawlMap &crawl_map_unused)
|
|||||||
BEESCOUNT(progress_out_of_bg);
|
BEESCOUNT(progress_out_of_bg);
|
||||||
}
|
}
|
||||||
const auto bytenr_offset = min(bi_last_bytenr, max(bytenr, bi.first_bytenr)) - bi.first_bytenr + bi.first_total;
|
const auto bytenr_offset = min(bi_last_bytenr, max(bytenr, bi.first_bytenr)) - bi.first_bytenr + bi.first_total;
|
||||||
const auto bytenr_percent = bytenr_offset / (0.01 * fs_size);
|
const auto bytenr_norm = bytenr_offset / double(fs_size);
|
||||||
const auto now = time(NULL);
|
|
||||||
const auto time_so_far = now - min(now, this_state.m_started);
|
const auto time_so_far = now - min(now, this_state.m_started);
|
||||||
|
const string start_stamp = strf_localtime(this_state.m_started);
|
||||||
string eta_stamp = "-";
|
string eta_stamp = "-";
|
||||||
string eta_pretty = "-";
|
string eta_pretty = "-";
|
||||||
const auto &deferred_finished = deferred_map.at(subvol);
|
const auto &deferred_finished = deferred_map.at(subvol);
|
||||||
const bool deferred = deferred_finished.first;
|
|
||||||
const bool finished = deferred_finished.second;
|
const bool finished = deferred_finished.second;
|
||||||
if (time_so_far > 1 && bytenr_percent > 0 && !finished) {
|
if (finished) {
|
||||||
const time_t eta_duration = time_so_far / (bytenr_percent / 100);
|
// eta_stamp = "idle";
|
||||||
|
} else if (time_so_far > 1 && bytenr_norm > 0.01) {
|
||||||
|
const time_t eta_duration = time_so_far / bytenr_norm;
|
||||||
const time_t eta_time = eta_duration + now;
|
const time_t eta_time = eta_duration + now;
|
||||||
struct tm ltm = { 0 };
|
eta_stamp = strf_localtime(eta_time);
|
||||||
DIE_IF_ZERO(localtime_r(&eta_time, <m));
|
|
||||||
|
|
||||||
char buf[1024] = { 0 };
|
|
||||||
DIE_IF_ZERO(strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M", <m));
|
|
||||||
eta_stamp = string(buf);
|
|
||||||
eta_pretty = pretty_seconds(eta_duration);
|
eta_pretty = pretty_seconds(eta_duration);
|
||||||
}
|
}
|
||||||
const auto &mma = mes.m_map.at(subvol);
|
const auto &mma = mes.m_map.at(subvol);
|
||||||
const auto mma_ratio = mes_sample_size_ok ? (mma.m_bytes / double(mes.m_total)) : 1.0;
|
const auto mma_ratio = mes_sample_size_ok ? (mma.m_bytes / double(mes.m_total)) : 1.0;
|
||||||
const auto pos_scaled_text = mes_sample_size_ok ? pretty(bytenr_offset * mma_ratio) : "-";
|
const auto posn_text = Table::Text(finished ? "idle" : astringprintf("%06d", int(floor(bytenr_norm * 1000000))));
|
||||||
const auto pos_text = Table::Text(deferred ? "deferred" : pos_scaled_text);
|
|
||||||
const auto pct_text = Table::Text(finished ? "finished" : astringprintf("%.4f%%", bytenr_percent));
|
|
||||||
const auto size_text = Table::Text( mes_sample_size_ok ? pretty(fs_size * mma_ratio) : "-");
|
const auto size_text = Table::Text( mes_sample_size_ok ? pretty(fs_size * mma_ratio) : "-");
|
||||||
eta.insert_row(Table::endpos, vector<Table::Content> {
|
eta.insert_row(Table::endpos, vector<Table::Content> {
|
||||||
pos_text,
|
|
||||||
size_text,
|
|
||||||
pct_text,
|
|
||||||
Table::Text(magic.m_max_size == numeric_limits<uint64_t>::max() ? "max" : pretty(magic.m_max_size)),
|
Table::Text(magic.m_max_size == numeric_limits<uint64_t>::max() ? "max" : pretty(magic.m_max_size)),
|
||||||
|
size_text,
|
||||||
|
posn_text,
|
||||||
Table::Number(this_state.m_min_transid),
|
Table::Number(this_state.m_min_transid),
|
||||||
Table::Number(this_state.m_max_transid),
|
Table::Number(this_state.m_max_transid),
|
||||||
|
Table::Text(start_stamp),
|
||||||
Table::Text(eta_pretty),
|
Table::Text(eta_pretty),
|
||||||
Table::Text(eta_stamp),
|
Table::Text(eta_stamp),
|
||||||
});
|
});
|
||||||
BEESCOUNT(progress_ok);
|
BEESCOUNT(progress_ok);
|
||||||
}
|
}
|
||||||
eta.insert_row(0, vector<Table::Content> {
|
eta.insert_row(Table::endpos, vector<Table::Content> {
|
||||||
Table::Text("done"),
|
Table::Text("total"),
|
||||||
Table::Text(pretty(fs_size)),
|
Table::Text(pretty(fs_size)),
|
||||||
Table::Text("%done"),
|
Table::Text(""),
|
||||||
Table::Text("size"),
|
Table::Text("gen_now"),
|
||||||
Table::Text("transid"),
|
|
||||||
Table::Number(m_roots->transid_max()),
|
Table::Number(m_roots->transid_max()),
|
||||||
Table::Text("todo"),
|
Table::Text(""),
|
||||||
Table::Text("ETA"),
|
Table::Text("updated"),
|
||||||
|
Table::Text(strf_localtime(now)),
|
||||||
});
|
});
|
||||||
const auto dash_fill = Table::Fill('-');
|
|
||||||
eta.insert_row(1, vector<Table::Content>(eta.cols().size(), dash_fill));
|
|
||||||
eta.left("");
|
eta.left("");
|
||||||
eta.mid(" ");
|
eta.mid(" ");
|
||||||
eta.right("");
|
eta.right("");
|
||||||
@@ -1279,6 +1302,9 @@ BeesRoots::transid_max_nocache()
|
|||||||
THROW_CHECK1(runtime_error, rv, rv > 0);
|
THROW_CHECK1(runtime_error, rv, rv > 0);
|
||||||
// transid must be less than max, or we did something very wrong
|
// transid must be less than max, or we did something very wrong
|
||||||
THROW_CHECK1(runtime_error, rv, rv < numeric_limits<uint64_t>::max());
|
THROW_CHECK1(runtime_error, rv, rv < numeric_limits<uint64_t>::max());
|
||||||
|
|
||||||
|
// Update the rate estimator
|
||||||
|
m_transid_re.update(rv);
|
||||||
return rv;
|
return rv;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1303,8 +1329,6 @@ struct BeesFileCrawl {
|
|||||||
BeesCrawlState m_state;
|
BeesCrawlState m_state;
|
||||||
/// Currently processed offset in file
|
/// Currently processed offset in file
|
||||||
off_t m_offset;
|
off_t m_offset;
|
||||||
/// Btrfs file fetcher
|
|
||||||
BtrfsExtentDataFetcher m_bedf;
|
|
||||||
|
|
||||||
/// Method that does one unit of work for the Task
|
/// Method that does one unit of work for the Task
|
||||||
bool crawl_one_extent();
|
bool crawl_one_extent();
|
||||||
@@ -1316,10 +1340,15 @@ BeesFileCrawl::crawl_one_extent()
|
|||||||
BEESNOTE("crawl_one_extent m_offset " << to_hex(m_offset) << " state " << m_state);
|
BEESNOTE("crawl_one_extent m_offset " << to_hex(m_offset) << " state " << m_state);
|
||||||
BEESTRACE("crawl_one_extent m_offset " << to_hex(m_offset) << " state " << m_state);
|
BEESTRACE("crawl_one_extent m_offset " << to_hex(m_offset) << " state " << m_state);
|
||||||
|
|
||||||
|
BtrfsExtentDataFetcher bedf(m_ctx->root_fd());
|
||||||
|
bedf.tree(m_state.m_root);
|
||||||
|
bedf.objectid(m_state.m_objectid);
|
||||||
|
bedf.transid(m_state.m_min_transid);
|
||||||
|
|
||||||
// Only one thread can dedupe a file. btrfs will lock others out.
|
// Only one thread can dedupe a file. btrfs will lock others out.
|
||||||
// Inodes are usually full of shared extents, especially in the case of snapshots,
|
// Inodes are usually full of shared extents, especially in the case of snapshots,
|
||||||
// so when we lock an inode, we'll lock the same inode number in all subvols at once.
|
// so when we lock an inode, we'll lock the same inode number in all subvols at once.
|
||||||
auto inode_mutex = m_ctx->get_inode_mutex(m_bedf.objectid());
|
auto inode_mutex = m_ctx->get_inode_mutex(bedf.objectid());
|
||||||
auto inode_lock = inode_mutex->try_lock(Task::current_task());
|
auto inode_lock = inode_mutex->try_lock(Task::current_task());
|
||||||
if (!inode_lock) {
|
if (!inode_lock) {
|
||||||
BEESCOUNT(crawl_deferred_inode);
|
BEESCOUNT(crawl_deferred_inode);
|
||||||
@@ -1331,12 +1360,12 @@ BeesFileCrawl::crawl_one_extent()
|
|||||||
// It will mean the file or subvol was deleted or there's metadata corruption,
|
// It will mean the file or subvol was deleted or there's metadata corruption,
|
||||||
// and we should stop trying to scan the inode in that case.
|
// and we should stop trying to scan the inode in that case.
|
||||||
// The calling Task will be aborted.
|
// The calling Task will be aborted.
|
||||||
const auto bti = m_bedf.lower_bound(m_offset);
|
const auto bti = bedf.lower_bound(m_offset);
|
||||||
if (!bti) {
|
if (!bti) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// Make sure we advance
|
// Make sure we advance
|
||||||
m_offset = max(bti.offset() + m_bedf.block_size(), bti.offset());
|
m_offset = max(bti.offset() + bedf.block_size(), bti.offset());
|
||||||
// Check extent item generation is in range
|
// Check extent item generation is in range
|
||||||
const auto gen = bti.file_extent_generation();
|
const auto gen = bti.file_extent_generation();
|
||||||
if (gen < m_state.m_min_transid) {
|
if (gen < m_state.m_min_transid) {
|
||||||
@@ -1446,11 +1475,7 @@ BeesRoots::crawl_batch(shared_ptr<BeesCrawl> this_crawl)
|
|||||||
.m_hold = this_crawl->hold_state(this_state),
|
.m_hold = this_crawl->hold_state(this_state),
|
||||||
.m_state = this_state,
|
.m_state = this_state,
|
||||||
.m_offset = this_range.begin(),
|
.m_offset = this_range.begin(),
|
||||||
.m_bedf = BtrfsExtentDataFetcher(m_ctx->root_fd()),
|
|
||||||
});
|
});
|
||||||
bfc->m_bedf.tree(subvol);
|
|
||||||
bfc->m_bedf.objectid(inode);
|
|
||||||
bfc->m_bedf.transid(this_state.m_min_transid);
|
|
||||||
BEESNOTE("Starting task " << this_range);
|
BEESNOTE("Starting task " << this_range);
|
||||||
Task(task_title, [bfc]() {
|
Task(task_title, [bfc]() {
|
||||||
BEESNOTE("crawl_batch " << bfc->m_hold->get());
|
BEESNOTE("crawl_batch " << bfc->m_hold->get());
|
||||||
@@ -1476,6 +1501,15 @@ BeesRoots::clear_caches()
|
|||||||
m_ctx->resolve_cache_clear();
|
m_ctx->resolve_cache_clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
BeesRoots::wait_for_transid(const uint64_t count)
|
||||||
|
{
|
||||||
|
const auto now_transid = transid_max_nocache();
|
||||||
|
const auto target_transid = now_transid + count;
|
||||||
|
BEESLOGDEBUG("Waiting for transid " << target_transid << ", current transid is " << now_transid);
|
||||||
|
m_transid_re.wait_until(target_transid);
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
BeesRoots::crawl_thread()
|
BeesRoots::crawl_thread()
|
||||||
{
|
{
|
||||||
@@ -1497,7 +1531,8 @@ BeesRoots::crawl_thread()
|
|||||||
BEESTRACE("Measure current transid");
|
BEESTRACE("Measure current transid");
|
||||||
catch_all([&]() {
|
catch_all([&]() {
|
||||||
BEESTRACE("calling transid_max_nocache");
|
BEESTRACE("calling transid_max_nocache");
|
||||||
m_transid_re.update(transid_max_nocache());
|
// Will update m_transid_re as side effect
|
||||||
|
transid_max_nocache();
|
||||||
});
|
});
|
||||||
|
|
||||||
const auto new_transid = m_transid_re.count();
|
const auto new_transid = m_transid_re.count();
|
||||||
@@ -1614,7 +1649,7 @@ BeesRoots::insert_new_crawl()
|
|||||||
lock.unlock();
|
lock.unlock();
|
||||||
|
|
||||||
// Nothing to crawl? Seems suspicious...
|
// Nothing to crawl? Seems suspicious...
|
||||||
if (m_root_crawl_map.empty()) {
|
if (crawl_map_copy.empty()) {
|
||||||
BEESLOGINFO("crawl map is empty!");
|
BEESLOGINFO("crawl map is empty!");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1682,7 +1717,7 @@ BeesRoots::start()
|
|||||||
m_crawl_thread.exec([&]() {
|
m_crawl_thread.exec([&]() {
|
||||||
// Measure current transid before creating any crawlers
|
// Measure current transid before creating any crawlers
|
||||||
catch_all([&]() {
|
catch_all([&]() {
|
||||||
m_transid_re.update(transid_max_nocache());
|
transid_max_nocache();
|
||||||
});
|
});
|
||||||
|
|
||||||
// Make sure we have a full complement of crawlers
|
// Make sure we have a full complement of crawlers
|
||||||
@@ -1724,6 +1759,32 @@ BeesRoots::stop_wait()
|
|||||||
BEESLOGDEBUG("BeesRoots stopped");
|
BEESLOGDEBUG("BeesRoots stopped");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static
|
||||||
|
Fd
|
||||||
|
bees_openat(int const parent_fd, const char *const pathname, uint64_t const flags)
|
||||||
|
{
|
||||||
|
// Never O_CREAT so we don't need a mode argument
|
||||||
|
THROW_CHECK1(invalid_argument, flags, (flags & O_CREAT) == 0);
|
||||||
|
|
||||||
|
// Try openat2 if the kernel has it
|
||||||
|
static bool can_openat2 = true;
|
||||||
|
if (can_openat2) {
|
||||||
|
open_how how {
|
||||||
|
.flags = flags,
|
||||||
|
.resolve = RESOLVE_BENEATH | RESOLVE_NO_SYMLINKS | RESOLVE_NO_XDEV,
|
||||||
|
};
|
||||||
|
const auto rv = openat2(parent_fd, pathname, &how, sizeof(open_how));
|
||||||
|
if (rv == -1 && errno == ENOSYS) {
|
||||||
|
can_openat2 = false;
|
||||||
|
} else {
|
||||||
|
return Fd(rv);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// No kernel support, use openat instead
|
||||||
|
return Fd(openat(parent_fd, pathname, flags));
|
||||||
|
}
|
||||||
|
|
||||||
Fd
|
Fd
|
||||||
BeesRoots::open_root_nocache(uint64_t rootid)
|
BeesRoots::open_root_nocache(uint64_t rootid)
|
||||||
{
|
{
|
||||||
@@ -1786,7 +1847,7 @@ BeesRoots::open_root_nocache(uint64_t rootid)
|
|||||||
}
|
}
|
||||||
// Theoretically there is only one, so don't bother looping.
|
// Theoretically there is only one, so don't bother looping.
|
||||||
BEESTRACE("dirid " << dirid << " path " << ino.m_paths.at(0));
|
BEESTRACE("dirid " << dirid << " path " << ino.m_paths.at(0));
|
||||||
parent_fd = openat(parent_fd, ino.m_paths.at(0).c_str(), FLAGS_OPEN_DIR);
|
parent_fd = bees_openat(parent_fd, ino.m_paths.at(0).c_str(), FLAGS_OPEN_DIR);
|
||||||
if (!parent_fd) {
|
if (!parent_fd) {
|
||||||
BEESLOGTRACE("no parent_fd from dirid");
|
BEESLOGTRACE("no parent_fd from dirid");
|
||||||
BEESCOUNT(root_parent_path_open_fail);
|
BEESCOUNT(root_parent_path_open_fail);
|
||||||
@@ -1795,7 +1856,7 @@ BeesRoots::open_root_nocache(uint64_t rootid)
|
|||||||
}
|
}
|
||||||
// BEESLOG("openat(" << name_fd(parent_fd) << ", " << name << ")");
|
// BEESLOG("openat(" << name_fd(parent_fd) << ", " << name << ")");
|
||||||
BEESTRACE("openat(" << name_fd(parent_fd) << ", " << name << ")");
|
BEESTRACE("openat(" << name_fd(parent_fd) << ", " << name << ")");
|
||||||
Fd rv = openat(parent_fd, name.c_str(), FLAGS_OPEN_DIR);
|
Fd rv = bees_openat(parent_fd, name.c_str(), FLAGS_OPEN_DIR);
|
||||||
if (!rv) {
|
if (!rv) {
|
||||||
BEESLOGTRACE("open failed for name " << name << ": " << strerror(errno));
|
BEESLOGTRACE("open failed for name " << name << ": " << strerror(errno));
|
||||||
BEESCOUNT(root_open_fail);
|
BEESCOUNT(root_open_fail);
|
||||||
@@ -1941,7 +2002,7 @@ BeesRoots::open_root_ino_nocache(uint64_t root, uint64_t ino)
|
|||||||
// opening in write mode, and if we do open in write mode,
|
// opening in write mode, and if we do open in write mode,
|
||||||
// we can't exec the file while we have it open.
|
// we can't exec the file while we have it open.
|
||||||
const char *fp_cstr = file_path.c_str();
|
const char *fp_cstr = file_path.c_str();
|
||||||
rv = openat(root_fd, fp_cstr, FLAGS_OPEN_FILE);
|
rv = bees_openat(root_fd, fp_cstr, FLAGS_OPEN_FILE);
|
||||||
if (!rv) {
|
if (!rv) {
|
||||||
// errno == ENOENT is the most common error case.
|
// errno == ENOENT is the most common error case.
|
||||||
// No need to report it.
|
// No need to report it.
|
||||||
@@ -2026,12 +2087,6 @@ BeesRoots::open_root_ino(uint64_t root, uint64_t ino)
|
|||||||
return m_ctx->fd_cache()->open_root_ino(root, ino);
|
return m_ctx->fd_cache()->open_root_ino(root, ino);
|
||||||
}
|
}
|
||||||
|
|
||||||
RateEstimator &
|
|
||||||
BeesRoots::transid_re()
|
|
||||||
{
|
|
||||||
return m_transid_re;
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
void
|
||||||
BeesRoots::insert_tmpfile(Fd fd)
|
BeesRoots::insert_tmpfile(Fd fd)
|
||||||
{
|
{
|
||||||
|
@@ -91,9 +91,9 @@ BeesNote::~BeesNote()
|
|||||||
tl_next = m_prev;
|
tl_next = m_prev;
|
||||||
unique_lock<mutex> lock(s_mutex);
|
unique_lock<mutex> lock(s_mutex);
|
||||||
if (tl_next) {
|
if (tl_next) {
|
||||||
s_status[crucible::gettid()] = tl_next;
|
s_status[gettid()] = tl_next;
|
||||||
} else {
|
} else {
|
||||||
s_status.erase(crucible::gettid());
|
s_status.erase(gettid());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -104,7 +104,7 @@ BeesNote::BeesNote(function<void(ostream &os)> f) :
|
|||||||
m_prev = tl_next;
|
m_prev = tl_next;
|
||||||
tl_next = this;
|
tl_next = this;
|
||||||
unique_lock<mutex> lock(s_mutex);
|
unique_lock<mutex> lock(s_mutex);
|
||||||
s_status[crucible::gettid()] = tl_next;
|
s_status[gettid()] = tl_next;
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@@ -12,6 +12,7 @@ Load management options:
|
|||||||
-C, --thread-factor Worker thread factor (default 1)
|
-C, --thread-factor Worker thread factor (default 1)
|
||||||
-G, --thread-min Minimum worker thread count (default 0)
|
-G, --thread-min Minimum worker thread count (default 0)
|
||||||
-g, --loadavg-target Target load average for worker threads (default none)
|
-g, --loadavg-target Target load average for worker threads (default none)
|
||||||
|
--throttle-factor Idle time between operations (default 1.0)
|
||||||
|
|
||||||
Filesystem tree traversal options:
|
Filesystem tree traversal options:
|
||||||
-m, --scan-mode Scanning mode (0..4, default 4)
|
-m, --scan-mode Scanning mode (0..4, default 4)
|
||||||
|
141
src/bees.cc
141
src/bees.cc
@@ -220,7 +220,7 @@ bees_readahead_check(int const fd, off_t const offset, size_t const size)
|
|||||||
{
|
{
|
||||||
// FIXME: the rest of the code calls this function more often than necessary,
|
// FIXME: the rest of the code calls this function more often than necessary,
|
||||||
// usually back-to-back calls on the same range in a loop.
|
// usually back-to-back calls on the same range in a loop.
|
||||||
// Simply discard requests that are identical to recent requests from the same thread.
|
// Simply discard requests that are identical to recent requests.
|
||||||
const Stat stat_rv(fd);
|
const Stat stat_rv(fd);
|
||||||
auto tup = make_tuple(offset, size, stat_rv.st_dev, stat_rv.st_ino);
|
auto tup = make_tuple(offset, size, stat_rv.st_dev, stat_rv.st_ino);
|
||||||
static mutex s_recent_mutex;
|
static mutex s_recent_mutex;
|
||||||
@@ -242,7 +242,7 @@ static
|
|||||||
void
|
void
|
||||||
bees_readahead_nolock(int const fd, const off_t offset, const size_t size)
|
bees_readahead_nolock(int const fd, const off_t offset, const size_t size)
|
||||||
{
|
{
|
||||||
if (!bees_readahead_check(fd, size, offset)) return;
|
if (!bees_readahead_check(fd, offset, size)) return;
|
||||||
Timer readahead_timer;
|
Timer readahead_timer;
|
||||||
BEESNOTE("readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
|
BEESNOTE("readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
|
||||||
BEESTOOLONG("readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
|
BEESTOOLONG("readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
|
||||||
@@ -278,7 +278,7 @@ static mutex s_only_one;
|
|||||||
void
|
void
|
||||||
bees_readahead_pair(int fd, off_t offset, size_t size, int fd2, off_t offset2, size_t size2)
|
bees_readahead_pair(int fd, off_t offset, size_t size, int fd2, off_t offset2, size_t size2)
|
||||||
{
|
{
|
||||||
if (!bees_readahead_check(fd, size, offset) && !bees_readahead_check(fd2, offset2, size2)) return;
|
if (!bees_readahead_check(fd, offset, size) && !bees_readahead_check(fd2, offset2, size2)) return;
|
||||||
BEESNOTE("waiting to readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size) << ","
|
BEESNOTE("waiting to readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size) << ","
|
||||||
<< "\n\t" << name_fd(fd2) << " offset " << to_hex(offset2) << " len " << pretty(size2));
|
<< "\n\t" << name_fd(fd2) << " offset " << to_hex(offset2) << " len " << pretty(size2));
|
||||||
unique_lock<mutex> m_lock(s_only_one);
|
unique_lock<mutex> m_lock(s_only_one);
|
||||||
@@ -289,7 +289,7 @@ bees_readahead_pair(int fd, off_t offset, size_t size, int fd2, off_t offset2, s
|
|||||||
void
|
void
|
||||||
bees_readahead(int const fd, const off_t offset, const size_t size)
|
bees_readahead(int const fd, const off_t offset, const size_t size)
|
||||||
{
|
{
|
||||||
if (!bees_readahead_check(fd, size, offset)) return;
|
if (!bees_readahead_check(fd, offset, size)) return;
|
||||||
BEESNOTE("waiting to readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
|
BEESNOTE("waiting to readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
|
||||||
unique_lock<mutex> m_lock(s_only_one);
|
unique_lock<mutex> m_lock(s_only_one);
|
||||||
bees_readahead_nolock(fd, offset, size);
|
bees_readahead_nolock(fd, offset, size);
|
||||||
@@ -305,6 +305,48 @@ bees_unreadahead(int const fd, off_t offset, size_t size)
|
|||||||
BEESCOUNTADD(readahead_unread_ms, unreadahead_timer.age() * 1000);
|
BEESCOUNTADD(readahead_unread_ms, unreadahead_timer.age() * 1000);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static double bees_throttle_factor = 0.0;
|
||||||
|
|
||||||
|
void
|
||||||
|
bees_throttle(const double time_used, const char *const context)
|
||||||
|
{
|
||||||
|
static mutex s_mutex;
|
||||||
|
unique_lock<mutex> throttle_lock(s_mutex);
|
||||||
|
struct time_pair {
|
||||||
|
double time_used = 0;
|
||||||
|
double time_count = 0;
|
||||||
|
double longest_sleep_time = 0;
|
||||||
|
};
|
||||||
|
static map<string, time_pair> s_time_map;
|
||||||
|
auto &this_time = s_time_map[context];
|
||||||
|
auto &this_time_used = this_time.time_used;
|
||||||
|
auto &this_time_count = this_time.time_count;
|
||||||
|
auto &longest_sleep_time = this_time.longest_sleep_time;
|
||||||
|
this_time_used += time_used;
|
||||||
|
++this_time_count;
|
||||||
|
// Keep the timing data fresh
|
||||||
|
static Timer s_fresh_timer;
|
||||||
|
if (s_fresh_timer.age() > 60) {
|
||||||
|
s_fresh_timer.reset();
|
||||||
|
this_time_count *= 0.9;
|
||||||
|
this_time_used *= 0.9;
|
||||||
|
}
|
||||||
|
// Wait for enough data to calculate rates
|
||||||
|
if (this_time_used < 1.0 || this_time_count < 1.0) return;
|
||||||
|
const auto avg_time = this_time_used / this_time_count;
|
||||||
|
const auto sleep_time = min(60.0, bees_throttle_factor * avg_time - time_used);
|
||||||
|
if (sleep_time <= 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (sleep_time > longest_sleep_time) {
|
||||||
|
BEESLOGDEBUG(context << ": throttle delay " << sleep_time << " s, time used " << time_used << " s, avg time " << avg_time << " s");
|
||||||
|
longest_sleep_time = sleep_time;
|
||||||
|
}
|
||||||
|
throttle_lock.unlock();
|
||||||
|
BEESNOTE(context << ": throttle delay " << sleep_time << " s, time used " << time_used << " s, avg time " << avg_time << " s");
|
||||||
|
nanosleep(sleep_time);
|
||||||
|
}
|
||||||
|
|
||||||
thread_local random_device bees_random_device;
|
thread_local random_device bees_random_device;
|
||||||
thread_local uniform_int_distribution<default_random_engine::result_type> bees_random_seed_dist(
|
thread_local uniform_int_distribution<default_random_engine::result_type> bees_random_seed_dist(
|
||||||
numeric_limits<default_random_engine::result_type>::min(),
|
numeric_limits<default_random_engine::result_type>::min(),
|
||||||
@@ -401,6 +443,8 @@ BeesTempFile::resize(off_t offset)
|
|||||||
|
|
||||||
// Count time spent here
|
// Count time spent here
|
||||||
BEESCOUNTADD(tmp_resize_ms, resize_timer.age() * 1000);
|
BEESCOUNTADD(tmp_resize_ms, resize_timer.age() * 1000);
|
||||||
|
|
||||||
|
bees_throttle(resize_timer.age(), "tmpfile_resize");
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
@@ -536,6 +580,8 @@ BeesTempFile::make_copy(const BeesFileRange &src)
|
|||||||
}
|
}
|
||||||
BEESCOUNTADD(tmp_copy_ms, copy_timer.age() * 1000);
|
BEESCOUNTADD(tmp_copy_ms, copy_timer.age() * 1000);
|
||||||
|
|
||||||
|
bees_throttle(copy_timer.age(), "tmpfile_copy");
|
||||||
|
|
||||||
BEESCOUNT(tmp_copy);
|
BEESCOUNT(tmp_copy);
|
||||||
return rv;
|
return rv;
|
||||||
}
|
}
|
||||||
@@ -574,19 +620,23 @@ operator<<(ostream &os, const siginfo_t &si)
|
|||||||
|
|
||||||
static sigset_t new_sigset, old_sigset;
|
static sigset_t new_sigset, old_sigset;
|
||||||
|
|
||||||
|
static
|
||||||
void
|
void
|
||||||
block_term_signal()
|
block_signals()
|
||||||
{
|
{
|
||||||
BEESLOGDEBUG("Masking signals");
|
BEESLOGDEBUG("Masking signals");
|
||||||
|
|
||||||
DIE_IF_NON_ZERO(sigemptyset(&new_sigset));
|
DIE_IF_NON_ZERO(sigemptyset(&new_sigset));
|
||||||
DIE_IF_NON_ZERO(sigaddset(&new_sigset, SIGTERM));
|
DIE_IF_NON_ZERO(sigaddset(&new_sigset, SIGTERM));
|
||||||
DIE_IF_NON_ZERO(sigaddset(&new_sigset, SIGINT));
|
DIE_IF_NON_ZERO(sigaddset(&new_sigset, SIGINT));
|
||||||
|
DIE_IF_NON_ZERO(sigaddset(&new_sigset, SIGUSR1));
|
||||||
|
DIE_IF_NON_ZERO(sigaddset(&new_sigset, SIGUSR2));
|
||||||
DIE_IF_NON_ZERO(sigprocmask(SIG_BLOCK, &new_sigset, &old_sigset));
|
DIE_IF_NON_ZERO(sigprocmask(SIG_BLOCK, &new_sigset, &old_sigset));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static
|
||||||
void
|
void
|
||||||
wait_for_term_signal()
|
wait_for_signals()
|
||||||
{
|
{
|
||||||
BEESNOTE("waiting for signals");
|
BEESNOTE("waiting for signals");
|
||||||
BEESLOGDEBUG("Waiting for signals...");
|
BEESLOGDEBUG("Waiting for signals...");
|
||||||
@@ -603,14 +653,28 @@ wait_for_term_signal()
|
|||||||
THROW_ERRNO("sigwaitinfo errno = " << errno);
|
THROW_ERRNO("sigwaitinfo errno = " << errno);
|
||||||
} else {
|
} else {
|
||||||
BEESLOGNOTICE("Received signal " << rv << " info " << info);
|
BEESLOGNOTICE("Received signal " << rv << " info " << info);
|
||||||
// Unblock so we die immediately if signalled again
|
// If SIGTERM or SIGINT, unblock so we die immediately if signalled again
|
||||||
DIE_IF_NON_ZERO(sigprocmask(SIG_BLOCK, &old_sigset, &new_sigset));
|
switch (info.si_signo) {
|
||||||
break;
|
case SIGUSR1:
|
||||||
|
BEESLOGNOTICE("Received SIGUSR1 - pausing workers");
|
||||||
|
TaskMaster::pause(true);
|
||||||
|
break;
|
||||||
|
case SIGUSR2:
|
||||||
|
BEESLOGNOTICE("Received SIGUSR2 - unpausing workers");
|
||||||
|
TaskMaster::pause(false);
|
||||||
|
break;
|
||||||
|
case SIGTERM:
|
||||||
|
case SIGINT:
|
||||||
|
default:
|
||||||
|
DIE_IF_NON_ZERO(sigprocmask(SIG_BLOCK, &old_sigset, &new_sigset));
|
||||||
|
BEESLOGDEBUG("Signal catcher exiting");
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
BEESLOGDEBUG("Signal catcher exiting");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static
|
||||||
int
|
int
|
||||||
bees_main(int argc, char *argv[])
|
bees_main(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
@@ -634,7 +698,7 @@ bees_main(int argc, char *argv[])
|
|||||||
|
|
||||||
// Have to block signals now before we create a bunch of threads
|
// Have to block signals now before we create a bunch of threads
|
||||||
// so the threads will also have the signals blocked.
|
// so the threads will also have the signals blocked.
|
||||||
block_term_signal();
|
block_signals();
|
||||||
|
|
||||||
// Create a context so we can apply configuration to it
|
// Create a context so we can apply configuration to it
|
||||||
shared_ptr<BeesContext> bc = make_shared<BeesContext>();
|
shared_ptr<BeesContext> bc = make_shared<BeesContext>();
|
||||||
@@ -652,29 +716,34 @@ bees_main(int argc, char *argv[])
|
|||||||
BeesRoots::ScanMode root_scan_mode = BeesRoots::SCAN_MODE_EXTENT;
|
BeesRoots::ScanMode root_scan_mode = BeesRoots::SCAN_MODE_EXTENT;
|
||||||
|
|
||||||
// Configure getopt_long
|
// Configure getopt_long
|
||||||
|
// Options with no short form
|
||||||
|
enum {
|
||||||
|
BEES_OPT_THROTTLE_FACTOR = 256,
|
||||||
|
};
|
||||||
static const struct option long_options[] = {
|
static const struct option long_options[] = {
|
||||||
{ "thread-factor", required_argument, NULL, 'C' },
|
{ .name = "thread-factor", .has_arg = required_argument, .val = 'C' },
|
||||||
{ "thread-min", required_argument, NULL, 'G' },
|
{ .name = "throttle-factor", .has_arg = required_argument, .val = BEES_OPT_THROTTLE_FACTOR },
|
||||||
{ "strip-paths", no_argument, NULL, 'P' },
|
{ .name = "thread-min", .has_arg = required_argument, .val = 'G' },
|
||||||
{ "no-timestamps", no_argument, NULL, 'T' },
|
{ .name = "strip-paths", .has_arg = no_argument, .val = 'P' },
|
||||||
{ "workaround-btrfs-send", no_argument, NULL, 'a' },
|
{ .name = "no-timestamps", .has_arg = no_argument, .val = 'T' },
|
||||||
{ "thread-count", required_argument, NULL, 'c' },
|
{ .name = "workaround-btrfs-send", .has_arg = no_argument, .val = 'a' },
|
||||||
{ "loadavg-target", required_argument, NULL, 'g' },
|
{ .name = "thread-count", .has_arg = required_argument, .val = 'c' },
|
||||||
{ "help", no_argument, NULL, 'h' },
|
{ .name = "loadavg-target", .has_arg = required_argument, .val = 'g' },
|
||||||
{ "scan-mode", required_argument, NULL, 'm' },
|
{ .name = "help", .has_arg = no_argument, .val = 'h' },
|
||||||
{ "absolute-paths", no_argument, NULL, 'p' },
|
{ .name = "scan-mode", .has_arg = required_argument, .val = 'm' },
|
||||||
{ "timestamps", no_argument, NULL, 't' },
|
{ .name = "absolute-paths", .has_arg = no_argument, .val = 'p' },
|
||||||
{ "verbose", required_argument, NULL, 'v' },
|
{ .name = "timestamps", .has_arg = no_argument, .val = 't' },
|
||||||
{ 0, 0, 0, 0 },
|
{ .name = "verbose", .has_arg = required_argument, .val = 'v' },
|
||||||
|
{ 0 },
|
||||||
};
|
};
|
||||||
|
|
||||||
// Build getopt_long's short option list from the long_options table.
|
// Build getopt_long's short option list from the long_options table.
|
||||||
// While we're at it, make sure we didn't duplicate any options.
|
// While we're at it, make sure we didn't duplicate any options.
|
||||||
string getopt_list;
|
string getopt_list;
|
||||||
set<decltype(option::val)> option_vals;
|
map<decltype(option::val), string> option_vals;
|
||||||
for (const struct option *op = long_options; op->val; ++op) {
|
for (const struct option *op = long_options; op->val; ++op) {
|
||||||
THROW_CHECK1(runtime_error, op->val, !option_vals.count(op->val));
|
const auto ins_rv = option_vals.insert(make_pair(op->val, op->name));
|
||||||
option_vals.insert(op->val);
|
THROW_CHECK1(runtime_error, op->val, ins_rv.second);
|
||||||
if ((op->val & 0xff) != op->val) {
|
if ((op->val & 0xff) != op->val) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -685,22 +754,26 @@ bees_main(int argc, char *argv[])
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Parse options
|
// Parse options
|
||||||
int c;
|
|
||||||
while (true) {
|
while (true) {
|
||||||
int option_index = 0;
|
int option_index = 0;
|
||||||
|
|
||||||
c = getopt_long(argc, argv, getopt_list.c_str(), long_options, &option_index);
|
const auto c = getopt_long(argc, argv, getopt_list.c_str(), long_options, &option_index);
|
||||||
if (-1 == c) {
|
if (-1 == c) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
BEESLOGDEBUG("Parsing option '" << static_cast<char>(c) << "'");
|
// getopt_long should have weeded out any invalid options,
|
||||||
|
// so we can go ahead and throw here
|
||||||
|
BEESLOGDEBUG("Parsing option '" << option_vals.at(c) << "'");
|
||||||
|
|
||||||
switch (c) {
|
switch (c) {
|
||||||
|
|
||||||
case 'C':
|
case 'C':
|
||||||
thread_factor = stod(optarg);
|
thread_factor = stod(optarg);
|
||||||
break;
|
break;
|
||||||
|
case BEES_OPT_THROTTLE_FACTOR:
|
||||||
|
bees_throttle_factor = stod(optarg);
|
||||||
|
break;
|
||||||
case 'G':
|
case 'G':
|
||||||
thread_min = stoul(optarg);
|
thread_min = stoul(optarg);
|
||||||
break;
|
break;
|
||||||
@@ -741,12 +814,12 @@ bees_main(int argc, char *argv[])
|
|||||||
case 'h':
|
case 'h':
|
||||||
default:
|
default:
|
||||||
do_cmd_help(argv);
|
do_cmd_help(argv);
|
||||||
return EXIT_FAILURE;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (optind + 1 != argc) {
|
if (optind + 1 != argc) {
|
||||||
BEESLOGERR("Only one filesystem path per bees process");
|
BEESLOGERR("Exactly one filesystem path required");
|
||||||
return EXIT_FAILURE;
|
return EXIT_FAILURE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -786,6 +859,8 @@ bees_main(int argc, char *argv[])
|
|||||||
BEESLOGNOTICE("setting worker thread pool maximum size to " << thread_count);
|
BEESLOGNOTICE("setting worker thread pool maximum size to " << thread_count);
|
||||||
TaskMaster::set_thread_count(thread_count);
|
TaskMaster::set_thread_count(thread_count);
|
||||||
|
|
||||||
|
BEESLOGNOTICE("setting throttle factor to " << bees_throttle_factor);
|
||||||
|
|
||||||
// Set root path
|
// Set root path
|
||||||
string root_path = argv[optind++];
|
string root_path = argv[optind++];
|
||||||
BEESLOGNOTICE("setting root path to '" << root_path << "'");
|
BEESLOGNOTICE("setting root path to '" << root_path << "'");
|
||||||
@@ -808,7 +883,7 @@ bees_main(int argc, char *argv[])
|
|||||||
bc->start();
|
bc->start();
|
||||||
|
|
||||||
// Now we just wait forever
|
// Now we just wait forever
|
||||||
wait_for_term_signal();
|
wait_for_signals();
|
||||||
|
|
||||||
// Shut it down
|
// Shut it down
|
||||||
bc->stop();
|
bc->stop();
|
||||||
|
@@ -78,10 +78,10 @@ const int BEES_PROGRESS_INTERVAL = BEES_STATS_INTERVAL;
|
|||||||
const int BEES_STATUS_INTERVAL = 1;
|
const int BEES_STATUS_INTERVAL = 1;
|
||||||
|
|
||||||
// Number of file FDs to cache when not in active use
|
// Number of file FDs to cache when not in active use
|
||||||
const size_t BEES_FILE_FD_CACHE_SIZE = 32768;
|
const size_t BEES_FILE_FD_CACHE_SIZE = 524288;
|
||||||
|
|
||||||
// Number of root FDs to cache when not in active use
|
// Number of root FDs to cache when not in active use
|
||||||
const size_t BEES_ROOT_FD_CACHE_SIZE = 4096;
|
const size_t BEES_ROOT_FD_CACHE_SIZE = 65536;
|
||||||
|
|
||||||
// Number of FDs to open (rlimit)
|
// Number of FDs to open (rlimit)
|
||||||
const size_t BEES_OPEN_FILE_LIMIT = BEES_FILE_FD_CACHE_SIZE + BEES_ROOT_FD_CACHE_SIZE + 100;
|
const size_t BEES_OPEN_FILE_LIMIT = BEES_FILE_FD_CACHE_SIZE + BEES_ROOT_FD_CACHE_SIZE + 100;
|
||||||
@@ -576,7 +576,6 @@ class BeesRoots : public enable_shared_from_this<BeesRoots> {
|
|||||||
void writeback_thread();
|
void writeback_thread();
|
||||||
uint64_t next_root(uint64_t root = 0);
|
uint64_t next_root(uint64_t root = 0);
|
||||||
void current_state_set(const BeesCrawlState &bcs);
|
void current_state_set(const BeesCrawlState &bcs);
|
||||||
RateEstimator& transid_re();
|
|
||||||
bool crawl_batch(shared_ptr<BeesCrawl> crawl);
|
bool crawl_batch(shared_ptr<BeesCrawl> crawl);
|
||||||
void clear_caches();
|
void clear_caches();
|
||||||
|
|
||||||
@@ -615,6 +614,8 @@ public:
|
|||||||
|
|
||||||
uint64_t transid_min();
|
uint64_t transid_min();
|
||||||
uint64_t transid_max();
|
uint64_t transid_max();
|
||||||
|
|
||||||
|
void wait_for_transid(const uint64_t count);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct BeesHash {
|
struct BeesHash {
|
||||||
@@ -887,6 +888,7 @@ string pretty(double d);
|
|||||||
void bees_readahead(int fd, off_t offset, size_t size);
|
void bees_readahead(int fd, off_t offset, size_t size);
|
||||||
void bees_readahead_pair(int fd, off_t offset, size_t size, int fd2, off_t offset2, size_t size2);
|
void bees_readahead_pair(int fd, off_t offset, size_t size, int fd2, off_t offset2, size_t size2);
|
||||||
void bees_unreadahead(int fd, off_t offset, size_t size);
|
void bees_unreadahead(int fd, off_t offset, size_t size);
|
||||||
|
void bees_throttle(double time_used, const char *context);
|
||||||
string format_time(time_t t);
|
string format_time(time_t t);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
Reference in New Issue
Block a user