1
0
mirror of https://github.com/Zygo/bees.git synced 2025-08-02 13:53:28 +02:00

4 Commits

Author SHA1 Message Date
KhalilSantana
27857406f5 Fixes a bad grep pattern caused by dffd6e0
Fixes #233
2022-10-13 16:32:48 -04:00
Khalil Santana
b44ed287dd Get rid of errors by using grep -E
"egrep: warning: egrep is obsolescent; using grep -E"
2022-10-05 22:36:33 -03:00
Ayla Ounce
20c469245c Fix beesd script arg parsing to respect PREFIX
Without this, if you install to a different PREFIX such as /usr/local
it will fail to recognize any arguments and if you use the systemd unit,
that makes --no-timestamps the first NOT_SUPPORTED_ARG which will get
passed to uuidparse, which doesn't recognize it and errors.
2022-10-05 22:36:33 -03:00
Javi Vilarroig
77cf2d794e Minimal changes in beesd script to make it functional in my system 2022-10-05 22:36:33 -03:00
75 changed files with 2631 additions and 6625 deletions

3
.gitignore vendored
View File

@@ -1,8 +1,7 @@
*.[ao]
*.bak
*.dep
*.new
*.tmp
*.dep
*.so*
Doxyfile
README.html

View File

@@ -2,7 +2,6 @@ MAKE += PREFIX=$(PREFIX) LIBEXEC_PREFIX=$(LIBEXEC_PREFIX) ETC_PREFIX=$(ETC_PREFI
define TEMPLATE_COMPILER =
sed $< >$@ \
-e's#@DESTDIR@#$(DESTDIR)#' \
-e's#@PREFIX@#$(PREFIX)#' \
-e's#@ETC_PREFIX@#$(ETC_PREFIX)#' \
-e's#@LIBEXEC_PREFIX@#$(LIBEXEC_PREFIX)#'

View File

@@ -49,6 +49,11 @@ scripts/%: scripts/%.in
scripts: scripts/beesd scripts/beesd@.service
install_tools: ## Install support tools + libs
install_tools: src
install -Dm755 bin/fiemap $(DESTDIR)$(PREFIX)/bin/fiemap
install -Dm755 bin/fiewalk $(DESTDIR)$(PREFIX)/sbin/fiewalk
install_bees: ## Install bees + libs
install_bees: src $(RUN_INSTALL_TESTS)
install -Dm755 bin/bees $(DESTDIR)$(LIBEXEC_PREFIX)/bees
@@ -56,13 +61,13 @@ install_bees: src $(RUN_INSTALL_TESTS)
install_scripts: ## Install scipts
install_scripts: scripts
install -Dm755 scripts/beesd $(DESTDIR)$(PREFIX)/sbin/beesd
install -Dm644 scripts/beesd.conf.sample $(DESTDIR)$(ETC_PREFIX)/bees/beesd.conf.sample
install -Dm644 scripts/beesd.conf.sample $(DESTDIR)/$(ETC_PREFIX)/bees/beesd.conf.sample
ifneq ($(SYSTEMD_SYSTEM_UNIT_DIR),)
install -Dm644 scripts/beesd@.service $(DESTDIR)$(SYSTEMD_SYSTEM_UNIT_DIR)/beesd@.service
endif
install: ## Install distribution
install: install_bees install_scripts
install: install_bees install_scripts $(OPTIONAL_INSTALL_TARGETS)
help: ## Show help
@fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/\t/'

View File

@@ -6,30 +6,31 @@ Best-Effort Extent-Same, a btrfs deduplication agent.
About bees
----------
bees is a block-oriented userspace deduplication agent designed to scale
up to large btrfs filesystems. It is an offline dedupe combined with
an incremental data scan capability to minimize time data spends on disk
from write to dedupe.
bees is a block-oriented userspace deduplication agent designed for large
btrfs filesystems. It is an offline dedupe combined with an incremental
data scan capability to minimize time data spends on disk from write
to dedupe.
Strengths
---------
* Space-efficient hash table - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
* Daemon mode - incrementally dedupes new data as it appears
* Largest extents first - recover more free space during fixed maintenance windows
* Space-efficient hash table and matching algorithms - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
* Daemon incrementally dedupes new data using btrfs tree search
* Works with btrfs compression - dedupe any combination of compressed and uncompressed files
* Whole-filesystem dedupe - scans data only once, even with snapshots and reflinks
* **NEW** [Works around `btrfs send` problems with dedupe and incremental parent shapshots](docs/options.md)
* Works around btrfs filesystem structure to free more disk space
* Persistent hash table for rapid restart after shutdown
* Whole-filesystem dedupe - including snapshots
* Constant hash table size - no increased RAM usage if data set becomes larger
* Works on live data - no scheduled downtime required
* Automatic self-throttling - reduces system load
* btrfs support - recovers more free space from btrfs than naive dedupers
* Automatic self-throttling based on system load
Weaknesses
----------
* Whole-filesystem dedupe - has no include/exclude filters, does not accept file lists
* Requires root privilege (`CAP_SYS_ADMIN` plus the usual filesystem read/modify caps)
* Requires root privilege (or `CAP_SYS_ADMIN`)
* First run may require temporary disk space for extent reorganization
* [First run may increase metadata space usage if many snapshots exist](docs/gotchas.md)
* Constant hash table size - no decreased RAM usage if data set becomes smaller
* btrfs only
@@ -46,7 +47,7 @@ Recommended Reading
-------------------
* [bees Gotchas](docs/gotchas.md)
* [btrfs kernel bugs](docs/btrfs-kernel.md) - especially DATA CORRUPTION WARNING for old kernels
* [btrfs kernel bugs](docs/btrfs-kernel.md) - especially DATA CORRUPTION WARNING
* [bees vs. other btrfs features](docs/btrfs-other.md)
* [What to do when something goes wrong](docs/wrong.md)
@@ -69,6 +70,6 @@ You can also use Github:
Copyright & License
-------------------
Copyright 2015-2025 Zygo Blaxell <bees@furryterror.org>.
Copyright 2015-2018 Zygo Blaxell <bees@furryterror.org>.
GPL (version 3 or later).

View File

@@ -1,31 +1,37 @@
Recommended Linux Kernel Version for bees
=========================================
Recommended Kernel Version for bees
===================================
First, a warning about old Linux kernel versions:
First, a warning that is not specific to bees:
> **Linux kernel version 5.1, 5.2, and 5.3 should not be used with btrfs
due to a severe regression that can lead to fatal metadata corruption.**
This issue is fixed in version 5.4.14 and later.
> **Kernel 5.1, 5.2, and 5.3 should not be used with btrfs due to a
severe regression that can lead to fatal metadata corruption.**
This issue is fixed in kernel 5.4.14 and later.
**Recommended Linux kernel versions for bees are 5.4, 5.10, 5.15, 6.1,
6.6, or 6.12 with recent LTS and -stable updates.** The latest released
kernel as of this writing is 6.12.9, and the earliest supported LTS
kernel is 5.4.
**Recommended kernel versions for bees are 4.19, 5.4, 5.10, 5.11, or 5.12,
with recent LTS and -stable updates.** The latest released kernel as
of this writing is 5.12.3.
Some optional bees features use kernel APIs introduced in kernel 4.15
(extent scan) and 5.6 (`openat2` support). These bees features are not
available on older kernels. Support for older kernels may be removed
in a future bees release.
4.14, 4.9, and 4.4 LTS kernels with recent updates are OK with
some issues. Older kernels will be slower (a little slower or a lot
slower depending on which issues are triggered). Not all fixes are
backported.
Obsolete non-LTS kernels have a variety of unfixed issues and should
not be used with btrfs. For details see the table below.
bees requires btrfs kernel API version 4.2 or higher, and does not work
on older kernels.
bees will detect and use btrfs kernel API up to version 4.15 if present.
In some future bees release, this API version may become mandatory.
bees will not run at all on kernels before 4.2 due to lack of minimal
API support.
Kernel Bug Tracking Table
-------------------------
These bugs are particularly popular among bees users, though not all are specifically relevant to bees:
These bugs are particularly popular among bees users:
| First bad kernel | Last bad kernel | Issue Description | Fixed Kernel Versions | Fix Commit
| :---: | :---: | --- | :---: | ---
@@ -52,19 +58,10 @@ These bugs are particularly popular among bees users, though not all are specifi
| - | 5.8 | deadlock in `TREE_SEARCH` ioctl (core component of bees filesystem scanner), followed by regression in deadlock fix | 4.4.237, 4.9.237, 4.14.199, 4.19.146, 5.4.66, 5.8.10 and later | a48b73eca4ce btrfs: fix potential deadlock in the search ioctl, 1c78544eaa46 btrfs: fix wrong address when faulting in pages in the search ioctl
| 5.7 | 5.10 | kernel crash if balance receives fatal signal e.g. Ctrl-C | 5.4.93, 5.10.11, 5.11 and later | 18d3bff411c8 btrfs: don't get an EINTR during drop_snapshot for reloc
| 5.10 | 5.10 | 20x write performance regression | 5.10.8, 5.11 and later | e076ab2a2ca7 btrfs: shrink delalloc pages instead of full inodes
| 5.4 | 5.11 | spurious tree checker failures on extent ref hash | 5.4.125, 5.10.43, 5.11.5, 5.12 and later | 1119a72e223f btrfs: tree-checker: do not error out if extent ref hash doesn't match
| 5.4 | 5.11 | spurious tree checker failures on extent ref hash | 5.11.5, 5.12 and later | 1119a72e223f btrfs: tree-checker: do not error out if extent ref hash doesn't match
| - | 5.11 | tree mod log issue #5 | 4.4.263, 4.9.263, 4.14.227, 4.19.183, 5.4.108, 5.10.26, 5.11.9, 5.12 and later | dbcc7d57bffc btrfs: fix race when cloning extent buffer during rewind of an old root
| - | 5.12 | tree mod log issue #6 | 4.14.233, 4.19.191, 5.4.118, 5.10.36, 5.11.20, 5.12.3, 5.13 and later | f9690f426b21 btrfs: fix race when picking most recent mod log operation for an old root
| 4.15 | 5.16 | spurious warnings from `fs/fs-writeback.c` when `flushoncommit` is enabled | 5.15.27, 5.16.13, 5.17 and later | a0f0cf8341e3 btrfs: get rid of warning on transaction commit when using flushoncommit
| - | 5.17 | crash during device removal can make filesystem unmountable | 5.15.54, 5.16.20, 5.17.3, 5.18 and later | bbac58698a55 btrfs: remove device item and update super block in the same transaction
| - | 5.18 | wrong superblock num_devices makes filesystem unmountable | 4.14.283, 4.19.247, 5.4.198, 5.10.121, 5.15.46, 5.17.14, 5.18.3, 5.19 and later | d201238ccd2f btrfs: repair super block num_devices automatically
| 5.18 | 5.19 | parent transid verify failed during log tree replay after a crash during a rename operation | 5.18.18, 5.19.2, 6.0 and later | 723df2bcc9e1 btrfs: join running log transaction when logging new name
| 5.12 | 6.0 | space cache corruption and potential double allocations | 5.15.65, 5.19.6, 6.0 and later | ced8ecf026fd btrfs: fix space cache corruption and potential double allocations
| 6.0 | 6.5 | suboptimal allocation in multi-device filesystems due to chunk allocator regression | 6.1.60, 6.5.9, 6.6 and later | 8a540e990d7d btrfs: fix stripe length calculation for non-zoned data chunk allocation
| 6.3, backported to 5.15.107, 6.1.24, 6.2.11 | 6.3 | vmalloc error, failed to allocate pages | 6.3.10, 6.4 and later. Bug (f349b15e183d "mm: vmalloc: avoid warn_alloc noise caused by fatal signal" in v6.3-rc6) backported to 6.1.24, 6.2.11, and 5.15.107. | 95a301eefa82 mm/vmalloc: do not output a spurious warning when huge vmalloc() fails
| 6.2 | 6.3 | `IGNORE_OFFSET` flag ignored in `LOGICAL_INO` ioctl | 6.2.16, 6.3.3, 6.4 and later | 0cad8f14d70c btrfs: fix backref walking not returning all inode refs
| 6.10 | 6.11 | `adding refs to an existing tree ref`, `failed to run delayed ref`, then read-only | 6.11.10, 6.12 and later | 7d493a5ecc26 btrfs: fix incorrect comparison for delayed refs
| 5.4 | - | kernel hang when multiple threads are running `LOGICAL_INO` and dedupe/clone ioctl on the same extent | - | workaround: avoid doing that
| 4.15 | - | spurious warnings from `fs/fs-writeback.c` when `flushoncommit` is enabled | - | workaround: comment out the `WARN_ON`
"Last bad kernel" refers to that version's last stable update from
kernel.org. Distro kernels may backport additional fixes. Consult
@@ -79,69 +76,97 @@ through 5.4.13 inclusive.
A "-" for "first bad kernel" indicates the bug has been present since
the relevant feature first appeared in btrfs.
A "-" for "last bad kernel" indicates the bug has not yet been fixed in
current kernels (see top of this page for which kernel version that is).
A "-" for "last bad kernel" indicates the bug has not yet been fixed as
of 5.8.14.
In cases where issues are fixed by commits spread out over multiple
kernel versions, "fixed kernel version" refers to the version that
contains the last committed component of the fix.
contains all components of the fix.
Workarounds for known kernel bugs
---------------------------------
* **Hangs with concurrent `LOGICAL_INO` and dedupe/clone**: on all
kernel versions so far, multiple threads running `LOGICAL_INO` and
dedupe/clone ioctls at the same time on the same inodes or extents
can lead to a kernel hang. The kernel enters an infinite loop in
`add_all_parents`, where `count` is 0, `ref->count` is 1, and
`btrfs_next_item` or `btrfs_next_old_item` never find a matching ref.
* **Tree mod log issues**: bees will detect that a btrfs balance is
running, and pause bees activity until the balance is done. This avoids
running both the `LOGICAL_INO` ioctl and btrfs balance at the same time,
which avoids kernel crashes on old kernel versions.
bees has two workarounds for this bug: 1. schedule work so that multiple
threads do not simultaneously access the same inode or the same extent,
and 2. use a brute-force global lock within bees that prevents any
thread from running `LOGICAL_INO` while any other thread is running
dedupe.
The numbers for "tree mod log issue #" in the above table are arbitrary.
There are a lot of them, and they all behave fairly similarly.
Workaround #1 isn't really a workaround, since we want to do the same
thing for unrelated performance reasons. If multiple threads try to
perform dedupe operations on the same extent or inode, btrfs will make
all the threads wait for the same locks anyway, so it's better to have
bees find some other inode or extent to work on while waiting for btrfs
to finish.
This workaround is less necessary for kernels 5.4.19 and later.
Workaround #2 doesn't seem to be needed after implementing workaround
#1, but it's better to be slightly slower than to hang one CPU core
and the filesystem until the kernel is rebooted.
* **Slow backrefs** (aka toxic extents): Under certain conditions,
if the number of references to a single shared extent grows too
high, the kernel consumes more and more CPU while also holding locks
that delay write access to the filesystem. bees avoids this bug
by measuring the time the kernel spends performing `LOGICAL_INO`
operations and permanently blacklisting any extent or hash involved
where the kernel starts to get slow. In the bees log, such blocks
are labelled as 'toxic' hash/block addresses. Toxic extents are
rare (about 1 in 100,000 extents become toxic), but toxic extents can
become 8 orders of magnitude more expensive to process than the fastest
non-toxic extents. This seems to affect all dedupe agents on btrfs;
at this time of writing only bees has a workaround for this bug.
It is still theoretically possible to trigger the kernel bug when
running bees at the same time as other dedupers, or other programs
that use `LOGICAL_INO` like `btdu`, or when performing a reflink clone
operation such as `cp` or `mv`; however, it's extremely difficult to
reproduce the bug without closely cooperating threads.
* **Slow backrefs** (aka toxic extents): On older kernels, under certain
conditions, if the number of references to a single shared extent grows
too high, the kernel consumes more and more CPU while also holding
locks that delay write access to the filesystem. This is no longer
a concern on kernels after 5.7 (or an up-to-date 5.4 LTS version),
but there are still some remains of earlier workarounds for this issue
in bees that have not been fully removed.
bees avoided this bug by measuring the time the kernel spends performing
`LOGICAL_INO` operations and permanently blacklisting any extent or
hash involved where the kernel starts to get slow. In the bees log,
such blocks are labelled as 'toxic' hash/block addresses.
Future bees releases will remove toxic extent detection (it only detects
false positives now) and clear all previously saved toxic extent bits.
This workaround is less necessary for kernels 5.4.96, 5.7 and later,
though it can still take 2 ms of CPU to resolve each extent ref on a
fast machine on a large, heavily fragmented file.
* **dedupe breaks `btrfs send` in old kernels**. The bees option
`--workaround-btrfs-send` prevents any modification of read-only subvols
in order to avoid breaking `btrfs send` on kernels before 5.2.
in order to avoid breaking `btrfs send`.
This workaround is no longer necessary to avoid kernel crashes and
send performance failure on kernel 5.4.4 and later. bees will pause
dedupe until the send is finished on current kernels.
This workaround is no longer necessary to avoid kernel crashes
and send performance failure on kernel 4.9.207, 4.14.159, 4.19.90,
5.3.17, 5.4.4, 5.5 and later; however, some conflict between send
and dedupe still remains, so the workaround is still useful.
`btrfs receive` is not and has never been affected by this issue.
Unfixed kernel bugs
-------------------
As of 5.12.3:
* **The kernel does not permit `btrfs send` and dedupe to run at the
same time**. Recent kernels no longer crash, but now refuse one
operation with an error if the other operation was already running.
bees has not been updated to handle the new dedupe behavior optimally.
Optimal behavior is to defer dedupe operations when send is detected,
and resume after the send is finished. Current bees behavior is to
complain loudly about each individual dedupe failure in log messages,
and abandon duplicate data references in the snapshot that send is
processing. A future bees version shall have better handling for
this situation.
Workaround: send `SIGSTOP` to bees, or terminate the bees process,
before running `btrfs send`.
This workaround is not strictly required if snapshot is deleted after
sending. In that case, any duplicate data blocks that were not removed
by dedupe will be removed by snapshot delete instead. The workaround
still saves some IO.
`btrfs receive` is not affected by this issue.
* **Spurious warnings in `fs/fs-writeback.c`** on kernel 4.15 and later
when filesystem is mounted with `flushoncommit`. These
seem to be harmless (there are other locks which prevent
concurrent umount of the filesystem), but the underlying
problems that trigger the `WARN_ON` are [not trivial to
fix](https://www.spinics.net/lists/linux-btrfs/msg87752.html).
The warnings can be especially voluminous when bees is running.
Workarounds:
1. mount with `-o noflushoncommit`
2. patch kernel to remove warning in `fs/fs-writeback.c`.
Note that using kernels 4.14 and earlier is *not* a viable workaround
for this issue, because kernels 4.14 and earlier will eventually
deadlock when a filesystem is mounted with `-o flushoncommit` (a single
commit fixes one bug and introduces the other).

View File

@@ -3,34 +3,49 @@ Good Btrfs Feature Interactions
bees has been tested in combination with the following:
* btrfs compression (zlib, lzo, zstd)
* btrfs compression (zlib, lzo, zstd), mixtures of compressed and uncompressed extents
* PREALLOC extents (unconditionally replaced with holes)
* HOLE extents and btrfs no-holes feature
* Other deduplicators (`duperemove`, `jdupes`)
* Reflink copies (modern coreutils `cp` and `mv`)
* Concurrent file modification (e.g. PostgreSQL and sqlite databases, VMs, build daemons)
* All btrfs RAID profiles: single, dup, raid0, raid1, raid10, raid1c3, raid1c4, raid5, raid6
* IO errors during dedupe (affected extents are skipped)
* Other deduplicators, reflink copies (though bees may decide to redo their work)
* btrfs snapshots and non-snapshot subvols (RW and RO)
* Concurrent file modification (e.g. PostgreSQL and sqlite databases, build daemons)
* all btrfs RAID profiles
* IO errors during dedupe (read errors will throw exceptions, bees will catch them and skip over the affected extent)
* Filesystems mounted *with* the flushoncommit option ([lots of harmless kernel log warnings on 4.15 and later](btrfs-kernel.md))
* Filesystems mounted *without* the flushoncommit option
* 4K filesystem data block size / clone alignment
* 64-bit and 32-bit LE host CPUs (amd64, x86, arm)
* Large files (kernel 5.4 or later strongly recommended)
* Filesystem data sizes up to 100T+ bytes, 1000M+ files
* `open(O_DIRECT)` (seems to work as well--or as poorly--with bees as with any other btrfs feature)
* btrfs-convert from ext2/3/4
* btrfs `autodefrag` mount option
* btrfs balance (data balances cause rescan of relocated data)
* btrfs block-group-tree
* btrfs `flushoncommit` and `noflushoncommit` mount options
* btrfs mixed block groups
* btrfs `nodatacow`/`nodatasum` inode attribute or mount option (bees skips all nodatasum files)
* btrfs qgroups and quota support (_not_ squotas)
* Huge files (>1TB--although Btrfs performance on such files isn't great in general)
* filesystems up to 30T+ bytes, 100M+ files
* btrfs receive
* btrfs scrub
* btrfs send (dedupe pauses automatically, kernel 5.4 or later required)
* btrfs snapshot, non-snapshot subvols (RW and RO), snapshot delete
* btrfs nodatacow/nodatasum inode attribute or mount option (bees skips all nodatasum files)
* open(O_DIRECT) (seems to work as well--or as poorly--with bees as with any other btrfs feature)
* lvmcache: no problems observed in testing with recent kernels or reported by users in the last year.
**Note:** some btrfs features have minimum kernel versions which are
higher than the minimum kernel version for bees.
Bad Btrfs Feature Interactions
------------------------------
bees has been tested in combination with the following, and various problems are known:
* bcache: no data-losing problems observed in testing with recent kernels
or reported by users in the last year. Some issues observed with
bcache interacting badly with some SSD models' firmware, but so far
this only causes temporary loss of service, not filesystem damage.
This behavior does not seem to be specific to bees (ordinary filesystem
tests with rsync and snapshots will reproduce it), but it does prevent
any significant testing of bees on bcache.
* btrfs send: there are bugs in `btrfs send` that can be triggered by bees.
The [`--workaround-btrfs-send` option](options.md) works around this issue
by preventing bees from modifying read-only snapshots.
* btrfs qgroups: very slow, sometimes hangs...and it's even worse when
bees is running.
* btrfs autodefrag mount option: hangs and high CPU usage problems
reported by users. bees cannot distinguish autodefrag activity from
normal filesystem activity and will likely try to undo the autodefrag
if duplicate copies of the defragmented data exist.
Untested Btrfs Feature Interactions
-----------------------------------
@@ -39,6 +54,9 @@ bees has not been tested with the following, and undesirable interactions may oc
* Non-4K filesystem data block size (should work if recompiled)
* Non-equal hash (SUM) and filesystem data block (CLONE) sizes (need to fix that eventually)
* btrfs seed filesystems, raid-stripe-tree, squotas (no particular reason these wouldn't work, but no one has reported trying)
* btrfs out-of-tree kernel patches (e.g. encryption, extent tree v2)
* btrfs seed filesystems (does anyone even use those?)
* btrfs out-of-tree kernel patches (e.g. in-kernel dedupe or encryption)
* btrfs-convert from ext2/3/4 (never tested, might run out of space or ignore significant portions of the filesystem due to sanity checks)
* btrfs mixed block groups (don't know a reason why it would *not* work, but never tested)
* flashcache: an out-of-tree cache-HDD-on-SSD block layer helper.
* Host CPUs with exotic page sizes, alignment requirements, or endianness (ppc, alpha, sparc, strongarm, s390, mips, m68k...)

View File

@@ -8,10 +8,9 @@ are reasonable in most cases.
Hash Table Sizing
-----------------
Hash table entries are 16 bytes per data block. The hash table stores the
most recently read unique hashes. Once the hash table is full, each new
entry added to the table evicts an old entry. This makes the hash table
a sliding window over the most recently scanned data from the filesystem.
Hash table entries are 16 bytes per data block. The hash table stores
the most recently read unique hashes. Once the hash table is full,
each new entry in the table evicts an old entry.
Here are some numbers to estimate appropriate hash table sizes:
@@ -26,7 +25,9 @@ Here are some numbers to estimate appropriate hash table sizes:
Notes:
* If the hash table is too large, no extra dedupe efficiency is
obtained, and the extra space wastes RAM.
obtained, and the extra space just wastes RAM. Extra space can also slow
bees down by preventing old data from being evicted, so bees wastes time
looking for matching data that is no longer present on the filesystem.
* If the hash table is too small, bees extrapolates from matching
blocks to find matching adjacent blocks in the filesystem that have been
@@ -35,10 +36,6 @@ one block in common between two extents in order to be able to dedupe
the entire extents. This provides significantly more dedupe hit rate
per hash table byte than other dedupe tools.
* There is a fairly wide range of usable hash sizes, and performances
degrades according to a smooth probabilistic curve in both directions.
Double or half the optimium size usually works just as well.
* When counting unique data in compressed data blocks to estimate
optimum hash table size, count the *uncompressed* size of the data.
@@ -55,25 +52,25 @@ patterns on dedupe effectiveness without performing deep inspection of
both the filesystem data and its structure--a task that is as expensive
as performing the deduplication.
* **Compression** in files reduces the average extent length compared
to uncompressed files. The maximum compressed extent length on
btrfs is 128KB, while the maximum uncompressed extent length is 128MB.
Longer extents decrease the optimum hash table size while shorter extents
increase the optimum hash table size, because the probability of a hash
table entry being present (i.e. unevicted) in each extent is proportional
to the extent length.
* **Compression** on the filesystem reduces the average extent length
compared to uncompressed filesystems. The maximum compressed extent
length on btrfs is 128KB, while the maximum uncompressed extent length
is 128MB. Longer extents decrease the optimum hash table size while
shorter extents increase the optimum hash table size because the
probability of a hash table entry being present (i.e. unevicted) in
each extent is proportional to the extent length.
As a rule of thumb, the optimal hash table size for a compressed
filesystem is 2-4x larger than the optimal hash table size for the same
data on an uncompressed filesystem. Dedupe efficiency falls rapidly with
hash tables smaller than 128MB/TB as the average dedupe extent size is
larger than the largest possible compressed extent size (128KB).
data on an uncompressed filesystem. Dedupe efficiency falls dramatically
with hash tables smaller than 128MB/TB as the average dedupe extent size
is larger than the largest possible compressed extent size (128KB).
* **Short writes or fragmentation** also shorten the average extent
length and increase optimum hash table size. If a database writes to
files randomly using 4K page writes, all of these extents will be 4K
in length, and the hash table size must be increased to retain each one
(or the user must accept a lower dedupe hit rate).
* **Short writes** also shorten the average extent length and increase
optimum hash table size. If a database writes to files randomly using
4K page writes, all of these extents will be 4K in length, and the hash
table size must be increased to retain each one (or the user must accept
a lower dedupe hit rate).
Defragmenting files that have had many short writes increases the
extent length and therefore reduces the optimum hash table size.
@@ -94,222 +91,59 @@ code files over and over, so it will need a smaller hash table than a
backup server which has to refer to the oldest data on the filesystem
every time a new client machine's data is added to the server.
Scanning modes
--------------
Scanning modes for multiple subvols
-----------------------------------
The `--scan-mode` option affects how bees iterates over the filesystem,
schedules extents for scanning, and tracks progress.
The `--scan-mode` option affects how bees divides resources between
subvolumes. This is particularly relevant when there are snapshots,
as there are tradeoffs to be made depending on how snapshots are used
on the filesystem.
There are now two kinds of scan mode: the legacy **subvol** scan modes,
and the new **extent** scan mode.
Note that if a filesystem has only one subvolume (i.e. the root,
subvol ID 5) then the `--scan-mode` option has no effect, as there is
only one subvolume to scan.
Scan mode can be changed by restarting bees with a different scan mode
option.
The default mode is mode 0, "lockstep". In this mode, each inode of each
subvol is scanned at the same time, before moving to the next inode in
each subvol. This maximizes the likelihood that all of the references to
a snapshot of a file are scanned at the same time, which takes advantage
of VFS caching in the Linux kernel. If snapshots are created very often,
bees will not make very good progress as it constantly restarts the
filesystem scan from the beginning each time a new snapshot is created.
Extent scan mode:
Scan mode 1, "independent", simply scans every subvol independently
in parallel. Each subvol's scanner shares time equally with all other
subvol scanners. Whenever a new subvol appears, a new scanner is
created and the new subvol scanner doesn't affect the behavior of any
existing subvol scanner.
* Works with 4.15 and later kernels.
* Can estimate progress and provide an ETA.
* Can optimize scanning order to dedupe large extents first.
* Can keep up with frequent creation and deletion of snapshots.
Subvol scan modes:
* Work with 4.14 and earlier kernels.
* Cannot estimate or report progress.
* Cannot optimize scanning order by extent size.
* Have problems keeping up with multiple snapshots created during a scan.
The default scan mode is 4, "extent".
Scan mode 2, "sequential", processes each subvol completely before
proceeding to the next subvol. This is a good mode when using bees for
the first time on a filesystem that already has many existing snapshots
and a high rate of new snapshot creation. Short-lived snapshots
(e.g. those used for `btrfs send`) are effectively ignored, and bees
directs its efforts toward older subvols that are more likely to be
origin subvols for snapshots. By deduping origin subvols first, bees
ensures that future snapshots will already be deduplicated and do not
need to be deduplicated again.
If you are using bees for the first time on a filesystem with many
existing snapshots, you should read about [snapshot gotchas](gotchas.md).
Subvol scan modes
-----------------
Subvol scan modes are maintained for compatibility with existing
installations, but will not be developed further. New installations
should use extent scan mode instead.
The _quantity_ of text below detailing the shortcomings of each subvol
scan mode should be informative all by itself.
Subvol scan modes work on any kernel version supported by bees. They
are the only scan modes usable on kernel 4.14 and earlier.
The difference between the subvol scan modes is the order in which the
files from different subvols are fed into the scanner. They all scan
files in inode number order, from low to high offset within each inode,
the same way that a program like `cat` would read files (but skipping
over old data from earlier btrfs transactions).
If a filesystem has only one subvolume with data in it, then all of
the subvol scan modes are equivalent. In this case, there is only one
subvolume to scan, so every possible ordering of subvols is the same.
The `--workaround-btrfs-send` option pauses scanning subvols that are
read-only. If the subvol is made read-write (e.g. with `btrfs prop set
$subvol ro false`), or if the `--workaround-btrfs-send` option is removed,
then the scan of that subvol is unpaused and dedupe proceeds normally.
Space will only be recovered when the last read-only subvol is deleted.
Subvol scan modes cannot efficiently or accurately calculate an ETA for
completion or estimate progress through the data. They simply request
"the next new inode" from btrfs, and they are completed when btrfs says
there is no next new inode.
Between subvols, there are several scheduling algorithms with different
trade-offs:
Scan mode 0, "lockstep", scans the same inode number in each subvol at
close to the same time. This is useful if the subvols are snapshots
with a common ancestor, since the same inode number in each subvol will
have similar or identical contents. This maximizes the likelihood that
all of the references to a snapshot of a file are scanned at close to
the same time, improving dedupe hit rate. If the subvols are unrelated
(i.e. not snapshots of a single subvol) then this mode does not provide
any significant advantage. This mode uses smaller amounts of temporary
space for shorter periods of time when most subvols are snapshots. When a
new snapshot is created, this mode will stop scanning other subvols and
scan the new snapshot until the same inode number is reached in each
subvol, which will effectively stop dedupe temporarily as this data has
already been scanned and deduped in the other snapshots.
Scan mode 1, "independent", scans the next inode with new data in
each subvol. There is no coordination between the subvols, other than
round-robin distribution of files from each subvol to each worker thread.
This mode makes continuous forward progress in all subvols. When a new
snapshot is created, previous subvol scans continue as before, but the
worker threads are now divided among one more subvol.
Scan mode 2, "sequential", scans one subvol at a time, in numerical subvol
ID order, processing each subvol completely before proceeding to the next
subvol. This avoids spending time scanning short-lived snapshots that
will be deleted before they can be fully deduped (e.g. those used for
`btrfs send`). Scanning starts on older subvols that are more likely
to be origin subvols for future snapshots, eliminating the need to
dedupe future snapshots separately. This mode uses the largest amount
of temporary space for the longest time, and typically requires a larger
hash table to maintain dedupe hit rate.
Scan mode 3, "recent", scans the subvols with the highest `min_transid`
value first (i.e. the ones that were most recently completely scanned),
then falls back to "independent" mode to break ties. This interrupts
long scans of old subvols to give a rapid dedupe response to new data
in previously scanned subvols, then returns to the old subvols after
the new data is scanned.
Extent scan mode
----------------
Scan mode 4, "extent", scans the extent tree instead of the subvol trees.
Extent scan mode reads each extent once, regardless of the number of
reflinks or snapshots. It adapts to the creation of new snapshots
and reflinks immediately, without having to revisit old data.
In the extent scan mode, extents are separated into multiple size tiers
to prioritize large extents over small ones. Deduping large extents
keeps the metadata update cost low per block saved, resulting in faster
dedupe at the start of a scan cycle. This is important for maximizing
performance in use cases where bees runs for a limited time, such as
during an overnight maintenance window.
Once the larger size tiers are completed, dedupe space recovery speeds
slow down significantly. It may be desirable to stop bees running once
the larger size tiers are finished, then start bees running some time
later after new data has appeared.
Each extent is mapped in physical address order, and all extent references
are submitted to the scanner at the same time, resulting in much better
cache behavior and dedupe performance compared to the subvol scan modes.
The "extent" scan mode is not usable on kernels before 4.15 because
it relies on the `LOGICAL_INO_V2` ioctl added in that kernel release.
When using bees with an older kernel, only subvol scan modes will work.
Extents are divided into virtual subvols by size, using reserved btrfs
subvol IDs 250..255. The size tier groups are:
* 250: 32M+1 and larger
* 251: 8M+1..32M
* 252: 2M+1..8M
* 253: 512K+1..2M
* 254: 128K+1..512K
* 255: 128K and smaller (includes all compressed extents)
Extent scan mode can efficiently calculate dedupe progress within
the filesystem and estimate an ETA for completion within each size
tier; however, the accuracy of the ETA can be questionable due to the
non-uniform distribution of block addresses in a typical user filesystem.
Older versions of bees do not recognize the virtual subvols, so running
an old bees version after running a new bees version will reset the
"extent" scan mode's progress in `beescrawl.dat` to the beginning.
This may change in future bees releases, i.e. extent scans will store
their checkpoint data somewhere else.
The `--workaround-btrfs-send` option behaves differently in extent
scan modes: In extent scan mode, dedupe proceeds on all subvols that are
read-write, but all subvols that are read-only are excluded from dedupe.
Space will only be recovered when the last read-only subvol is deleted.
During `btrfs send` all duplicate extents in the sent subvol will not be
removed (the kernel will reject dedupe commands while send is active,
and bees currently will not re-issue them after the send is complete).
It may be preferable to terminate the bees process while running `btrfs
send` in extent scan mode, and restart bees after the `send` is complete.
Threads and load management
---------------------------
By default, bees creates one worker thread for each CPU detected. These
threads then perform scanning and dedupe operations. bees attempts to
maximize the amount of productive work each thread does, until either the
threads are all continuously busy, or there is no remaining work to do.
By default, bees creates one worker thread for each CPU detected.
These threads then perform scanning and dedupe operations. The number of
worker threads can be set with the [`--thread-count` and `--thread-factor`
options](options.md).
In many cases it is not desirable to continually run bees at maximum
performance. Maximum performance is not necessary if bees can dedupe
new data faster than it appears on the filesystem. If it only takes
bees 10 minutes per day to dedupe all new data on a filesystem, then
bees doesn't need to run for more than 10 minutes per day.
bees supports a number of options for reducing system load:
* Run bees for a few hours per day, at an off-peak time (i.e. during
a maintenace window), instead of running bees continuously. Any data
added to the filesystem while bees is not running will be scanned when
bees restarts. At the end of the maintenance window, terminate the
bees process with SIGTERM to write the hash table and scan position
for the next maintenance window.
* Temporarily pause bees operation by sending the bees process SIGUSR1,
and resume operation with SIGUSR2. This is preferable to freezing
and thawing the process, e.g. with freezer cgroups or SIGSTOP/SIGCONT
signals, because it allows bees to close open file handles that would
otherwise prevent those files from being deleted while bees is frozen.
* Reduce the number of worker threads with the [`--thread-count` or
`--thread-factor` options](options.md). This simply leaves CPU cores
idle so that other applications on the host can use them, or to save
power.
* Allow bees to automatically track system load and increase or decrease
the number of threads to reach a target system load. This reduces
impact on the rest of the system by pausing bees when other CPU and IO
intensive loads are active on the system, and resumes bees when the other
loads are inactive. This is configured with the [`--loadavg-target`
and `--thread-min` options](options.md).
* Allow bees to self-throttle operations that enqueue delayed work
within btrfs. These operations are not well controlled by Linux
features such as process priority or IO priority or IO rate-limiting,
because the enqueued work is submitted to btrfs several seconds before
btrfs performs the work. By the time btrfs performs the work, it's too
late for external throttling to be effective. The [`--throttle-factor`
option](options.md) tracks how long it takes btrfs to complete queued
operations, and reduces bees's queued work submission rate to match
btrfs's queued work completion rate (or a fraction thereof, to reduce
system load).
If desired, bees can automatically increase or decrease the number
of worker threads in response to system load. This reduces impact on
the rest of the system by pausing bees when other CPU and IO intensive
loads are active on the system, and resumes bees when the other loads
are inactive. This is configured with the [`--loadavg-target` and
`--thread-min` options](options.md).
Log verbosity
-------------

View File

@@ -67,12 +67,11 @@ The `adjust` event group consists of operations related to translating stored vi
* `adjust_exact`: A block address from the hash table corresponding to an uncompressed data block was processed to find its `(root, inode, offset)` references.
* `adjust_exact_correct`: A block address corresponding to an uncompressed block was retrieved from the hash table and resolved to a physical block containing data that matches another block bees has already read.
* `adjust_exact_wrong`: A block address corresponding to an uncompressed block was retrieved from the hash table and resolved to a physical block containing data that matches the hash but not the data from another block bees has already read (i.e. there was a hash collision).
* `adjust_hit`: A block address was retrieved from the hash table and resolved to a physical block in an uncompressed extent containing data that matches the data from another block bees has already read (i.e. a duplicate match was found).
* `adjust_hit`: A block address was retrieved from the hash table and resolved to a physical block containing data that matches the data from another block bees has already read (i.e. a duplicate match was found).
* `adjust_miss`: A block address was retrieved from the hash table and resolved to a physical block containing a hash that does not match the hash from another block bees has already read (i.e. the hash table contained a stale entry and the data it referred to has since been overwritten in the filesystem).
* `adjust_needle_too_long`: A block address was retrieved from the hash table, but when the corresponding extent item was retrieved, its offset or length were out of range to be a match (i.e. the hash table contained a stale entry and the data it referred to has since been overwritten in the filesystem).
* `adjust_no_match`: A hash collision occurred (i.e. a block on disk was located with the same hash as the hash table entry but different data) . Effectively an alias for `hash_collision` as it is not possible to have one event without the other.
* `adjust_offset_high`: The `LOGICAL_INO` ioctl gave an extent item that does not overlap with the desired block because the extent item ends before the desired block in the extent data.
* `adjust_offset_hit`: A block address was retrieved from the hash table and resolved to a physical block in a compressed extent containing data that matches the data from another block bees has already read (i.e. a duplicate match was found).
* `adjust_offset_low`: The `LOGICAL_INO` ioctl gave an extent item that does not overlap with the desired block because the extent item begins after the desired block in the extent data.
* `adjust_try`: A block address and extent item candidate were passed to `BeesResolver::adjust_offset` for processing.
@@ -118,16 +117,11 @@ crawl
The `crawl` event group consists of operations related to scanning btrfs trees to find new extent refs to scan for dedupe.
* `crawl_again`: An inode crawl was restarted because the extent was already locked by another running crawl.
* `crawl_blacklisted`: An extent was not scanned because it belongs to a blacklisted file.
* `crawl_deferred_inode`: Two tasks attempted to scan the same inode at the same time, so one was deferred.
* `crawl_done`: One pass over a subvol was completed.
* `crawl_discard_high`: An extent that was too large for the crawler's size tier was discarded.
* `crawl_discard_low`: An extent that was too small for the crawler's size tier was discarded.
* `crawl_create`: A new subvol crawler was created.
* `crawl_done`: One pass over all subvols on the filesystem was completed.
* `crawl_empty`: A `TREE_SEARCH_V2` ioctl call failed or returned an empty set (usually because all data in the subvol was scanned).
* `crawl_extent`: The extent crawler queued all references to an extent for processing.
* `crawl_fail`: A `TREE_SEARCH_V2` ioctl call failed.
* `crawl_flop`: Small extent items were not skipped because the next extent started at or before the end of the previous extent.
* `crawl_gen_high`: An extent item in the search results refers to an extent that is newer than the current crawl's `max_transid` allows.
* `crawl_gen_low`: An extent item in the search results refers to an extent that is older than the current crawl's `min_transid` allows.
* `crawl_hole`: An extent item in the search results refers to a hole.
@@ -138,14 +132,10 @@ The `crawl` event group consists of operations related to scanning btrfs trees t
* `crawl_nondata`: An item in the search results is not data.
* `crawl_prealloc`: An extent item in the search results refers to a `PREALLOC` extent.
* `crawl_push`: An extent item in the search results is suitable for scanning and deduplication.
* `crawl_restart`: A subvol crawl was restarted with a new `min_transid..max_transid` range.
* `crawl_scan`: An extent item in the search results is submitted to `BeesContext::scan_forward` for scanning and deduplication.
* `crawl_skip`: Small extent items were skipped because no extent of sufficient size was found within the minimum search distance.
* `crawl_skip_ms`: Time spent skipping small extent items.
* `crawl_search`: A `TREE_SEARCH_V2` ioctl call was successful.
* `crawl_throttled`: Extent scan created too many work queue items and was prevented from creating any more.
* `crawl_tree_block`: Extent scan found and skipped a metadata tree block.
* `crawl_unknown`: An extent item in the search results has an unrecognized type.
* `crawl_unthrottled`: Extent scan allowed to create work queue items again.
dedup
-----
@@ -171,25 +161,6 @@ The `exception` event group consists of C++ exceptions. C++ exceptions are thro
* `exception_caught`: Total number of C++ exceptions thrown and caught by a generic exception handler.
* `exception_caught_silent`: Total number of "silent" C++ exceptions thrown and caught by a generic exception handler. These are exceptions which are part of the correct and normal operation of bees. The exceptions are logged at a lower log level.
extent
------
The `extent` event group consists of events that occur within the extent scanner.
* `extent_deferred_inode`: A lock conflict was detected when two worker threads attempted to manipulate the same inode at the same time.
* `extent_empty`: A complete list of references to an extent was created but the list was empty, e.g. because all refs are in deleted inodes or snapshots.
* `extent_fail`: An ioctl call to `LOGICAL_INO` failed.
* `extent_forward`: An extent reference was submitted for scanning.
* `extent_mapped`: A complete map of references to an extent was created and added to the crawl queue.
* `extent_ok`: An ioctl call to `LOGICAL_INO` completed successfully.
* `extent_overflow`: A complete map of references to an extent exceeded `BEES_MAX_EXTENT_REF_COUNT`, so the extent was dropped.
* `extent_ref_missing`: An extent reference reported by `LOGICAL_INO` was not found by later `TREE_SEARCH_V2` calls.
* `extent_ref_ok`: One extent reference was queued for scanning.
* `extent_restart`: An extent reference was requeued to be scanned again after an active extent lock is released.
* `extent_retry`: An extent reference was requeued to be scanned again after an active inode lock is released.
* `extent_skip`: A 4K extent with more than 1000 refs was skipped.
* `extent_zero`: An ioctl call to `LOGICAL_INO` succeeded, but reported an empty list of extents.
hash
----
@@ -208,6 +179,24 @@ The `hash` event group consists of operations related to the bees hash table.
* `hash_insert`: A `(hash, address)` pair was inserted by `BeesHashTable::push_random_hash_addr`.
* `hash_lookup`: The hash table was searched for `(hash, address)` pairs matching a given `hash`.
inserted
--------
The `inserted` event group consists of operations related to storing hash and address data in the hash table (i.e. the hash table client).
* `inserted_block`: Total number of data block references scanned and inserted into the hash table.
* `inserted_clobbered`: Total number of data block references scanned and eliminated from the filesystem.
matched
-------
The `matched` event group consists of events related to matching incoming data blocks against existing hash table entries.
* `matched_0`: A data block was scanned, hash table entries found, but no matching data blocks on the filesytem located.
* `matched_1_or_more`: A data block was scanned, hash table entries found, and one or more matching data blocks on the filesystem located.
* `matched_2_or_more`: A data block was scanned, hash table entries found, and two or more matching data blocks on the filesystem located.
* `matched_3_or_more`: A data block was scanned, hash table entries found, and three or more matching data blocks on the filesystem located.
open
----
@@ -269,29 +258,12 @@ The `pairforward` event group consists of events related to extending matching b
* `pairforward_try`: Started extending a pair of matching block ranges forward.
* `pairforward_zero`: A pair of matching block ranges could not be extended backward by one block because the src block contained all zeros and was not compressed.
progress
--------
The `progress` event group consists of events related to progress estimation.
* `progress_no_data_bg`: Failed to retrieve any data block groups from the filesystem.
* `progress_not_created`: A crawler for one size tier had not been created for the extent scanner.
* `progress_complete`: A crawler for one size tier has completed a scan.
* `progress_not_found`: The extent position for a crawler does not correspond to any block group.
* `progress_out_of_bg`: The extent position for a crawler does not correspond to any data block group.
* `progress_ok`: Table of progress and ETA created successfully.
readahead
---------
The `readahead` event group consists of events related to data prefetching (formerly calls to `posix_fadvise` or `readahead`, but now emulated in userspace).
The `readahead` event group consists of events related to calls to `posix_fadvise`.
* `readahead_bytes`: Number of bytes prefetched.
* `readahead_count`: Number of read calls.
* `readahead_clear`: Number of times the duplicate read cache was cleared.
* `readahead_fail`: Number of read errors during prefetch.
* `readahead_ms`: Total time spent emulating readahead in user-space (kernel readahead is not measured).
* `readahead_skip`: Number of times a duplicate read was identified in the cache and skipped.
* `readahead_ms`: Total time spent running `posix_fadvise(..., POSIX_FADV_WILLNEED)` aka `readahead()`.
* `readahead_unread_ms`: Total time spent running `posix_fadvise(..., POSIX_FADV_DONTNEED)`.
replacedst
@@ -323,12 +295,10 @@ resolve
The `resolve` event group consists of operations related to translating a btrfs virtual block address (i.e. physical block address) to a `(root, inode, offset)` tuple (i.e. locating and opening the file containing a matching block). `resolve` is the top level, `chase` and `adjust` are the lower two levels.
* `resolve_empty`: The `LOGICAL_INO` ioctl returned successfully with an empty reference list (0 items).
* `resolve_fail`: The `LOGICAL_INO` ioctl returned an error.
* `resolve_large`: The `LOGICAL_INO` ioctl returned more than 2730 results (the limit of the v1 ioctl).
* `resolve_ms`: Total time spent in the `LOGICAL_INO` ioctl (i.e. wallclock time, not kernel CPU time).
* `resolve_ok`: The `LOGICAL_INO` ioctl returned success.
* `resolve_overflow`: The `LOGICAL_INO` ioctl returned 9999 or more extents (the limit configured in `bees.h`).
* `resolve_toxic`: The `LOGICAL_INO` ioctl took more than 0.1 seconds of kernel CPU time.
root
@@ -356,49 +326,42 @@ The `scan` event group consists of operations related to scanning incoming data.
* `scan_blacklisted`: A blacklisted extent was passed to `scan_forward` and dropped.
* `scan_block`: A block of data was scanned.
* `scan_compressed_no_dedup`: An extent that was compressed contained non-zero, non-duplicate data.
* `scan_dup_block`: Number of duplicate block references deduped.
* `scan_dup_hit`: A pair of duplicate block ranges was found.
* `scan_bump`: After deduping a block range, the scan pointer had to be moved past the end of the deduped byte range.
* `scan_dup_block`: Number of duplicate blocks deduped.
* `scan_dup_hit`: A pair of duplicate block ranges was found and removed.
* `scan_dup_miss`: A pair of duplicate blocks was found in the hash table but not in the filesystem.
* `scan_eof`: Scan past EOF was attempted.
* `scan_erase_redundant`: Blocks in the hash table were removed because they were removed from the filesystem by dedupe.
* `scan_extent`: An extent was scanned (`scan_one_extent`).
* `scan_forward`: A logical byte range was scanned (`scan_forward`).
* `scan_found`: An entry was found in the hash table matching a scanned block from the filesystem.
* `scan_hash_hit`: A block was found on the filesystem corresponding to a block found in the hash table.
* `scan_hash_miss`: A block was not found on the filesystem corresponding to a block found in the hash table.
* `scan_hash_preinsert`: A non-zero data block's hash was prepared for possible insertion into the hash table.
* `scan_hash_insert`: A non-zero data block's hash was inserted into the hash table.
* `scan_hash_preinsert`: A block was prepared for insertion into the hash table.
* `scan_hole`: A hole extent was found during scan and ignored.
* `scan_interesting`: An extent had flags that were not recognized by bees and was ignored.
* `scan_lookup`: A hash was looked up in the hash table.
* `scan_malign`: A block being scanned matched a hash at EOF in the hash table, but the EOF was not aligned to a block boundary and the two blocks did not have the same length.
* `scan_no_fd`: References to a block from the hash table were found, but a FD could not be opened.
* `scan_no_rewrite`: All blocks in an extent were removed by dedupe (i.e. no copies).
* `scan_push_front`: An entry in the hash table matched a duplicate block, so the entry was moved to the head of its LRU list.
* `scan_reinsert`: A copied block's hash and block address was inserted into the hash table.
* `scan_resolve_hit`: A block address in the hash table was successfully resolved to an open FD and offset pair.
* `scan_resolve_zero`: A block address in the hash table was not resolved to any subvol/inode pair, so the corresponding hash table entry was removed.
* `scan_rewrite`: A range of bytes in a file was copied, then the copy deduped over the original data.
* `scan_root_dead`: A deleted subvol was detected.
* `scan_seen_clear`: The list of recently scanned extents reached maximum size and was cleared.
* `scan_seen_erase`: An extent reference was modified by scan, so all future references to the extent must be scanned.
* `scan_seen_hit`: A scan was skipped because the same extent had recently been scanned.
* `scan_seen_insert`: An extent reference was not modified by scan and its hashes have been inserted into the hash table, so all future references to the extent can be ignored.
* `scan_seen_miss`: A scan was not skipped because the same extent had not recently been scanned (i.e. the extent was scanned normally).
* `scan_skip_bytes`: Nuisance dedupe or hole-punching would save less than half of the data in an extent.
* `scan_skip_ops`: Nuisance dedupe or hole-punching would require too many dedupe/copy/hole-punch operations in an extent.
* `scan_toxic_hash`: A scanned block has the same hash as a hash table entry that is marked toxic.
* `scan_toxic_match`: A hash table entry points to a block that is discovered to be toxic.
* `scan_twice`: Two references to the same block have been found in the hash table.
* `scan_zero`: A data block containing only zero bytes was detected.
* `scan_zero_compressed`: An extent that was compressed and contained only zero bytes was found.
* `scan_zero_uncompressed`: A block that contained only zero bytes was found in an uncompressed extent.
scanf
-----
The `scanf` event group consists of operations related to `BeesContext::scan_forward`. This is the entry point where `crawl` schedules new data for scanning.
* `scanf_deferred_extent`: Two tasks attempted to scan the same extent at the same time, so one was deferred.
* `scanf_eof`: Scan past EOF was attempted.
* `scanf_extent`: A btrfs extent item was scanned.
* `scanf_extent_ms`: Total thread-seconds spent scanning btrfs extent items.
* `scanf_no_fd`: References to a block from the hash table were found, but a FD could not be opened.
* `scanf_total`: A logical byte range of a file was scanned.
* `scanf_total_ms`: Total thread-seconds spent scanning logical byte ranges.

View File

@@ -45,46 +45,87 @@ bees will loop billions of times considering all possibilities. This is
a waste of time, so an exception is currently used to break out of such
loops early. The exception text in this case is:
`FIXME: too many duplicate candidates, bailing out here`
`FIXME: bailing out here, need to fix this further up the call stack`
Terminating bees with SIGTERM
-----------------------------
bees is designed to survive host crashes, so it is safe to terminate bees
using SIGKILL; however, when bees next starts up, it will repeat some
work that was performed between the last bees crawl state save point
and the SIGKILL (up to 15 minutes), and a large hash table may not be
completely written back to disk, so some duplicate matches will be lost.
bees is designed to survive host crashes, so it is safe to terminate
bees using SIGKILL; however, when bees next starts up, it will repeat
some work that was performed between the last bees crawl state save point
and the SIGKILL (up to 15 minutes). If bees is stopped and started less
than once per day, then this is not a problem as the proportional impact
is quite small; however, users who stop and start bees daily or even
more often may prefer to have a clean shutdown with SIGTERM so bees can
restart faster.
If bees is stopped and started less than once per week, then this is not
a problem as the proportional impact is quite small; however, users who
stop and start bees daily or even more often may prefer to have a clean
shutdown with SIGTERM so bees can restart faster.
bees handling of SIGTERM can take a long time on machines with some or
all of:
The shutdown procedure performs these steps:
* Large RAM and `vm.dirty_ratio`
* Large number of active bees worker threads
* Large number of bees temporary files (proportional to thread count)
* Large hash table size
* Large filesystem size
* High IO latency, especially "low power" spinning disks
* High filesystem activity, especially duplicate data writes
1. Crawl state is saved to `$BEESHOME`. This is the most
Each of these factors individually increases the total time required
to perform a clean bees shutdown. When combined, the factors can
multiply with each other, dramatically increasing the time required to
flush bees state to disk.
On a large system with many of the above factors present, a "clean"
bees shutdown can take more than 20 minutes. Even a small machine
(16GB RAM, 1GB hash table, 1TB NVME disk) can take several seconds to
complete a SIGTERM shutdown.
The shutdown procedure performs potentially long-running tasks in
this order:
1. Worker threads finish executing their current Task and exit.
Threads executing `LOGICAL_INO` ioctl calls usually finish quickly,
but btrfs imposes no limit on the ioctl's running time, so it
can take several minutes in rare bad cases. If there is a btrfs
commit already in progress on the filesystem, then most worker
threads will be blocked until the btrfs commit is finished.
2. Crawl state is saved to `$BEESHOME`. This normally completes
relatively quickly (a few seconds at most). This is the most
important bees state to save to disk as it directly impacts
restart time, so it is done as early as possible
restart time, so it is done as early as possible (but no earlier).
2. Hash table is written to disk. Normally the hash table is
trickled back to disk at a rate of about 128KiB per second;
3. Hash table is written to disk. Normally the hash table is
trickled back to disk at a rate of about 2GB per hour;
however, SIGTERM causes bees to attempt to flush the whole table
immediately. The time spent here depends on the size of RAM, speed
of disks, and aggressiveness of competing filesystem workloads.
It can trigger `vm.dirty_bytes` limits and block other processes
writing to the filesystem for a while.
immediately. If bees has recently been idle then the hash table is
likely already flushed to disk, so this step will finish quickly;
however, if bees has recently been active and the hash table is
large relative to RAM size, the blast of rapidly written data
can force the Linux VFS to block all writes to the filesystem
for sufficient time to complete all pending btrfs metadata
writes which accumulated during the btrfs commit before bees
received SIGTERM...and _then_ let bees write out the hash table.
The time spent here depends on the size of RAM, speed of disks,
and aggressiveness of competing filesystem workloads.
3. The bees process calls `_exit`, which terminates all running
worker threads, closes and deletes all temporary files. This
can take a while _after_ the bees process exits, especially on
slow spinning disks.
4. bees temporary files are closed, which implies deletion of their
inodes. These are files which consist entirely of shared extent
structures, and btrfs takes an unusually long time to delete such
files (up to a few minutes for each on slow spinning disks).
If bees is terminated with SIGKILL, only step #1 and #4 are performed (the
kernel performs these automatically if bees exits). This reduces the
shutdown time at the cost of increased startup time.
Balances
--------
First, read [`LOGICAL_INO` and btrfs balance WARNING](btrfs-kernel.md).
bees will suspend operations during a btrfs balance to work around
kernel bugs.
A btrfs balance relocates data on disk by making a new copy of the
data, replacing all references to the old data with references to the
new copy, and deleting the old copy. To bees, this is the same as any
@@ -134,9 +175,7 @@ the beginning.
Each time bees dedupes an extent that is referenced by a snapshot,
the entire metadata page in the snapshot subvol (16KB by default) must
be CoWed in btrfs. Since all references must be removed at the same
time, this CoW operation is repeated in every snapshot containing the
duplicate data. This can result in a substantial increase in btrfs
be CoWed in btrfs. This can result in a substantial increase in btrfs
metadata size if there are many snapshots on a filesystem.
Normally, metadata is small (less than 1% of the filesystem) and dedupe
@@ -205,7 +244,7 @@ Other Gotchas
* bees avoids the [slow backrefs kernel bug](btrfs-kernel.md) by
measuring the time required to perform `LOGICAL_INO` operations.
If an extent requires over 5.0 kernel CPU seconds to perform a
If an extent requires over 0.1 kernel CPU seconds to perform a
`LOGICAL_INO` ioctl, then bees blacklists the extent and avoids
referencing it in future operations. In most cases, fewer than 0.1%
of extents in a filesystem must be avoided this way. This results
@@ -213,18 +252,17 @@ Other Gotchas
filesystem while `LOGICAL_INO` is running. Generally the CPU spends
most of the runtime of the `LOGICAL_INO` ioctl running the kernel,
so on a single-core CPU the entire system can freeze up for a second
during operations on toxic extents. Note this only occurs on older
kernels. See [the slow backrefs kernel bug section](btrfs-kernel.md).
during operations on toxic extents.
* If a process holds a directory FD open, the subvol containing the
directory cannot be deleted (`btrfs sub del` will start the deletion
process, but it will not proceed past the first open directory FD).
`btrfs-cleaner` will simply skip over the directory *and all of its
children* until the FD is closed. bees avoids this gotcha by closing
all of the FDs in its directory FD cache every btrfs transaction.
all of the FDs in its directory FD cache every 10 btrfs transactions.
* If a file is deleted while bees is caching an open FD to the file,
bees continues to scan the file. For very large files (e.g. VM
images), the deletion of the file can be delayed indefinitely.
To limit this delay, bees closes all FDs in its file FD cache every
btrfs transaction.
10 btrfs transactions.

View File

@@ -8,12 +8,10 @@ bees uses checkpoints for persistence to eliminate the IO overhead of a
transactional data store. On restart, bees will dedupe any data that
was added to the filesystem since the last checkpoint. Checkpoints
occur every 15 minutes for scan progress, stored in `beescrawl.dat`.
The hash table trickle-writes to disk at 128KiB/s to `beeshash.dat`,
but will flush immediately if bees is terminated by SIGTERM.
There are no special requirements for bees hash table storage--`.beeshome`
could be stored on a different btrfs filesystem, ext4, or even CIFS (but
not MS-DOS--beeshome does need filenames longer than 8.3).
The hash table trickle-writes to disk at 4GB/hour to `beeshash.dat`.
An hourly performance report is written to `beesstats.txt`. There are
no special requirements for bees hash table storage--`.beeshome` could
be stored on a different btrfs filesystem, ext4, or even CIFS.
bees uses a persistent dedupe hash table with a fixed size configured
by the user. Any size of hash table can be dedicated to dedupe. If a
@@ -22,7 +20,7 @@ small as 128KB.
The bees hash table is loaded into RAM at startup and `mlock`ed so it
will not be swapped out by the kernel (if swap is permitted, performance
degrades to nearly zero, for both bees and the swap device).
degrades to nearly zero).
bees scans the filesystem in a single pass which removes duplicate
extents immediately after they are detected. There are no distinct
@@ -85,12 +83,12 @@ of these functions in userspace, at the expense of encountering [some
kernel bugs in `LOGICAL_INO` performance](btrfs-kernel.md).
bees uses only the data-safe `FILE_EXTENT_SAME` (aka `FIDEDUPERANGE`)
kernel ioctl to manipulate user data, so it can dedupe live data
(e.g. build servers, sqlite databases, VM disk images). bees does not
modify file attributes or timestamps in deduplicated files.
kernel operations to manipulate user data, so it can dedupe live data
(e.g. build servers, sqlite databases, VM disk images). It does not
modify file attributes or timestamps.
When bees has scanned all of the data, bees will pause until a new
transaction has completed in the btrfs filesystem. bees tracks
When bees has scanned all of the data, bees will pause until 10
transactions have been completed in the btrfs filesystem. bees tracks
the current btrfs transaction ID over time so that it polls less often
on quiescent filesystems and more often on busy filesystems.

View File

@@ -6,30 +6,31 @@ Best-Effort Extent-Same, a btrfs deduplication agent.
About bees
----------
bees is a block-oriented userspace deduplication agent designed to scale
up to large btrfs filesystems. It is an offline dedupe combined with
an incremental data scan capability to minimize time data spends on disk
from write to dedupe.
bees is a block-oriented userspace deduplication agent designed for large
btrfs filesystems. It is an offline dedupe combined with an incremental
data scan capability to minimize time data spends on disk from write
to dedupe.
Strengths
---------
* Space-efficient hash table - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
* Daemon mode - incrementally dedupes new data as it appears
* Largest extents first - recover more free space during fixed maintenance windows
* Space-efficient hash table and matching algorithms - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
* Daemon incrementally dedupes new data using btrfs tree search
* Works with btrfs compression - dedupe any combination of compressed and uncompressed files
* Whole-filesystem dedupe - scans data only once, even with snapshots and reflinks
* **NEW** [Works around `btrfs send` problems with dedupe and incremental parent shapshots](options.md)
* Works around btrfs filesystem structure to free more disk space
* Persistent hash table for rapid restart after shutdown
* Whole-filesystem dedupe - including snapshots
* Constant hash table size - no increased RAM usage if data set becomes larger
* Works on live data - no scheduled downtime required
* Automatic self-throttling - reduces system load
* btrfs support - recovers more free space from btrfs than naive dedupers
* Automatic self-throttling based on system load
Weaknesses
----------
* Whole-filesystem dedupe - has no include/exclude filters, does not accept file lists
* Requires root privilege (`CAP_SYS_ADMIN` plus the usual filesystem read/modify caps)
* Requires root privilege (or `CAP_SYS_ADMIN`)
* First run may require temporary disk space for extent reorganization
* [First run may increase metadata space usage if many snapshots exist](gotchas.md)
* Constant hash table size - no decreased RAM usage if data set becomes smaller
* btrfs only
@@ -46,7 +47,7 @@ Recommended Reading
-------------------
* [bees Gotchas](gotchas.md)
* [btrfs kernel bugs](btrfs-kernel.md) - especially DATA CORRUPTION WARNING for old kernels
* [btrfs kernel bugs](btrfs-kernel.md) - especially DATA CORRUPTION WARNING
* [bees vs. other btrfs features](btrfs-other.md)
* [What to do when something goes wrong](wrong.md)
@@ -69,6 +70,6 @@ You can also use Github:
Copyright & License
-------------------
Copyright 2015-2025 Zygo Blaxell <bees@furryterror.org>.
Copyright 2015-2018 Zygo Blaxell <bees@furryterror.org>.
GPL (version 3 or later).

View File

@@ -4,7 +4,7 @@ Building bees
Dependencies
------------
* C++11 compiler (tested with GCC 8.1.0, 12.2.0)
* C++11 compiler (tested with GCC 4.9, 6.3.0, 8.1.0)
Sorry. I really like closures and shared_ptr, so support
for earlier compiler versions is unlikely.
@@ -19,7 +19,7 @@ Dependencies
* [Linux kernel version](btrfs-kernel.md) gets its own page.
* markdown to build the documentation
* markdown for documentation
* util-linux version that provides `blkid` command for the helper
script `scripts/beesd` to work
@@ -80,7 +80,7 @@ within a temporary runtime directory.
Packaging
---------
See 'Dependencies' above. Package maintainers can pick ideas for building and
See 'Dependencies' below. Package maintainers can pick ideas for building and
configuring the source package from the Gentoo ebuild:
<https://github.com/gentoo/gentoo/tree/master/sys-fs/bees>

View File

@@ -2,8 +2,8 @@ Features You Might Expect That bees Doesn't Have
------------------------------------------------
* There's no configuration file (patches welcome!). There are
some tunables hardcoded in the source (`src/bees.h`) that could eventually
become configuration options. There's also an incomplete option parser
some tunables hardcoded in the source that could eventually become
configuration options. There's also an incomplete option parser
(patches welcome!).
* The bees process doesn't fork and writes its log to stdout/stderr.
@@ -15,9 +15,16 @@ specific files (patches welcome).
* PREALLOC extents and extents containing blocks filled with zeros will
be replaced by holes. There is no way to turn this off.
* The fundamental unit of deduplication is the extent _reference_, when
it should be the _extent_ itself. This is an architectural limitation
that results in excess reads of extent data, even in the Extent scan mode.
* Consecutive runs of duplicate blocks that are less than 12K in length
can take 30% of the processing time while saving only 3% of the disk
space. There should be an option to just not bother with those, but it's
complicated by the btrfs requirement to always dedupe complete extents.
* There is a lot of duplicate reading of blocks in snapshots. bees will
scan all snapshots at close to the same time to try to get better
performance by caching, but really fixing this requires rewriting the
crawler to scan the btrfs extent tree directly instead of the subvol
FS trees.
* Block reads are currently more allocation- and CPU-intensive than they
should be, especially for filesystems on SSD where the IO overhead is
@@ -26,9 +33,8 @@ much smaller. This is a problem for CPU-power-constrained environments
* bees can currently fragment extents when required to remove duplicate
blocks, but has no defragmentation capability yet. When possible, bees
will attempt to work with existing extent boundaries and choose the
largest fragments available, but it will not aggregate blocks together
from multiple extents to create larger ones.
will attempt to work with existing extent boundaries, but it will not
aggregate blocks together from multiple extents to create larger ones.
* When bees fragments an extent, the copied data is compressed. There
is currently no way (other than by modifying the source) to select a
@@ -37,6 +43,3 @@ compression method or not compress the data (patches welcome!).
* It is theoretically possible to resize the hash table without starting
over with a new full-filesystem scan; however, this feature has not been
implemented yet.
* btrfs maintains csums of data blocks which bees could use to improve
scan speeds, but bees doesn't use them yet.

View File

@@ -36,70 +36,38 @@
Has no effect unless `--loadavg-target` is used to specify a target load.
* `--throttle-factor FACTOR`
In order to avoid saturating btrfs deferred work queues, bees tracks
the time that operations with delayed effect (dedupe and tmpfile copy)
and operations with long run times (`LOGICAL_INO`) run. If an operation
finishes before the average run time for that operation, bees will
sleep for the remainder of the average run time, so that operations
are submitted to btrfs at a rate similar to the rate that btrfs can
complete them.
The `FACTOR` is multiplied by the average run time for each operation
to calculate the target delay time.
`FACTOR` 0 is the default, which adds no delays. bees will attempt
to saturate btrfs delayed work queues as quickly as possible, which
may impact other processes on the same filesystem, or even slow down
bees itself.
`FACTOR` 1.0 will attempt to keep btrfs delayed work queues filled at
a steady average rate.
`FACTOR` more than 1.0 will add delays longer than the average
run time (e.g. 10.0 will delay all operations that take less than 10x
the average run time). High values of `FACTOR` may be desirable when
using bees with other applications on the same filesystem.
The maximum delay per operation is 60 seconds.
## Filesystem tree traversal options
* `--scan-mode MODE` or `-m`
Specify extent scanning algorithm.
Specify extent scanning algorithm. Default `MODE` is 0.
**EXPERIMENTAL** feature that may go away.
* Mode 0: lockstep
* Mode 1: independent
* Mode 2: sequential
* Mode 3: recent
* Mode 4: extent
For details of the different scanning modes and the default value of
this option, see [bees configuration](config.md).
* Mode 0: scan extents in ascending order of (inode, subvol, offset).
Keeps shared extents between snapshots together. Reads files sequentially.
Minimizes temporary space usage.
* Mode 1: scan extents from all subvols in parallel. Good performance
on non-spinning media when subvols are unrelated.
* Mode 2: scan all extents from one subvol at a time. Good sequential
read performance for spinning media. Maximizes temporary space usage.
## Workarounds
* `--workaround-btrfs-send` or `-a`
_This option is obsolete and should not be used any more._
Pretend that read-only snapshots are empty and silently discard any
request to dedupe files referenced through them. This is a workaround
for [problems with old kernels running `btrfs send` and `btrfs send
request to dedupe files referenced through them. This is a workaround for
[problems with the kernel implementation of `btrfs send` and `btrfs send
-p`](btrfs-kernel.md) which make these btrfs features unusable with bees.
This option was used to avoid breaking `btrfs send` on old kernels.
The affected kernels are now too old to be recommended for use with bees.
bees now waits for `btrfs send` to finish. There is no need for an
option to enable this.
This option should be used to avoid breaking `btrfs send` on the same
filesystem.
**Note:** There is a _significant_ space tradeoff when using this option:
it is likely no space will be recovered--and possibly significant extra
space used--until the read-only snapshots are deleted.
space used--until the read-only snapshots are deleted. On the other
hand, if snapshots are rotated frequently then bees will spend less time
scanning them.
## Logging options

View File

@@ -75,8 +75,9 @@ in the shell script that launches `bees`:
schedtool -D -n20 $$
ionice -c3 -p $$
You can also use the [load management options](options.md) to further
control the impact of bees on the rest of the system.
You can also use the [`--loadavg-target` and `--thread-min`
options](options.md) to further control the impact of bees on the rest
of the system.
Let the bees fly:

View File

@@ -4,13 +4,16 @@ What to do when something goes wrong with bees
Hangs and excessive slowness
----------------------------
### Are you using qgroups or autodefrag?
Read about [bad btrfs feature interactions](btrfs-other.md).
### Use load-throttling options
If bees is just more aggressive than you would like, consider using
[load throttling options](options.md). These are usually more effective
than `ionice`, `schedtool`, and the `blkio` cgroup (though you can
certainly use those too) because they limit work that bees queues up
for later execution inside btrfs.
certainly use those too).
### Check `$BEESSTATUS`
@@ -49,6 +52,10 @@ dst = 15 /run/bees/ede84fbd-cb59-0c60-9ea7-376fa4984887/data.new/home/builder/li
Thread names of note:
* `crawl_12345`: scan/dedupe worker threads (the number is the subvol
ID which the thread is currently working on). These threads appear
and disappear from the status dynamically according to the requirements
of the work queue and loadavg throttling.
* `bees`: main thread (doesn't do anything after startup, but its task execution time is that of the whole bees process)
* `crawl_master`: task that finds new extents in the filesystem and populates the work queue
* `crawl_transid`: btrfs transid (generation number) tracker and polling thread
@@ -57,13 +64,6 @@ dst = 15 /run/bees/ede84fbd-cb59-0c60-9ea7-376fa4984887/data.new/home/builder/li
* `hash_writeback`: trickle-writes the hash table back to `beeshash.dat`
* `hash_prefetch`: prefetches the hash table at startup and updates `beesstats.txt` hourly
Most other threads have names that are derived from the current dedupe
task that they are executing:
* `ref_205ad76b1000_24K_50`: extent scan performing dedupe of btrfs extent bytenr `205ad76b1000`, which is 24 KiB long and has 50 references
* `extent_250_32M_16E`: extent scan searching for extents between 32 MiB + 1 and 16 EiB bytes long, tracking scan position in virtual subvol `250`.
* `crawl_378_18916`: subvol scan searching for extent refs in subvol `378`, inode `18916`.
### Dump kernel stacks of hung processes
Check the kernel stacks of all blocked kernel processes:
@@ -91,7 +91,7 @@ bees Crashes
(gdb) thread apply all bt full
The last line generates megabytes of output and will often crash gdb.
Submit whatever output gdb can produce.
This is OK, submit whatever output gdb can produce.
**Note that this output may include filenames or data from your
filesystem.**
@@ -134,7 +134,7 @@ ulimit -c 0
# If there were core files, generate reports for them
for x in core*; do
if [ -e "$x" ]; then
if [ -e "$x" ]; then
gdb --core="$x" \
--eval-command='set pagination off' \
--eval-command='info shared' \
@@ -160,7 +160,8 @@ Kernel crashes, corruption, and filesystem damage
-------------------------------------------------
bees doesn't do anything that _should_ cause corruption or data loss;
however, [btrfs has kernel bugs](btrfs-kernel.md), so corruption is
however, [btrfs has kernel bugs](btrfs-kernel.md) and [interacts poorly
with some Linux block device layers](btrfs-other.md), so corruption is
not impossible.
Issues with the btrfs filesystem kernel code or other block device layers

View File

@@ -1,216 +0,0 @@
#ifndef CRUCIBLE_BTRFS_TREE_H
#define CRUCIBLE_BTRFS_TREE_H
#include "crucible/fd.h"
#include "crucible/fs.h"
#include "crucible/bytevector.h"
namespace crucible {
using namespace std;
class BtrfsTreeItem {
uint64_t m_objectid = 0;
uint64_t m_offset = 0;
uint64_t m_transid = 0;
ByteVector m_data;
uint8_t m_type = 0;
public:
uint64_t objectid() const { return m_objectid; }
uint64_t offset() const { return m_offset; }
uint64_t transid() const { return m_transid; }
uint8_t type() const { return m_type; }
const ByteVector data() const { return m_data; }
BtrfsTreeItem() = default;
BtrfsTreeItem(const BtrfsIoctlSearchHeader &bish);
BtrfsTreeItem& operator=(const BtrfsIoctlSearchHeader &bish);
bool operator!() const;
/// Member access methods. Invoking a method on the
/// wrong type of item will throw an exception.
/// @{ Block group items
uint64_t block_group_flags() const;
uint64_t block_group_used() const;
/// @}
/// @{ Chunk items
uint64_t chunk_length() const;
uint64_t chunk_type() const;
/// @}
/// @{ Dev extent items (physical byte ranges)
uint64_t dev_extent_chunk_offset() const;
uint64_t dev_extent_length() const;
/// @}
/// @{ Dev items (devices)
uint64_t dev_item_total_bytes() const;
uint64_t dev_item_bytes_used() const;
/// @}
/// @{ Inode items
uint64_t inode_size() const;
/// @}
/// @{ Extent refs (EXTENT_DATA)
uint64_t file_extent_logical_bytes() const;
uint64_t file_extent_generation() const;
uint64_t file_extent_offset() const;
uint64_t file_extent_bytenr() const;
uint8_t file_extent_type() const;
btrfs_compression_type file_extent_compression() const;
/// @}
/// @{ Extent items (EXTENT_ITEM)
uint64_t extent_begin() const;
uint64_t extent_end() const;
uint64_t extent_flags() const;
uint64_t extent_generation() const;
/// @}
/// @{ Root items
uint64_t root_flags() const;
uint64_t root_refs() const;
/// @}
/// @{ Root backref items.
uint64_t root_ref_dirid() const;
string root_ref_name() const;
uint64_t root_ref_parent_rootid() const;
/// @}
};
ostream &operator<<(ostream &os, const BtrfsTreeItem &bti);
class BtrfsTreeFetcher {
protected:
Fd m_fd;
BtrfsIoctlSearchKey m_sk;
uint64_t m_tree = 0;
uint64_t m_min_transid = 0;
uint64_t m_max_transid = numeric_limits<uint64_t>::max();
uint64_t m_block_size = 0;
uint64_t m_lookbehind_size = 0;
uint64_t m_scale_size = 0;
uint8_t m_type = 0;
uint64_t scale_logical(uint64_t logical) const;
uint64_t unscale_logical(uint64_t logical) const;
const static uint64_t s_max_logical = numeric_limits<uint64_t>::max();
uint64_t scaled_max_logical() const;
virtual void fill_sk(BtrfsIoctlSearchKey &key, uint64_t object);
virtual void next_sk(BtrfsIoctlSearchKey &key, const BtrfsIoctlSearchHeader &hdr);
virtual uint64_t hdr_logical(const BtrfsIoctlSearchHeader &hdr) = 0;
virtual bool hdr_match(const BtrfsIoctlSearchHeader &hdr) = 0;
virtual bool hdr_stop(const BtrfsIoctlSearchHeader &hdr) = 0;
Fd fd() const;
void fd(Fd fd);
public:
virtual ~BtrfsTreeFetcher() = default;
BtrfsTreeFetcher(Fd new_fd);
void type(uint8_t type);
uint8_t type();
void tree(uint64_t tree);
uint64_t tree();
void transid(uint64_t min_transid, uint64_t max_transid = numeric_limits<uint64_t>::max());
/// Block size (sectorsize) of filesystem
uint64_t block_size() const;
/// Fetch last object < logical, null if not found
BtrfsTreeItem prev(uint64_t logical);
/// Fetch first object > logical, null if not found
BtrfsTreeItem next(uint64_t logical);
/// Fetch object at exactly logical, null if not found
BtrfsTreeItem at(uint64_t);
/// Fetch first object >= logical
BtrfsTreeItem lower_bound(uint64_t logical);
/// Fetch last object <= logical
BtrfsTreeItem rlower_bound(uint64_t logical);
/// Estimated distance between objects
virtual uint64_t lookbehind_size() const;
virtual void lookbehind_size(uint64_t);
/// Scale size (normally block size but must be set to 1 for fs trees)
uint64_t scale_size() const;
void scale_size(uint64_t);
};
class BtrfsTreeObjectFetcher : public BtrfsTreeFetcher {
protected:
virtual void fill_sk(BtrfsIoctlSearchKey &key, uint64_t logical) override;
virtual uint64_t hdr_logical(const BtrfsIoctlSearchHeader &hdr) override;
virtual bool hdr_match(const BtrfsIoctlSearchHeader &hdr) override;
virtual bool hdr_stop(const BtrfsIoctlSearchHeader &hdr) override;
public:
using BtrfsTreeFetcher::BtrfsTreeFetcher;
};
class BtrfsTreeOffsetFetcher : public BtrfsTreeFetcher {
protected:
uint64_t m_objectid = 0;
virtual void fill_sk(BtrfsIoctlSearchKey &key, uint64_t offset) override;
virtual uint64_t hdr_logical(const BtrfsIoctlSearchHeader &hdr) override;
virtual bool hdr_match(const BtrfsIoctlSearchHeader &hdr) override;
virtual bool hdr_stop(const BtrfsIoctlSearchHeader &hdr) override;
public:
using BtrfsTreeFetcher::BtrfsTreeFetcher;
void objectid(uint64_t objectid);
uint64_t objectid() const;
};
class BtrfsCsumTreeFetcher : public BtrfsTreeOffsetFetcher {
public:
const uint32_t BTRFS_CSUM_TYPE_UNKNOWN = uint32_t(1) << 16;
private:
size_t m_sum_size = 0;
uint32_t m_sum_type = BTRFS_CSUM_TYPE_UNKNOWN;
public:
BtrfsCsumTreeFetcher(const Fd &fd);
uint32_t sum_type() const;
size_t sum_size() const;
void get_sums(uint64_t logical, size_t count, function<void(uint64_t logical, const uint8_t *buf, size_t count)> output);
};
/// Fetch extent items from extent tree.
/// Does not filter out metadata! See BtrfsDataExtentTreeFetcher for that.
class BtrfsExtentItemFetcher : public BtrfsTreeObjectFetcher {
public:
BtrfsExtentItemFetcher(const Fd &fd);
};
/// Fetch extent refs from an inode. Caller must set the tree and objectid.
class BtrfsExtentDataFetcher : public BtrfsTreeOffsetFetcher {
public:
BtrfsExtentDataFetcher(const Fd &fd);
};
/// Fetch raw inode items
class BtrfsInodeFetcher : public BtrfsTreeObjectFetcher {
public:
BtrfsInodeFetcher(const Fd &fd);
BtrfsTreeItem stat(uint64_t subvol, uint64_t inode);
};
/// Fetch a root (subvol) item
class BtrfsRootFetcher : public BtrfsTreeObjectFetcher {
public:
BtrfsRootFetcher(const Fd &fd);
BtrfsTreeItem root(uint64_t subvol);
BtrfsTreeItem root_backref(uint64_t subvol);
};
/// Fetch data extent items from extent tree, skipping metadata-only block groups
class BtrfsDataExtentTreeFetcher : public BtrfsExtentItemFetcher {
BtrfsTreeItem m_current_bg;
BtrfsTreeOffsetFetcher m_chunk_tree;
protected:
virtual void next_sk(BtrfsIoctlSearchKey &key, const BtrfsIoctlSearchHeader &hdr) override;
public:
BtrfsDataExtentTreeFetcher(const Fd &fd);
};
}
#endif

View File

@@ -216,28 +216,7 @@ enum btrfs_compression_type {
#define BTRFS_FS_INFO_FLAG_CSUM_INFO (1 << 0)
#endif
#ifndef BTRFS_FS_INFO_FLAG_GENERATION
/* Request information about filesystem generation */
#define BTRFS_FS_INFO_FLAG_GENERATION (1 << 1)
#endif
#ifndef BTRFS_FS_INFO_FLAG_METADATA_UUID
/* Request information about filesystem metadata UUID */
#define BTRFS_FS_INFO_FLAG_METADATA_UUID (1 << 2)
#endif
// BTRFS_CSUM_TYPE_CRC32 was a #define from 2008 to 2019.
// After that, it's an enum with the other 3 types.
// So if we do _not_ have CRC32 defined, it means we have the other 3;
// if we _do_ have CRC32 defined, it means we need the other 3.
// This seems likely to break some day.
#ifdef BTRFS_CSUM_TYPE_CRC32
#define BTRFS_CSUM_TYPE_XXHASH 1
#define BTRFS_CSUM_TYPE_SHA256 2
#define BTRFS_CSUM_TYPE_BLAKE2 3
#endif
struct btrfs_ioctl_fs_info_args_v3 {
struct btrfs_ioctl_fs_info_args_v2 {
__u64 max_id; /* out */
__u64 num_devices; /* out */
__u8 fsid[BTRFS_FSID_SIZE]; /* out */
@@ -248,9 +227,7 @@ struct btrfs_ioctl_fs_info_args_v3 {
__u16 csum_type; /* out */
__u16 csum_size; /* out */
__u64 flags; /* in/out */
__u64 generation; /* out */
__u8 metadata_uuid[BTRFS_FSID_SIZE]; /* out */
__u8 reserved[944]; /* pad to 1k */
__u8 reserved[968]; /* pad to 1k */
};
#endif // CRUCIBLE_BTRFS_H

View File

@@ -1,80 +0,0 @@
#ifndef _CRUCIBLE_BYTEVECTOR_H_
#define _CRUCIBLE_BYTEVECTOR_H_
#include <crucible/error.h>
#include <memory>
#include <mutex>
#include <ostream>
#include <cstdint>
#include <cstdlib>
namespace crucible {
using namespace std;
// new[] is a little slower than malloc
// shared_ptr is about 2x slower than unique_ptr
// vector<uint8_t> is ~160x slower
// so we won't bother with unique_ptr because we can't do shared copies with it
class ByteVector {
public:
using Pointer = shared_ptr<uint8_t>;
using value_type = Pointer::element_type;
using iterator = value_type*;
ByteVector() = default;
ByteVector(const ByteVector &that);
ByteVector& operator=(const ByteVector &that);
ByteVector(size_t size);
ByteVector(const ByteVector &that, size_t start, size_t length);
ByteVector(iterator begin, iterator end, size_t min_size = 0);
ByteVector at(size_t start, size_t length) const;
value_type& at(size_t) const;
iterator begin() const;
void clear();
value_type* data() const;
bool empty() const;
iterator end() const;
value_type& operator[](size_t) const;
size_t size() const;
bool operator==(const ByteVector &that) const;
// this version of erase only works at the beginning or end of the buffer, else throws exception
void erase(iterator first);
void erase(iterator first, iterator last);
// An important use case is ioctls that have a fixed-size header struct
// followed by a buffer for further arguments. These templates avoid
// doing reinterpret_casts every time.
template <class T> ByteVector(const T& object, size_t min_size);
template <class T> T* get() const;
private:
Pointer m_ptr;
size_t m_size = 0;
mutable mutex m_mutex;
};
template <class T>
ByteVector::ByteVector(const T& object, size_t min_size)
{
const auto size = max(min_size, sizeof(T));
m_ptr = Pointer(static_cast<value_type*>(malloc(size)), free);
memcpy(m_ptr.get(), &object, sizeof(T));
m_size = size;
}
template <class T>
T*
ByteVector::get() const
{
THROW_CHECK2(out_of_range, size(), sizeof(T), size() >= sizeof(T));
return reinterpret_cast<T*>(data());
}
ostream& operator<<(ostream &os, const ByteVector &bv);
}
#endif // _CRUCIBLE_BYTEVECTOR_H_

View File

@@ -30,7 +30,7 @@ namespace crucible {
map<Key, ListIter> m_map;
LockSet<Key> m_lockset;
size_t m_max_size;
mutable mutex m_mutex;
mutex m_mutex;
void check_overflow();
void recent_use(ListIter vp);
@@ -48,7 +48,6 @@ namespace crucible {
void expire(Arguments... args);
void insert(const Return &r, Arguments... args);
void clear();
size_t size() const;
};
template <class Return, class... Arguments>
@@ -191,14 +190,6 @@ namespace crucible {
lock.unlock();
}
template <class Return, class... Arguments>
size_t
LRUCache<Return, Arguments...>::size() const
{
unique_lock<mutex> lock(m_mutex);
return m_map.size();
}
template<class Return, class... Arguments>
Return
LRUCache<Return, Arguments...>::operator()(Arguments... args)

View File

@@ -28,7 +28,7 @@ namespace crucible {
};
template<> struct le_to_cpu_helper<uint16_t> {
uint16_t operator()(const uint16_t v) { return le16toh(v); }
uint16_t operator()(const uint16_t v) { return le64toh(v); }
};
template<> struct le_to_cpu_helper<uint8_t> {

View File

@@ -126,13 +126,6 @@ namespace crucible {
} \
} while(0)
#define THROW_CHECK4(type, value1, value2, value3, value4, expr) do { \
if (!(expr)) { \
THROW_ERROR(type, #value1 << " = " << (value1) << ", " #value2 << " = " << (value2) << ", " #value3 << " = " << (value3) << ", " #value4 << " = " << (value4) \
<< " failed constraint check (" << #expr << ")"); \
} \
} while(0)
#define THROW_CHECK_BIN_OP(type, value1, op, value2) do { \
if (!((value1) op (value2))) { \
THROW_ERROR(type, "failed constraint check " << #value1 << " (" << (value1) << ") " << #op << " " << #value2 << " (" << (value2) << ")"); \

View File

@@ -42,6 +42,9 @@ namespace crucible {
uint64_t bytenr() const;
bool operator==(const Extent &that) const;
bool operator!=(const Extent &that) const { return !(*this == that); }
Extent() = default;
Extent(const Extent &e) = default;
};
class ExtentWalker {

View File

@@ -1,7 +1,6 @@
#ifndef CRUCIBLE_FD_H
#define CRUCIBLE_FD_H
#include "crucible/bytevector.h"
#include "crucible/namedptr.h"
#include <cstring>
@@ -27,9 +26,9 @@
namespace crucible {
using namespace std;
/// File descriptor owner object. It closes them when destroyed.
/// Most of the functions here don't use it because these functions don't own FDs.
/// All good names for such objects are taken.
// IOHandle is a file descriptor owner object. It closes them when destroyed.
// Most of the functions here don't use it because these functions don't own FDs.
// All good names for such objects are taken.
class IOHandle {
IOHandle(const IOHandle &) = delete;
IOHandle(IOHandle &&) = delete;
@@ -43,7 +42,6 @@ namespace crucible {
int get_fd() const;
};
/// Copyable file descriptor.
class Fd {
static NamedPtr<IOHandle, int> s_named_ptr;
shared_ptr<IOHandle> m_handle;
@@ -63,29 +61,24 @@ namespace crucible {
// Functions named "foo_or_die" throw exceptions on failure.
/// Attempt to open the file with the given mode, throw exception on failure.
// Attempt to open the file with the given mode
int open_or_die(const string &file, int flags = O_RDONLY, mode_t mode = 0777);
/// Attempt to open the file with the given mode, throw exception on failure.
int openat_or_die(int dir_fd, const string &file, int flags = O_RDONLY, mode_t mode = 0777);
/// Decode open flags
// Decode open parameters
string o_flags_ntoa(int flags);
/// Decode open mode
string o_mode_ntoa(mode_t mode);
/// mmap with its one weird error case
// mmap with its one weird error case
void *mmap_or_die(void *addr, size_t length, int prot, int flags, int fd, off_t offset);
/// Decode mmap prot
// Decode mmap parameters
string mmap_prot_ntoa(int prot);
/// Decode mmap flags
string mmap_flags_ntoa(int flags);
/// Rename, throw exception on failure.
// Unlink, rename
void rename_or_die(const string &from, const string &to);
/// Rename, throw exception on failure.
void renameat_or_die(int fromfd, const string &frompath, int tofd, const string &topath);
/// Truncate, throw exception on failure.
void ftruncate_or_die(int fd, off_t size);
// Read or write structs:
@@ -93,25 +86,19 @@ namespace crucible {
// Three-arg version of read_or_die/write_or_die throws an error on incomplete read/writes
// Four-arg version returns number of bytes read/written through reference arg
/// Attempt read by pointer and length, throw exception on IO error or short read.
void read_or_die(int fd, void *buf, size_t size);
/// Attempt read of a POD struct, throw exception on IO error or short read.
template <class T> void read_or_die(int fd, T& buf)
{
return read_or_die(fd, static_cast<void *>(&buf), sizeof(buf));
}
/// Attempt read by pointer and length, throw exception on IO error but not short read.
void read_partial_or_die(int fd, void *buf, size_t size_wanted, size_t &size_read);
/// Attempt read of a POD struct, throw exception on IO error but not short read.
template <class T> void read_partial_or_die(int fd, T& buf, size_t &size_read)
{
return read_partial_or_die(fd, static_cast<void *>(&buf), sizeof(buf), size_read);
}
/// Attempt read at position by pointer and length, throw exception on IO error but not short read.
void pread_or_die(int fd, void *buf, size_t size, off_t offset);
/// Attempt read at position of a POD struct, throw exception on IO error but not short read.
template <class T> void pread_or_die(int fd, T& buf, off_t offset)
{
return pread_or_die(fd, static_cast<void *>(&buf), sizeof(buf), offset);
@@ -138,23 +125,20 @@ namespace crucible {
// Specialization for strings which reads/writes the string content, not the struct string
template<> void write_or_die<string>(int fd, const string& str);
template<> void pread_or_die<string>(int fd, string& str, off_t offset);
template<> void pread_or_die<vector<char>>(int fd, vector<char>& str, off_t offset);
template<> void pread_or_die<vector<uint8_t>>(int fd, vector<uint8_t>& str, off_t offset);
template<> void pwrite_or_die<string>(int fd, const string& str, off_t offset);
template<> void pread_or_die<ByteVector>(int fd, ByteVector& str, off_t offset);
template<> void pwrite_or_die<ByteVector>(int fd, const ByteVector& str, off_t offset);
// Deprecated
template<> void pread_or_die<vector<uint8_t>>(int fd, vector<uint8_t>& str, off_t offset) = delete;
template<> void pwrite_or_die<vector<uint8_t>>(int fd, const vector<uint8_t>& str, off_t offset) = delete;
template<> void pread_or_die<vector<char>>(int fd, vector<char>& str, off_t offset) = delete;
template<> void pwrite_or_die<vector<char>>(int fd, const vector<char>& str, off_t offset) = delete;
template<> void pwrite_or_die<vector<char>>(int fd, const vector<char>& str, off_t offset);
template<> void pwrite_or_die<vector<uint8_t>>(int fd, const vector<uint8_t>& str, off_t offset);
/// Read a simple string.
// A different approach to reading a simple string
string read_string(int fd, size_t size);
/// A lot of Unix API wants you to initialize a struct and call
/// one function to fill it, another function to throw it away,
/// and has some unknown third thing you have to do when there's
/// an error. That's also a C++ object with an exception-throwing
/// constructor.
// A lot of Unix API wants you to initialize a struct and call
// one function to fill it, another function to throw it away,
// and has some unknown third thing you have to do when there's
// an error. That's also a C++ object with an exception-throwing
// constructor.
struct Stat : public stat {
Stat();
Stat(int f);
@@ -168,17 +152,17 @@ namespace crucible {
string st_mode_ntoa(mode_t mode);
/// Because it's not trivial to do correctly
// Because it's not trivial to do correctly
string readlink_or_die(const string &path);
/// Determine the name of a FD by readlink through /proc/self/fd/
// Determine the name of a FD by readlink through /proc/self/fd/
string name_fd(int fd);
/// Returns Fd objects because it does own them.
// Returns Fd objects because it does own them.
pair<Fd, Fd> socketpair_or_die(int domain = AF_UNIX, int type = SOCK_STREAM, int protocol = 0);
/// like unique_lock but for flock instead of mutexes...and not trying
/// to hide the many and subtle differences between those two things *at all*.
// like unique_lock but for flock instead of mutexes...and not trying
// to hide the many and subtle differences between those two things *at all*.
class Flock {
int m_fd;
bool m_locked;
@@ -199,7 +183,7 @@ namespace crucible {
int fd();
};
/// Doesn't use Fd objects because it's usually just used to replace stdin/stdout/stderr.
// Doesn't use Fd objects because it's usually just used to replace stdin/stdout/stderr.
void dup2_or_die(int fd_in, int fd_out);
}

View File

@@ -1,9 +1,9 @@
#ifndef CRUCIBLE_FS_H
#define CRUCIBLE_FS_H
#include "crucible/bytevector.h"
#include "crucible/endian.h"
#include "crucible/error.h"
#include "crucible/spanner.h"
// Terribly Linux-specific FS-wrangling functions
@@ -27,16 +27,18 @@ namespace crucible {
// wrapper around fallocate(...FALLOC_FL_PUNCH_HOLE...)
void punch_hole(int fd, off_t offset, off_t len);
struct BtrfsExtentSame {
struct BtrfsExtentInfo : public btrfs_ioctl_same_extent_info {
BtrfsExtentInfo(int dst_fd, off_t dst_offset);
};
struct BtrfsExtentSame : public btrfs_ioctl_same_args {
virtual ~BtrfsExtentSame();
BtrfsExtentSame(int src_fd, off_t src_offset, off_t src_length);
void add(int fd, uint64_t offset);
void add(int fd, off_t offset);
virtual void do_ioctl();
uint64_t m_logical_offset = 0;
uint64_t m_length = 0;
int m_fd;
vector<btrfs_ioctl_same_extent_info> m_info;
vector<BtrfsExtentInfo> m_info;
};
ostream & operator<<(ostream &os, const btrfs_ioctl_same_extent_info *info);
@@ -51,30 +53,29 @@ namespace crucible {
ostream & operator<<(ostream &os, const BtrfsInodeOffsetRoot &p);
struct BtrfsDataContainer {
struct BtrfsDataContainer : public btrfs_data_container {
BtrfsDataContainer(size_t size = 64 * 1024);
void *prepare(size_t size);
size_t get_size() const;
decltype(btrfs_data_container::bytes_left) get_bytes_left() const;
decltype(btrfs_data_container::bytes_missing) get_bytes_missing() const;
decltype(btrfs_data_container::elem_cnt) get_elem_cnt() const;
decltype(btrfs_data_container::elem_missed) get_elem_missed() const;
decltype(bytes_left) get_bytes_left() const;
decltype(bytes_missing) get_bytes_missing() const;
decltype(elem_cnt) get_elem_cnt() const;
decltype(elem_missed) get_elem_missed() const;
ByteVector m_data;
vector<uint8_t> m_data;
};
struct BtrfsIoctlLogicalInoArgs {
struct BtrfsIoctlLogicalInoArgs : public btrfs_ioctl_logical_ino_args {
BtrfsIoctlLogicalInoArgs(uint64_t logical, size_t buf_size = 16 * 1024 * 1024);
uint64_t get_flags() const;
void set_flags(uint64_t new_flags);
void set_logical(uint64_t new_logical);
void set_size(uint64_t new_size);
void do_ioctl(int fd);
bool do_ioctl_nothrow(int fd);
virtual void do_ioctl(int fd);
virtual bool do_ioctl_nothrow(int fd);
size_t m_container_size;
struct BtrfsInodeOffsetRootSpan {
using iterator = BtrfsInodeOffsetRoot*;
using const_iterator = const BtrfsInodeOffsetRoot*;
@@ -85,17 +86,13 @@ namespace crucible {
const_iterator cend() const;
iterator data() const;
void clear();
operator vector<BtrfsInodeOffsetRoot>() const;
private:
iterator m_begin = nullptr;
iterator m_end = nullptr;
friend struct BtrfsIoctlLogicalInoArgs;
} m_iors;
private:
size_t m_container_size;
BtrfsDataContainer m_container;
uint64_t m_logical;
uint64_t m_flags = 0;
friend ostream & operator<<(ostream &os, const BtrfsIoctlLogicalInoArgs *p);
};
ostream & operator<<(ostream &os, const BtrfsIoctlLogicalInoArgs &p);
@@ -127,6 +124,15 @@ namespace crucible {
ostream & operator<<(ostream &os, const BtrfsIoctlDefragRangeArgs *p);
// in btrfs/ctree.h, but that's a nightmare to #include here
typedef enum {
BTRFS_COMPRESS_NONE = 0,
BTRFS_COMPRESS_ZLIB = 1,
BTRFS_COMPRESS_LZO = 2,
BTRFS_COMPRESS_ZSTD = 3,
BTRFS_COMPRESS_TYPES = 3
} btrfs_compression_type;
struct FiemapExtent : public fiemap_extent {
FiemapExtent();
FiemapExtent(const fiemap_extent &that);
@@ -135,26 +141,16 @@ namespace crucible {
off_t end() const;
};
struct Fiemap {
// because fiemap.h insists on giving FIEMAP_MAX_OFFSET
// a different type from the struct fiemap members
static const uint64_t s_fiemap_max_offset = FIEMAP_MAX_OFFSET;
struct Fiemap : public fiemap {
// Get entire file
Fiemap(uint64_t start = 0, uint64_t length = s_fiemap_max_offset);
Fiemap(uint64_t start = 0, uint64_t length = FIEMAP_MAX_OFFSET);
void do_ioctl(int fd);
vector<FiemapExtent> m_extents;
decltype(fiemap::fm_extent_count) m_min_count = (4096 - sizeof(fiemap)) / sizeof(fiemap_extent);
decltype(fiemap::fm_extent_count) m_max_count = 16 * 1024 * 1024 / sizeof(fiemap_extent);
uint64_t m_start;
uint64_t m_length;
// FIEMAP is slow and full of lies.
// This makes FIEMAP even slower, but reduces the lies a little.
decltype(fiemap::fm_flags) m_flags = FIEMAP_FLAG_SYNC;
friend ostream &operator<<(ostream &, const Fiemap &);
uint64_t m_min_count = (4096 - sizeof(fiemap)) / sizeof(fiemap_extent);
uint64_t m_max_count = 16 * 1024 * 1024 / sizeof(fiemap_extent);
};
ostream & operator<<(ostream &os, const fiemap_extent *info);
@@ -170,8 +166,8 @@ namespace crucible {
struct BtrfsIoctlSearchHeader : public btrfs_ioctl_search_header {
BtrfsIoctlSearchHeader();
ByteVector m_data;
size_t set_data(const ByteVector &v, size_t offset);
Spanner<const uint8_t> m_data;
size_t set_data(const vector<uint8_t> &v, size_t offset);
bool operator<(const BtrfsIoctlSearchHeader &that) const;
};
@@ -185,32 +181,24 @@ namespace crucible {
ostream & operator<<(ostream &os, const BtrfsIoctlSearchHeader &hdr);
struct BtrfsIoctlSearchKey : public btrfs_ioctl_search_key {
BtrfsIoctlSearchKey(size_t buf_size = 1024);
bool do_ioctl_nothrow(int fd);
void do_ioctl(int fd);
BtrfsIoctlSearchKey(size_t buf_size = 4096);
virtual bool do_ioctl_nothrow(int fd);
virtual void do_ioctl(int fd);
// Copy objectid/type/offset so we move forward
void next_min(const BtrfsIoctlSearchHeader& ref);
// move forward to next object of a single type
void next_min(const BtrfsIoctlSearchHeader& ref, const uint8_t type);
size_t m_buf_size;
vector<uint8_t> m_ioctl_arg;
set<BtrfsIoctlSearchHeader> m_result;
static thread_local size_t s_calls;
static thread_local size_t s_loops;
static thread_local size_t s_loops_empty;
static thread_local shared_ptr<ostream> s_debug_ostream;
};
ostream & operator<<(ostream &os, const btrfs_ioctl_search_key &key);
ostream & operator<<(ostream &os, const BtrfsIoctlSearchKey &key);
string btrfs_chunk_type_ntoa(uint64_t type);
string btrfs_search_type_ntoa(unsigned type);
string btrfs_search_objectid_ntoa(uint64_t objectid);
string btrfs_compress_type_ntoa(uint8_t type);
uint64_t btrfs_get_root_id(int fd);
uint64_t btrfs_get_root_transid(int fd);
@@ -245,14 +233,13 @@ namespace crucible {
unsigned long available() const;
};
struct BtrfsIoctlFsInfoArgs : public btrfs_ioctl_fs_info_args_v3 {
template<class V> ostream &hexdump(ostream &os, const V &v);
struct BtrfsIoctlFsInfoArgs : public btrfs_ioctl_fs_info_args_v2 {
BtrfsIoctlFsInfoArgs();
void do_ioctl(int fd);
bool do_ioctl_nothrow(int fd);
uint16_t csum_type() const;
uint16_t csum_size() const;
uint64_t generation() const;
vector<uint8_t> fsid() const;
};
ostream & operator<<(ostream &os, const BtrfsIoctlFsInfoArgs &a);

View File

@@ -1,38 +0,0 @@
#ifndef CRUCIBLE_HEXDUMP_H
#define CRUCIBLE_HEXDUMP_H
#include "crucible/string.h"
#include <ostream>
namespace crucible {
using namespace std;
template <class V>
ostream &
hexdump(ostream &os, const V &v)
{
const auto v_size = v.size();
const uint8_t* const v_data = reinterpret_cast<uint8_t*>(v.data());
os << "V { size = " << v_size << ", data:\n";
for (size_t i = 0; i < v_size; i += 8) {
string hex, ascii;
for (size_t j = i; j < i + 8; ++j) {
if (j < v_size) {
const uint8_t c = v_data[j];
char buf[8];
sprintf(buf, "%02x ", c);
hex += buf;
ascii += (c < 32 || c > 126) ? '.' : c;
} else {
hex += " ";
ascii += ' ';
}
}
os << astringprintf("\t%08x %s %s\n", i, hex.c_str(), ascii.c_str());
}
return os << "}";
}
};
#endif // CRUCIBLE_HEXDUMP_H

View File

@@ -117,7 +117,7 @@ namespace crucible {
while (full() || locked(name)) {
m_condvar.wait(lock);
}
auto rv = m_set.insert(make_pair(name, gettid()));
auto rv = m_set.insert(make_pair(name, crucible::gettid()));
THROW_CHECK0(runtime_error, rv.second);
}
@@ -129,7 +129,7 @@ namespace crucible {
if (full() || locked(name)) {
return false;
}
auto rv = m_set.insert(make_pair(name, gettid()));
auto rv = m_set.insert(make_pair(name, crucible::gettid()));
THROW_CHECK1(runtime_error, name, rv.second);
return true;
}

View File

@@ -1,42 +0,0 @@
#ifndef CRUCIBLE_MULTILOCK_H
#define CRUCIBLE_MULTILOCK_H
#include <condition_variable>
#include <map>
#include <memory>
#include <mutex>
#include <string>
namespace crucible {
using namespace std;
class MultiLocker {
mutex m_mutex;
condition_variable m_cv;
map<string, size_t> m_counters;
bool m_do_locking = true;
class LockHandle {
const string m_type;
MultiLocker &m_parent;
bool m_locked = false;
void set_locked(bool state);
public:
~LockHandle();
LockHandle(const string &type, MultiLocker &parent);
friend class MultiLocker;
};
friend class LockHandle;
bool is_lock_available(const string &type);
void put_lock(const string &type);
shared_ptr<LockHandle> get_lock_private(const string &type);
public:
static shared_ptr<LockHandle> get_lock(const string &type);
static void enable_locking(bool enabled);
};
}
#endif // CRUCIBLE_MULTILOCK_H

View File

@@ -12,18 +12,13 @@
namespace crucible {
using namespace std;
/// A thread-safe container for RAII of shared resources with unique names.
/// Storage for objects with unique names
template <class Return, class... Arguments>
class NamedPtr {
public:
/// The name in "NamedPtr"
using Key = tuple<Arguments...>;
/// A shared pointer to the named object with ownership
/// tracking that erases the object's stored name when
/// the last shared pointer is destroyed.
using Ptr = shared_ptr<Return>;
/// A function that translates a name into a shared pointer to an object.
using Func = function<Ptr(Arguments...)>;
private:
struct Value;
@@ -34,7 +29,6 @@ namespace crucible {
mutex m_mutex;
};
using MapPtr = shared_ptr<MapRep>;
/// Container for Return pointers. Destructor removes entry from map.
struct Value {
Ptr m_ret_ptr;
MapPtr m_map_rep;
@@ -56,21 +50,15 @@ namespace crucible {
void func(Func f);
Ptr operator()(Arguments... args);
Ptr insert(const Ptr &r, Arguments... args);
};
/// Construct NamedPtr map and define a function to turn a name into a pointer.
template <class Return, class... Arguments>
NamedPtr<Return, Arguments...>::NamedPtr(Func f) :
m_fn(f)
{
}
/// Construct a Value wrapper: the value to store, the argument key to store the value under,
/// and a pointer to the map. Everything needed to remove the key from the map when the
/// last NamedPtr is deleted. NamedPtr then releases its own pointer to the value, which
/// may or may not trigger deletion there.
template <class Return, class... Arguments>
NamedPtr<Return, Arguments...>::Value::Value(Ptr&& ret_ptr, const Key &key, const MapPtr &map_rep) :
m_ret_ptr(ret_ptr),
@@ -79,8 +67,6 @@ namespace crucible {
{
}
/// Destroy a Value wrapper: remove a dead Key from the map, then let the member destructors
/// do the rest. The Key might be in the map and not dead, so leave it alone in that case.
template <class Return, class... Arguments>
NamedPtr<Return, Arguments...>::Value::~Value()
{
@@ -96,23 +82,21 @@ namespace crucible {
// "our" map entry if it exists and is expired. The other
// thread would have done the same for us if the race had
// a different winner.
const auto found = m_map_rep->m_map.find(m_ret_key);
auto found = m_map_rep->m_map.find(m_ret_key);
if (found != m_map_rep->m_map.end() && found->second.expired()) {
m_map_rep->m_map.erase(found);
}
}
/// Find a Return by key and fetch a strong Return pointer.
/// Ignore Keys that have expired weak pointers.
template <class Return, class... Arguments>
typename NamedPtr<Return, Arguments...>::Ptr
NamedPtr<Return, Arguments...>::lookup_item(const Key &k)
{
// Must be called with lock held
const auto found = m_map_rep->m_map.find(k);
auto found = m_map_rep->m_map.find(k);
if (found != m_map_rep->m_map.end()) {
// Get the strong pointer back
const auto rv = found->second.lock();
auto rv = found->second.lock();
if (rv) {
// Have strong pointer. Return value that shares map entry.
return shared_ptr<Return>(rv, rv->m_ret_ptr.get());
@@ -125,11 +109,6 @@ namespace crucible {
return Ptr();
}
/// Insert the Return value of calling Func(Arguments...).
/// If the value already exists in the map, return the existing value.
/// If another thread is already running Func(Arguments...) then this thread
/// will block until the other thread finishes inserting the Return in the
/// map, and both threads will return the same Return value.
template <class Return, class... Arguments>
typename NamedPtr<Return, Arguments...>::Ptr
NamedPtr<Return, Arguments...>::insert_item(Func fn, Arguments... args)
@@ -137,36 +116,34 @@ namespace crucible {
Key k(args...);
// Is it already in the map?
unique_lock<mutex> lock_lookup(m_map_rep->m_mutex);
unique_lock<mutex> lock(m_map_rep->m_mutex);
auto rv = lookup_item(k);
if (rv) {
return rv;
}
// Release map lock and acquire key lock
lock_lookup.unlock();
const auto key_lock = m_lockset.make_lock(k);
lock.unlock();
auto key_lock = m_lockset.make_lock(k);
// Did item appear in map while we were waiting for key?
lock_lookup.lock();
lock.lock();
rv = lookup_item(k);
if (rv) {
return rv;
}
// We now hold key and index locks, but item not in map (or expired).
// Release map lock so other threads can use the map
lock_lookup.unlock();
// Call the function and create a new Value outside of the map
const auto new_value_ptr = make_shared<Value>(fn(args...), k, m_map_rep);
// Release map lock
lock.unlock();
// Call the function and create a new Value
auto new_value_ptr = make_shared<Value>(fn(args...), k, m_map_rep);
// Function must return a non-null pointer
THROW_CHECK0(runtime_error, new_value_ptr->m_ret_ptr);
// Reacquire index lock for map insertion. We still hold the key lock.
// Use a different lock object to make exceptions unlock in the right order
unique_lock<mutex> lock_insert(m_map_rep->m_mutex);
// Reacquire index lock for map insertion
lock.lock();
// Insert return value in map or overwrite existing
// empty or expired weak_ptr value.
@@ -181,16 +158,16 @@ namespace crucible {
// to find and fix.
assert(new_item_ref.expired());
// Update the map slot we are sure is empty
// Update the empty map slot
new_item_ref = new_value_ptr;
// Drop lock so we don't deadlock in constructor exceptions
lock.unlock();
// Return shared_ptr to Return using strong pointer's reference counter
return shared_ptr<Return>(new_value_ptr, new_value_ptr->m_ret_ptr.get());
// Release map lock, then key lock
}
/// (Re)define a function to turn a name into a pointer.
template <class Return, class... Arguments>
void
NamedPtr<Return, Arguments...>::func(Func func)
@@ -199,7 +176,6 @@ namespace crucible {
m_fn = func;
}
/// Convert a name into a pointer using the configured function.
template<class Return, class... Arguments>
typename NamedPtr<Return, Arguments...>::Ptr
NamedPtr<Return, Arguments...>::operator()(Arguments... args)
@@ -207,19 +183,14 @@ namespace crucible {
return insert_item(m_fn, args...);
}
/// Insert a pointer that has already been created under the
/// given name. Useful for inserting a pointer to a derived
/// class when the name doesn't contain all of the information
/// required for the object, or when the Return is already known by
/// some cheaper method than calling the function.
template<class Return, class... Arguments>
typename NamedPtr<Return, Arguments...>::Ptr
NamedPtr<Return, Arguments...>::insert(const Ptr &r, Arguments... args)
{
THROW_CHECK0(invalid_argument, r);
return insert_item([&](Arguments...) { return r; }, args...);
return insert_item([&](Arguments...) -> Ptr { return r; }, args...);
}
}
#endif // CRUCIBLE_NAMEDPTR_H
#endif // NAMEDPTR_H

View File

@@ -20,7 +20,7 @@ namespace crucible {
#define NTOA_TABLE_ENTRY_BITS(x) { .n = (x), .mask = (x), .a = (#x) }
// Enumerations (entire value matches all bits)
#define NTOA_TABLE_ENTRY_ENUM(x) { .n = (x), .mask = ~0ULL, .a = (#x) }
#define NTOA_TABLE_ENTRY_ENUM(x) { .n = (x), .mask = ~0UL, .a = (#x) }
// End of table (sorry, C++ didn't get C99's compound literals, so we have to write out all the member names)
#define NTOA_TABLE_ENTRY_END() { .n = 0, .mask = 0, .a = nullptr }

View File

@@ -1,52 +0,0 @@
#ifndef CRUCIBLE_OPENAT2_H
#define CRUCIBLE_OPENAT2_H
#include <cstdlib>
// Compatibility for building on old libc for new kernel
#include <linux/version.h>
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)
#include <linux/openat2.h>
#else
#include <linux/types.h>
#ifndef RESOLVE_NO_XDEV
#define RESOLVE_NO_XDEV 1
// RESOLVE_NO_XDEV was there from the beginning of openat2,
// so if that's missing, so is open_how
struct open_how {
__u64 flags;
__u64 mode;
__u64 resolve;
};
#endif
#ifndef RESOLVE_NO_MAGICLINKS
#define RESOLVE_NO_MAGICLINKS 2
#endif
#ifndef RESOLVE_NO_SYMLINKS
#define RESOLVE_NO_SYMLINKS 4
#endif
#ifndef RESOLVE_BENEATH
#define RESOLVE_BENEATH 8
#endif
#ifndef RESOLVE_IN_ROOT
#define RESOLVE_IN_ROOT 16
#endif
#endif // Linux version >= v5.6
extern "C" {
/// Weak symbol to support libc with no syscall wrapper
int openat2(int dirfd, const char *pathname, struct open_how *how, size_t size) throw();
};
#endif // CRUCIBLE_OPENAT2_H

View File

@@ -10,10 +10,6 @@
#include <sys/wait.h>
#include <unistd.h>
extern "C" {
pid_t gettid() throw();
};
namespace crucible {
using namespace std;
@@ -77,6 +73,7 @@ namespace crucible {
typedef ResourceHandle<Process::id, Process> Pid;
pid_t gettid();
double getloadavg1();
double getloadavg5();
double getloadavg15();

View File

@@ -4,20 +4,13 @@
#include "crucible/error.h"
#include <functional>
#include <map>
#include <memory>
#include <mutex>
#include <set>
#include <cassert>
namespace crucible {
using namespace std;
/// A class to track progress of multiple workers using only two points:
/// the first and last incomplete state. The first incomplete
/// state can be recorded as a checkpoint to resume later on.
/// The last completed state is the starting point for workers that
/// need something to do.
template <class T>
class ProgressTracker {
struct ProgressTrackerState;
@@ -26,17 +19,9 @@ namespace crucible {
using value_type = T;
using ProgressHolder = shared_ptr<ProgressHolderState>;
/// Create ProgressTracker with initial begin and end state 'v'.
ProgressTracker(const value_type &v);
/// The first incomplete state. This is not "sticky",
/// it will revert to the end state if there are no
/// items in progress.
value_type begin() const;
/// The last incomplete state. This is "sticky",
/// it can only increase and never decrease.
value_type end() const;
value_type begin();
value_type end();
ProgressHolder hold(const value_type &v);
@@ -46,7 +31,7 @@ namespace crucible {
struct ProgressTrackerState {
using key_type = pair<value_type, ProgressHolderState *>;
mutex m_mutex;
set<key_type> m_in_progress;
map<key_type, bool> m_in_progress;
value_type m_begin;
value_type m_end;
};
@@ -54,7 +39,6 @@ namespace crucible {
class ProgressHolderState {
shared_ptr<ProgressTrackerState> m_state;
const value_type m_value;
using key_type = typename ProgressTrackerState::key_type;
public:
ProgressHolderState(shared_ptr<ProgressTrackerState> state, const value_type &v);
~ProgressHolderState();
@@ -67,7 +51,7 @@ namespace crucible {
template <class T>
typename ProgressTracker<T>::value_type
ProgressTracker<T>::begin() const
ProgressTracker<T>::begin()
{
unique_lock<mutex> lock(m_state->m_mutex);
return m_state->m_begin;
@@ -75,7 +59,7 @@ namespace crucible {
template <class T>
typename ProgressTracker<T>::value_type
ProgressTracker<T>::end() const
ProgressTracker<T>::end()
{
unique_lock<mutex> lock(m_state->m_mutex);
return m_state->m_end;
@@ -102,11 +86,7 @@ namespace crucible {
m_value(v)
{
unique_lock<mutex> lock(m_state->m_mutex);
const auto rv = m_state->m_in_progress.insert(key_type(m_value, this));
THROW_CHECK1(runtime_error, m_value, rv.second);
// Set the beginning to the first existing in-progress item
m_state->m_begin = m_state->m_in_progress.begin()->first;
// If this value is past the end, move the end, but don't go backwards
m_state->m_in_progress[make_pair(m_value, this)] = true;
if (m_state->m_end < m_value) {
m_state->m_end = m_value;
}
@@ -116,15 +96,17 @@ namespace crucible {
ProgressTracker<T>::ProgressHolderState::~ProgressHolderState()
{
unique_lock<mutex> lock(m_state->m_mutex);
const auto rv = m_state->m_in_progress.erase(key_type(m_value, this));
// THROW_CHECK2(runtime_error, m_value, rv, rv == 1);
assert(rv == 1);
if (m_state->m_in_progress.empty()) {
// If we made the list empty, then m_begin == m_end
m_state->m_begin = m_state->m_end;
} else {
// If we deleted the first element, then m_begin = current first element
m_state->m_begin = m_state->m_in_progress.begin()->first;
m_state->m_in_progress[make_pair(m_value, this)] = false;
auto p = m_state->m_in_progress.begin();
while (p != m_state->m_in_progress.end()) {
if (p->second) {
break;
}
if (m_state->m_begin < p->first.first) {
m_state->m_begin = p->first.first;
}
m_state->m_in_progress.erase(p);
p = m_state->m_in_progress.begin();
}
}

View File

@@ -1,163 +0,0 @@
#ifndef _CRUCIBLE_SEEKER_H_
#define _CRUCIBLE_SEEKER_H_
#include "crucible/error.h"
#include <algorithm>
#include <limits>
#include <cstdint>
#if 0
#include <iostream>
#include <sstream>
#define DINIT(__x) __x
#define DLOG(__x) do { logs << __x << std::endl; } while (false)
#define DOUT(__err) do { __err << logs.str(); } while (false)
#else
#define DINIT(__x) do {} while (false)
#define DLOG(__x) do {} while (false)
#define DOUT(__x) do {} while (false)
#endif
namespace crucible {
using namespace std;
// Requirements for Container<Pos> Fetch(Pos lower, Pos upper):
// - fetches objects in Pos order, starting from lower (must be >= lower)
// - must return upper if present, may or may not return objects after that
// - returns a container of Pos objects with begin(), end(), rbegin(), rend()
// - container must iterate over objects in Pos order
// - uniqueness of Pos objects not required
// - should store the underlying data as a side effect
//
// Requirements for Pos:
// - should behave like an unsigned integer type
// - must have specializations in numeric_limits<T> for digits, max(), min()
// - must support +, -, -=, and related operators
// - must support <, <=, ==, and related operators
// - must support Pos / 2 (only)
//
// Requirements for seek_backward:
// - calls Fetch to search Pos space near target_pos
// - if no key exists with value <= target_pos, returns the minimum Pos value
// - returns the highest key value <= target_pos
// - returned key value may not be part of most recent Fetch result
// - 1 loop iteration when target_pos exists
template <class Fetch, class Pos = uint64_t>
Pos
seek_backward(Pos const target_pos, Fetch fetch, Pos min_step = 1, size_t max_loops = numeric_limits<size_t>::max())
{
DINIT(ostringstream logs);
try {
static const Pos end_pos = numeric_limits<Pos>::max();
// TBH this probably won't work if begin_pos != 0, i.e. any signed type
static const Pos begin_pos = numeric_limits<Pos>::min();
// Run a binary search looking for the highest key below target_pos.
// Initial upper bound of the search is target_pos.
// Find initial lower bound by doubling the size of the range until a key below target_pos
// is found, or the lower bound reaches the beginning of the search space.
// If the lower bound search reaches the beginning of the search space without finding a key,
// return the beginning of the search space; otherwise, perform a binary search between
// the bounds now established.
Pos lower_bound = 0;
Pos upper_bound = target_pos;
bool found_low = false;
Pos probe_pos = target_pos;
// We need one loop for each bit of the search space to find the lower bound,
// one loop for each bit of the search space to find the upper bound,
// and one extra loop to confirm the boundary is correct.
for (size_t loop_count = min(numeric_limits<Pos>::digits * size_t(2) + 1, max_loops); loop_count; --loop_count) {
DLOG("fetch(probe_pos = " << probe_pos << ", target_pos = " << target_pos << ")");
auto result = fetch(probe_pos, target_pos);
const Pos low_pos = result.empty() ? end_pos : *result.begin();
const Pos high_pos = result.empty() ? end_pos : *result.rbegin();
DLOG(" = " << low_pos << ".." << high_pos);
// check for correct behavior of the fetch function
THROW_CHECK2(out_of_range, high_pos, probe_pos, probe_pos <= high_pos);
THROW_CHECK2(out_of_range, low_pos, probe_pos, probe_pos <= low_pos);
THROW_CHECK2(out_of_range, low_pos, high_pos, low_pos <= high_pos);
if (!found_low) {
// if target_pos == end_pos then we will find it in every empty result set,
// so in that case we force the lower bound to be lower than end_pos
if ((target_pos == end_pos) ? (low_pos < target_pos) : (low_pos <= target_pos)) {
// found a lower bound, set the low bound there and switch to binary search
found_low = true;
lower_bound = low_pos;
DLOG("found_low = true, lower_bound = " << lower_bound);
} else {
// still looking for lower bound
// if probe_pos was begin_pos then we can stop with no result
if (probe_pos == begin_pos) {
DLOG("return: probe_pos == begin_pos " << begin_pos);
return begin_pos;
}
// double the range size, or use the distance between objects found so far
THROW_CHECK2(out_of_range, upper_bound, probe_pos, probe_pos <= upper_bound);
// already checked low_pos <= high_pos above
const Pos want_delta = max(upper_bound - probe_pos, min_step);
// avoid underflowing the beginning of the search space
const Pos have_delta = min(want_delta, probe_pos - begin_pos);
THROW_CHECK2(out_of_range, want_delta, have_delta, have_delta <= want_delta);
// move probe and try again
probe_pos = probe_pos - have_delta;
DLOG("probe_pos " << probe_pos << " = probe_pos - have_delta " << have_delta << " (want_delta " << want_delta << ")");
continue;
}
}
if (low_pos <= target_pos && target_pos <= high_pos) {
// have keys on either side of target_pos in result
// search from the high end until we find the highest key below target
for (auto i = result.rbegin(); i != result.rend(); ++i) {
// more correctness checking for fetch
THROW_CHECK2(out_of_range, *i, probe_pos, probe_pos <= *i);
if (*i <= target_pos) {
DLOG("return: *i " << *i << " <= target_pos " << target_pos);
return *i;
}
}
// if the list is empty then low_pos = high_pos = end_pos
// if target_pos = end_pos also, then we will execute the loop
// above but not find any matching entries.
THROW_CHECK0(runtime_error, result.empty());
}
if (target_pos <= low_pos) {
// results are all too high, so probe_pos..low_pos is too high
// lower the high bound to the probe pos
upper_bound = probe_pos;
DLOG("upper_bound = probe_pos " << probe_pos);
}
if (high_pos < target_pos) {
// results are all too low, so probe_pos..high_pos is too low
// raise the low bound to the high_pos
DLOG("lower_bound = high_pos " << high_pos);
lower_bound = high_pos;
}
// compute a new probe pos at the middle of the range and try again
// we can't have a zero-size range here because we would not have set found_low yet
THROW_CHECK2(out_of_range, lower_bound, upper_bound, lower_bound <= upper_bound);
const Pos delta = (upper_bound - lower_bound) / 2;
probe_pos = lower_bound + delta;
if (delta < 1) {
// nothing can exist in the range (lower_bound, upper_bound)
// and an object is known to exist at lower_bound
DLOG("return: probe_pos == lower_bound " << lower_bound);
return lower_bound;
}
THROW_CHECK2(out_of_range, lower_bound, probe_pos, lower_bound <= probe_pos);
THROW_CHECK2(out_of_range, upper_bound, probe_pos, probe_pos <= upper_bound);
DLOG("loop: lower_bound " << lower_bound << ", probe_pos " << probe_pos << ", upper_bound " << upper_bound);
}
THROW_ERROR(runtime_error, "FIXME: should not reach this line: "
"lower_bound..upper_bound " << lower_bound << ".." << upper_bound << ", "
"found_low " << found_low);
} catch (...) {
DOUT(cerr);
throw;
}
}
}
#endif // _CRUCIBLE_SEEKER_H_

167
include/crucible/spanner.h Normal file
View File

@@ -0,0 +1,167 @@
#ifndef CRUCIBLE_SPANNER_H
#define CRUCIBLE_SPANNER_H
#include "crucible/error.h"
#include <memory>
namespace crucible {
using namespace std;
// C++20 is already using the name "span" for something similar.
template <class T, class Head = T*, class Iter = Head>
class Spanner {
public:
using iterator = Iter;
using head_pointer = Head;
using value_type = T;
template <class Container>
Spanner(Container& container);
Spanner(head_pointer begin, iterator end);
Spanner(size_t size, head_pointer begin);
Spanner() = default;
Spanner &operator=(const Spanner &that) = default;
iterator begin() const;
iterator end() const;
value_type *data() const;
value_type &at(size_t n) const;
size_t size() const;
bool empty() const;
void clear();
value_type &operator[](size_t n) const;
iterator erase(iterator first, iterator last);
iterator erase(iterator first);
private:
head_pointer m_begin;
size_t m_size;
};
template <class Container, class Head = typename Container::value_type *, class Iter = Head>
Spanner<typename Container::value_type, Head, Iter> make_spanner(Container &container)
{
return Spanner<typename Container::value_type, Head, Iter>(container);
}
// This template is an attempt to turn a shared_ptr to a container
// into a range view that can be cheaply passed around.
// It probably doesn't quite work in the general case.
template <class Container, class Head = shared_ptr<typename Container::value_type>, class Iter = typename Container::value_type *>
Spanner<typename Container::value_type, Head, Iter> make_spanner(shared_ptr<Container> &cont_ptr)
{
shared_ptr<typename Container::value_type> head(cont_ptr, cont_ptr->data());
size_t const size = cont_ptr->size();
return Spanner<typename Container::value_type, Head, Iter>(size, head);
}
template <class T, class Head, class Iter>
template <class Container>
Spanner<T, Head, Iter>::Spanner(Container &container) :
m_begin(container.data()),
m_size(container.size())
{
}
template <class T, class Head, class Iter>
Spanner<T, Head, Iter>::Spanner(head_pointer begin, iterator end) :
m_begin(begin),
m_size(end - begin)
{
}
template <class T, class Head, class Iter>
Spanner<T, Head, Iter>::Spanner(size_t size, head_pointer begin) :
m_begin(begin),
m_size(size)
{
}
template <class T, class Head, class Iter>
typename Spanner<T, Head, Iter>::iterator
Spanner<T, Head, Iter>::erase(iterator first, iterator last)
{
auto end = m_begin + m_size;
if (first == m_begin) {
THROW_CHECK0(invalid_argument, last <= end);
m_begin = last;
return last;
}
if (last == end) {
THROW_CHECK0(invalid_argument, m_begin <= first);
m_size = first - m_begin;
return first;
}
THROW_ERROR(invalid_argument, "first != begin() and last != end()");
}
template <class T, class Head, class Iter>
typename Spanner<T, Head, Iter>::iterator
Spanner<T, Head, Iter>::erase(iterator first)
{
return erase(first, first + 1);
}
template <class T, class Head, class Iter>
typename Spanner<T, Head, Iter>::value_type &
Spanner<T, Head, Iter>::operator[](size_t n) const
{
return at(n);
}
template <class T, class Head, class Iter>
void
Spanner<T, Head, Iter>::clear()
{
m_begin = head_pointer();
m_size = 0;
}
template <class T, class Head, class Iter>
bool
Spanner<T, Head, Iter>::empty() const
{
return m_size == 0;
}
template <class T, class Head, class Iter>
size_t
Spanner<T, Head, Iter>::size() const
{
return m_size;
}
template <class T, class Head, class Iter>
typename Spanner<T, Head, Iter>::value_type *
Spanner<T, Head, Iter>::data() const
{
return &(*m_begin);
}
template <class T, class Head, class Iter>
typename Spanner<T, Head, Iter>::iterator
Spanner<T, Head, Iter>::begin() const
{
return data();
}
template <class T, class Head, class Iter>
typename Spanner<T, Head, Iter>::iterator
Spanner<T, Head, Iter>::end() const
{
return data() + m_size;
}
template <class T, class Head, class Iter>
typename Spanner<T, Head, Iter>::value_type &
Spanner<T, Head, Iter>::at(size_t n) const
{
THROW_CHECK2(out_of_range, n, size(), n < size());
return *(data() + n);
}
}
#endif // CRUCIBLE_SPANNER_H

View File

@@ -11,6 +11,23 @@
namespace crucible {
using namespace std;
// Zero-initialize a base class object (usually a C struct)
template <class Base>
void
memset_zero(Base *that)
{
memset(that, 0, sizeof(Base));
}
// Copy a base class object (usually a C struct) into a vector<uint8_t>
template <class Base>
vector<uint8_t>
vector_copy_struct(Base *that)
{
const uint8_t *begin_that = reinterpret_cast<const uint8_t *>(static_cast<const Base *>(that));
return vector<uint8_t>(begin_that, begin_that + sizeof(Base));
}
// int->hex conversion with sprintf
string to_hex(uint64_t i);

View File

@@ -1,106 +0,0 @@
#ifndef CRUCIBLE_TABLE_H
#define CRUCIBLE_TABLE_H
#include <functional>
#include <limits>
#include <map>
#include <memory>
#include <ostream>
#include <sstream>
#include <string>
#include <vector>
namespace crucible {
namespace Table {
using namespace std;
using Content = function<string(size_t width, size_t height)>;
const size_t endpos = numeric_limits<size_t>::max();
Content Fill(const char c);
Content Text(const string& s);
template <class T>
Content Number(const T& num)
{
ostringstream oss;
oss << num;
return Text(oss.str());
}
class Cell {
Content m_content;
public:
Cell(const Content &fn = [](size_t, size_t) { return string(); } );
Cell& operator=(const Content &fn);
string text(size_t width, size_t height) const;
};
class Dimension {
size_t m_next_pos = 0;
vector<size_t> m_elements;
friend class Table;
size_t at(size_t) const;
public:
size_t size() const;
size_t insert(size_t pos);
void erase(size_t pos);
};
class Table {
Dimension m_rows, m_cols;
map<pair<size_t, size_t>, Cell> m_cells;
string m_left = "|";
string m_mid = "|";
string m_right = "|";
public:
Dimension &rows();
const Dimension& rows() const;
Dimension &cols();
const Dimension& cols() const;
Cell& at(size_t row, size_t col);
const Cell& at(size_t row, size_t col) const;
template <class T> void insert_row(size_t pos, const T& container);
template <class T> void insert_col(size_t pos, const T& container);
void left(const string &s);
void mid(const string &s);
void right(const string &s);
const string& left() const;
const string& mid() const;
const string& right() const;
};
ostream& operator<<(ostream &os, const Table &table);
template <class T>
void
Table::insert_row(size_t pos, const T& container)
{
const auto new_pos = m_rows.insert(pos);
size_t col = 0;
for (const auto &i : container) {
if (col >= cols().size()) {
cols().insert(col);
}
at(new_pos, col++) = i;
}
}
template <class T>
void
Table::insert_col(size_t pos, const T& container)
{
const auto new_pos = m_cols.insert(pos);
size_t row = 0;
for (const auto &i : container) {
if (row >= rows().size()) {
rows().insert(row);
}
at(row++, new_pos) = i;
}
}
}
}
#endif // CRUCIBLE_TABLE_H

View File

@@ -3,7 +3,6 @@
#include <functional>
#include <memory>
#include <mutex>
#include <ostream>
#include <string>
@@ -40,17 +39,10 @@ namespace crucible {
/// after the current instance exits.
void run() const;
/// Schedule task to run when no other Task is available.
void idle() const;
/// Schedule Task to run after this Task has run or
/// been destroyed.
void append(const Task &task) const;
/// Schedule Task to run after this Task has run or
/// been destroyed, in Task ID order.
void insert(const Task &task) const;
/// Describe Task as text.
string title() const;
@@ -100,89 +92,92 @@ namespace crucible {
/// Gets the current number of active workers
static size_t get_thread_count();
/// Gets the current load tracking statistics
struct LoadStats {
/// Current load extracted from last two 5-second load average samples
double current_load;
/// Target thread count computed from previous thread count and current load
double thread_target;
/// Load average for last 60 seconds
double loadavg;
};
static LoadStats get_current_load();
/// Drop the current queue and discard new Tasks without
/// running them. Currently executing tasks are not
/// affected (use set_thread_count(0) to wait for those
/// to complete).
static void cancel();
/// Stop running any new Tasks. All existing
/// Consumer threads will exit. Does not affect queue.
/// Does not wait for threads to exit. Reversible.
static void pause(bool paused = true);
};
// Barrier executes waiting Tasks once the last BarrierLock
// is released. Multiple unique Tasks may be scheduled while
// BarrierLocks exist and all will be run() at once upon
// release. If no BarrierLocks exist, Tasks are executed
// immediately upon insertion.
class BarrierState;
/// Barrier delays the execution of one or more Tasks.
/// The Tasks are executed when the last shared reference to the
/// BarrierState is released. Copies of Barrier objects refer
/// to the same Barrier state.
class Barrier {
class BarrierLock {
shared_ptr<BarrierState> m_barrier_state;
BarrierLock(shared_ptr<BarrierState> pbs);
friend class Barrier;
public:
Barrier();
/// Schedule a task for execution when last Barrier is released.
void insert_task(Task t);
/// Release this reference to the barrier state.
/// Last released reference executes the task.
/// Barrier can only be released once, after which the
/// object can no longer be used.
// Release this Lock immediately and permanently
void release();
};
class Barrier {
shared_ptr<BarrierState> m_barrier_state;
Barrier(shared_ptr<BarrierState> pbs);
public:
Barrier();
// Prevent execution of tasks behind barrier until
// BarrierLock destructor or release() method is called.
BarrierLock lock();
// Schedule a task for execution when no Locks exist
void insert_task(Task t);
};
// Exclusion provides exclusive access to a ExclusionLock.
// One Task will be able to obtain the ExclusionLock; other Tasks
// may schedule themselves for re-execution after the ExclusionLock
// is released.
class ExclusionState;
class Exclusion;
class ExclusionLock {
shared_ptr<Task> m_owner;
ExclusionLock(shared_ptr<Task> owner);
shared_ptr<ExclusionState> m_exclusion_state;
ExclusionLock(shared_ptr<ExclusionState> pes);
ExclusionLock() = default;
friend class Exclusion;
public:
/// Explicit default constructor because we have other kinds
ExclusionLock() = default;
// Calls release()
~ExclusionLock();
/// Release this Lock immediately and permanently
// Release this Lock immediately and permanently
void release();
/// Test for locked state
// Test for locked state
operator bool() const;
};
class Exclusion {
mutex m_mutex;
weak_ptr<Task> m_owner;
shared_ptr<ExclusionState> m_exclusion_state;
Exclusion(shared_ptr<ExclusionState> pes);
public:
/// Attempt to obtain a Lock. If successful, current Task
/// owns the Lock until the ExclusionLock is released
/// (it is the ExclusionLock that owns the lock, so it can
/// be passed to other Tasks or threads, but this is not
/// recommended practice).
/// If not successful, the argument Task is appended to the
/// task that currently holds the lock. Current task is
/// expected to immediately release any other ExclusionLock
/// objects it holds, and exit its Task function.
ExclusionLock try_lock(const Task &task);
Exclusion(const string &title);
// Attempt to obtain a Lock. If successful, current Task
// owns the Lock until the ExclusionLock is released
// (it is the ExclusionLock that owns the lock, so it can
// be passed to other Tasks or threads, but this is not
// recommended practice).
// If not successful, current Task is expected to call
// insert_task(current_task()), release any ExclusionLock
// objects it holds, and exit its Task function.
ExclusionLock try_lock();
// Execute Task when Exclusion is unlocked (possibly
// immediately).
void insert_task(Task t = Task::current_task());
};
/// Wrapper around pthread_setname_np which handles length limits
void pthread_setname(const string &name);
/// Wrapper around pthread_getname_np for symmetry
string pthread_getname();
}
#endif // CRUCIBLE_TASK_H

View File

@@ -34,7 +34,7 @@ namespace crucible {
double m_rate;
double m_burst;
double m_tokens = 0.0;
mutable mutex m_mutex;
mutex m_mutex;
void update_tokens();
RateLimiter() = delete;
@@ -45,8 +45,6 @@ namespace crucible {
double sleep_time(double cost = 1.0);
bool is_ready();
void borrow(double cost = 1.0);
void rate(double new_rate);
double rate() const;
};
class RateEstimator {
@@ -90,9 +88,6 @@ namespace crucible {
// Read count
uint64_t count() const;
/// Increment count (like update(count() + more), but atomic)
void increment(uint64_t more = 1);
// Convert counts to chrono types
chrono::high_resolution_clock::time_point time_point(uint64_t absolute_count) const;
chrono::duration<double> duration(uint64_t relative_count) const;

View File

@@ -1,14 +0,0 @@
#ifndef CRUCIBLE_UNAME_H
#define CRUCIBLE_UNAME_H
#include <sys/utsname.h>
namespace crucible {
using namespace std;
struct Uname : public utsname {
Uname();
};
}
#endif

View File

@@ -1,9 +1,9 @@
TAG ?= $(shell git describe --always --dirty || echo UNKNOWN)
default: libcrucible.a
%.a: Makefile
CRUCIBLE_OBJS = \
bytevector.o \
btrfs-tree.o \
chatter.o \
city.o \
cleanup.o \
@@ -12,16 +12,12 @@ CRUCIBLE_OBJS = \
extentwalker.o \
fd.o \
fs.o \
multilock.o \
ntoa.o \
openat2.o \
path.o \
process.o \
string.o \
table.o \
task.o \
time.o \
uname.o \
include ../makeflags
-include ../localconf
@@ -32,13 +28,24 @@ BEES_LDFLAGS = $(LDFLAGS)
configure.h: configure.h.in
$(TEMPLATE_COMPILER)
%.dep: %.cc configure.h Makefile
.depends:
mkdir -p $@
.depends/%.dep: %.cc configure.h Makefile | .depends
$(CXX) $(BEES_CXXFLAGS) -M -MF $@ -MT $(<:.cc=.o) $<
include $(CRUCIBLE_OBJS:%.o=%.dep)
depends.mk: $(CRUCIBLE_OBJS:%.o=.depends/%.dep)
cat $^ > $@.new
mv -f $@.new $@
.version.cc: configure.h Makefile ../makeflags $(CRUCIBLE_OBJS:.o=.cc) ../include/crucible/*.h
echo "namespace crucible { const char *VERSION = \"$(TAG)\"; }" > $@.new
if ! cmp "$@.new" "$@"; then mv -fv $@.new $@; fi
include depends.mk
%.o: %.cc ../makeflags
$(CXX) $(BEES_CXXFLAGS) -o $@ -c $<
libcrucible.a: $(CRUCIBLE_OBJS)
libcrucible.a: $(CRUCIBLE_OBJS) .version.o
$(AR) rcs $@ $^

View File

@@ -1,783 +0,0 @@
#include "crucible/btrfs-tree.h"
#include "crucible/btrfs.h"
#include "crucible/error.h"
#include "crucible/fs.h"
#include "crucible/hexdump.h"
#include "crucible/seeker.h"
#define CRUCIBLE_BTRFS_TREE_DEBUG(x) do { \
if (BtrfsIoctlSearchKey::s_debug_ostream) { \
(*BtrfsIoctlSearchKey::s_debug_ostream) << x; \
} \
} while (false)
namespace crucible {
using namespace std;
uint64_t
BtrfsTreeItem::extent_begin() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_EXTENT_ITEM_KEY);
return m_objectid;
}
uint64_t
BtrfsTreeItem::extent_end() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_EXTENT_ITEM_KEY);
return m_objectid + m_offset;
}
uint64_t
BtrfsTreeItem::extent_flags() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_EXTENT_ITEM_KEY);
return btrfs_get_member(&btrfs_extent_item::flags, m_data);
}
uint64_t
BtrfsTreeItem::extent_generation() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_EXTENT_ITEM_KEY);
return btrfs_get_member(&btrfs_extent_item::generation, m_data);
}
uint64_t
BtrfsTreeItem::root_ref_dirid() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_ROOT_BACKREF_KEY);
return btrfs_get_member(&btrfs_root_ref::dirid, m_data);
}
string
BtrfsTreeItem::root_ref_name() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_ROOT_BACKREF_KEY);
const auto name_len = btrfs_get_member(&btrfs_root_ref::name_len, m_data);
const auto name_start = sizeof(struct btrfs_root_ref);
const auto name_end = name_len + name_start;
THROW_CHECK2(runtime_error, m_data.size(), name_end, m_data.size() >= name_end);
return string(m_data.data() + name_start, m_data.data() + name_end);
}
uint64_t
BtrfsTreeItem::root_ref_parent_rootid() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_ROOT_BACKREF_KEY);
return offset();
}
uint64_t
BtrfsTreeItem::root_flags() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_ROOT_ITEM_KEY);
return btrfs_get_member(&btrfs_root_item::flags, m_data);
}
uint64_t
BtrfsTreeItem::root_refs() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_ROOT_ITEM_KEY);
return btrfs_get_member(&btrfs_root_item::refs, m_data);
}
ostream &
operator<<(ostream &os, const BtrfsTreeItem &bti)
{
os << "BtrfsTreeItem {"
<< " objectid = " << to_hex(bti.objectid())
<< ", type = " << btrfs_search_type_ntoa(bti.type())
<< ", offset = " << to_hex(bti.offset())
<< ", transid = " << bti.transid()
<< ", data = ";
hexdump(os, bti.data());
return os;
}
uint64_t
BtrfsTreeItem::block_group_flags() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_BLOCK_GROUP_ITEM_KEY);
return btrfs_get_member(&btrfs_block_group_item::flags, m_data);
}
uint64_t
BtrfsTreeItem::block_group_used() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_BLOCK_GROUP_ITEM_KEY);
return btrfs_get_member(&btrfs_block_group_item::used, m_data);
}
uint64_t
BtrfsTreeItem::chunk_length() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_CHUNK_ITEM_KEY);
return btrfs_get_member(&btrfs_chunk::length, m_data);
}
uint64_t
BtrfsTreeItem::chunk_type() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_CHUNK_ITEM_KEY);
return btrfs_get_member(&btrfs_chunk::type, m_data);
}
uint64_t
BtrfsTreeItem::dev_extent_chunk_offset() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_DEV_EXTENT_KEY);
return btrfs_get_member(&btrfs_dev_extent::chunk_offset, m_data);
}
uint64_t
BtrfsTreeItem::dev_extent_length() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_DEV_EXTENT_KEY);
return btrfs_get_member(&btrfs_dev_extent::length, m_data);
}
uint64_t
BtrfsTreeItem::dev_item_total_bytes() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_DEV_ITEM_KEY);
return btrfs_get_member(&btrfs_dev_item::total_bytes, m_data);
}
uint64_t
BtrfsTreeItem::dev_item_bytes_used() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_DEV_ITEM_KEY);
return btrfs_get_member(&btrfs_dev_item::bytes_used, m_data);
}
uint64_t
BtrfsTreeItem::inode_size() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_INODE_ITEM_KEY);
return btrfs_get_member(&btrfs_inode_item::size, m_data);
}
uint64_t
BtrfsTreeItem::file_extent_logical_bytes() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_EXTENT_DATA_KEY);
const auto file_extent_item_type = btrfs_get_member(&btrfs_file_extent_item::type, m_data);
switch (file_extent_item_type) {
case BTRFS_FILE_EXTENT_INLINE:
return btrfs_get_member(&btrfs_file_extent_item::ram_bytes, m_data);
case BTRFS_FILE_EXTENT_PREALLOC:
case BTRFS_FILE_EXTENT_REG:
return btrfs_get_member(&btrfs_file_extent_item::num_bytes, m_data);
default:
THROW_ERROR(runtime_error, "unknown btrfs_file_extent_item type " << file_extent_item_type);
}
}
uint64_t
BtrfsTreeItem::file_extent_offset() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_EXTENT_DATA_KEY);
const auto file_extent_item_type = btrfs_get_member(&btrfs_file_extent_item::type, m_data);
switch (file_extent_item_type) {
case BTRFS_FILE_EXTENT_INLINE:
THROW_ERROR(invalid_argument, "extent is inline " << *this);
case BTRFS_FILE_EXTENT_PREALLOC:
case BTRFS_FILE_EXTENT_REG:
return btrfs_get_member(&btrfs_file_extent_item::offset, m_data);
default:
THROW_ERROR(runtime_error, "unknown btrfs_file_extent_item type " << file_extent_item_type << " in " << *this);
}
}
uint64_t
BtrfsTreeItem::file_extent_generation() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_EXTENT_DATA_KEY);
return btrfs_get_member(&btrfs_file_extent_item::generation, m_data);
}
uint64_t
BtrfsTreeItem::file_extent_bytenr() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_EXTENT_DATA_KEY);
auto file_extent_item_type = btrfs_get_member(&btrfs_file_extent_item::type, m_data);
switch (file_extent_item_type) {
case BTRFS_FILE_EXTENT_INLINE:
THROW_ERROR(invalid_argument, "extent is inline " << *this);
case BTRFS_FILE_EXTENT_PREALLOC:
case BTRFS_FILE_EXTENT_REG:
return btrfs_get_member(&btrfs_file_extent_item::disk_bytenr, m_data);
default:
THROW_ERROR(runtime_error, "unknown btrfs_file_extent_item type " << file_extent_item_type << " in " << *this);
}
}
uint8_t
BtrfsTreeItem::file_extent_type() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_EXTENT_DATA_KEY);
return btrfs_get_member(&btrfs_file_extent_item::type, m_data);
}
btrfs_compression_type
BtrfsTreeItem::file_extent_compression() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_EXTENT_DATA_KEY);
return static_cast<btrfs_compression_type>(btrfs_get_member(&btrfs_file_extent_item::compression, m_data));
}
BtrfsTreeItem::BtrfsTreeItem(const BtrfsIoctlSearchHeader &bish) :
m_objectid(bish.objectid),
m_offset(bish.offset),
m_transid(bish.transid),
m_data(bish.m_data),
m_type(bish.type)
{
}
BtrfsTreeItem &
BtrfsTreeItem::operator=(const BtrfsIoctlSearchHeader &bish)
{
m_objectid = bish.objectid;
m_offset = bish.offset;
m_transid = bish.transid;
m_data = bish.m_data;
m_type = bish.type;
return *this;
}
bool
BtrfsTreeItem::operator!() const
{
return m_transid == 0 && m_objectid == 0 && m_offset == 0 && m_type == 0;
}
uint64_t
BtrfsTreeFetcher::block_size() const
{
return m_block_size;
}
BtrfsTreeFetcher::BtrfsTreeFetcher(Fd new_fd) :
m_fd(new_fd)
{
BtrfsIoctlFsInfoArgs bifia;
bifia.do_ioctl(fd());
m_block_size = bifia.sectorsize;
THROW_CHECK1(runtime_error, m_block_size, m_block_size > 0);
// We don't believe sector sizes that aren't multiples of 4K
THROW_CHECK1(runtime_error, m_block_size, (m_block_size % 4096) == 0);
m_lookbehind_size = 128 * 1024;
m_scale_size = m_block_size;
}
Fd
BtrfsTreeFetcher::fd() const
{
return m_fd;
}
void
BtrfsTreeFetcher::fd(Fd fd)
{
m_fd = fd;
}
void
BtrfsTreeFetcher::type(uint8_t type)
{
m_type = type;
}
uint8_t
BtrfsTreeFetcher::type()
{
return m_type;
}
void
BtrfsTreeFetcher::tree(uint64_t tree)
{
m_tree = tree;
}
uint64_t
BtrfsTreeFetcher::tree()
{
return m_tree;
}
void
BtrfsTreeFetcher::transid(uint64_t min_transid, uint64_t max_transid)
{
m_min_transid = min_transid;
m_max_transid = max_transid;
}
uint64_t
BtrfsTreeFetcher::lookbehind_size() const
{
return m_lookbehind_size;
}
void
BtrfsTreeFetcher::lookbehind_size(uint64_t lookbehind_size)
{
m_lookbehind_size = lookbehind_size;
}
uint64_t
BtrfsTreeFetcher::scale_size() const
{
return m_scale_size;
}
void
BtrfsTreeFetcher::scale_size(uint64_t scale_size)
{
m_scale_size = scale_size;
}
void
BtrfsTreeFetcher::fill_sk(BtrfsIoctlSearchKey &sk, uint64_t object)
{
(void)object;
// btrfs allows tree ID 0 meaning the current tree, but we do not.
THROW_CHECK0(invalid_argument, m_tree != 0);
sk.tree_id = m_tree;
sk.min_type = m_type;
sk.max_type = m_type;
sk.min_transid = m_min_transid;
sk.max_transid = m_max_transid;
sk.nr_items = 1;
}
void
BtrfsTreeFetcher::next_sk(BtrfsIoctlSearchKey &key, const BtrfsIoctlSearchHeader &hdr)
{
key.next_min(hdr, m_type);
}
BtrfsTreeItem
BtrfsTreeFetcher::at(uint64_t logical)
{
CRUCIBLE_BTRFS_TREE_DEBUG("at " << logical);
BtrfsIoctlSearchKey &sk = m_sk;
fill_sk(sk, logical);
// Exact match, should return 0 or 1 items
sk.max_type = sk.min_type;
sk.nr_items = 1;
sk.do_ioctl(fd());
THROW_CHECK1(runtime_error, sk.m_result.size(), sk.m_result.size() < 2);
for (const auto &i : sk.m_result) {
if (hdr_logical(i) == logical && hdr_match(i)) {
return i;
}
}
return BtrfsTreeItem();
}
uint64_t
BtrfsTreeFetcher::scale_logical(const uint64_t logical) const
{
THROW_CHECK1(invalid_argument, logical, (logical % m_scale_size) == 0 || logical == s_max_logical);
return logical / m_scale_size;
}
uint64_t
BtrfsTreeFetcher::scaled_max_logical() const
{
return scale_logical(s_max_logical);
}
uint64_t
BtrfsTreeFetcher::unscale_logical(const uint64_t logical) const
{
THROW_CHECK1(invalid_argument, logical, logical <= scaled_max_logical());
if (logical == scaled_max_logical()) {
return s_max_logical;
}
return logical * scale_size();
}
BtrfsTreeItem
BtrfsTreeFetcher::rlower_bound(uint64_t logical)
{
#if 0
static bool btfrlb_debug = getenv("BTFLRB_DEBUG");
#define BTFRLB_DEBUG(x) do { if (btfrlb_debug) cerr << x; } while (false)
#else
#define BTFRLB_DEBUG(x) CRUCIBLE_BTRFS_TREE_DEBUG(x)
#endif
BtrfsTreeItem closest_item;
uint64_t closest_logical = 0;
BtrfsIoctlSearchKey &sk = m_sk;
size_t loops = 0;
BTFRLB_DEBUG("rlower_bound: " << to_hex(logical) << " in tree " << tree() << endl);
seek_backward(scale_logical(logical), [&](uint64_t const lower_bound, uint64_t const upper_bound) {
++loops;
fill_sk(sk, unscale_logical(min(scaled_max_logical(), lower_bound)));
set<uint64_t> rv;
do {
sk.nr_items = 4;
sk.do_ioctl(fd());
BTFRLB_DEBUG("fetch: loop " << loops << " lower_bound..upper_bound " << to_hex(lower_bound) << ".." << to_hex(upper_bound));
for (auto &i : sk.m_result) {
next_sk(sk, i);
// If hdr_stop or !hdr_match, don't inspect the item
if (hdr_stop(i)) {
rv.insert(numeric_limits<uint64_t>::max());
BTFRLB_DEBUG("(stop)");
break;
}
if (!hdr_match(i)) {
BTFRLB_DEBUG("(no match)");
continue;
}
const auto this_logical = hdr_logical(i);
BTFRLB_DEBUG(" " << to_hex(this_logical) << " " << i);
const auto scaled_hdr_logical = scale_logical(this_logical);
BTFRLB_DEBUG(" " << "(match)");
if (this_logical <= logical && this_logical > closest_logical) {
closest_logical = this_logical;
closest_item = i;
BTFRLB_DEBUG("(closest)");
}
rv.insert(scaled_hdr_logical);
if (scaled_hdr_logical > upper_bound) {
BTFRLB_DEBUG("(" << to_hex(scaled_hdr_logical) << " >= " << to_hex(upper_bound) << ")");
break;
}
BTFRLB_DEBUG("(cont'd)");
}
BTFRLB_DEBUG(endl);
// We might get a search result that contains only non-matching items.
// Keep looping until we find any matching item or we run out of tree.
} while (rv.empty() && !sk.m_result.empty());
return rv;
}, scale_logical(lookbehind_size()));
return closest_item;
#undef BTFRLB_DEBUG
}
BtrfsTreeItem
BtrfsTreeFetcher::lower_bound(uint64_t logical)
{
BtrfsIoctlSearchKey &sk = m_sk;
fill_sk(sk, logical);
do {
assert(sk.max_offset == s_max_logical);
sk.do_ioctl(fd());
for (const auto &i : sk.m_result) {
if (hdr_match(i)) {
return i;
}
if (hdr_stop(i)) {
return BtrfsTreeItem();
}
next_sk(sk, i);
}
} while (!sk.m_result.empty());
return BtrfsTreeItem();
}
BtrfsTreeItem
BtrfsTreeFetcher::next(uint64_t logical)
{
CRUCIBLE_BTRFS_TREE_DEBUG("next " << logical);
const auto scaled_logical = scale_logical(logical);
if (scaled_logical + 1 > scaled_max_logical()) {
return BtrfsTreeItem();
}
return lower_bound(unscale_logical(scaled_logical + 1));
}
BtrfsTreeItem
BtrfsTreeFetcher::prev(uint64_t logical)
{
CRUCIBLE_BTRFS_TREE_DEBUG("prev " << logical);
const auto scaled_logical = scale_logical(logical);
if (scaled_logical < 1) {
return BtrfsTreeItem();
}
return rlower_bound(unscale_logical(scaled_logical - 1));
}
void
BtrfsTreeObjectFetcher::fill_sk(BtrfsIoctlSearchKey &sk, uint64_t object)
{
BtrfsTreeFetcher::fill_sk(sk, object);
sk.min_offset = 0;
sk.max_offset = numeric_limits<decltype(sk.max_offset)>::max();
sk.min_objectid = object;
sk.max_objectid = numeric_limits<decltype(sk.max_objectid)>::max();
}
uint64_t
BtrfsTreeObjectFetcher::hdr_logical(const BtrfsIoctlSearchHeader &hdr)
{
return hdr.objectid;
}
bool
BtrfsTreeObjectFetcher::hdr_match(const BtrfsIoctlSearchHeader &hdr)
{
// If you're calling this method without overriding it, you should have set type first
assert(m_type);
return hdr.type == m_type;
}
bool
BtrfsTreeObjectFetcher::hdr_stop(const BtrfsIoctlSearchHeader &hdr)
{
return false;
(void)hdr;
}
uint64_t
BtrfsTreeOffsetFetcher::hdr_logical(const BtrfsIoctlSearchHeader &hdr)
{
return hdr.offset;
}
bool
BtrfsTreeOffsetFetcher::hdr_match(const BtrfsIoctlSearchHeader &hdr)
{
assert(m_type);
return hdr.type == m_type && hdr.objectid == m_objectid;
}
bool
BtrfsTreeOffsetFetcher::hdr_stop(const BtrfsIoctlSearchHeader &hdr)
{
assert(m_type);
return hdr.objectid > m_objectid || hdr.type > m_type;
}
void
BtrfsTreeOffsetFetcher::objectid(uint64_t objectid)
{
m_objectid = objectid;
}
uint64_t
BtrfsTreeOffsetFetcher::objectid() const
{
return m_objectid;
}
void
BtrfsTreeOffsetFetcher::fill_sk(BtrfsIoctlSearchKey &sk, uint64_t offset)
{
BtrfsTreeFetcher::fill_sk(sk, offset);
sk.min_offset = offset;
sk.max_offset = numeric_limits<decltype(sk.max_offset)>::max();
sk.min_objectid = m_objectid;
sk.max_objectid = m_objectid;
}
void
BtrfsCsumTreeFetcher::get_sums(uint64_t const logical, size_t count, function<void(uint64_t logical, const uint8_t *buf, size_t bytes)> output)
{
#if 0
static bool bctfgs_debug = getenv("BCTFGS_DEBUG");
#define BCTFGS_DEBUG(x) do { if (bctfgs_debug) cerr << x; } while (false)
#else
#define BCTFGS_DEBUG(x) CRUCIBLE_BTRFS_TREE_DEBUG(x)
#endif
const uint64_t logical_end = logical + count * block_size();
BtrfsTreeItem bti = rlower_bound(logical);
size_t __attribute__((unused)) loops = 0;
BCTFGS_DEBUG("get_sums " << to_hex(logical) << ".." << to_hex(logical_end) << endl);
while (!!bti) {
BCTFGS_DEBUG("get_sums[" << loops << "]: " << bti << endl);
++loops;
// Reject wrong type or objectid
THROW_CHECK1(runtime_error, bti.type(), bti.type() == BTRFS_EXTENT_CSUM_KEY);
THROW_CHECK1(runtime_error, bti.objectid(), bti.objectid() == BTRFS_EXTENT_CSUM_OBJECTID);
// Is this object in range?
const uint64_t data_logical = bti.offset();
if (data_logical >= logical_end) {
// csum object is past end of range, we are done
return;
}
// Figure out how long this csum item is in various units
const size_t csum_byte_count = bti.data().size();
THROW_CHECK1(runtime_error, csum_byte_count, (csum_byte_count % m_sum_size) == 0);
THROW_CHECK1(runtime_error, csum_byte_count, csum_byte_count > 0);
const size_t csum_count = csum_byte_count / m_sum_size;
const uint64_t data_byte_count = csum_count * block_size();
const uint64_t data_logical_end = data_logical + data_byte_count;
if (data_logical_end <= logical) {
// too low, look at next item
bti = lower_bound(logical);
continue;
}
// There is some overlap?
const uint64_t overlap_begin = max(logical, data_logical);
const uint64_t overlap_end = min(logical_end, data_logical_end);
THROW_CHECK2(runtime_error, overlap_begin, overlap_end, overlap_begin < overlap_end);
const uint64_t overlap_offset = overlap_begin - data_logical;
THROW_CHECK1(runtime_error, overlap_offset, (overlap_offset % block_size()) == 0);
const uint64_t overlap_index = overlap_offset * m_sum_size / block_size();
const uint64_t overlap_byte_count = overlap_end - overlap_begin;
const uint64_t overlap_csum_byte_count = overlap_byte_count * m_sum_size / block_size();
// Can't be bigger than a btrfs item
THROW_CHECK1(runtime_error, overlap_index, overlap_index < 65536);
THROW_CHECK1(runtime_error, overlap_csum_byte_count, overlap_csum_byte_count < 65536);
// Yes, process the overlap
output(overlap_begin, bti.data().data() + overlap_index, overlap_csum_byte_count);
// Advance
bti = lower_bound(overlap_end);
}
#undef BCTFGS_DEBUG
}
uint32_t
BtrfsCsumTreeFetcher::sum_type() const
{
return m_sum_type;
}
size_t
BtrfsCsumTreeFetcher::sum_size() const
{
return m_sum_size;
}
BtrfsCsumTreeFetcher::BtrfsCsumTreeFetcher(const Fd &new_fd) :
BtrfsTreeOffsetFetcher(new_fd)
{
type(BTRFS_EXTENT_CSUM_KEY);
tree(BTRFS_CSUM_TREE_OBJECTID);
objectid(BTRFS_EXTENT_CSUM_OBJECTID);
BtrfsIoctlFsInfoArgs bifia;
bifia.do_ioctl(fd());
m_sum_type = static_cast<btrfs_compression_type>(bifia.csum_type());
m_sum_size = bifia.csum_size();
if (m_sum_type == BTRFS_CSUM_TYPE_CRC32 && m_sum_size == 0) {
// Older kernel versions don't fill in this field
m_sum_size = 4;
}
THROW_CHECK1(runtime_error, m_sum_size, m_sum_size > 0);
}
BtrfsExtentItemFetcher::BtrfsExtentItemFetcher(const Fd &new_fd) :
BtrfsTreeObjectFetcher(new_fd)
{
tree(BTRFS_EXTENT_TREE_OBJECTID);
type(BTRFS_EXTENT_ITEM_KEY);
}
BtrfsExtentDataFetcher::BtrfsExtentDataFetcher(const Fd &new_fd) :
BtrfsTreeOffsetFetcher(new_fd)
{
type(BTRFS_EXTENT_DATA_KEY);
}
BtrfsInodeFetcher::BtrfsInodeFetcher(const Fd &fd) :
BtrfsTreeObjectFetcher(fd)
{
type(BTRFS_INODE_ITEM_KEY);
scale_size(1);
}
BtrfsTreeItem
BtrfsInodeFetcher::stat(uint64_t subvol, uint64_t inode)
{
tree(subvol);
const auto item = at(inode);
if (!!item) {
THROW_CHECK2(runtime_error, item.objectid(), inode, inode == item.objectid());
THROW_CHECK2(runtime_error, item.type(), BTRFS_INODE_ITEM_KEY, item.type() == BTRFS_INODE_ITEM_KEY);
}
return item;
}
BtrfsRootFetcher::BtrfsRootFetcher(const Fd &fd) :
BtrfsTreeObjectFetcher(fd)
{
tree(BTRFS_ROOT_TREE_OBJECTID);
scale_size(1);
}
BtrfsTreeItem
BtrfsRootFetcher::root(const uint64_t subvol)
{
const auto my_type = BTRFS_ROOT_ITEM_KEY;
type(my_type);
const auto item = at(subvol);
if (!!item) {
THROW_CHECK2(runtime_error, item.objectid(), subvol, subvol == item.objectid());
THROW_CHECK2(runtime_error, item.type(), my_type, item.type() == my_type);
}
return item;
}
BtrfsTreeItem
BtrfsRootFetcher::root_backref(const uint64_t subvol)
{
const auto my_type = BTRFS_ROOT_BACKREF_KEY;
type(my_type);
const auto item = at(subvol);
if (!!item) {
THROW_CHECK2(runtime_error, item.objectid(), subvol, subvol == item.objectid());
THROW_CHECK2(runtime_error, item.type(), my_type, item.type() == my_type);
}
return item;
}
BtrfsDataExtentTreeFetcher::BtrfsDataExtentTreeFetcher(const Fd &fd) :
BtrfsExtentItemFetcher(fd),
m_chunk_tree(fd)
{
tree(BTRFS_EXTENT_TREE_OBJECTID);
type(BTRFS_EXTENT_ITEM_KEY);
m_chunk_tree.tree(BTRFS_CHUNK_TREE_OBJECTID);
m_chunk_tree.type(BTRFS_CHUNK_ITEM_KEY);
m_chunk_tree.objectid(BTRFS_FIRST_CHUNK_TREE_OBJECTID);
}
void
BtrfsDataExtentTreeFetcher::next_sk(BtrfsIoctlSearchKey &key, const BtrfsIoctlSearchHeader &hdr)
{
key.min_type = key.max_type = type();
key.max_objectid = key.max_offset = numeric_limits<uint64_t>::max();
key.min_offset = 0;
key.min_objectid = hdr.objectid;
const auto step = scale_size();
if (key.min_objectid < numeric_limits<uint64_t>::max() - step) {
key.min_objectid += step;
} else {
key.min_objectid = numeric_limits<uint64_t>::max();
}
// If we're still in our current block group, check here
if (!!m_current_bg) {
const auto bg_begin = m_current_bg.offset();
const auto bg_end = bg_begin + m_current_bg.chunk_length();
// If we are still in our current block group, return early
if (key.min_objectid >= bg_begin && key.min_objectid < bg_end) return;
}
// We don't have a current block group or we're out of range
// Find the chunk that this bytenr belongs to
m_current_bg = m_chunk_tree.rlower_bound(key.min_objectid);
// Make sure it's a data block group
while (!!m_current_bg) {
// Data block group, stop here
if (m_current_bg.chunk_type() & BTRFS_BLOCK_GROUP_DATA) break;
// Not a data block group, skip to end
key.min_objectid = m_current_bg.offset() + m_current_bg.chunk_length();
m_current_bg = m_chunk_tree.lower_bound(key.min_objectid);
}
if (!m_current_bg) {
// Ran out of data block groups, stop here
return;
}
// Check to see if bytenr is in the current data block group
const auto bg_begin = m_current_bg.offset();
if (key.min_objectid < bg_begin) {
// Move forward to start of data block group
key.min_objectid = bg_begin;
}
}
}

View File

@@ -1,189 +0,0 @@
#include "crucible/bytevector.h"
#include "crucible/error.h"
#include "crucible/hexdump.h"
#include "crucible/string.h"
#include <cassert>
namespace crucible {
using namespace std;
ByteVector::iterator
ByteVector::begin() const
{
unique_lock<mutex> lock(m_mutex);
return m_ptr.get();
}
ByteVector::iterator
ByteVector::end() const
{
unique_lock<mutex> lock(m_mutex);
return m_ptr.get() + m_size;
}
size_t
ByteVector::size() const
{
return m_size;
}
bool
ByteVector::empty() const
{
return !m_ptr || !m_size;
}
void
ByteVector::clear()
{
unique_lock<mutex> lock(m_mutex);
m_ptr.reset();
m_size = 0;
}
ByteVector::value_type&
ByteVector::operator[](size_t index) const
{
unique_lock<mutex> lock(m_mutex);
return m_ptr.get()[index];
}
ByteVector::ByteVector(const ByteVector &that)
{
unique_lock<mutex> lock(that.m_mutex);
m_ptr = that.m_ptr;
m_size = that.m_size;
}
ByteVector&
ByteVector::operator=(const ByteVector &that)
{
// If &that == this, there's no need to do anything, but
// especially don't try to lock the same mutex twice.
if (&m_mutex != &that.m_mutex) {
unique_lock<mutex> lock_this(m_mutex, defer_lock);
unique_lock<mutex> lock_that(that.m_mutex, defer_lock);
lock(lock_this, lock_that);
m_ptr = that.m_ptr;
m_size = that.m_size;
}
return *this;
}
ByteVector::ByteVector(const ByteVector &that, size_t start, size_t length)
{
THROW_CHECK0(out_of_range, that.m_ptr);
THROW_CHECK2(out_of_range, start, that.m_size, start <= that.m_size);
THROW_CHECK2(out_of_range, start + length, that.m_size + length, start + length <= that.m_size + length);
m_ptr = Pointer(that.m_ptr, that.m_ptr.get() + start);
m_size = length;
}
ByteVector
ByteVector::at(size_t start, size_t length) const
{
return ByteVector(*this, start, length);
}
ByteVector::value_type&
ByteVector::at(size_t size) const
{
unique_lock<mutex> lock(m_mutex);
THROW_CHECK0(out_of_range, m_ptr);
THROW_CHECK2(out_of_range, size, m_size, size < m_size);
return m_ptr.get()[size];
}
static
void *
bv_allocate(size_t size)
{
#ifdef BEES_VALGRIND
// XXX: only do this to shut up valgrind
return calloc(1, size);
#else
return malloc(size);
#endif
}
ByteVector::ByteVector(size_t size)
{
m_ptr = Pointer(static_cast<value_type*>(bv_allocate(size)), free);
// bad_alloc doesn't fit THROW_CHECK's template
THROW_CHECK0(runtime_error, m_ptr);
m_size = size;
}
ByteVector::ByteVector(iterator begin, iterator end, size_t min_size)
{
const size_t size = end - begin;
const size_t alloc_size = max(size, min_size);
m_ptr = Pointer(static_cast<value_type*>(bv_allocate(alloc_size)), free);
THROW_CHECK0(runtime_error, m_ptr);
m_size = alloc_size;
memcpy(m_ptr.get(), begin, size);
}
bool
ByteVector::operator==(const ByteVector &that) const
{
unique_lock<mutex> lock_this(m_mutex, defer_lock);
unique_lock<mutex> lock_that(that.m_mutex, defer_lock);
lock(lock_this, lock_that);
if (!m_ptr) {
return !that.m_ptr;
}
if (!that.m_ptr) {
return false;
}
if (m_size != that.m_size) {
return false;
}
if (m_ptr.get() == that.m_ptr.get()) {
return true;
}
return !memcmp(m_ptr.get(), that.m_ptr.get(), m_size);
}
void
ByteVector::erase(iterator begin, iterator end)
{
unique_lock<mutex> lock(m_mutex);
const size_t size = end - begin;
if (!size) return;
THROW_CHECK0(out_of_range, m_ptr);
const iterator my_begin = m_ptr.get();
const iterator my_end = my_begin + m_size;
THROW_CHECK4(out_of_range, my_begin, begin, my_end, end, my_begin == begin || my_end == end);
if (begin == my_begin) {
if (end == my_end) {
m_size = 0;
m_ptr.reset();
return;
}
m_ptr = Pointer(m_ptr, end);
}
m_size -= size;
}
void
ByteVector::erase(iterator begin)
{
erase(begin, begin + 1);
}
ByteVector::value_type*
ByteVector::data() const
{
unique_lock<mutex> lock(m_mutex);
return m_ptr.get();
}
ostream&
operator<<(ostream &os, const ByteVector &bv) {
hexdump(os, bv);
return os;
}
}

View File

@@ -76,7 +76,7 @@ namespace crucible {
DIE_IF_ZERO(strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", &ltm));
header_stream << buf;
header_stream << " " << getpid() << "." << gettid();
header_stream << " " << getpid() << "." << crucible::gettid();
if (add_prefix_level) {
header_stream << "<" << m_loglevel << ">";
}
@@ -88,7 +88,7 @@ namespace crucible {
header_stream << "<" << m_loglevel << ">";
}
header_stream << (m_name.empty() ? "thread" : m_name);
header_stream << "[" << gettid() << "]";
header_stream << "[" << crucible::gettid() << "]";
}
header_stream << ": ";

View File

@@ -496,7 +496,7 @@ namespace crucible {
BtrfsExtentWalker::Vec
BtrfsExtentWalker::get_extent_map(off_t pos)
{
BtrfsIoctlSearchKey sk;
BtrfsIoctlSearchKey sk(65536);
if (!m_root_fd) {
m_root_fd = m_fd;
}
@@ -640,7 +640,9 @@ namespace crucible {
ExtentWalker::get_extent_map(off_t pos)
{
EWLOG("get_extent_map(" << to_hex(pos) << ")");
Fiemap fm(ranged_cast<uint64_t>(pos), ranged_cast<uint64_t>(numeric_limits<off_t>::max() - pos));
Fiemap fm;
fm.fm_start = ranged_cast<uint64_t>(pos);
fm.fm_length = ranged_cast<uint64_t>(numeric_limits<off_t>::max() - pos);
fm.m_max_count = fm.m_min_count = sc_extent_fetch_max;
fm.do_ioctl(m_fd);
Vec rv;

View File

@@ -361,11 +361,8 @@ namespace crucible {
THROW_ERROR(invalid_argument, "pwrite: trying to write on a closed file descriptor");
}
int rv = ::pwrite(fd, buf, size, offset);
if (rv < 0) {
THROW_ERRNO("pwrite: could not write " << size << " bytes at fd " << name_fd(fd) << " offset " << offset);
}
if (rv != static_cast<ssize_t>(size)) {
THROW_ERROR(runtime_error, "pwrite: only " << rv << " of " << size << " bytes written at fd " << name_fd(fd) << " offset " << offset);
if (rv != static_cast<int>(size)) {
THROW_ERROR(runtime_error, "pwrite: only " << rv << " of " << size << " bytes written at offset " << offset);
}
}
@@ -395,7 +392,7 @@ namespace crucible {
}
THROW_ERRNO("read: " << size << " bytes");
}
if (rv > static_cast<ssize_t>(size)) {
if (rv > static_cast<int>(size)) {
THROW_ERROR(runtime_error, "read: somehow read more bytes (" << rv << ") than requested (" << size << ")");
}
if (rv == 0) break;
@@ -444,8 +441,8 @@ namespace crucible {
}
THROW_ERRNO("pread: " << size << " bytes");
}
if (rv != static_cast<ssize_t>(size)) {
THROW_ERROR(runtime_error, "pread: " << size << " bytes at fd " << name_fd(fd) << " offset " << offset << " returned " << rv);
if (rv != static_cast<int>(size)) {
THROW_ERROR(runtime_error, "pread: " << size << " bytes at offset " << offset << " returned " << rv);
}
break;
}
@@ -461,14 +458,28 @@ namespace crucible {
template<>
void
pread_or_die<ByteVector>(int fd, ByteVector &text, off_t offset)
pread_or_die<vector<char>>(int fd, vector<char> &text, off_t offset)
{
return pread_or_die(fd, text.data(), text.size(), offset);
}
template<>
void
pwrite_or_die<ByteVector>(int fd, const ByteVector &text, off_t offset)
pread_or_die<vector<uint8_t>>(int fd, vector<uint8_t> &text, off_t offset)
{
return pread_or_die(fd, text.data(), text.size(), offset);
}
template<>
void
pwrite_or_die<vector<uint8_t>>(int fd, const vector<uint8_t> &text, off_t offset)
{
return pwrite_or_die(fd, text.data(), text.size(), offset);
}
template<>
void
pwrite_or_die<vector<char>>(int fd, const vector<char> &text, off_t offset)
{
return pwrite_or_die(fd, text.data(), text.size(), offset);
}
@@ -480,9 +491,9 @@ namespace crucible {
return pwrite_or_die(fd, text.data(), text.size(), offset);
}
Stat::Stat() :
stat( (stat) { } )
Stat::Stat()
{
memset_zero<stat>(this);
}
Stat &
@@ -501,15 +512,15 @@ namespace crucible {
return *this;
}
Stat::Stat(int fd) :
stat( (stat) { } )
Stat::Stat(int fd)
{
memset_zero<stat>(this);
fstat(fd);
}
Stat::Stat(const string &filename) :
stat( (stat) { } )
Stat::Stat(const string &filename)
{
memset_zero<stat>(this);
lstat(filename);
}
@@ -524,14 +535,7 @@ namespace crucible {
void
ioctl_iflags_set(int fd, int attr)
{
// This bit of nonsense brought to you by Valgrind.
union {
int attr;
long zero;
} u;
u.zero = 0;
u.attr = attr;
DIE_IF_MINUS_ONE(ioctl(fd, FS_IOC_SETFLAGS, &u.attr));
DIE_IF_MINUS_ONE(ioctl(fd, FS_IOC_SETFLAGS, &attr));
}
string

431
lib/fs.cc
View File

@@ -2,7 +2,6 @@
#include "crucible/error.h"
#include "crucible/fd.h"
#include "crucible/hexdump.h"
#include "crucible/limits.h"
#include "crucible/ntoa.h"
#include "crucible/string.h"
@@ -33,11 +32,19 @@ namespace crucible {
#endif
}
BtrfsExtentInfo::BtrfsExtentInfo(int dst_fd, off_t dst_offset)
{
memset_zero<btrfs_ioctl_same_extent_info>(this);
fd = dst_fd;
logical_offset = dst_offset;
}
BtrfsExtentSame::BtrfsExtentSame(int src_fd, off_t src_offset, off_t src_length) :
m_logical_offset(src_offset),
m_length(src_length),
m_fd(src_fd)
{
memset_zero<btrfs_ioctl_same_args>(this);
logical_offset = src_offset;
length = src_length;
}
BtrfsExtentSame::~BtrfsExtentSame()
@@ -45,12 +52,9 @@ namespace crucible {
}
void
BtrfsExtentSame::add(int const fd, uint64_t const offset)
BtrfsExtentSame::add(int fd, off_t offset)
{
m_info.push_back( (btrfs_ioctl_same_extent_info) {
.fd = fd,
.logical_offset = offset,
});
m_info.push_back(BtrfsExtentInfo(fd, offset));
}
ostream &
@@ -107,8 +111,11 @@ namespace crucible {
os << " '" << fd_name << "'";
});
}
os << ", .logical_offset = " << to_hex(bes.m_logical_offset);
os << ", .length = " << to_hex(bes.m_length);
os << ", .logical_offset = " << to_hex(bes.logical_offset);
os << ", .length = " << to_hex(bes.length);
os << ", .dest_count = " << bes.dest_count;
os << ", .reserved1 = " << bes.reserved1;
os << ", .reserved2 = " << bes.reserved2;
os << ", .info[] = {";
for (size_t i = 0; i < bes.m_info.size(); ++i) {
os << " [" << i << "] = " << &(bes.m_info[i]) << ",";
@@ -119,25 +126,22 @@ namespace crucible {
void
btrfs_clone_range(int src_fd, off_t src_offset, off_t src_length, int dst_fd, off_t dst_offset)
{
btrfs_ioctl_clone_range_args args ( (btrfs_ioctl_clone_range_args) {
.src_fd = src_fd,
.src_offset = ranged_cast<uint64_t, off_t>(src_offset),
.src_length = ranged_cast<uint64_t, off_t>(src_length),
.dest_offset = ranged_cast<uint64_t, off_t>(dst_offset),
} );
struct btrfs_ioctl_clone_range_args args;
memset_zero(&args);
args.src_fd = src_fd;
args.src_offset = src_offset;
args.src_length = src_length;
args.dest_offset = dst_offset;
DIE_IF_MINUS_ONE(ioctl(dst_fd, BTRFS_IOC_CLONE_RANGE, &args));
}
void
BtrfsExtentSame::do_ioctl()
{
const size_t buf_size = sizeof(btrfs_ioctl_same_args) + m_info.size() * sizeof(btrfs_ioctl_same_extent_info);
ByteVector ioctl_arg( (btrfs_ioctl_same_args) {
.logical_offset = m_logical_offset,
.length = m_length,
.dest_count = ranged_cast<decltype(btrfs_ioctl_same_args::dest_count)>(m_info.size()),
}, buf_size);
btrfs_ioctl_same_args *const ioctl_ptr = ioctl_arg.get<btrfs_ioctl_same_args>();
dest_count = m_info.size();
vector<uint8_t> ioctl_arg = vector_copy_struct<btrfs_ioctl_same_args>(this);
ioctl_arg.resize(sizeof(btrfs_ioctl_same_args) + dest_count * sizeof(btrfs_ioctl_same_extent_info), 0);
btrfs_ioctl_same_args *ioctl_ptr = reinterpret_cast<btrfs_ioctl_same_args *>(ioctl_arg.data());
size_t count = 0;
for (auto i = m_info.cbegin(); i != m_info.cend(); ++i) {
ioctl_ptr->info[count] = static_cast<const btrfs_ioctl_same_extent_info &>(m_info[count]);
@@ -159,13 +163,12 @@ namespace crucible {
{
THROW_CHECK1(invalid_argument, src_length, src_length > 0);
while (src_length > 0) {
BtrfsExtentSame bes(src_fd, src_offset, src_length);
off_t length = min(off_t(BTRFS_MAX_DEDUPE_LEN), src_length);
BtrfsExtentSame bes(src_fd, src_offset, length);
bes.add(dst_fd, dst_offset);
bes.do_ioctl();
const auto status = bes.m_info.at(0).status;
auto status = bes.m_info.at(0).status;
if (status == 0) {
const off_t length = bes.m_info.at(0).bytes_deduped;
THROW_CHECK0(invalid_argument, length > 0);
src_offset += length;
dst_offset += length;
src_length -= length;
@@ -191,15 +194,18 @@ namespace crucible {
void *
BtrfsDataContainer::prepare(size_t container_size)
{
if (m_data.size() < container_size) {
m_data.resize(container_size);
}
btrfs_data_container *p = reinterpret_cast<btrfs_data_container *>(m_data.data());
const size_t min_size = offsetof(btrfs_data_container, val);
if (container_size < min_size) {
THROW_ERROR(out_of_range, "container size " << container_size << " smaller than minimum " << min_size);
}
if (m_data.size() < container_size) {
m_data = ByteVector(container_size);
}
const auto p = m_data.get<btrfs_data_container>();
*p = (btrfs_data_container) { };
p->bytes_left = 0;
p->bytes_missing = 0;
p->elem_cnt = 0;
p->elem_missed = 0;
return p;
}
@@ -212,29 +218,25 @@ namespace crucible {
decltype(btrfs_data_container::bytes_left)
BtrfsDataContainer::get_bytes_left() const
{
const auto p = m_data.get<btrfs_data_container>();
return p->bytes_left;
return bytes_left;
}
decltype(btrfs_data_container::bytes_missing)
BtrfsDataContainer::get_bytes_missing() const
{
const auto p = m_data.get<btrfs_data_container>();
return p->bytes_missing;
return bytes_missing;
}
decltype(btrfs_data_container::elem_cnt)
BtrfsDataContainer::get_elem_cnt() const
{
const auto p = m_data.get<btrfs_data_container>();
return p->elem_cnt;
return elem_cnt;
}
decltype(btrfs_data_container::elem_missed)
BtrfsDataContainer::get_elem_missed() const
{
const auto p = m_data.get<btrfs_data_container>();
return p->elem_missed;
return elem_missed;
}
ostream &
@@ -244,7 +246,7 @@ namespace crucible {
return os << "BtrfsIoctlLogicalInoArgs NULL";
}
os << "BtrfsIoctlLogicalInoArgs {";
os << " .m_logical = " << to_hex(p->m_logical);
os << " .logical = " << to_hex(p->logical);
os << " .inodes[] = {\n";
unsigned count = 0;
for (auto i = p->m_iors.cbegin(); i != p->m_iors.cend(); ++i) {
@@ -256,9 +258,10 @@ namespace crucible {
BtrfsIoctlLogicalInoArgs::BtrfsIoctlLogicalInoArgs(uint64_t new_logical, size_t new_size) :
m_container_size(new_size),
m_container(new_size),
m_logical(new_logical)
m_container(new_size)
{
memset_zero<btrfs_ioctl_logical_ino_args>(this);
logical = new_logical;
}
size_t
@@ -297,6 +300,11 @@ namespace crucible {
return m_begin;
}
BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::operator vector<BtrfsInodeOffsetRoot>() const
{
return vector<BtrfsInodeOffsetRoot>(m_begin, m_end);
}
void
BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::clear()
{
@@ -306,40 +314,23 @@ namespace crucible {
void
BtrfsIoctlLogicalInoArgs::set_flags(uint64_t new_flags)
{
m_flags = new_flags;
// We are still supporting building with old headers that don't have .flags yet
*(&reserved[0] + 3) = new_flags;
}
uint64_t
BtrfsIoctlLogicalInoArgs::get_flags() const
{
// We are still supporting building with old headers that don't have .flags yet
return m_flags;
}
void
BtrfsIoctlLogicalInoArgs::set_logical(uint64_t new_logical)
{
m_logical = new_logical;
}
void
BtrfsIoctlLogicalInoArgs::set_size(uint64_t new_size)
{
m_container_size = new_size;
return *(&reserved[0] + 3);
}
bool
BtrfsIoctlLogicalInoArgs::do_ioctl_nothrow(int fd)
{
btrfs_ioctl_logical_ino_args args = (btrfs_ioctl_logical_ino_args) {
.logical = m_logical,
.size = m_container_size,
.inodes = reinterpret_cast<uintptr_t>(m_container.prepare(m_container_size)),
};
// We are still supporting building with old headers that don't have .flags yet
*(&args.reserved[0] + 3) = m_flags;
btrfs_ioctl_logical_ino_args *const p = &args;
btrfs_ioctl_logical_ino_args *p = static_cast<btrfs_ioctl_logical_ino_args *>(this);
inodes = reinterpret_cast<uint64_t>(m_container.prepare(m_container_size));
size = m_container.get_size();
m_iors.clear();
@@ -376,13 +367,13 @@ namespace crucible {
bili_version = BTRFS_IOC_LOGICAL_INO_V2;
}
btrfs_data_container *const bdc = reinterpret_cast<btrfs_data_container *>(p->inodes);
BtrfsInodeOffsetRoot *const ior_iter = reinterpret_cast<BtrfsInodeOffsetRoot *>(bdc->val);
btrfs_data_container *bdc = reinterpret_cast<btrfs_data_container *>(p->inodes);
BtrfsInodeOffsetRoot *input_iter = reinterpret_cast<BtrfsInodeOffsetRoot *>(bdc->val);
// elem_cnt counts uint64_t, but BtrfsInodeOffsetRoot is 3x uint64_t
THROW_CHECK1(runtime_error, bdc->elem_cnt, bdc->elem_cnt % 3 == 0);
m_iors.m_begin = ior_iter;
m_iors.m_end = ior_iter + bdc->elem_cnt / 3;
m_iors.m_begin = input_iter;
m_iors.m_end = input_iter + bdc->elem_cnt / 3;
return true;
}
@@ -405,10 +396,9 @@ namespace crucible {
}
BtrfsIoctlInoPathArgs::BtrfsIoctlInoPathArgs(uint64_t inode, size_t new_size) :
btrfs_ioctl_ino_path_args( (btrfs_ioctl_ino_path_args) { } ),
m_container_size(new_size)
{
assert(inum == 0);
memset_zero<btrfs_ioctl_ino_path_args>(this);
inum = inode;
}
@@ -417,7 +407,7 @@ namespace crucible {
{
btrfs_ioctl_ino_path_args *p = static_cast<btrfs_ioctl_ino_path_args *>(this);
BtrfsDataContainer container(m_container_size);
fspath = reinterpret_cast<uintptr_t>(container.prepare(m_container_size));
fspath = reinterpret_cast<uint64_t>(container.prepare(m_container_size));
size = container.get_size();
m_paths.clear();
@@ -426,14 +416,14 @@ namespace crucible {
return false;
}
btrfs_data_container *const bdc = reinterpret_cast<btrfs_data_container *>(p->fspath);
btrfs_data_container *bdc = reinterpret_cast<btrfs_data_container *>(p->fspath);
m_paths.reserve(bdc->elem_cnt);
const uint64_t *up = reinterpret_cast<const uint64_t *>(bdc->val);
const char *const cp = reinterpret_cast<const char *>(bdc->val);
const char *cp = reinterpret_cast<const char *>(bdc->val);
for (auto count = bdc->elem_cnt; count > 0; --count) {
const char *const path = cp + *up++;
const char *path = cp + *up++;
if (static_cast<size_t>(path - cp) > container.get_size()) {
THROW_ERROR(out_of_range, "offset " << (path - cp) << " > size " << container.get_size() << " in " << __PRETTY_FUNCTION__);
}
@@ -468,10 +458,9 @@ namespace crucible {
return os;
}
BtrfsIoctlInoLookupArgs::BtrfsIoctlInoLookupArgs(uint64_t new_objectid) :
btrfs_ioctl_ino_lookup_args( (btrfs_ioctl_ino_lookup_args) { } )
BtrfsIoctlInoLookupArgs::BtrfsIoctlInoLookupArgs(uint64_t new_objectid)
{
assert(objectid == 0);
memset_zero<btrfs_ioctl_ino_lookup_args>(this);
objectid = new_objectid;
}
@@ -489,9 +478,9 @@ namespace crucible {
}
}
BtrfsIoctlDefragRangeArgs::BtrfsIoctlDefragRangeArgs() :
btrfs_ioctl_defrag_range_args( (btrfs_ioctl_defrag_range_args) { } )
BtrfsIoctlDefragRangeArgs::BtrfsIoctlDefragRangeArgs()
{
memset_zero<btrfs_ioctl_defrag_range_args>(this);
}
bool
@@ -521,10 +510,9 @@ namespace crucible {
}
string
btrfs_compress_type_ntoa(uint8_t compress_type)
btrfs_ioctl_defrag_range_compress_type_ntoa(uint32_t compress_type)
{
static const bits_ntoa_table table[] = {
NTOA_TABLE_ENTRY_ENUM(BTRFS_COMPRESS_NONE),
NTOA_TABLE_ENTRY_ENUM(BTRFS_COMPRESS_ZLIB),
NTOA_TABLE_ENTRY_ENUM(BTRFS_COMPRESS_LZO),
NTOA_TABLE_ENTRY_ENUM(BTRFS_COMPRESS_ZSTD),
@@ -544,14 +532,14 @@ namespace crucible {
os << " .len = " << p->len;
os << " .flags = " << btrfs_ioctl_defrag_range_flags_ntoa(p->flags);
os << " .extent_thresh = " << p->extent_thresh;
os << " .compress_type = " << btrfs_compress_type_ntoa(p->compress_type);
os << " .compress_type = " << btrfs_ioctl_defrag_range_compress_type_ntoa(p->compress_type);
os << " .unused[4] = { " << p->unused[0] << ", " << p->unused[1] << ", " << p->unused[2] << ", " << p->unused[3] << "} }";
return os;
}
FiemapExtent::FiemapExtent() :
fiemap_extent( (fiemap_extent) { } )
FiemapExtent::FiemapExtent()
{
memset_zero<fiemap_extent>(this);
}
FiemapExtent::FiemapExtent(const fiemap_extent &that)
@@ -658,10 +646,13 @@ namespace crucible {
operator<<(ostream &os, const Fiemap &args)
{
os << "Fiemap {";
os << " .m_start = " << to_hex(args.m_start) << ".." << to_hex(args.m_start + args.m_length);
os << ", .m_length = " << to_hex(args.m_length);
os << ", .m_flags = " << fiemap_flags_ntoa(args.m_flags);
os << ", .fm_extents[" << args.m_extents.size() << "] = {";
os << " .fm_start = " << to_hex(args.fm_start) << ".." << to_hex(args.fm_start + args.fm_length);
os << ", .fm_length = " << to_hex(args.fm_length);
if (args.fm_flags) os << ", .fm_flags = " << fiemap_flags_ntoa(args.fm_flags);
os << ", .fm_mapped_extents = " << args.fm_mapped_extents;
os << ", .fm_extent_count = " << args.fm_extent_count;
if (args.fm_reserved) os << ", .fm_reserved = " << args.fm_reserved;
os << ", .fm_extents[] = {";
size_t count = 0;
for (auto i = args.m_extents.cbegin(); i != args.m_extents.cend(); ++i) {
os << "\n\t[" << count++ << "] = " << &(*i) << ",";
@@ -669,35 +660,41 @@ namespace crucible {
return os << "\n}";
}
Fiemap::Fiemap(uint64_t start, uint64_t length) :
m_start(start),
m_length(length)
Fiemap::Fiemap(uint64_t start, uint64_t length)
{
memset_zero<fiemap>(this);
fm_start = start;
fm_length = length;
// FIEMAP is slow and full of lines.
// This makes FIEMAP even slower, but reduces the lies a little.
fm_flags = FIEMAP_FLAG_SYNC;
}
void
Fiemap::do_ioctl(int fd)
{
THROW_CHECK1(out_of_range, m_min_count, m_min_count <= m_max_count);
THROW_CHECK1(out_of_range, m_min_count, m_min_count > 0);
const auto extent_count = m_min_count;
ByteVector ioctl_arg(sizeof(fiemap) + extent_count * sizeof(fiemap_extent));
auto extent_count = m_min_count;
vector<uint8_t> ioctl_arg = vector_copy_struct<fiemap>(this);
fiemap *const ioctl_ptr = ioctl_arg.get<fiemap>();
ioctl_arg.resize(sizeof(fiemap) + extent_count * sizeof(fiemap_extent), 0);
auto start = m_start;
const auto end = m_start + m_length;
fiemap *ioctl_ptr = reinterpret_cast<fiemap *>(ioctl_arg.data());
auto start = fm_start;
auto end = fm_start + fm_length;
auto orig_start = fm_start;
auto orig_length = fm_length;
vector<FiemapExtent> extents;
while (start < end && extents.size() < m_max_count) {
*ioctl_ptr = (fiemap) {
.fm_start = start,
.fm_length = end - start,
.fm_flags = m_flags,
.fm_extent_count = extent_count,
};
ioctl_ptr->fm_start = start;
ioctl_ptr->fm_length = end - start;
ioctl_ptr->fm_extent_count = extent_count;
ioctl_ptr->fm_mapped_extents = 0;
// cerr << "Before (fd = " << fd << ") : " << ioctl_ptr << endl;
DIE_IF_MINUS_ONE(ioctl(fd, FS_IOC_FIEMAP, ioctl_ptr));
@@ -723,107 +720,74 @@ namespace crucible {
}
}
fiemap *this_ptr = static_cast<fiemap *>(this);
*this_ptr = *ioctl_ptr;
fm_start = orig_start;
fm_length = orig_length;
fm_extent_count = extents.size();
m_extents = extents;
}
BtrfsIoctlSearchKey::BtrfsIoctlSearchKey(size_t buf_size) :
btrfs_ioctl_search_key( (btrfs_ioctl_search_key) {
.max_objectid = numeric_limits<decltype(max_objectid)>::max(),
.max_offset = numeric_limits<decltype(max_offset)>::max(),
.max_transid = numeric_limits<decltype(max_transid)>::max(),
.max_type = numeric_limits<decltype(max_type)>::max(),
.nr_items = 1,
}),
m_buf_size(buf_size)
{
memset_zero<btrfs_ioctl_search_key>(this);
max_objectid = numeric_limits<decltype(max_objectid)>::max();
max_offset = numeric_limits<decltype(max_offset)>::max();
max_transid = numeric_limits<decltype(max_transid)>::max();
max_type = numeric_limits<decltype(max_type)>::max();
nr_items = numeric_limits<decltype(nr_items)>::max();
}
BtrfsIoctlSearchHeader::BtrfsIoctlSearchHeader() :
btrfs_ioctl_search_header( (btrfs_ioctl_search_header) { } )
BtrfsIoctlSearchHeader::BtrfsIoctlSearchHeader()
{
memset_zero<btrfs_ioctl_search_header>(this);
}
size_t
BtrfsIoctlSearchHeader::set_data(const ByteVector &v, size_t offset)
BtrfsIoctlSearchHeader::set_data(const vector<uint8_t> &v, size_t offset)
{
THROW_CHECK2(invalid_argument, offset, v.size(), offset + sizeof(btrfs_ioctl_search_header) <= v.size());
memcpy(static_cast<btrfs_ioctl_search_header *>(this), &v[offset], sizeof(btrfs_ioctl_search_header));
offset += sizeof(btrfs_ioctl_search_header);
THROW_CHECK2(invalid_argument, offset + len, v.size(), offset + len <= v.size());
m_data = ByteVector(v, offset, len);
m_data = Spanner<const uint8_t>(&v[offset], &v[offset + len]);
return offset + len;
}
thread_local size_t BtrfsIoctlSearchKey::s_calls = 0;
thread_local size_t BtrfsIoctlSearchKey::s_loops = 0;
thread_local size_t BtrfsIoctlSearchKey::s_loops_empty = 0;
thread_local shared_ptr<ostream> BtrfsIoctlSearchKey::s_debug_ostream;
bool
BtrfsIoctlSearchKey::do_ioctl_nothrow(int fd)
{
// It would be really nice if the kernel tells us whether our
// buffer overflowed or how big the overflowing object
// was; instead, we have to guess.
// Normally we like to be paranoid and fill empty bytes with zero,
// but these buffers can be huge. 80% of a 4GHz CPU huge.
// Keep the ioctl buffer from one run to the next to save on malloc costs
size_t target_buf_size = sizeof(btrfs_ioctl_search_args_v2) + m_buf_size;
m_ioctl_arg = vector_copy_struct<btrfs_ioctl_search_key>(this);
m_ioctl_arg.resize(target_buf_size);
m_result.clear();
// Make sure there is space for at least the search key and one (empty) header
size_t buf_size = max(m_buf_size, sizeof(btrfs_ioctl_search_args_v2) + sizeof(btrfs_ioctl_search_header));
ByteVector ioctl_arg;
btrfs_ioctl_search_args_v2 *ioctl_ptr;
do {
// ioctl buffer size does not include search key header or buffer size
ioctl_arg = ByteVector(buf_size + sizeof(btrfs_ioctl_search_args_v2));
ioctl_ptr = ioctl_arg.get<btrfs_ioctl_search_args_v2>();
ioctl_ptr->key = static_cast<const btrfs_ioctl_search_key&>(*this);
ioctl_ptr->buf_size = buf_size;
if (s_debug_ostream) {
(*s_debug_ostream) << "bisk " << (ioctl_ptr->key) << "\n";
}
// Don't bother supporting V1. Kernels that old have other problems.
int rv = ioctl(fd, BTRFS_IOC_TREE_SEARCH_V2, ioctl_arg.data());
++s_calls;
if (rv != 0 && errno == ENOENT) {
// If we are searching a tree that is deleted or no longer exists, just return an empty list
ioctl_ptr->key.nr_items = 0;
break;
}
if (rv != 0 && errno != EOVERFLOW) {
return false;
}
if (rv == 0 && nr_items <= ioctl_ptr->key.nr_items) {
// got all the items we wanted, thanks
m_buf_size = max(m_buf_size, buf_size);
break;
}
// Didn't get all the items we wanted. Increase the buf size and try again.
// These sizes are very common on default-formatted btrfs, so use these
// instead of naive doubling.
if (buf_size < 4096) {
buf_size = 4096;
} else if (buf_size < 16384) {
buf_size = 16384;
} else if (buf_size < 65536) {
buf_size = 65536;
} else {
buf_size *= 2;
}
// don't automatically raise the buf size higher than 64K, the largest possible btrfs item
++s_loops;
if (ioctl_ptr->key.nr_items == 0) {
++s_loops_empty;
}
} while (buf_size < 65536);
// ioctl changes nr_items, this has to be copied back
btrfs_ioctl_search_args_v2 *ioctl_ptr = reinterpret_cast<btrfs_ioctl_search_args_v2 *>(m_ioctl_arg.data());
ioctl_ptr->buf_size = m_buf_size;
// Don't bother supporting V1. Kernels that old have other problems.
int rv = ioctl(fd, BTRFS_IOC_TREE_SEARCH_V2, ioctl_ptr);
if (rv != 0) {
return false;
}
static_cast<btrfs_ioctl_search_key&>(*this) = ioctl_ptr->key;
size_t offset = pointer_distance(ioctl_ptr->buf, ioctl_ptr);
for (decltype(nr_items) i = 0; i < nr_items; ++i) {
BtrfsIoctlSearchHeader item;
offset = item.set_data(ioctl_arg, offset);
offset = item.set_data(m_ioctl_arg, offset);
m_result.insert(item);
}
return true;
}
@@ -831,7 +795,7 @@ namespace crucible {
BtrfsIoctlSearchKey::do_ioctl(int fd)
{
if (!do_ioctl_nothrow(fd)) {
THROW_ERRNO("BTRFS_IOC_TREE_SEARCH_V2: " << name_fd(fd) << ": " << *this);
THROW_ERRNO("BTRFS_IOC_TREE_SEARCH_V2: " << name_fd(fd));
}
}
@@ -842,67 +806,33 @@ namespace crucible {
min_type = ref.type;
min_offset = ref.offset + 1;
if (min_offset < ref.offset) {
// We wrapped, try the next type
++min_type;
assert(min_offset == 0);
if (min_type < ref.type) {
assert(min_type == 0);
// We wrapped, try the next objectid
++min_objectid;
// no advancement possible at end
THROW_CHECK1(runtime_error, min_type, min_type == 0);
}
// We wrapped, try the next objectid
++min_objectid;
}
}
void
BtrfsIoctlSearchKey::next_min(const BtrfsIoctlSearchHeader &ref, const uint8_t type)
template <class V>
ostream &
hexdump(ostream &os, const V &v)
{
if (ref.type < type) {
// forward to type in same object with zero offset
min_objectid = ref.objectid;
min_type = type;
min_offset = 0;
} else if (ref.type > type) {
// skip directly to start of next objectid with target type
min_objectid = ref.objectid + 1;
// no advancement possible at end
THROW_CHECK2(out_of_range, min_objectid, ref.objectid, min_objectid > ref.objectid);
min_type = type;
min_offset = 0;
} else {
// advance within this type
min_objectid = ref.objectid;
min_type = ref.type;
min_offset = ref.offset + 1;
if (min_offset < ref.offset) {
// We wrapped, try the next objectid, same type
++min_objectid;
THROW_CHECK2(out_of_range, min_objectid, ref.objectid, min_objectid > ref.objectid);
min_type = type;
assert(min_offset == 0);
os << "vector<uint8_t> { size = " << v.size() << ", data:\n";
for (size_t i = 0; i < v.size(); i += 8) {
string hex, ascii;
for (size_t j = i; j < i + 8; ++j) {
if (j < v.size()) {
uint8_t c = v[j];
char buf[8];
sprintf(buf, "%02x ", c);
hex += buf;
ascii += (c < 32 || c > 126) ? '.' : c;
} else {
hex += " ";
ascii += ' ';
}
}
os << astringprintf("\t%08x %s %s\n", i, hex.c_str(), ascii.c_str());
}
}
string
btrfs_chunk_type_ntoa(uint64_t type)
{
static const bits_ntoa_table table[] = {
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_DATA),
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_METADATA),
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_SYSTEM),
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_DUP),
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_RAID0),
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_RAID1),
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_RAID10),
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_RAID1C3),
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_RAID1C4),
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_RAID5),
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_RAID6),
NTOA_TABLE_ENTRY_END()
};
return bits_ntoa(type, table);
return os << "}";
}
string
@@ -1099,9 +1029,9 @@ namespace crucible {
return rv;
}
Statvfs::Statvfs() :
statvfs( (statvfs) { } )
Statvfs::Statvfs()
{
memset_zero<statvfs>(this);
}
Statvfs::Statvfs(int fd) :
@@ -1152,27 +1082,17 @@ namespace crucible {
return os << " }";
};
BtrfsIoctlFsInfoArgs::BtrfsIoctlFsInfoArgs() :
btrfs_ioctl_fs_info_args_v3( (btrfs_ioctl_fs_info_args_v3) {
.flags = 0
| BTRFS_FS_INFO_FLAG_CSUM_INFO
| BTRFS_FS_INFO_FLAG_GENERATION
,
})
BtrfsIoctlFsInfoArgs::BtrfsIoctlFsInfoArgs()
{
}
bool
BtrfsIoctlFsInfoArgs::do_ioctl_nothrow(int const fd)
{
btrfs_ioctl_fs_info_args_v3 *p = static_cast<btrfs_ioctl_fs_info_args_v3 *>(this);
return 0 == ioctl(fd, BTRFS_IOC_FS_INFO, p);
memset_zero<btrfs_ioctl_fs_info_args_v2>(this);
flags = BTRFS_FS_INFO_FLAG_CSUM_INFO;
}
void
BtrfsIoctlFsInfoArgs::do_ioctl(int const fd)
BtrfsIoctlFsInfoArgs::do_ioctl(int fd)
{
if (!do_ioctl_nothrow(fd)) {
btrfs_ioctl_fs_info_args_v2 *p = static_cast<btrfs_ioctl_fs_info_args_v2 *>(this);
if (ioctl(fd, BTRFS_IOC_FS_INFO, p)) {
THROW_ERRNO("BTRFS_IOC_FS_INFO: fd " << fd);
}
}
@@ -1180,26 +1100,13 @@ namespace crucible {
uint16_t
BtrfsIoctlFsInfoArgs::csum_type() const
{
return this->btrfs_ioctl_fs_info_args_v3::csum_type;
return this->btrfs_ioctl_fs_info_args_v2::csum_type;
}
uint16_t
BtrfsIoctlFsInfoArgs::csum_size() const
{
return this->btrfs_ioctl_fs_info_args_v3::csum_size;
}
vector<uint8_t>
BtrfsIoctlFsInfoArgs::fsid() const
{
const auto begin = btrfs_ioctl_fs_info_args_v3::fsid;
return vector<uint8_t>(begin, begin + BTRFS_FSID_SIZE);
}
uint64_t
BtrfsIoctlFsInfoArgs::generation() const
{
return this->btrfs_ioctl_fs_info_args_v3::generation;
return this->btrfs_ioctl_fs_info_args_v2::csum_size;
}
};

View File

@@ -1,83 +0,0 @@
#include "crucible/multilock.h"
#include "crucible/error.h"
namespace crucible {
using namespace std;
MultiLocker::LockHandle::LockHandle(const string &type, MultiLocker &parent) :
m_type(type),
m_parent(parent)
{
}
void
MultiLocker::LockHandle::set_locked(const bool state)
{
m_locked = state;
}
MultiLocker::LockHandle::~LockHandle()
{
if (m_locked) {
m_parent.put_lock(m_type);
m_locked = false;
}
}
bool
MultiLocker::is_lock_available(const string &type)
{
for (const auto &i : m_counters) {
if (i.second != 0 && i.first != type) {
return false;
}
}
return true;
}
void
MultiLocker::put_lock(const string &type)
{
unique_lock<mutex> lock(m_mutex);
auto &counter = m_counters[type];
THROW_CHECK2(runtime_error, type, counter, counter > 0);
--counter;
if (counter == 0) {
m_cv.notify_all();
}
}
shared_ptr<MultiLocker::LockHandle>
MultiLocker::get_lock_private(const string &type)
{
unique_lock<mutex> lock(m_mutex);
m_counters.insert(make_pair(type, size_t(0)));
while (!is_lock_available(type)) {
m_cv.wait(lock);
}
const auto rv = make_shared<LockHandle>(type, *this);
++m_counters[type];
rv->set_locked(true);
return rv;
}
static MultiLocker s_process_instance;
shared_ptr<MultiLocker::LockHandle>
MultiLocker::get_lock(const string &type)
{
if (s_process_instance.m_do_locking) {
return s_process_instance.get_lock_private(type);
} else {
return shared_ptr<MultiLocker::LockHandle>();
}
}
void
MultiLocker::enable_locking(const bool enabled)
{
s_process_instance.m_do_locking = enabled;
}
}

View File

@@ -1,40 +0,0 @@
#include "crucible/openat2.h"
#include <sys/syscall.h>
// Compatibility for building on old libc for new kernel
#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 6, 0)
// Every arch that defines this uses 437, except Alpha, where 437 is
// mq_getsetattr.
#ifndef SYS_openat2
#ifdef __alpha__
#define SYS_openat2 547
#else
#define SYS_openat2 437
#endif
#endif
#endif // Linux version >= v5.6
#include <fcntl.h>
#include <unistd.h>
extern "C" {
int
__attribute__((weak))
openat2(int const dirfd, const char *const pathname, struct open_how *const how, size_t const size)
throw()
{
#ifdef SYS_openat2
return syscall(SYS_openat2, dirfd, pathname, how, size);
#else
errno = ENOSYS;
return -1;
#endif
}
};

View File

@@ -7,18 +7,13 @@
#include <cstdlib>
#include <utility>
// for gettid()
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <unistd.h>
#include <sys/syscall.h>
extern "C" {
pid_t
__attribute__((weak))
gettid() throw()
{
return syscall(SYS_gettid);
}
};
namespace crucible {
using namespace std;
@@ -116,6 +111,12 @@ namespace crucible {
}
}
pid_t
gettid()
{
return syscall(SYS_gettid);
}
double
getloadavg1()
{

View File

@@ -1,254 +0,0 @@
#include "crucible/table.h"
#include "crucible/string.h"
namespace crucible {
namespace Table {
using namespace std;
Content
Fill(const char c)
{
return [=](size_t width, size_t height) -> string {
string rv;
while (height--) {
rv += string(width, c);
if (height) {
rv += "\n";
}
}
return rv;
};
}
Content
Text(const string &s)
{
return [=](size_t width, size_t height) -> string {
const auto lines = split("\n", s);
string rv;
size_t line_count = 0;
for (const auto &i : lines) {
if (line_count++) {
rv += "\n";
}
if (i.length() < width) {
rv += string(width - i.length(), ' ');
}
rv += i;
}
while (line_count < height) {
if (line_count++) {
rv += "\n";
}
rv += string(width, ' ');
}
return rv;
};
}
Content
Number(const string &s)
{
return [=](size_t width, size_t height) -> string {
const auto lines = split("\n", s);
string rv;
size_t line_count = 0;
for (const auto &i : lines) {
if (line_count++) {
rv += "\n";
}
if (i.length() < width) {
rv += string(width - i.length(), ' ');
}
rv += i;
}
while (line_count < height) {
if (line_count++) {
rv += "\n";
}
rv += string(width, ' ');
}
return rv;
};
}
Cell::Cell(const Content &fn) :
m_content(fn)
{
}
Cell&
Cell::operator=(const Content &fn)
{
m_content = fn;
return *this;
}
string
Cell::text(size_t width, size_t height) const
{
return m_content(width, height);
}
size_t
Dimension::size() const
{
return m_elements.size();
}
size_t
Dimension::insert(size_t pos)
{
++m_next_pos;
const auto insert_pos = min(m_elements.size(), pos);
const auto it = m_elements.begin() + insert_pos;
m_elements.insert(it, m_next_pos);
return insert_pos;
}
void
Dimension::erase(size_t pos)
{
const auto it = m_elements.begin() + min(m_elements.size(), pos);
m_elements.erase(it);
}
size_t
Dimension::at(size_t pos) const
{
return m_elements.at(pos);
}
Dimension&
Table::rows()
{
return m_rows;
};
const Dimension&
Table::rows() const
{
return m_rows;
};
Dimension&
Table::cols()
{
return m_cols;
};
const Dimension&
Table::cols() const
{
return m_cols;
};
const Cell&
Table::at(size_t row, size_t col) const
{
const auto row_idx = m_rows.at(row);
const auto col_idx = m_cols.at(col);
const auto found = m_cells.find(make_pair(row_idx, col_idx));
if (found == m_cells.end()) {
static const Cell s_empty(Fill('.'));
return s_empty;
}
return found->second;
};
Cell&
Table::at(size_t row, size_t col)
{
const auto row_idx = m_rows.at(row);
const auto col_idx = m_cols.at(col);
return m_cells[make_pair(row_idx, col_idx)];
};
static
pair<size_t, size_t>
text_size(const string &s)
{
const auto s_split = split("\n", s);
size_t width = 0;
for (const auto &i : s_split) {
width = max(width, i.length());
}
return make_pair(width, s_split.size());
}
ostream& operator<<(ostream &os, const Table &table)
{
const auto rows = table.rows().size();
const auto cols = table.cols().size();
vector<size_t> row_heights(rows, 1);
vector<size_t> col_widths(cols, 1);
// Get the size of all fixed- and minimum-sized content cells
for (size_t row = 0; row < table.rows().size(); ++row) {
vector<string> col_text;
for (size_t col = 0; col < table.cols().size(); ++col) {
col_text.push_back(table.at(row, col).text(0, 0));
const auto tsize = text_size(*col_text.rbegin());
row_heights[row] = max(row_heights[row], tsize.second);
col_widths[col] = max(col_widths[col], tsize.first);
}
}
// Render the table
for (size_t row = 0; row < table.rows().size(); ++row) {
vector<string> lines(row_heights[row], "");
for (size_t col = 0; col < table.cols().size(); ++col) {
const auto& table_cell = table.at(row, col);
const auto table_text = table_cell.text(col_widths[col], row_heights[row]);
auto col_lines = split("\n", table_text);
col_lines.resize(row_heights[row], "");
for (size_t line = 0; line < row_heights[row]; ++line) {
if (col > 0) {
lines[line] += table.mid();
}
lines[line] += col_lines[line];
}
}
for (const auto &line : lines) {
os << table.left() << line << table.right() << "\n";
}
}
return os;
}
void
Table::left(const string &s)
{
m_left = s;
}
void
Table::mid(const string &s)
{
m_mid = s;
}
void
Table::right(const string &s)
{
m_right = s;
}
const string&
Table::left() const
{
return m_left;
}
const string&
Table::mid() const
{
return m_mid;
}
const string&
Table::right() const
{
return m_right;
}
}
}

View File

@@ -18,27 +18,6 @@
namespace crucible {
using namespace std;
static const size_t thread_name_length = 15; // TASK_COMM_LEN on Linux
void
pthread_setname(const string &name)
{
auto name_copy = name.substr(0, thread_name_length);
// Don't care if a debugging facility fails
pthread_setname_np(pthread_self(), name_copy.c_str());
}
string
pthread_getname()
{
char buf[thread_name_length + 1] = { 0 };
// We'll get an empty name if this fails...
pthread_getname_np(pthread_self(), buf, sizeof(buf));
// ...or at least null-terminated garbage
buf[thread_name_length] = '\0';
return buf;
}
class TaskState;
using TaskStatePtr = shared_ptr<TaskState>;
using TaskStateWeak = weak_ptr<TaskState>;
@@ -51,8 +30,7 @@ namespace crucible {
static thread_local TaskStatePtr tl_current_task;
/// because we don't want to bump -std=c++-17 just to get scoped_lock.
/// Also we don't want to self-deadlock if both mutexes are the same mutex.
/// because we don't want to bump -std=c++-17 just to get scoped_lock
class PairLock {
unique_lock<mutex> m_lock1, m_lock2;
public:
@@ -76,24 +54,13 @@ namespace crucible {
/// Tasks to be executed after the current task is executed
list<TaskStatePtr> m_post_exec_queue;
/// Set by run(), append(), and insert(). Cleared by exec().
bool m_run_now = false;
/// Set by insert(). Cleared by exec() and destructor.
bool m_sort_queue = false;
/// Incremented by run() and append(). Decremented by exec().
size_t m_run_count = 0;
/// Set when task starts execution by exec().
/// Cleared when exec() ends.
bool m_is_running = false;
/// Set when task is queued while already running.
/// Cleared when task is requeued.
bool m_run_again = false;
/// Set when task is queued as idle task while already running.
/// Cleared when task is queued as non-idle task.
bool m_idle = false;
/// Sequential identifier for next task
static atomic<TaskId> s_next_id;
@@ -118,11 +85,10 @@ namespace crucible {
static void clear_queue(TaskQueue &tq);
/// Rescue any TaskQueue, not just this one.
static void rescue_queue(TaskQueue &tq, const bool sort_queue);
static void rescue_queue(TaskQueue &tq);
TaskState &operator=(const TaskState &) = delete;
TaskState(const TaskState &) = delete;
TaskState(TaskState &&) = delete;
public:
~TaskState();
@@ -135,9 +101,6 @@ namespace crucible {
/// instance at the end of TaskMaster's global queue.
void run();
/// Run the task when there are no more Tasks on the main queue.
void idle();
/// Execute task immediately in current thread if it is not already
/// executing in another thread; otherwise, append the current task
/// to itself to be executed immediately in the other thread.
@@ -153,10 +116,6 @@ namespace crucible {
/// or is destroyed.
void append(const TaskStatePtr &task);
/// Queue task to execute after current task finishes executing
/// or is destroyed, in task ID order.
void insert(const TaskStatePtr &task);
/// How masy Tasks are there? Good for catching leaks
static size_t instance_count();
};
@@ -168,7 +127,6 @@ namespace crucible {
mutex m_mutex;
condition_variable m_condvar;
TaskQueue m_queue;
TaskQueue m_idle_queue;
size_t m_thread_max;
size_t m_thread_min = 0;
set<TaskConsumerPtr> m_threads;
@@ -178,8 +136,6 @@ namespace crucible {
size_t m_configured_thread_max;
double m_thread_target;
bool m_cancelled = false;
bool m_paused = false;
TaskMaster::LoadStats m_load_stats;
friend class TaskConsumer;
friend class TaskMaster;
@@ -193,7 +149,6 @@ namespace crucible {
void set_loadavg_target(double target);
void loadavg_thread_fn();
void cancel();
void pause(bool paused = true);
TaskMasterState &operator=(const TaskMasterState &) = delete;
TaskMasterState(const TaskMasterState &) = delete;
@@ -203,11 +158,9 @@ namespace crucible {
TaskMasterState(size_t thread_max = thread::hardware_concurrency());
static void push_back(const TaskStatePtr &task);
static void push_back_idle(const TaskStatePtr &task);
static void push_front(TaskQueue &queue);
size_t get_queue_count();
size_t get_thread_count();
static TaskMaster::LoadStats get_current_load();
};
class TaskConsumer : public enable_shared_from_this<TaskConsumer> {
@@ -234,47 +187,30 @@ namespace crucible {
static auto s_tms = make_shared<TaskMasterState>();
void
TaskState::rescue_queue(TaskQueue &queue, const bool sort_queue)
TaskState::rescue_queue(TaskQueue &queue)
{
if (queue.empty()) {
return;
}
const auto &tlcc = tl_current_consumer;
auto tlcc = tl_current_consumer;
if (tlcc) {
// We are executing under a TaskConsumer, splice our post-exec queue at front.
// No locks needed because we are using only thread-local objects.
tlcc->m_local_queue.splice(tlcc->m_local_queue.begin(), queue);
if (sort_queue) {
tlcc->m_local_queue.sort([&](const TaskStatePtr &a, const TaskStatePtr &b) {
return a->m_id < b->m_id;
});
}
} else {
// We are not executing under a TaskConsumer.
// If there is only one task, then just insert it at the front of the queue.
if (queue.size() == 1) {
TaskMasterState::push_front(queue);
} else {
// If there are multiple tasks, create a new task to wrap our post-exec queue,
// then push it to the front of the global queue using normal locking methods.
TaskStatePtr rescue_task(make_shared<TaskState>("rescue_task", [](){}));
swap(rescue_task->m_post_exec_queue, queue);
// Do the sort--once--when a new Consumer has picked up the Task
rescue_task->m_sort_queue = sort_queue;
TaskQueue tq_one { rescue_task };
TaskMasterState::push_front(tq_one);
}
// Create a new task to wrap our post-exec queue,
// then push it to the front of the global queue using normal locking methods.
TaskStatePtr rescue_task(make_shared<TaskState>("rescue_task", [](){}));
swap(rescue_task->m_post_exec_queue, queue);
TaskQueue tq_one { rescue_task };
TaskMasterState::push_front(tq_one);
}
assert(queue.empty());
}
TaskState::~TaskState()
{
--s_instance_count;
unique_lock<mutex> lock(m_mutex);
// If any dependent Tasks were appended since the last exec, run them now
TaskState::rescue_queue(m_post_exec_queue, m_sort_queue);
// No need to clear m_sort_queue here, it won't exist soon
}
TaskState::TaskState(string title, function<void()> exec_fn) :
@@ -311,10 +247,11 @@ namespace crucible {
void
TaskState::clear_queue(TaskQueue &tq)
{
for (auto &i : tq) {
while (!tq.empty()) {
auto i = *tq.begin();
tq.pop_front();
i->clear();
}
tq.clear();
}
void
@@ -329,28 +266,10 @@ namespace crucible {
{
THROW_CHECK0(invalid_argument, task);
PairLock lock(m_mutex, task->m_mutex);
if (!task->m_run_now) {
task->m_run_now = true;
if (!task->m_run_count) {
++task->m_run_count;
append_nolock(task);
}
task->m_idle = false;
}
void
TaskState::insert(const TaskStatePtr &task)
{
THROW_CHECK0(invalid_argument, task);
THROW_CHECK2(invalid_argument, m_id, task->m_id, m_id != task->m_id);
PairLock lock(m_mutex, task->m_mutex);
if (!task->m_run_now) {
task->m_run_now = true;
// Move the task and its post-exec queue to follow this task,
// and request a sort of the flattened list.
m_sort_queue = true;
m_post_exec_queue.push_back(task);
m_post_exec_queue.splice(m_post_exec_queue.end(), task->m_post_exec_queue);
}
task->m_idle = false;
}
void
@@ -361,44 +280,33 @@ namespace crucible {
unique_lock<mutex> lock(m_mutex);
if (m_is_running) {
m_run_again = true;
append_nolock(shared_from_this());
return;
} else {
m_run_now = false;
--m_run_count;
m_is_running = true;
}
lock.unlock();
char buf[24] = { 0 };
DIE_IF_MINUS_ERRNO(pthread_getname_np(pthread_self(), buf, sizeof(buf)));
DIE_IF_MINUS_ERRNO(pthread_setname_np(pthread_self(), m_title.c_str()));
TaskStatePtr this_task = shared_from_this();
swap(this_task, tl_current_task);
lock.unlock();
const auto old_thread_name = pthread_getname();
pthread_setname(m_title);
catch_all([&]() {
m_exec_fn();
});
pthread_setname(old_thread_name);
swap(this_task, tl_current_task);
pthread_setname_np(pthread_self(), buf);
lock.lock();
swap(this_task, tl_current_task);
m_is_running = false;
if (m_run_again) {
m_run_again = false;
if (m_idle) {
// All the way back to the end of the line
TaskMasterState::push_back_idle(shared_from_this());
} else {
// Insert after any dependents waiting for this Task
m_post_exec_queue.push_back(shared_from_this());
}
}
// Splice task post_exec queue at front of local queue
TaskState::rescue_queue(m_post_exec_queue, m_sort_queue);
m_sort_queue = false;
TaskState::rescue_queue(m_post_exec_queue);
}
string
@@ -418,46 +326,24 @@ namespace crucible {
TaskState::run()
{
unique_lock<mutex> lock(m_mutex);
m_idle = false;
if (m_run_now) {
if (m_run_count) {
return;
}
m_run_now = true;
if (m_is_running) {
m_run_again = true;
} else {
TaskMasterState::push_back(shared_from_this());
}
}
void
TaskState::idle()
{
unique_lock<mutex> lock(m_mutex);
m_idle = true;
if (m_run_now) {
return;
}
m_run_now = true;
if (m_is_running) {
m_run_again = true;
} else {
TaskMasterState::push_back_idle(shared_from_this());
}
++m_run_count;
TaskMasterState::push_back(shared_from_this());
}
TaskMasterState::TaskMasterState(size_t thread_max) :
m_thread_max(thread_max),
m_configured_thread_max(thread_max),
m_thread_target(thread_max),
m_load_stats(TaskMaster::LoadStats { 0 })
m_thread_target(thread_max)
{
}
void
TaskMasterState::start_threads_nolock()
{
while (m_threads.size() < m_thread_max && !m_paused) {
while (m_threads.size() < m_thread_max) {
m_threads.insert(make_shared<TaskConsumer>(shared_from_this()));
}
}
@@ -489,20 +375,6 @@ namespace crucible {
s_tms->start_threads_nolock();
}
void
TaskMasterState::push_back_idle(const TaskStatePtr &task)
{
THROW_CHECK0(runtime_error, task);
unique_lock<mutex> lock(s_tms->m_mutex);
if (s_tms->m_cancelled) {
task->clear();
return;
}
s_tms->m_idle_queue.push_back(task);
s_tms->m_condvar.notify_all();
s_tms->start_threads_nolock();
}
void
TaskMasterState::push_front(TaskQueue &queue)
{
@@ -538,37 +410,16 @@ namespace crucible {
return s_tms->m_threads.size();
}
TaskMaster::LoadStats
TaskMaster::get_current_load()
{
unique_lock<mutex> lock(s_tms->m_mutex);
return s_tms->m_load_stats;
}
ostream &
TaskMaster::print_queue(ostream &os)
{
unique_lock<mutex> lock(s_tms->m_mutex);
auto queue_copy = s_tms->m_queue;
lock.unlock();
os << "Queue (size " << queue_copy.size() << "):" << endl;
os << "Queue (size " << s_tms->m_queue.size() << "):" << endl;
size_t counter = 0;
for (auto i : queue_copy) {
for (auto i : s_tms->m_queue) {
os << "Queue #" << ++counter << " Task ID " << i->id() << " " << i->title() << endl;
}
os << "Queue End" << endl;
lock.lock();
queue_copy = s_tms->m_idle_queue;
lock.unlock();
os << "Idle (size " << queue_copy.size() << "):" << endl;
counter = 0;
for (const auto &i : queue_copy) {
os << "Idle #" << ++counter << " Task ID " << i->id() << " " << i->title() << endl;
}
os << "Idle End" << endl;
return os;
return os << "Queue End" << endl;
}
ostream &
@@ -593,6 +444,11 @@ namespace crucible {
size_t
TaskMasterState::calculate_thread_count_nolock()
{
if (m_cancelled) {
// No threads running while cancelled
return 0;
}
if (m_load_target == 0) {
// No limits, no stats, use configured thread count
return m_configured_thread_max;
@@ -621,20 +477,18 @@ namespace crucible {
m_prev_loadavg = loadavg;
const double load_deficit = m_load_target - loadavg;
if (load_deficit > 0) {
// Load is too low, solve by adding another worker
m_thread_target += load_deficit / 3;
} else if (load_deficit < 0) {
// Load is too high, solve by removing all known excess tasks
m_thread_target += load_deficit;
}
// Change the thread target based on the
// difference between current and desired load
// but don't get too close all at once due to rounding and sample error.
// If m_load_target < 1.0 then we are just doing PWM with one thread.
m_load_stats = TaskMaster::LoadStats {
.current_load = current_load,
.thread_target = m_thread_target,
.loadavg = loadavg,
};
if (m_load_target <= 1.0) {
m_thread_target = 1.0;
} else if (m_load_target - current_load >= 1.0) {
m_thread_target += (m_load_target - current_load - 1.0) / 2.0;
} else if (m_load_target < current_load) {
m_thread_target += m_load_target - current_load;
}
// Cannot exceed configured maximum thread count or less than zero
m_thread_target = min(max(0.0, m_thread_target), double(m_configured_thread_max));
@@ -665,6 +519,12 @@ namespace crucible {
TaskMasterState::set_thread_count(size_t thread_max)
{
unique_lock<mutex> lock(m_mutex);
// XXX: someday we might want to uncancel, and this would be the place to do it;
// however, when we cancel we destroy the entire Task queue, and that might be
// non-trivial to recover from
if (m_cancelled) {
return;
}
m_configured_thread_max = thread_max;
lock.unlock();
adjust_thread_count();
@@ -681,11 +541,9 @@ namespace crucible {
TaskMasterState::cancel()
{
unique_lock<mutex> lock(m_mutex);
m_paused = true;
m_cancelled = true;
decltype(m_queue) empty_queue;
m_queue.swap(empty_queue);
empty_queue.splice(empty_queue.end(), m_idle_queue);
m_condvar.notify_all();
lock.unlock();
TaskState::clear_queue(empty_queue);
@@ -697,28 +555,14 @@ namespace crucible {
s_tms->cancel();
}
void
TaskMasterState::pause(const bool paused)
{
unique_lock<mutex> lock(m_mutex);
m_paused = paused;
m_condvar.notify_all();
if (!m_paused) {
start_threads_nolock();
}
lock.unlock();
}
void
TaskMaster::pause(const bool paused)
{
s_tms->pause(paused);
}
void
TaskMasterState::set_thread_min_count(size_t thread_min)
{
unique_lock<mutex> lock(m_mutex);
// XXX: someday we might want to uncancel, and this would be the place to do it
if (m_cancelled) {
return;
}
m_thread_min = thread_min;
lock.unlock();
adjust_thread_count();
@@ -734,7 +578,7 @@ namespace crucible {
void
TaskMasterState::loadavg_thread_fn()
{
pthread_setname("load_tracker");
pthread_setname_np(pthread_self(), "load_tracker");
while (!m_cancelled) {
adjust_thread_count();
nanosleep(5.0);
@@ -788,13 +632,6 @@ namespace crucible {
m_task_state->run();
}
void
Task::idle() const
{
THROW_CHECK0(runtime_error, m_task_state);
m_task_state->idle();
}
void
Task::append(const Task &that) const
{
@@ -803,14 +640,6 @@ namespace crucible {
m_task_state->append(that.m_task_state);
}
void
Task::insert(const Task &that) const
{
THROW_CHECK0(runtime_error, m_task_state);
THROW_CHECK0(runtime_error, that);
m_task_state->insert(that.m_task_state);
}
Task
Task::current_task()
{
@@ -865,7 +694,7 @@ namespace crucible {
TaskConsumer::consumer_thread()
{
// Keep a copy because we will be destroying *this later
const auto master_copy = m_master;
auto master_copy = m_master;
// Constructor is running with master locked.
// Wait until that is done before trying to do anything.
@@ -875,13 +704,13 @@ namespace crucible {
m_thread->detach();
// Set thread name so it isn't empty or the name of some other thread
pthread_setname("task_consumer");
DIE_IF_MINUS_ERRNO(pthread_setname_np(pthread_self(), "task_consumer"));
// It is now safe to access our own shared_ptr
TaskConsumerPtr this_consumer = shared_from_this();
swap(this_consumer, tl_current_consumer);
while (!master_copy->m_paused) {
while (!master_copy->m_cancelled) {
if (master_copy->m_thread_max < master_copy->m_threads.size()) {
// We are one of too many threads, exit now
break;
@@ -893,9 +722,6 @@ namespace crucible {
} else if (!master_copy->m_queue.empty()) {
m_current_task = *master_copy->m_queue.begin();
master_copy->m_queue.pop_front();
} else if (!master_copy->m_idle_queue.empty()) {
m_current_task = *master_copy->m_idle_queue.begin();
master_copy->m_idle_queue.pop_front();
} else {
master_copy->m_condvar.wait(lock);
continue;
@@ -923,15 +749,12 @@ namespace crucible {
// There is no longer a current consumer, but hold our own shared
// state so it's still there in the destructor
swap(this_consumer, tl_current_consumer);
assert(!tl_current_consumer);
// Release lock to rescue queue (may attempt to queue a
// new task at TaskMaster). rescue_queue normally sends
// tasks to the local queue of the current TaskConsumer
// thread, but we just disconnected ourselves from that.
// No sorting here because this is not a TaskState.
// Release lock to rescue queue (may attempt to queue a new task at TaskMaster).
// rescue_queue normally sends tasks to the local queue of the current TaskConsumer thread,
// but we just disconnected ourselves from that.
lock.unlock();
TaskState::rescue_queue(m_local_queue, false);
TaskState::rescue_queue(m_local_queue);
// Hold lock so we can erase ourselves
lock.lock();
@@ -957,16 +780,24 @@ namespace crucible {
void insert_task(Task t);
};
Barrier::Barrier(shared_ptr<BarrierState> pbs) :
m_barrier_state(pbs)
{
}
Barrier::Barrier() :
m_barrier_state(make_shared<BarrierState>())
{
}
void
BarrierState::release()
{
set<Task> tasks_local;
unique_lock<mutex> lock(m_mutex);
swap(tasks_local, m_tasks);
lock.unlock();
for (const auto &i : tasks_local) {
for (auto i : m_tasks) {
i.run();
}
m_tasks.clear();
}
BarrierState::~BarrierState()
@@ -974,6 +805,17 @@ namespace crucible {
release();
}
BarrierLock::BarrierLock(shared_ptr<BarrierState> pbs) :
m_barrier_state(pbs)
{
}
void
BarrierLock::release()
{
m_barrier_state.reset();
}
void
BarrierState::insert_task(Task t)
{
@@ -981,54 +823,122 @@ namespace crucible {
m_tasks.insert(t);
}
Barrier::Barrier() :
m_barrier_state(make_shared<BarrierState>())
{
}
void
Barrier::insert_task(Task t)
{
m_barrier_state->insert_task(t);
}
void
Barrier::release()
BarrierLock
Barrier::lock()
{
m_barrier_state.reset();
return BarrierLock(m_barrier_state);
}
ExclusionLock::ExclusionLock(shared_ptr<Task> owner) :
m_owner(owner)
class ExclusionState {
mutex m_mutex;
bool m_locked = false;
Task m_task;
public:
ExclusionState(const string &title);
~ExclusionState();
void release();
bool try_lock();
void insert_task(Task t);
};
Exclusion::Exclusion(shared_ptr<ExclusionState> pbs) :
m_exclusion_state(pbs)
{
}
Exclusion::Exclusion(const string &title) :
m_exclusion_state(make_shared<ExclusionState>(title))
{
}
ExclusionState::ExclusionState(const string &title) :
m_task(title, [](){})
{
}
void
ExclusionState::release()
{
unique_lock<mutex> lock(m_mutex);
m_locked = false;
m_task.run();
}
ExclusionState::~ExclusionState()
{
release();
}
ExclusionLock::ExclusionLock(shared_ptr<ExclusionState> pbs) :
m_exclusion_state(pbs)
{
}
void
ExclusionLock::release()
{
m_owner.reset();
if (m_exclusion_state) {
m_exclusion_state->release();
m_exclusion_state.reset();
}
}
ExclusionLock
Exclusion::try_lock(const Task &task)
ExclusionLock::~ExclusionLock()
{
release();
}
void
ExclusionState::insert_task(Task task)
{
unique_lock<mutex> lock(m_mutex);
const auto sp = m_owner.lock();
if (sp) {
if (task) {
sp->insert(task);
}
return ExclusionLock();
if (m_locked) {
// If Exclusion is locked then queue task for release;
m_task.append(task);
} else {
const auto rv = make_shared<Task>(task);
m_owner = rv;
return ExclusionLock(rv);
// otherwise, run the inserted task immediately
task.run();
}
}
bool
ExclusionState::try_lock()
{
unique_lock<mutex> lock(m_mutex);
if (m_locked) {
return false;
} else {
m_locked = true;
return true;
}
}
void
Exclusion::insert_task(Task t)
{
m_exclusion_state->insert_task(t);
}
ExclusionLock::operator bool() const
{
return !!m_owner;
return !!m_exclusion_state;
}
ExclusionLock
Exclusion::try_lock()
{
THROW_CHECK0(runtime_error, m_exclusion_state);
if (m_exclusion_state->try_lock()) {
return ExclusionLock(m_exclusion_state);
} else {
return ExclusionLock();
}
}
}

View File

@@ -98,16 +98,12 @@ namespace crucible {
m_rate(rate),
m_burst(burst)
{
THROW_CHECK1(invalid_argument, m_rate, m_rate > 0);
THROW_CHECK1(invalid_argument, m_burst, m_burst >= 0);
}
RateLimiter::RateLimiter(double rate) :
m_rate(rate),
m_burst(rate)
{
THROW_CHECK1(invalid_argument, m_rate, m_rate > 0);
THROW_CHECK1(invalid_argument, m_burst, m_burst >= 0);
}
void
@@ -123,7 +119,6 @@ namespace crucible {
double
RateLimiter::sleep_time(double cost)
{
THROW_CHECK1(invalid_argument, m_rate, m_rate > 0);
borrow(cost);
unique_lock<mutex> lock(m_mutex);
update_tokens();
@@ -159,21 +154,6 @@ namespace crucible {
m_tokens -= cost;
}
void
RateLimiter::rate(double const new_rate)
{
THROW_CHECK1(invalid_argument, new_rate, new_rate > 0);
unique_lock<mutex> lock(m_mutex);
m_rate = new_rate;
}
double
RateLimiter::rate() const
{
unique_lock<mutex> lock(m_mutex);
return m_rate;
}
RateEstimator::RateEstimator(double min_delay, double max_delay) :
m_min_delay(min_delay),
m_max_delay(max_delay)
@@ -222,13 +202,6 @@ namespace crucible {
}
}
void
RateEstimator::increment(const uint64_t more)
{
unique_lock<mutex> lock(m_mutex);
return update_unlocked(m_last_count + more);
}
uint64_t
RateEstimator::count() const
{

View File

@@ -1,11 +0,0 @@
#include "crucible/error.h"
#include "crucible/uname.h"
namespace crucible {
using namespace std;
Uname::Uname()
{
DIE_IF_NON_ZERO(uname(static_cast<utsname*>(this)));
}
}

View File

@@ -10,4 +10,4 @@ CCFLAGS = -Wall -Wextra -Werror -O3
CCFLAGS += -I../include -D_FILE_OFFSET_BITS=64
BEES_CFLAGS = $(CCFLAGS) -std=c99 $(CFLAGS)
BEES_CXXFLAGS = $(CCFLAGS) -std=c++11 -Wold-style-cast -Wno-missing-field-initializers $(CXXFLAGS)
BEES_CXXFLAGS = $(CCFLAGS) -std=c++11 -Wold-style-cast $(CXXFLAGS)

View File

@@ -1,13 +1,5 @@
#!/bin/bash
# if not called from systemd try to replicate mount unsharing on ctrl+c
# see: https://github.com/Zygo/bees/issues/281
if [ -z "${SYSTEMD_EXEC_PID}" -a -z "${UNSHARE_DONE}" ]; then
UNSHARE_DONE=true
export UNSHARE_DONE
exec unshare -m --propagation private -- "$0" "$@"
fi
## Helpful functions
INFO(){ echo "INFO:" "$@"; }
ERRO(){ echo "ERROR:" "$@"; exit 1; }
@@ -23,7 +15,7 @@ readonly AL128K="$((128*1024))"
readonly AL16M="$((16*1024*1024))"
readonly CONFIG_DIR=@ETC_PREFIX@/bees/
readonly bees_bin=$(realpath @DESTDIR@/@LIBEXEC_PREFIX@/bees)
readonly bees_bin=$(realpath @LIBEXEC_PREFIX@/bees)
command -v "$bees_bin" &> /dev/null || ERRO "Missing 'bees' agent"
@@ -116,11 +108,13 @@ mkdir -p "$WORK_DIR" || exit 1
INFO "MOUNT DIR: $MNT_DIR"
mkdir -p "$MNT_DIR" || exit 1
mount --make-private -osubvolid=5,nodev,noexec /dev/disk/by-uuid/$UUID "$MNT_DIR" || exit 1
mount --make-private -osubvolid=5 /dev/disk/by-uuid/$UUID "$MNT_DIR" || exit 1
if [ ! -d "$BEESHOME" ]; then
INFO "Create subvol $BEESHOME for store bees data"
btrfs sub cre "$BEESHOME"
else
btrfs sub show "$BEESHOME" &> /dev/null || ERRO "$BEESHOME MUST BE A SUBVOL!"
fi
# Check DB size
@@ -134,7 +128,7 @@ fi
fi
if (( "$OLD_SIZE" != "$NEW_SIZE" )); then
INFO "Resize db: $OLD_SIZE -> $NEW_SIZE"
rm -f "$BEESHOME/beescrawl.dat"
[ -f "$BEESHOME/beescrawl.$UUID.dat" ] && rm "$BEESHOME/beescrawl.$UUID.dat"
truncate -s $NEW_SIZE $DB_PATH
fi
chmod 700 "$DB_PATH"

View File

@@ -17,7 +17,6 @@ KillSignal=SIGTERM
MemoryAccounting=true
Nice=19
Restart=on-abnormal
RuntimeDirectory=bees
StartupCPUWeight=25
StartupIOWeight=25

View File

@@ -1,6 +1,11 @@
BEES = ../bin/bees
PROGRAMS = \
../bin/fiemap \
../bin/fiewalk \
all: $(BEES)
PROGRAM_OBJS = $(foreach b,$(PROGRAMS),$(patsubst ../bin/%,%.o,$(b)))
all: $(BEES) $(PROGRAMS)
include ../makeflags
-include ../localconf
@@ -20,18 +25,25 @@ BEES_OBJS = \
ALL_OBJS = $(BEES_OBJS) $(PROGRAM_OBJS)
bees-version.c: bees.h $(BEES_OBJS:.o=.cc) Makefile ../lib/libcrucible.a
echo "const char *BEES_VERSION = \"$(BEES_VERSION)\";" > bees-version.c.new
if ! [ -e "$@" ] || ! cmp -s "$@.new" "$@"; then mv -fv $@.new $@; fi
bees-version.c: bees.h $(BEES_OBJS:.o=.cc) Makefile
echo "const char *BEES_VERSION = \"$(BEES_VERSION)\";" > bees-version.new.c
mv -f bees-version.new.c bees-version.c
bees-usage.c: bees-usage.txt Makefile
(echo 'const char *BEES_USAGE = '; sed -r 's/^(.*)$$/"\1\\n"/' < bees-usage.txt; echo ';') > bees-usage.new.c
mv -f bees-usage.new.c bees-usage.c
%.dep: %.cc Makefile
.depends:
mkdir -p $@
.depends/%.dep: %.cc Makefile | .depends
$(CXX) $(BEES_CXXFLAGS) -M -MF $@ -MT $(<:.cc=.o) $<
include $(ALL_OBJS:%.o=%.dep)
depends.mk: $(ALL_OBJS:%.o=.depends/%.dep)
cat $^ > $@.new
mv -f $@.new $@
include depends.mk
%.o: %.c ../makeflags
$(CC) $(BEES_CFLAGS) -o $@ -c $<
@@ -39,6 +51,11 @@ include $(ALL_OBJS:%.o=%.dep)
%.o: %.cc ../makeflags
$(CXX) $(BEES_CXXFLAGS) -o $@ -c $<
$(PROGRAMS): ../bin/%: %.o
$(CXX) $(BEES_CXXFLAGS) $(BEES_LDFLAGS) -o $@ $< $(LIBS)
$(PROGRAMS): ../lib/libcrucible.a
$(BEES): $(BEES_OBJS) bees-version.o bees-usage.o ../lib/libcrucible.a
$(CXX) $(BEES_CXXFLAGS) $(BEES_LDFLAGS) -o $@ $^ $(LIBS)

File diff suppressed because it is too large Load Diff

View File

@@ -3,9 +3,9 @@
#include "crucible/city.h"
#include "crucible/crc64.h"
#include "crucible/string.h"
#include "crucible/uname.h"
#include <algorithm>
#include <random>
#include <sys/mman.h>
@@ -106,6 +106,12 @@ BeesHashTable::flush_dirty_extent(uint64_t extent_index)
BEESNOTE("flushing extent #" << extent_index << " of " << m_extents << " extents");
auto lock = lock_extent_by_index(extent_index);
// Not dirty, nothing to do
if (!m_extent_metadata.at(extent_index).m_dirty) {
return false;
}
bool wrote_extent = false;
catch_all([&]() {
@@ -117,7 +123,10 @@ BeesHashTable::flush_dirty_extent(uint64_t extent_index)
THROW_CHECK2(out_of_range, dirty_extent_end, dirty_extent, dirty_extent_end - dirty_extent == BLOCK_SIZE_HASHTAB_EXTENT);
BEESTOOLONG("pwrite(fd " << m_fd << " '" << name_fd(m_fd)<< "', length " << to_hex(dirty_extent_end - dirty_extent) << ", offset " << to_hex(dirty_extent - m_byte_ptr) << ")");
// Copy the extent because we might be stuck writing for a while
ByteVector extent_copy(dirty_extent, dirty_extent_end);
vector<uint8_t> extent_copy(dirty_extent, dirty_extent_end);
// Mark extent non-dirty while we still hold the lock
m_extent_metadata.at(extent_index).m_dirty = false;
// Release the lock
lock.unlock();
@@ -130,10 +139,6 @@ BeesHashTable::flush_dirty_extent(uint64_t extent_index)
// const size_t dirty_extent_size = dirty_extent_end - dirty_extent;
// bees_unreadahead(m_fd, dirty_extent_offset, dirty_extent_size);
// Mark extent clean if write was successful
lock.lock();
m_extent_metadata.at(extent_index).m_dirty = false;
wrote_extent = true;
});
@@ -147,28 +152,25 @@ BeesHashTable::flush_dirty_extents(bool slowly)
uint64_t wrote_extents = 0;
for (size_t extent_index = 0; extent_index < m_extents; ++extent_index) {
// Skip the clean ones
auto lock = lock_extent_by_index(extent_index);
if (!m_extent_metadata.at(extent_index).m_dirty) {
continue;
}
lock.unlock();
if (flush_dirty_extent(extent_index)) {
++wrote_extents;
if (slowly) {
if (m_stop_requested) {
slowly = false;
continue;
}
BEESNOTE("flush rate limited after extent #" << extent_index << " of " << m_extents << " extents");
chrono::duration<double> sleep_time(m_flush_rate_limit.sleep_time(BLOCK_SIZE_HASHTAB_EXTENT));
unique_lock<mutex> lock(m_stop_mutex);
if (m_stop_requested) {
BEESLOGDEBUG("Stop requested in hash table flush_dirty_extents");
// This function is called by another thread with !slowly,
// so we just get out of the way here.
break;
}
m_stop_condvar.wait_for(lock, sleep_time);
}
}
}
BEESLOGINFO("Flushed " << wrote_extents << " of " << m_extents << " hash table extents");
if (!slowly) {
BEESLOGINFO("Flushed " << wrote_extents << " of " << m_extents << " extents");
}
return wrote_extents;
}
@@ -202,28 +204,10 @@ BeesHashTable::writeback_loop()
m_dirty_condvar.wait(lock);
}
}
// The normal loop exits at the end of one iteration when stop requested,
// but stop request will be in the middle of the loop, and some extents
// will still be dirty. Run the flush loop again to get those.
BEESNOTE("flushing hash table, round 2");
BEESLOGDEBUG("Flushing hash table");
flush_dirty_extents(false);
// If there were any Tasks still running, they may have updated
// some hash table pages during the second flush. These updates
// will be lost. The Tasks will be repeated on the next run because
// they were not completed prior to the stop request, and the
// Crawl progress was already flushed out before the Hash table
// started writing, so nothing is really lost here.
catch_all([&]() {
// trigger writeback on our way out
#if 0
// seems to trigger huge latency spikes
BEESTOOLONG("unreadahead hash table size " <<
pretty(m_size)); bees_unreadahead(m_fd, 0, m_size);
#endif
BEESTOOLONG("unreadahead hash table size " << pretty(m_size));
bees_unreadahead(m_fd, 0, m_size);
});
BEESLOGDEBUG("Exited hash table writeback_loop");
}
@@ -242,7 +226,6 @@ percent(size_t num, size_t den)
void
BeesHashTable::prefetch_loop()
{
Uname uname;
bool not_locked = true;
while (!m_stop_requested) {
size_t width = 64;
@@ -336,7 +319,6 @@ BeesHashTable::prefetch_loop()
graph_blob << "Now: " << format_time(time(NULL)) << "\n";
graph_blob << "Uptime: " << m_ctx->total_timer().age() << " seconds\n";
graph_blob << "Version: " << BEES_VERSION << "\n";
graph_blob << "Kernel: " << uname.sysname << " " << uname.release << " " << uname.machine << " " << uname.version << "\n";
graph_blob
<< "\nHash table page occupancy histogram (" << occupied_count << "/" << total_count << " cells occupied, " << (occupied_count * 100 / total_count) << "%)\n"
@@ -356,8 +338,6 @@ BeesHashTable::prefetch_loop()
auto avg_rates = thisStats / m_ctx->total_timer().age();
graph_blob << "\t" << avg_rates << "\n";
graph_blob << m_ctx->get_progress();
BEESLOGINFO(graph_blob.str());
catch_all([&]() {
m_stats_file.write(graph_blob.str());
@@ -448,38 +428,10 @@ BeesHashTable::fetch_missing_extent_by_index(uint64_t extent_index)
// If we are in prefetch, give the kernel a hint about the next extent
if (m_prefetch_running) {
// Use the kernel readahead here, because it might work for this use case
readahead(m_fd, dirty_extent_offset + dirty_extent_size, dirty_extent_size);
// XXX: don't call this if bees_readahead is implemented by pread()
bees_readahead(m_fd, dirty_extent_offset + dirty_extent_size, dirty_extent_size);
}
});
Cell *cell = m_extent_ptr[extent_index ].p_buckets[0].p_cells;
Cell *cell_end = m_extent_ptr[extent_index + 1].p_buckets[0].p_cells;
size_t toxic_cleared_count = 0;
set<BeesHashTable::Cell> seen_it(cell, cell_end);
while (cell < cell_end) {
if (cell->e_addr & BeesAddress::c_toxic_mask) {
++toxic_cleared_count;
cell->e_addr &= ~BeesAddress::c_toxic_mask;
// Clearing the toxic bit might mean we now have a duplicate.
// This could be due to a race between two
// inserts, one finds the extent toxic while the
// other does not. That's arguably a bug elsewhere,
// but we should rewrite the whole extent lookup/insert
// loop, not spend time fixing code that will be
// thrown out later anyway.
// If there is a cell that is identical to this one
// except for the toxic bit, then we don't need this one.
if (seen_it.count(*cell)) {
cell->e_addr = 0;
cell->e_hash = 0;
}
}
++cell;
}
if (toxic_cleared_count) {
BEESLOGDEBUG("Cleared " << toxic_cleared_count << " hashes while fetching hash table extent " << extent_index);
}
}
void
@@ -586,8 +538,6 @@ BeesHashTable::push_front_hash_addr(HashType hash, AddrType addr)
return found;
}
thread_local uniform_int_distribution<size_t> BeesHashTable::tl_distribution(0, c_cells_per_bucket - 1);
/// Insert a hash entry at some unspecified point in the list.
/// If entry is already present in list, returns true and does not
/// modify list. If entry is not present in list, returns false and
@@ -605,7 +555,9 @@ BeesHashTable::push_random_hash_addr(HashType hash, AddrType addr)
Cell *ip = find(er.first, er.second, mv);
bool found = (ip < er.second);
const auto pos = tl_distribution(bees_generator);
thread_local default_random_engine generator;
thread_local uniform_int_distribution<int> distribution(0, c_cells_per_bucket - 1);
auto pos = distribution(generator);
int case_cond = 0;
#if 0
@@ -797,7 +749,7 @@ BeesHashTable::BeesHashTable(shared_ptr<BeesContext> ctx, string filename, off_t
for (auto fp = madv_flags; fp->value; ++fp) {
BEESTOOLONG("madvise(" << fp->name << ")");
if (madvise(m_byte_ptr, m_size, fp->value)) {
BEESLOGNOTICE("madvise(..., " << fp->name << "): " << strerror(errno) << " (ignored)");
BEESLOGWARN("madvise(..., " << fp->name << "): " << strerror(errno) << " (ignored)");
}
}
@@ -811,18 +763,8 @@ BeesHashTable::BeesHashTable(shared_ptr<BeesContext> ctx, string filename, off_t
prefetch_loop();
});
// Blacklist might fail if the hash table is not stored on a btrfs,
// or if it's on a _different_ btrfs
// Blacklist might fail if the hash table is not stored on a btrfs
catch_all([&]() {
// Root is definitely a btrfs
BtrfsIoctlFsInfoArgs root_info;
root_info.do_ioctl(m_ctx->root_fd());
// Hash might not be a btrfs
BtrfsIoctlFsInfoArgs hash_info;
if (hash_info.do_ioctl_nothrow(m_fd)) return;
// If Hash is a btrfs, Root must be the same one
if (root_info.fsid() != hash_info.fsid()) return;
// Hash is on the same one, blacklist it
m_ctx->blacklist_insert(BeesFileId(m_fd));
});
}
@@ -847,7 +789,7 @@ BeesHashTable::~BeesHashTable()
}
void
BeesHashTable::stop_request()
BeesHashTable::stop()
{
BEESNOTE("stopping BeesHashTable threads");
BEESLOGDEBUG("Stopping BeesHashTable threads");
@@ -861,11 +803,7 @@ BeesHashTable::stop_request()
unique_lock<mutex> dirty_lock(m_dirty_mutex);
m_dirty_condvar.notify_all();
dirty_lock.unlock();
}
void
BeesHashTable::stop_wait()
{
BEESNOTE("waiting for hash_prefetch thread");
BEESLOGDEBUG("Waiting for hash_prefetch thread");
m_prefetch_thread.join();
@@ -874,5 +812,11 @@ BeesHashTable::stop_wait()
BEESLOGDEBUG("Waiting for hash_writeback thread");
m_writeback_thread.join();
if (m_cell_ptr && m_size) {
BEESLOGDEBUG("Flushing hash table");
BEESNOTE("flushing hash table");
flush_dirty_extents(false);
}
BEESLOGDEBUG("BeesHashTable stopped");
}

View File

@@ -384,29 +384,26 @@ BeesResolver::for_each_extent_ref(BeesBlockData bbd, function<bool(const BeesFil
return stop_now;
}
BeesRangePair
BeesResolver::replace_dst(const BeesFileRange &dst_bfr_in)
BeesFileRange
BeesResolver::replace_dst(const BeesFileRange &dst_bfr)
{
BEESTRACE("replace_dst dst_bfr " << dst_bfr_in);
BEESTRACE("replace_dst dst_bfr " << dst_bfr);
BEESCOUNT(replacedst_try);
// Open dst, reuse it for all src
BEESNOTE("Opening dst bfr " << dst_bfr_in);
BEESTRACE("Opening dst bfr " << dst_bfr_in);
auto dst_bfr = dst_bfr_in;
BEESNOTE("Opening dst bfr " << dst_bfr);
BEESTRACE("Opening dst bfr " << dst_bfr);
dst_bfr.fd(m_ctx);
BeesFileRange overlap_bfr;
BEESTRACE("overlap_bfr " << overlap_bfr);
BeesBlockData bbd(dst_bfr);
BeesRangePair rv = { BeesFileRange(), BeesFileRange() };
for_each_extent_ref(bbd, [&](const BeesFileRange &src_bfr_in) -> bool {
for_each_extent_ref(bbd, [&](const BeesFileRange &src_bfr) -> bool {
// Open src
BEESNOTE("Opening src bfr " << src_bfr_in);
BEESTRACE("Opening src bfr " << src_bfr_in);
auto src_bfr = src_bfr_in;
BEESNOTE("Opening src bfr " << src_bfr);
BEESTRACE("Opening src bfr " << src_bfr);
src_bfr.fd(m_ctx);
if (dst_bfr.overlaps(src_bfr)) {
@@ -421,7 +418,7 @@ BeesResolver::replace_dst(const BeesFileRange &dst_bfr_in)
BEESCOUNT(replacedst_same);
// stop looping here, all the other srcs will probably fail this test too
BeesTracer::set_silent();
throw runtime_error("FIXME: too many duplicate candidates, bailing out here");
throw runtime_error("FIXME: bailing out here, need to fix this further up the call stack");
}
// Make pair(src, dst)
@@ -437,12 +434,21 @@ BeesResolver::replace_dst(const BeesFileRange &dst_bfr_in)
BEESCOUNT(replacedst_grown);
}
rv = brp;
m_found_dup = true;
return true;
// Dedup
BEESNOTE("dedup " << brp);
if (m_ctx->dedup(brp)) {
BEESCOUNT(replacedst_dedup_hit);
m_found_dup = true;
overlap_bfr = brp.second;
// FIXME: find best range first, then dedupe that
return true; // i.e. break
} else {
BEESCOUNT(replacedst_dedup_miss);
return false; // i.e. continue
}
});
// BEESLOG("overlap_bfr after " << overlap_bfr);
return rv;
return overlap_bfr.copy_closed();
}
BeesFileRange

File diff suppressed because it is too large Load Diff

View File

@@ -91,9 +91,9 @@ BeesNote::~BeesNote()
tl_next = m_prev;
unique_lock<mutex> lock(s_mutex);
if (tl_next) {
s_status[gettid()] = tl_next;
s_status[crucible::gettid()] = tl_next;
} else {
s_status.erase(gettid());
s_status.erase(crucible::gettid());
}
}
@@ -104,14 +104,16 @@ BeesNote::BeesNote(function<void(ostream &os)> f) :
m_prev = tl_next;
tl_next = this;
unique_lock<mutex> lock(s_mutex);
s_status[gettid()] = tl_next;
s_status[crucible::gettid()] = tl_next;
}
void
BeesNote::set_name(const string &name)
{
tl_name = name;
pthread_setname(name);
catch_all([&]() {
DIE_IF_MINUS_ERRNO(pthread_setname_np(pthread_self(), name.c_str()));
});
}
string
@@ -132,12 +134,19 @@ BeesNote::get_name()
}
// OK try the pthread name next.
char buf[24];
memset(buf, '\0', sizeof(buf));
int err = pthread_getname_np(pthread_self(), buf, sizeof(buf));
if (err) {
return string("pthread_getname_np: ") + strerror(err);
}
buf[sizeof(buf) - 1] = '\0';
// thread_getname_np returns process name
// ...by default? ...for the main thread?
// ...except during exception handling?
// ...randomly?
return pthread_getname();
return buf;
}
BeesNote::ThreadStatusMap

View File

@@ -183,24 +183,6 @@ BeesFileRange::grow_begin(off_t delta)
return m_begin;
}
off_t
BeesFileRange::shrink_begin(off_t delta)
{
THROW_CHECK1(invalid_argument, delta, delta > 0);
THROW_CHECK3(invalid_argument, delta, m_begin, m_end, delta + m_begin < m_end);
m_begin += delta;
return m_begin;
}
off_t
BeesFileRange::shrink_end(off_t delta)
{
THROW_CHECK1(invalid_argument, delta, delta > 0);
THROW_CHECK2(invalid_argument, delta, m_end, m_end >= delta);
m_end -= delta;
return m_end;
}
BeesFileRange::BeesFileRange(const BeesBlockData &bbd) :
m_fd(bbd.fd()),
m_begin(bbd.begin()),
@@ -256,6 +238,42 @@ BeesFileRange::overlaps(const BeesFileRange &that) const
return false;
}
bool
BeesFileRange::coalesce(const BeesFileRange &that)
{
// Let's define coalesce-with-null as identity,
// and coalesce-null-with-null as coalesced
if (!*this) {
operator=(that);
return true;
}
if (!that) {
return true;
}
// Can't coalesce different files
if (!is_same_file(that)) return false;
pair<uint64_t, uint64_t> a(m_begin, m_end);
pair<uint64_t, uint64_t> b(that.m_begin, that.m_end);
// range a starts lower than or equal b
if (b.first < a.first) {
swap(a, b);
}
// if b starts within a, they overlap
// (and the intersecting region is b.first..min(a.second, b.second))
// (and the union region is a.first..max(a.second, b.second))
if (b.first >= a.first && b.first < a.second) {
m_begin = a.first;
m_end = max(a.second, b.second);
return true;
}
return false;
}
BeesFileRange::operator BeesBlockData() const
{
BEESTRACE("operator BeesBlockData " << *this);
@@ -269,7 +287,7 @@ BeesFileRange::fd() const
}
Fd
BeesFileRange::fd(const shared_ptr<BeesContext> &ctx)
BeesFileRange::fd(const shared_ptr<BeesContext> &ctx) const
{
// If we don't have a fid we can't do much here
if (m_fid) {
@@ -367,8 +385,8 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
BEESTRACE("e_second " << e_second);
// Preread entire extent
bees_readahead_pair(second.fd(), e_second.begin(), e_second.size(),
first.fd(), e_second.begin() + first.begin() - second.begin(), e_second.size());
bees_readahead(second.fd(), e_second.begin(), e_second.size());
bees_readahead(first.fd(), e_second.begin() + first.begin() - second.begin(), e_second.size());
auto hash_table = ctx->hash_table();
@@ -406,6 +424,17 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
break;
}
// Source extent cannot be toxic
BeesAddress first_addr(first.fd(), new_first.begin());
if (!first_addr.is_magic()) {
auto first_resolved = ctx->resolve_addr(first_addr);
if (first_resolved.is_toxic()) {
BEESLOGWARN("WORKAROUND: not growing matching pair backward because src addr is toxic:\n" << *this);
BEESCOUNT(pairbackward_toxic_addr);
break;
}
}
// Extend second range. If we hit BOF we can go no further.
BeesFileRange new_second = second;
BEESTRACE("new_second = " << new_second);
@@ -441,7 +470,6 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
}
// Source block cannot be zero in a non-compressed non-magic extent
BeesAddress first_addr(first.fd(), new_first.begin());
if (first_bbd.is_data_zero() && !first_addr.is_magic() && !first_addr.is_compressed()) {
BEESCOUNT(pairbackward_zero);
break;
@@ -457,7 +485,7 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
}
}
if (found_toxic) {
BEESLOGDEBUG("WORKAROUND: found toxic hash in " << first_bbd << " while extending backward:\n" << *this);
BEESLOGWARN("WORKAROUND: found toxic hash in " << first_bbd << " while extending backward:\n" << *this);
BEESCOUNT(pairbackward_toxic_hash);
break;
}
@@ -499,6 +527,17 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
break;
}
// Source extent cannot be toxic
BeesAddress first_addr(first.fd(), new_first.begin());
if (!first_addr.is_magic()) {
auto first_resolved = ctx->resolve_addr(first_addr);
if (first_resolved.is_toxic()) {
BEESLOGWARN("WORKAROUND: not growing matching pair forward because src is toxic:\n" << *this);
BEESCOUNT(pairforward_toxic);
break;
}
}
// Extend second range. If we hit EOF we can go no further.
BeesFileRange new_second = second;
BEESTRACE("new_second = " << new_second);
@@ -542,7 +581,6 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
}
// Source block cannot be zero in a non-compressed non-magic extent
BeesAddress first_addr(first.fd(), new_first.begin());
if (first_bbd.is_data_zero() && !first_addr.is_magic() && !first_addr.is_compressed()) {
BEESCOUNT(pairforward_zero);
break;
@@ -558,7 +596,7 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
}
}
if (found_toxic) {
BEESLOGDEBUG("WORKAROUND: found toxic hash in " << first_bbd << " while extending forward:\n" << *this);
BEESLOGWARN("WORKAROUND: found toxic hash in " << first_bbd << " while extending forward:\n" << *this);
BEESCOUNT(pairforward_toxic_hash);
break;
}
@@ -587,22 +625,6 @@ BeesRangePair::copy_closed() const
return BeesRangePair(first.copy_closed(), second.copy_closed());
}
void
BeesRangePair::shrink_begin(off_t const delta)
{
first.shrink_begin(delta);
second.shrink_begin(delta);
THROW_CHECK2(runtime_error, first.size(), second.size(), first.size() == second.size());
}
void
BeesRangePair::shrink_end(off_t const delta)
{
first.shrink_end(delta);
second.shrink_end(delta);
THROW_CHECK2(runtime_error, first.size(), second.size(), first.size() == second.size());
}
ostream &
operator<<(ostream &os, const BeesAddress &ba)
{

View File

@@ -12,10 +12,9 @@ Load management options:
-C, --thread-factor Worker thread factor (default 1)
-G, --thread-min Minimum worker thread count (default 0)
-g, --loadavg-target Target load average for worker threads (default none)
--throttle-factor Idle time between operations (default 1.0)
Filesystem tree traversal options:
-m, --scan-mode Scanning mode (0..4, default 4)
-m, --scan-mode Scanning mode (0..2, default 0)
Workarounds:
-a, --workaround-btrfs-send Workaround for btrfs send

View File

@@ -198,7 +198,7 @@ BeesTooLong::check() const
if (age() > m_limit) {
ostringstream oss;
m_func(oss);
BEESLOGINFO("PERFORMANCE: " << *this << " sec: " << oss.str());
BEESLOGWARN("PERFORMANCE: " << *this << " sec: " << oss.str());
}
}
@@ -214,86 +214,44 @@ BeesTooLong::operator=(const func_type &f)
return *this;
}
static
bool
bees_readahead_check(int const fd, off_t const offset, size_t const size)
void
bees_sync(int fd)
{
// FIXME: the rest of the code calls this function more often than necessary,
// usually back-to-back calls on the same range in a loop.
// Simply discard requests that are identical to recent requests.
const Stat stat_rv(fd);
auto tup = make_tuple(offset, size, stat_rv.st_dev, stat_rv.st_ino);
static mutex s_recent_mutex;
static set<decltype(tup)> s_recent;
unique_lock<mutex> lock(s_recent_mutex);
if (s_recent.size() > BEES_MAX_EXTENT_REF_COUNT) {
s_recent.clear();
BEESCOUNT(readahead_clear);
}
const auto rv = s_recent.insert(tup);
// If we recently did this readahead, we're done here
if (!rv.second) {
BEESCOUNT(readahead_skip);
}
return rv.second;
Timer sync_timer;
BEESNOTE("syncing " << name_fd(fd));
BEESTOOLONG("syncing " << name_fd(fd));
DIE_IF_NON_ZERO(fsync(fd));
BEESCOUNT(sync_count);
BEESCOUNTADD(sync_ms, sync_timer.age() * 1000);
}
static
void
bees_readahead_nolock(int const fd, const off_t offset, const size_t size)
bees_readahead(int const fd, off_t offset, size_t size)
{
if (!bees_readahead_check(fd, offset, size)) return;
Timer readahead_timer;
BEESNOTE("readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
BEESTOOLONG("readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
// In the kernel, readahead() is identical to posix_fadvise(..., POSIX_FADV_DONTNEED)
DIE_IF_NON_ZERO(readahead(fd, offset, size));
#if 0
// Make sure this data is in page cache by brute force
// The btrfs kernel code does readahead with lower ioprio
// and might discard the readahead request entirely.
// This isn't necessary and it might even be slower
BEESNOTE("emulating readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
auto working_size = size;
auto working_offset = offset;
while (working_size) {
// don't care about multithreaded writes to this buffer--it is garbage anyway
while (size) {
static uint8_t dummy[BEES_READAHEAD_SIZE];
const size_t this_read_size = min(working_size, sizeof(dummy));
// Ignore errors and short reads. It turns out our size
// parameter isn't all that accurate, so we can't use
// the pread_or_die template.
const auto pr_rv = pread(fd, dummy, this_read_size, working_offset);
if (pr_rv >= 0) {
BEESCOUNT(readahead_count);
BEESCOUNTADD(readahead_bytes, pr_rv);
} else {
BEESCOUNT(readahead_fail);
}
working_offset += this_read_size;
working_size -= this_read_size;
size_t this_read_size = min(size, sizeof(dummy));
// Ignore errors and short reads.
// It turns out our size parameter isn't all that accurate.
(void)!pread(fd, dummy, this_read_size, offset);
BEESCOUNT(readahead_count);
BEESCOUNTADD(readahead_bytes, this_read_size);
offset += this_read_size;
size -= this_read_size;
}
#endif
BEESCOUNTADD(readahead_ms, readahead_timer.age() * 1000);
}
static mutex s_only_one;
void
bees_readahead_pair(int fd, off_t offset, size_t size, int fd2, off_t offset2, size_t size2)
{
if (!bees_readahead_check(fd, offset, size) && !bees_readahead_check(fd2, offset2, size2)) return;
BEESNOTE("waiting to readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size) << ","
<< "\n\t" << name_fd(fd2) << " offset " << to_hex(offset2) << " len " << pretty(size2));
unique_lock<mutex> m_lock(s_only_one);
bees_readahead_nolock(fd, offset, size);
bees_readahead_nolock(fd2, offset2, size2);
}
void
bees_readahead(int const fd, const off_t offset, const size_t size)
{
if (!bees_readahead_check(fd, offset, size)) return;
BEESNOTE("waiting to readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
unique_lock<mutex> m_lock(s_only_one);
bees_readahead_nolock(fd, offset, size);
}
void
bees_unreadahead(int const fd, off_t offset, size_t size)
{
@@ -304,55 +262,6 @@ bees_unreadahead(int const fd, off_t offset, size_t size)
BEESCOUNTADD(readahead_unread_ms, unreadahead_timer.age() * 1000);
}
static double bees_throttle_factor = 0.0;
void
bees_throttle(const double time_used, const char *const context)
{
static mutex s_mutex;
unique_lock<mutex> throttle_lock(s_mutex);
struct time_pair {
double time_used = 0;
double time_count = 0;
double longest_sleep_time = 0;
};
static map<string, time_pair> s_time_map;
auto &this_time = s_time_map[context];
auto &this_time_used = this_time.time_used;
auto &this_time_count = this_time.time_count;
auto &longest_sleep_time = this_time.longest_sleep_time;
this_time_used += time_used;
++this_time_count;
// Keep the timing data fresh
static Timer s_fresh_timer;
if (s_fresh_timer.age() > 60) {
s_fresh_timer.reset();
this_time_count *= 0.9;
this_time_used *= 0.9;
}
// Wait for enough data to calculate rates
if (this_time_used < 1.0 || this_time_count < 1.0) return;
const auto avg_time = this_time_used / this_time_count;
const auto sleep_time = min(60.0, bees_throttle_factor * avg_time - time_used);
if (sleep_time <= 0) {
return;
}
if (sleep_time > longest_sleep_time) {
BEESLOGDEBUG(context << ": throttle delay " << sleep_time << " s, time used " << time_used << " s, avg time " << avg_time << " s");
longest_sleep_time = sleep_time;
}
throttle_lock.unlock();
BEESNOTE(context << ": throttle delay " << sleep_time << " s, time used " << time_used << " s, avg time " << avg_time << " s");
nanosleep(sleep_time);
}
thread_local random_device bees_random_device;
thread_local uniform_int_distribution<default_random_engine::result_type> bees_random_seed_dist(
numeric_limits<default_random_engine::result_type>::min(),
numeric_limits<default_random_engine::result_type>::max()
);
thread_local default_random_engine bees_generator(bees_random_seed_dist(bees_random_device));
BeesStringFile::BeesStringFile(Fd dir_fd, string name, size_t limit) :
m_dir_fd(dir_fd),
m_name(name),
@@ -442,8 +351,6 @@ BeesTempFile::resize(off_t offset)
// Count time spent here
BEESCOUNTADD(tmp_resize_ms, resize_timer.age() * 1000);
bees_throttle(resize_timer.age(), "tmpfile_resize");
}
void
@@ -561,6 +468,7 @@ BeesTempFile::make_copy(const BeesFileRange &src)
auto src_p = src.begin();
auto dst_p = begin;
bool did_block_write = false;
while (dst_p < end) {
auto len = min(BLOCK_SIZE_CLONE, end - dst_p);
BeesBlockData bbd(src.fd(), src_p, len);
@@ -571,6 +479,7 @@ BeesTempFile::make_copy(const BeesFileRange &src)
BEESNOTE("copying " << src << " to " << rv << "\n"
"\tpwrite " << bbd << " to " << name_fd(m_fd) << " offset " << to_hex(dst_p) << " len " << len);
pwrite_or_die(m_fd, bbd.data().data(), len, dst_p);
did_block_write = true;
BEESCOUNT(tmp_block);
BEESCOUNTADD(tmp_bytes, len);
}
@@ -579,7 +488,15 @@ BeesTempFile::make_copy(const BeesFileRange &src)
}
BEESCOUNTADD(tmp_copy_ms, copy_timer.age() * 1000);
bees_throttle(copy_timer.age(), "tmpfile_copy");
if (did_block_write) {
#if 0
// There were a lot of kernel bugs leading to lockups.
// Most of them are fixed now.
// Unnecessary sync makes us slow, but maybe it has some robustness utility.
// TODO: make this configurable.
bees_sync(m_fd);
#endif
}
BEESCOUNT(tmp_copy);
return rv;
@@ -619,23 +536,19 @@ operator<<(ostream &os, const siginfo_t &si)
static sigset_t new_sigset, old_sigset;
static
void
block_signals()
block_term_signal()
{
BEESLOGDEBUG("Masking signals");
DIE_IF_NON_ZERO(sigemptyset(&new_sigset));
DIE_IF_NON_ZERO(sigaddset(&new_sigset, SIGTERM));
DIE_IF_NON_ZERO(sigaddset(&new_sigset, SIGINT));
DIE_IF_NON_ZERO(sigaddset(&new_sigset, SIGUSR1));
DIE_IF_NON_ZERO(sigaddset(&new_sigset, SIGUSR2));
DIE_IF_NON_ZERO(sigprocmask(SIG_BLOCK, &new_sigset, &old_sigset));
}
static
void
wait_for_signals()
wait_for_term_signal()
{
BEESNOTE("waiting for signals");
BEESLOGDEBUG("Waiting for signals...");
@@ -652,28 +565,14 @@ wait_for_signals()
THROW_ERRNO("sigwaitinfo errno = " << errno);
} else {
BEESLOGNOTICE("Received signal " << rv << " info " << info);
// If SIGTERM or SIGINT, unblock so we die immediately if signalled again
switch (info.si_signo) {
case SIGUSR1:
BEESLOGNOTICE("Received SIGUSR1 - pausing workers");
TaskMaster::pause(true);
break;
case SIGUSR2:
BEESLOGNOTICE("Received SIGUSR2 - unpausing workers");
TaskMaster::pause(false);
break;
case SIGTERM:
case SIGINT:
default:
DIE_IF_NON_ZERO(sigprocmask(SIG_BLOCK, &old_sigset, &new_sigset));
BEESLOGDEBUG("Signal catcher exiting");
return;
}
// Unblock so we die immediately if signalled again
DIE_IF_NON_ZERO(sigprocmask(SIG_BLOCK, &old_sigset, &new_sigset));
break;
}
}
BEESLOGDEBUG("Signal catcher exiting");
}
static
int
bees_main(int argc, char *argv[])
{
@@ -697,51 +596,47 @@ bees_main(int argc, char *argv[])
// Have to block signals now before we create a bunch of threads
// so the threads will also have the signals blocked.
block_signals();
block_term_signal();
// Create a context so we can apply configuration to it
shared_ptr<BeesContext> bc = make_shared<BeesContext>();
BEESLOGDEBUG("context constructed");
string cwd(readlink_or_die("/proc/self/cwd"));
// Defaults
bool use_relative_paths = false;
bool chatter_prefix_timestamp = true;
double thread_factor = 0;
unsigned thread_count = 0;
unsigned thread_min = 0;
double load_target = 0;
bool workaround_btrfs_send = false;
BeesRoots::ScanMode root_scan_mode = BeesRoots::SCAN_MODE_EXTENT;
BeesRoots::ScanMode root_scan_mode = BeesRoots::SCAN_MODE_ZERO;
// Configure getopt_long
// Options with no short form
enum {
BEES_OPT_THROTTLE_FACTOR = 256,
};
static const struct option long_options[] = {
{ .name = "thread-factor", .has_arg = required_argument, .val = 'C' },
{ .name = "throttle-factor", .has_arg = required_argument, .val = BEES_OPT_THROTTLE_FACTOR },
{ .name = "thread-min", .has_arg = required_argument, .val = 'G' },
{ .name = "strip-paths", .has_arg = no_argument, .val = 'P' },
{ .name = "no-timestamps", .has_arg = no_argument, .val = 'T' },
{ .name = "workaround-btrfs-send", .has_arg = no_argument, .val = 'a' },
{ .name = "thread-count", .has_arg = required_argument, .val = 'c' },
{ .name = "loadavg-target", .has_arg = required_argument, .val = 'g' },
{ .name = "help", .has_arg = no_argument, .val = 'h' },
{ .name = "scan-mode", .has_arg = required_argument, .val = 'm' },
{ .name = "absolute-paths", .has_arg = no_argument, .val = 'p' },
{ .name = "timestamps", .has_arg = no_argument, .val = 't' },
{ .name = "verbose", .has_arg = required_argument, .val = 'v' },
{ 0 },
{ "thread-factor", required_argument, NULL, 'C' },
{ "thread-min", required_argument, NULL, 'G' },
{ "strip-paths", no_argument, NULL, 'P' },
{ "no-timestamps", no_argument, NULL, 'T' },
{ "workaround-btrfs-send", no_argument, NULL, 'a' },
{ "thread-count", required_argument, NULL, 'c' },
{ "loadavg-target", required_argument, NULL, 'g' },
{ "help", no_argument, NULL, 'h' },
{ "scan-mode", required_argument, NULL, 'm' },
{ "absolute-paths", no_argument, NULL, 'p' },
{ "timestamps", no_argument, NULL, 't' },
{ "verbose", required_argument, NULL, 'v' },
{ 0, 0, 0, 0 },
};
// Build getopt_long's short option list from the long_options table.
// While we're at it, make sure we didn't duplicate any options.
string getopt_list;
map<decltype(option::val), string> option_vals;
set<decltype(option::val)> option_vals;
for (const struct option *op = long_options; op->val; ++op) {
const auto ins_rv = option_vals.insert(make_pair(op->val, op->name));
THROW_CHECK1(runtime_error, op->val, ins_rv.second);
THROW_CHECK1(runtime_error, op->val, !option_vals.count(op->val));
option_vals.insert(op->val);
if ((op->val & 0xff) != op->val) {
continue;
}
@@ -752,31 +647,27 @@ bees_main(int argc, char *argv[])
}
// Parse options
int c;
while (true) {
int option_index = 0;
const auto c = getopt_long(argc, argv, getopt_list.c_str(), long_options, &option_index);
c = getopt_long(argc, argv, getopt_list.c_str(), long_options, &option_index);
if (-1 == c) {
break;
}
// getopt_long should have weeded out any invalid options,
// so we can go ahead and throw here
BEESLOGDEBUG("Parsing option '" << option_vals.at(c) << "'");
BEESLOGDEBUG("Parsing option '" << static_cast<char>(c) << "'");
switch (c) {
case 'C':
thread_factor = stod(optarg);
break;
case BEES_OPT_THROTTLE_FACTOR:
bees_throttle_factor = stod(optarg);
break;
case 'G':
thread_min = stoul(optarg);
break;
case 'P':
use_relative_paths = true;
crucible::set_relative_path(cwd);
break;
case 'T':
chatter_prefix_timestamp = false;
@@ -794,7 +685,7 @@ bees_main(int argc, char *argv[])
root_scan_mode = static_cast<BeesRoots::ScanMode>(stoul(optarg));
break;
case 'p':
use_relative_paths = false;
crucible::set_relative_path("");
break;
case 't':
chatter_prefix_timestamp = true;
@@ -812,12 +703,12 @@ bees_main(int argc, char *argv[])
case 'h':
default:
do_cmd_help(argv);
return EXIT_SUCCESS;
return EXIT_FAILURE;
}
}
if (optind + 1 != argc) {
BEESLOGERR("Exactly one filesystem path required");
BEESLOGERR("Only one filesystem path per bees process");
return EXIT_FAILURE;
}
@@ -857,32 +748,22 @@ bees_main(int argc, char *argv[])
BEESLOGNOTICE("setting worker thread pool maximum size to " << thread_count);
TaskMaster::set_thread_count(thread_count);
BEESLOGNOTICE("setting throttle factor to " << bees_throttle_factor);
// Set root path
string root_path = argv[optind++];
BEESLOGNOTICE("setting root path to '" << root_path << "'");
bc->set_root_path(root_path);
// Set path prefix
if (use_relative_paths) {
crucible::set_relative_path(name_fd(bc->root_fd()));
}
// Workaround for btrfs send
bc->roots()->set_workaround_btrfs_send(workaround_btrfs_send);
// Set root scan mode
bc->roots()->set_scan_mode(root_scan_mode);
// Workaround for the logical-ino-vs-clone kernel bug
MultiLocker::enable_locking(true);
// Start crawlers
bc->start();
// Now we just wait forever
wait_for_signals();
wait_for_term_signal();
// Shut it down
bc->stop();
@@ -901,8 +782,8 @@ main(int argc, char *argv[])
return EXIT_FAILURE;
}
int rv = EXIT_FAILURE;
catch_all([&]() {
int rv = 1;
catch_and_explain([&]() {
rv = bees_main(argc, argv);
});
BEESLOGNOTICE("Exiting with status " << rv << " " << (rv ? "(failure)" : "(success)"));

View File

@@ -1,7 +1,6 @@
#ifndef BEES_H
#define BEES_H
#include "crucible/btrfs-tree.h"
#include "crucible/cache.h"
#include "crucible/chatter.h"
#include "crucible/error.h"
@@ -9,21 +8,20 @@
#include "crucible/fd.h"
#include "crucible/fs.h"
#include "crucible/lockset.h"
#include "crucible/multilock.h"
#include "crucible/pool.h"
#include "crucible/progress.h"
#include "crucible/time.h"
#include "crucible/task.h"
#include <atomic>
#include <functional>
#include <list>
#include <mutex>
#include <string>
#include <random>
#include <thread>
#include <endian.h>
#include <syslog.h>
#include <endian.h>
using namespace crucible;
using namespace std;
@@ -61,9 +59,8 @@ const off_t BLOCK_SIZE_HASHTAB_BUCKET = BLOCK_SIZE_MMAP;
// Extent size for hash table (since the nocow file attribute does not seem to be working today)
const off_t BLOCK_SIZE_HASHTAB_EXTENT = BLOCK_SIZE_MAX_COMPRESSED_EXTENT;
// Bytes per second we want to flush from hash table
// Optimistic sustained write rate for SD cards
const double BEES_FLUSH_RATE = 128 * 1024;
// Bytes per second we want to flush (8GB every two hours)
const double BEES_FLUSH_RATE = 8.0 * 1024 * 1024 * 1024 / 7200.0;
// Interval between writing crawl state to disk
const int BEES_WRITEBACK_INTERVAL = 900;
@@ -78,13 +75,13 @@ const int BEES_PROGRESS_INTERVAL = BEES_STATS_INTERVAL;
const int BEES_STATUS_INTERVAL = 1;
// Number of file FDs to cache when not in active use
const size_t BEES_FILE_FD_CACHE_SIZE = 524288;
const size_t BEES_FILE_FD_CACHE_SIZE = 4096;
// Number of root FDs to cache when not in active use
const size_t BEES_ROOT_FD_CACHE_SIZE = 65536;
const size_t BEES_ROOT_FD_CACHE_SIZE = 1024;
// Number of FDs to open (rlimit)
const size_t BEES_OPEN_FILE_LIMIT = BEES_FILE_FD_CACHE_SIZE + BEES_ROOT_FD_CACHE_SIZE + 100;
const size_t BEES_OPEN_FILE_LIMIT = (BEES_FILE_FD_CACHE_SIZE + BEES_ROOT_FD_CACHE_SIZE) * 2 + 100;
// Worker thread factor (multiplied by detected number of CPU cores)
const double BEES_DEFAULT_THREAD_FACTOR = 1.0;
@@ -93,17 +90,37 @@ const double BEES_DEFAULT_THREAD_FACTOR = 1.0;
const double BEES_TOO_LONG = 5.0;
// Avoid any extent where LOGICAL_INO takes this much kernel CPU time
const double BEES_TOXIC_SYS_DURATION = 5.0;
const double BEES_TOXIC_SYS_DURATION = 0.1;
// Maximum number of refs to a single extent before we have other problems
// If we have more than 10K refs to an extent, adding another will save 0.01% space
const size_t BEES_MAX_EXTENT_REF_COUNT = 9999; // (16 * 1024 * 1024 / 24);
// Maximum number of refs to a single extent
const size_t BEES_MAX_EXTENT_REF_COUNT = (16 * 1024 * 1024 / 24) - 1;
// How long between hash table histograms
const double BEES_HASH_TABLE_ANALYZE_INTERVAL = BEES_STATS_INTERVAL;
// Wait at least this long for a new transid
const double BEES_TRANSID_POLL_INTERVAL = 30.0;
// Stop growing the work queue after we have this many tasks queued
const size_t BEES_MAX_QUEUE_SIZE = 128;
// Read this many items at a time in SEARCHv2
const size_t BEES_MAX_CRAWL_ITEMS = 8;
// Read this many bytes at a time in SEARCHv2 (one maximum-sized metadata page)
const size_t BEES_MAX_CRAWL_BYTES = 64 * 1024;
// Insert this many items before switching to a new subvol
const size_t BEES_MAX_CRAWL_BATCH = 128;
// Wait this many transids between crawls
const size_t BEES_TRANSID_FACTOR = 10;
// Wait this long for a balance to stop
const double BEES_BALANCE_POLL_INTERVAL = 60.0;
// Workaround for backref bugs
const bool BEES_SERIALIZE_RESOLVE = false;
// Workaround for tree mod log bugs
const bool BEES_SERIALIZE_BALANCE = false;
// Workaround for silly dedupe / ineffective readahead behavior
const size_t BEES_READAHEAD_SIZE = 1024 * 1024;
@@ -124,7 +141,7 @@ const int FLAGS_OPEN_FANOTIFY = O_RDWR | O_NOATIME | O_CLOEXEC | O_LARGEFILE;
#define BEESLOG(lv,x) do { if (lv < bees_log_level) { Chatter __chatter(lv, BeesNote::get_name()); __chatter << x; } } while (0)
#define BEESLOGTRACE(x) do { BEESLOG(LOG_DEBUG, x); BeesTracer::trace_now(); } while (0)
#define BEESTRACE(x) BeesTracer SRSLY_WTF_C(beesTracer_, __LINE__) ([&]() { BEESLOG(LOG_ERR, x << " at " << __FILE__ << ":" << __LINE__); })
#define BEESTRACE(x) BeesTracer SRSLY_WTF_C(beesTracer_, __LINE__) ([&]() { BEESLOG(LOG_ERR, x); })
#define BEESTOOLONG(x) BeesTooLong SRSLY_WTF_C(beesTooLong_, __LINE__) ([&](ostream &_btl_os) { _btl_os << x; })
#define BEESNOTE(x) BeesNote SRSLY_WTF_C(beesNote_, __LINE__) ([&](ostream &_btl_os) { _btl_os << x; })
@@ -252,7 +269,7 @@ ostream& operator<<(ostream &os, const BeesFileId &bfi);
class BeesFileRange {
protected:
Fd m_fd;
mutable Fd m_fd;
mutable BeesFileId m_fid;
off_t m_begin = 0, m_end = 0;
mutable off_t m_file_size = -1;
@@ -274,36 +291,35 @@ public:
bool is_same_file(const BeesFileRange &that) const;
bool overlaps(const BeesFileRange &that) const;
// If file ranges overlap, extends this to include that.
// Coalesce with empty bfr = non-empty bfr
bool coalesce(const BeesFileRange &that);
// Remove that from this, creating 0, 1, or 2 new objects
pair<BeesFileRange, BeesFileRange> subtract(const BeesFileRange &that) const;
off_t begin() const { return m_begin; }
off_t end() const { return m_end; }
off_t size() const;
/// @{ Lazy accessors
// Lazy accessors
off_t file_size() const;
BeesFileId fid() const;
/// @}
/// Get the fd if there is one
// Get the fd if there is one
Fd fd() const;
/// Get the fd, opening it if necessary
Fd fd(const shared_ptr<BeesContext> &ctx);
// Get the fd, opening it if necessary
Fd fd(const shared_ptr<BeesContext> &ctx) const;
/// Copy the BeesFileId but not the Fd
BeesFileRange copy_closed() const;
/// Is it defined?
// Is it defined?
operator bool() const { return !!m_fd || m_fid; }
/// @{ Make range larger
// Make range larger
off_t grow_end(off_t delta);
off_t grow_begin(off_t delta);
/// @}
/// @{ Make range smaller
off_t shrink_end(off_t delta);
off_t shrink_begin(off_t delta);
/// @}
friend ostream & operator<<(ostream &os, const BeesFileRange &bfr);
};
@@ -329,7 +345,6 @@ public:
BeesAddress(Type addr = ZERO) : m_addr(addr) {}
BeesAddress(MagicValue addr) : m_addr(addr) {}
BeesAddress& operator=(const BeesAddress &that) = default;
BeesAddress(const BeesAddress &that) = default;
operator Type() const { return m_addr; }
bool operator==(const BeesAddress &that) const;
bool operator==(const MagicValue that) const { return *this == BeesAddress(that); }
@@ -390,7 +405,6 @@ public:
HashType e_hash;
AddrType e_addr;
Cell(const Cell &) = default;
Cell &operator=(const Cell &) = default;
Cell(HashType hash, AddrType addr) : e_hash(hash), e_addr(addr) { }
bool operator==(const Cell &e) const { return tie(e_hash, e_addr) == tie(e.e_hash, e.e_addr); }
bool operator!=(const Cell &e) const { return tie(e_hash, e_addr) != tie(e.e_hash, e.e_addr); }
@@ -415,14 +429,12 @@ public:
BeesHashTable(shared_ptr<BeesContext> ctx, string filename, off_t size = BLOCK_SIZE_HASHTAB_EXTENT);
~BeesHashTable();
void stop_request();
void stop_wait();
void stop();
vector<Cell> find_cell(HashType hash);
bool push_random_hash_addr(HashType hash, AddrType addr);
void erase_hash_addr(HashType hash, AddrType addr);
bool push_front_hash_addr(HashType hash, AddrType addr);
bool flush_dirty_extent(uint64_t extent_index);
private:
string m_filename;
@@ -456,7 +468,7 @@ private:
// Mutex/condvar for the writeback thread
mutex m_dirty_mutex;
condition_variable m_dirty_condvar;
bool m_dirty = false;
bool m_dirty;
// Mutex/condvar to stop
mutex m_stop_mutex;
@@ -482,6 +494,7 @@ private:
void fetch_missing_extent_by_index(uint64_t extent_index);
void set_extent_dirty_locked(uint64_t extent_index);
size_t flush_dirty_extents(bool slowly);
bool flush_dirty_extent(uint64_t extent_index);
size_t hash_to_extent_index(HashType ht);
unique_lock<mutex> lock_extent_by_hash(HashType ht);
@@ -489,8 +502,6 @@ private:
BeesHashTable(const BeesHashTable &) = delete;
BeesHashTable &operator=(const BeesHashTable &) = delete;
static thread_local uniform_int_distribution<size_t> tl_distribution;
};
ostream &operator<<(ostream &os, const BeesHashTable::Cell &bhte);
@@ -510,52 +521,43 @@ class BeesCrawl {
shared_ptr<BeesContext> m_ctx;
mutex m_mutex;
BtrfsTreeItem m_next_extent_data;
set<BeesFileRange> m_extents;
bool m_deferred = false;
bool m_finished = false;
mutex m_state_mutex;
ProgressTracker<BeesCrawlState> m_state;
BtrfsTreeObjectFetcher m_btof;
bool fetch_extents();
void fetch_extents_harder();
bool restart_crawl_unlocked();
BeesFileRange bti_to_bfr(const BtrfsTreeItem &bti) const;
bool next_transid();
public:
BeesCrawl(shared_ptr<BeesContext> ctx, BeesCrawlState initial_state);
BeesFileRange peek_front();
BeesFileRange pop_front();
ProgressTracker<BeesCrawlState>::ProgressHolder hold_state(const BeesCrawlState &bcs);
ProgressTracker<BeesCrawlState>::ProgressHolder hold_state(const BeesFileRange &bfr);
BeesCrawlState get_state_begin();
BeesCrawlState get_state_end() const;
BeesCrawlState get_state_end();
void set_state(const BeesCrawlState &bcs);
void deferred(bool def_setting);
bool deferred() const;
bool finished() const;
bool restart_crawl();
};
class BeesScanMode;
class BeesRoots : public enable_shared_from_this<BeesRoots> {
shared_ptr<BeesContext> m_ctx;
BeesStringFile m_crawl_state_file;
using CrawlMap = map<uint64_t, shared_ptr<BeesCrawl>>;
CrawlMap m_root_crawl_map;
map<uint64_t, shared_ptr<BeesCrawl>> m_root_crawl_map;
mutex m_mutex;
uint64_t m_crawl_dirty = 0;
uint64_t m_crawl_clean = 0;
bool m_crawl_dirty = false;
Timer m_crawl_timer;
BeesThread m_crawl_thread;
BeesThread m_writeback_thread;
RateEstimator m_transid_re;
size_t m_transid_factor = BEES_TRANSID_FACTOR;
Task m_crawl_task;
bool m_workaround_btrfs_send = false;
shared_ptr<BeesScanMode> m_scanner;
LRUCache<bool, uint64_t> m_root_ro_cache;
mutex m_tmpfiles_mutex;
map<BeesFileId, Fd> m_tmpfiles;
@@ -564,13 +566,18 @@ class BeesRoots : public enable_shared_from_this<BeesRoots> {
condition_variable m_stop_condvar;
bool m_stop_requested = false;
CrawlMap insert_new_crawl();
void insert_new_crawl();
void insert_root(const BeesCrawlState &bcs);
Fd open_root_nocache(uint64_t root);
Fd open_root_ino_nocache(uint64_t root, uint64_t ino);
bool is_root_ro_nocache(uint64_t root);
uint64_t transid_min();
uint64_t transid_max();
uint64_t transid_max_nocache();
void state_load();
ostream &state_to_stream(ostream &os);
void state_save();
bool crawl_roots();
string crawl_state_filename() const;
void crawl_state_set_dirty();
void crawl_state_erase(const BeesCrawlState &bcs);
@@ -578,47 +585,42 @@ class BeesRoots : public enable_shared_from_this<BeesRoots> {
void writeback_thread();
uint64_t next_root(uint64_t root = 0);
void current_state_set(const BeesCrawlState &bcs);
bool crawl_batch(shared_ptr<BeesCrawl> crawl);
RateEstimator& transid_re();
size_t crawl_batch(shared_ptr<BeesCrawl> crawl);
void clear_caches();
void insert_tmpfile(Fd fd);
void erase_tmpfile(Fd fd);
shared_ptr<BeesCrawl> insert_root(const BeesCrawlState &bcs);
friend class BeesCrawl;
friend class BeesFdCache;
friend class BeesScanMode;
friend class BeesScanModeSubvol;
friend class BeesScanModeExtent;
friend class BeesCrawl;
friend class BeesTempFile;
public:
BeesRoots(shared_ptr<BeesContext> ctx);
void start();
void stop_request();
void stop_wait();
void insert_tmpfile(Fd fd);
void erase_tmpfile(Fd fd);
void stop();
Fd open_root(uint64_t root);
Fd open_root_ino(uint64_t root, uint64_t ino);
Fd open_root_ino(const BeesFileId &bfi) { return open_root_ino(bfi.root(), bfi.ino()); }
bool is_root_ro(uint64_t root);
// TODO: think of better names for these.
// or TODO: do extent-tree scans instead
enum ScanMode {
SCAN_MODE_LOCKSTEP,
SCAN_MODE_INDEPENDENT,
SCAN_MODE_SEQUENTIAL,
SCAN_MODE_RECENT,
SCAN_MODE_EXTENT,
SCAN_MODE_ZERO,
SCAN_MODE_ONE,
SCAN_MODE_TWO,
SCAN_MODE_COUNT, // must be last
};
void set_scan_mode(ScanMode new_mode);
void set_workaround_btrfs_send(bool do_avoid);
uint64_t transid_min();
uint64_t transid_max();
private:
ScanMode m_scan_mode = SCAN_MODE_ZERO;
static string scan_mode_ntoa(ScanMode new_mode);
void wait_for_transid(const uint64_t count);
};
struct BeesHash {
@@ -637,7 +639,7 @@ private:
ostream & operator<<(ostream &os, const BeesHash &bh);
class BeesBlockData {
using Blob = ByteVector;
using Blob = vector<uint8_t>;
mutable Fd m_fd;
off_t m_offset;
@@ -678,8 +680,6 @@ class BeesRangePair : public pair<BeesFileRange, BeesFileRange> {
public:
BeesRangePair(const BeesFileRange &src, const BeesFileRange &dst);
bool grow(shared_ptr<BeesContext> ctx, bool constrained);
void shrink_begin(const off_t delta);
void shrink_end(const off_t delta);
BeesRangePair copy_closed() const;
bool operator<(const BeesRangePair &that) const;
friend ostream & operator<<(ostream &os, const BeesRangePair &brp);
@@ -723,14 +723,19 @@ struct BeesResolveAddrResult {
bool is_toxic() const { return m_is_toxic; }
};
struct BeesHalt : exception {
const char *what() const noexcept override;
};
class BeesContext : public enable_shared_from_this<BeesContext> {
shared_ptr<BeesContext> m_parent_ctx;
Fd m_home_fd;
shared_ptr<BeesFdCache> m_fd_cache;
shared_ptr<BeesHashTable> m_hash_table;
shared_ptr<BeesRoots> m_roots;
Pool<BeesTempFile> m_tmpfile_pool;
Pool<BtrfsIoctlLogicalInoArgs> m_logical_ino_pool;
LRUCache<BeesResolveAddrResult, BeesAddress> m_resolve_cache;
@@ -742,28 +747,30 @@ class BeesContext : public enable_shared_from_this<BeesContext> {
Timer m_total_timer;
NamedPtr<Exclusion, uint64_t> m_extent_locks;
NamedPtr<Exclusion, uint64_t> m_inode_locks;
LockSet<uint64_t> m_extent_lock_set;
mutable mutex m_stop_mutex;
condition_variable m_stop_condvar;
bool m_stop_requested = false;
bool m_stop_status = false;
mutable mutex m_abort_mutex;
condition_variable m_abort_condvar;
bool m_abort_requested = false;
shared_ptr<BeesThread> m_progress_thread;
shared_ptr<BeesThread> m_status_thread;
mutex m_progress_mtx;
string m_progress_str;
void set_root_fd(Fd fd);
BeesResolveAddrResult resolve_addr_uncached(BeesAddress addr);
void wait_for_balance();
void scan_one_extent(const BeesFileRange &bfr, const Extent &e);
BeesFileRange scan_one_extent(const BeesFileRange &bfr, const Extent &e);
void rewrite_file_range(const BeesFileRange &bfr);
public:
BeesContext() = default;
void set_root_path(string path);
@@ -771,9 +778,7 @@ public:
Fd home_fd();
string root_path() const { return m_root_path; }
bool scan_forward(const BeesFileRange &bfr);
shared_ptr<BtrfsIoctlLogicalInoArgs> logical_ino(uint64_t bytenr, bool all_refs);
BeesFileRange scan_forward(const BeesFileRange &bfr);
bool is_root_ro(uint64_t root);
BeesRangePair dup_extent(const BeesFileRange &src, const shared_ptr<BeesTempFile> &tmpfile);
@@ -783,16 +788,11 @@ public:
void blacklist_erase(const BeesFileId &fid);
bool is_blacklisted(const BeesFileId &fid) const;
shared_ptr<Exclusion> get_inode_mutex(uint64_t inode);
BeesResolveAddrResult resolve_addr(BeesAddress addr);
void invalidate_addr(BeesAddress addr);
void resolve_cache_clear();
void dump_status();
void show_progress();
void set_progress(const string &str);
string get_progress();
void start();
void stop();
@@ -804,6 +804,7 @@ public:
shared_ptr<BeesTempFile> tmpfile();
const Timer &total_timer() const { return m_total_timer; }
LockSet<uint64_t> &extent_lock_set() { return m_extent_lock_set; }
};
class BeesResolver {
@@ -811,7 +812,7 @@ class BeesResolver {
BeesAddress m_addr;
vector<BtrfsInodeOffsetRoot> m_biors;
set<BeesFileRange> m_ranges;
size_t m_bior_count;
unsigned m_bior_count;
// We found matching data, so we can dedupe
bool m_found_data = false;
@@ -855,7 +856,7 @@ public:
BeesFileRange find_one_match(BeesHash hash);
void replace_src(const BeesFileRange &src_bfr);
BeesRangePair replace_dst(const BeesFileRange &dst_bfr);
BeesFileRange replace_dst(const BeesFileRange &dst_bfr);
bool found_addr() const { return m_found_addr; }
bool found_data() const { return m_found_data; }
@@ -886,12 +887,10 @@ public:
extern int bees_log_level;
extern const char *BEES_USAGE;
extern const char *BEES_VERSION;
extern thread_local default_random_engine bees_generator;
string pretty(double d);
void bees_sync(int fd);
void bees_readahead(int fd, off_t offset, size_t size);
void bees_readahead_pair(int fd, off_t offset, size_t size, int fd2, off_t offset2, size_t size2);
void bees_unreadahead(int fd, off_t offset, size_t size);
void bees_throttle(double time_used, const char *context);
string format_time(time_t t);
#endif

53
src/fiemap.cc Normal file
View File

@@ -0,0 +1,53 @@
#include "crucible/fd.h"
#include "crucible/fs.h"
#include "crucible/error.h"
#include "crucible/string.h"
#include <iostream>
#include <fcntl.h>
#include <sys/stat.h>
#include <unistd.h>
using namespace crucible;
using namespace std;
int
main(int argc, char **argv)
{
catch_all([&]() {
THROW_CHECK1(invalid_argument, argc, argc > 1);
string filename = argv[1];
cout << "File: " << filename << endl;
Fd fd = open_or_die(filename, O_RDONLY);
Fiemap fm;
fm.fm_flags &= ~(FIEMAP_FLAG_SYNC);
fm.m_max_count = 100;
if (argc > 2) { fm.fm_start = stoull(argv[2], nullptr, 0); }
if (argc > 3) { fm.fm_length = stoull(argv[3], nullptr, 0); }
if (argc > 4) { fm.fm_flags = stoull(argv[4], nullptr, 0); }
fm.fm_length = min(fm.fm_length, FIEMAP_MAX_OFFSET - fm.fm_start);
uint64_t stop_at = fm.fm_start + fm.fm_length;
uint64_t last_byte = fm.fm_start;
do {
fm.do_ioctl(fd);
// cerr << fm;
uint64_t last_logical = FIEMAP_MAX_OFFSET;
for (auto &extent : fm.m_extents) {
if (extent.fe_logical > last_byte) {
cout << "Log " << to_hex(last_byte) << ".." << to_hex(extent.fe_logical) << " Hole" << endl;
}
cout << "Log " << to_hex(extent.fe_logical) << ".." << to_hex(extent.fe_logical + extent.fe_length)
<< " Phy " << to_hex(extent.fe_physical) << ".." << to_hex(extent.fe_physical + extent.fe_length)
<< " Flags " << fiemap_extent_flags_ntoa(extent.fe_flags) << endl;
last_logical = extent.fe_logical + extent.fe_length;
last_byte = last_logical;
}
fm.fm_start = last_logical;
} while (fm.fm_start < stop_at);
});
exit(EXIT_SUCCESS);
}

40
src/fiewalk.cc Normal file
View File

@@ -0,0 +1,40 @@
#include "crucible/extentwalker.h"
#include "crucible/error.h"
#include "crucible/string.h"
#include <iostream>
#include <fcntl.h>
#include <unistd.h>
using namespace crucible;
using namespace std;
int
main(int argc, char **argv)
{
catch_all([&]() {
THROW_CHECK1(invalid_argument, argc, argc > 1);
string filename = argv[1];
cout << "File: " << filename << endl;
Fd fd = open_or_die(filename, O_RDONLY);
BtrfsExtentWalker ew(fd);
off_t pos = 0;
if (argc > 2) { pos = stoull(argv[2], nullptr, 0); }
ew.seek(pos);
do {
// cout << "\n\n>>>" << ew.current() << "<<<\n\n" << endl;
cout << ew.current() << endl;
} while (ew.next());
#if 0
cout << "\n\n\nAnd now, backwards...\n\n\n" << endl;
do {
cout << "\n\n>>>" << ew.current() << "<<<\n\n" << endl;
} while (ew.prev());
cout << "\n\n\nDone!\n\n\n" << endl;
#endif
});
exit(EXIT_SUCCESS);
}

View File

@@ -7,8 +7,6 @@ PROGRAMS = \
path \
process \
progress \
seeker \
table \
task \
all: test
@@ -22,10 +20,17 @@ include ../makeflags
LIBS = -lcrucible -lpthread
BEES_LDFLAGS = -L../lib $(LDFLAGS)
%.dep: %.cc tests.h Makefile
.depends:
mkdir -p $@
.depends/%.dep: %.cc tests.h Makefile | .depends
$(CXX) $(BEES_CXXFLAGS) -M -MF $@ -MT $(<:.cc=.o) $<
include $(PROGRAMS:%=%.dep)
depends.mk: $(PROGRAMS:%=.depends/%.dep)
cat $^ > $@.new
mv -f $@.new $@
include depends.mk
$(PROGRAMS:%=%.o): %.o: %.cc ../makeflags Makefile
$(CXX) $(BEES_CXXFLAGS) -o $@ -c $<

View File

@@ -3,7 +3,6 @@
#include "crucible/limits.h"
#include <cassert>
#include <cstdint>
using namespace crucible;

View File

@@ -12,49 +12,23 @@ using namespace std;
void
test_progress()
{
// On create, begin == end == constructor argument
ProgressTracker<uint64_t> pt(123);
auto hold = pt.hold(234);
auto hold2 = pt.hold(345);
assert(pt.begin() == 123);
assert(pt.end() == 123);
// Holding a position past the end increases the end (and moves begin to match)
auto hold345 = pt.hold(345);
assert(pt.end() == 345);
auto hold3 = pt.hold(456);
assert(pt.begin() == 123);
assert(pt.end() == 456);
hold2.reset();
assert(pt.begin() == 123);
assert(pt.end() == 456);
hold.reset();
assert(pt.begin() == 345);
assert(pt.end() == 345);
// Holding a position before begin reduces begin, without changing end
auto hold234 = pt.hold(234);
assert(pt.begin() == 234);
assert(pt.end() == 345);
// Holding a position past the end increases the end, without affecting begin
auto hold456 = pt.hold(456);
assert(pt.begin() == 234);
assert(pt.end() == 456);
// Releasing a position in the middle affects neither begin nor end
hold345.reset();
assert(pt.begin() == 234);
assert(pt.end() == 456);
// Hold another position in the middle to test begin moving forward
auto hold400 = pt.hold(400);
// Releasing a position at the beginning moves begin forward
hold234.reset();
assert(pt.begin() == 400);
assert(pt.end() == 456);
// Releasing a position at the end doesn't move end backward
hold456.reset();
assert(pt.begin() == 400);
assert(pt.end() == 456);
// Releasing a position in the middle doesn't move end backward but does move begin forward
hold400.reset();
hold3.reset();
assert(pt.begin() == 456);
assert(pt.end() == 456);
}
int

View File

@@ -1,101 +0,0 @@
#include "tests.h"
#include "crucible/seeker.h"
#include <set>
#include <vector>
#include <unistd.h>
using namespace crucible;
static
set<uint64_t>
seeker_finder(const vector<uint64_t> &vec, uint64_t lower, uint64_t upper)
{
set<uint64_t> s(vec.begin(), vec.end());
auto lb = s.lower_bound(lower);
auto ub = lb;
if (ub != s.end()) ++ub;
if (ub != s.end()) ++ub;
for (; ub != s.end(); ++ub) {
if (*ub > upper) break;
}
return set<uint64_t>(lb, ub);
}
static bool test_fails = false;
static
void
seeker_test(const vector<uint64_t> &vec, uint64_t const target)
{
cerr << "Find " << target << " in {";
for (auto i : vec) {
cerr << " " << i;
}
cerr << " } = ";
size_t loops = 0;
bool excepted = catch_all([&]() {
auto found = seek_backward(target, [&](uint64_t lower, uint64_t upper) {
++loops;
return seeker_finder(vec, lower, upper);
});
cerr << found;
uint64_t my_found = 0;
for (auto i : vec) {
if (i <= target) {
my_found = i;
}
}
if (found == my_found) {
cerr << " (correct)";
} else {
cerr << " (INCORRECT - right answer is " << my_found << ")";
test_fails = true;
}
});
cerr << " (" << loops << " loops)" << endl;
if (excepted) {
test_fails = true;
}
}
static
void
test_seeker()
{
seeker_test(vector<uint64_t> { 0, 1, 2, 3, 4, 5 }, 3);
seeker_test(vector<uint64_t> { 0, 1, 2, 3, 4, 5 }, 5);
seeker_test(vector<uint64_t> { 0, 1, 2, 3, 4, 5 }, 0);
seeker_test(vector<uint64_t> { 0, 1, 2, 3, 4, 5 }, 1);
seeker_test(vector<uint64_t> { 0, 1, 2, 3, 4, 5 }, 4);
seeker_test(vector<uint64_t> { 0, 1, 2, 3, 4, 5 }, 2);
seeker_test(vector<uint64_t> { 11, 22, 33, 44, 55 }, 2);
seeker_test(vector<uint64_t> { 11, 22, 33, 44, 55 }, 25);
seeker_test(vector<uint64_t> { 11, 22, 33, 44, 55 }, 52);
seeker_test(vector<uint64_t> { 11, 22, 33, 44, 55 }, 99);
seeker_test(vector<uint64_t> { 11, 22, 33, 44, 55, 56 }, 99);
seeker_test(vector<uint64_t> { 11, 22, 33, 44, 55 }, 1);
seeker_test(vector<uint64_t> { 11, 22, 33, 44, 55 }, 55);
seeker_test(vector<uint64_t> { 11 }, 55);
seeker_test(vector<uint64_t> { 11 }, 10);
seeker_test(vector<uint64_t> { 55 }, 55);
seeker_test(vector<uint64_t> { }, 55);
seeker_test(vector<uint64_t> { 55 }, numeric_limits<uint64_t>::max());
seeker_test(vector<uint64_t> { 55 }, numeric_limits<uint64_t>::max() - 1);
seeker_test(vector<uint64_t> { }, numeric_limits<uint64_t>::max());
seeker_test(vector<uint64_t> { 0, numeric_limits<uint64_t>::max() }, numeric_limits<uint64_t>::max());
seeker_test(vector<uint64_t> { 0, numeric_limits<uint64_t>::max() }, numeric_limits<uint64_t>::max() - 1);
seeker_test(vector<uint64_t> { 0, numeric_limits<uint64_t>::max() - 1 }, numeric_limits<uint64_t>::max());
}
int main(int, const char **)
{
RUN_A_TEST(test_seeker());
return test_fails ? EXIT_FAILURE : EXIT_SUCCESS;
}

View File

@@ -1,63 +0,0 @@
#include "tests.h"
#include "crucible/table.h"
using namespace crucible;
using namespace std;
void
print_table(const Table::Table& t)
{
cerr << "BEGIN TABLE\n";
cerr << t;
cerr << "END TABLE\n";
cerr << endl;
}
void
test_table()
{
Table::Table t;
t.insert_row(Table::endpos, vector<Table::Content> {
Table::Text("Hello, World!"),
Table::Text("2"),
Table::Text("3"),
Table::Text("4"),
});
print_table(t);
t.insert_row(Table::endpos, vector<Table::Content> {
Table::Text("Greeting"),
Table::Text("two"),
Table::Text("three"),
Table::Text("four"),
});
print_table(t);
t.insert_row(Table::endpos, vector<Table::Content> {
Table::Fill('-'),
Table::Text("ii"),
Table::Text("iii"),
Table::Text("iv"),
});
print_table(t);
t.mid(" | ");
t.left("| ");
t.right(" |");
print_table(t);
t.insert_col(1, vector<Table::Content> {
Table::Text("1"),
Table::Text("one"),
Table::Text("i"),
Table::Text("I"),
});
print_table(t);
t.at(2, 1) = Table::Text("Two\nLines");
print_table(t);
}
int
main(int, char**)
{
RUN_A_TEST(test_table());
exit(EXIT_SUCCESS);
}

View File

@@ -90,51 +90,47 @@ test_barrier(size_t count)
mutex mtx;
condition_variable cv;
bool done_flag = false;
unique_lock<mutex> lock(mtx);
Barrier b;
auto b = make_shared<Barrier>();
// Run several tasks in parallel
for (size_t c = 0; c < count; ++c) {
auto bl = b->lock();
ostringstream oss;
oss << "task #" << c;
auto b_hold = b;
Task t(
oss.str(),
[c, &task_done, &mtx, b_hold]() mutable {
// ostringstream oss;
// oss << "Task #" << c << endl;
[c, &task_done, &mtx, bl]() mutable {
// cerr << "Task #" << c << endl;
unique_lock<mutex> lock(mtx);
// cerr << oss.str();
task_done.at(c) = true;
b_hold.release();
bl.release();
}
);
t.run();
}
// Need completed to go out of local scope so it will release b
{
Task completed(
"Waiting for Barrier",
[&mtx, &cv, &done_flag]() {
unique_lock<mutex> lock(mtx);
// cerr << "Running cv notify" << endl;
done_flag = true;
cv.notify_all();
}
);
b.insert_task(completed);
}
// Get current status
// TaskMaster::print_queue(cerr);
// TaskMaster::print_workers(cerr);
ostringstream oss;
TaskMaster::print_queue(oss);
TaskMaster::print_workers(oss);
// Release our b
b.release();
bool done_flag = false;
Task completed(
"Waiting for Barrier",
[&mtx, &cv, &done_flag]() {
unique_lock<mutex> lock(mtx);
// cerr << "Running cv notify" << endl;
done_flag = true;
cv.notify_all();
}
);
b->insert_task(completed);
b.reset();
while (true) {
size_t tasks_done = 0;
@@ -143,7 +139,7 @@ test_barrier(size_t count)
++tasks_done;
}
}
cerr << "Tasks done: " << tasks_done << " done_flag " << done_flag << endl;
// cerr << "Tasks done: " << tasks_done << " done_flag " << done_flag << endl;
if (tasks_done == count && done_flag) {
break;
}
@@ -157,7 +153,7 @@ void
test_exclusion(size_t count)
{
mutex only_one;
auto excl = make_shared<Exclusion>();
auto excl = make_shared<Exclusion>("test_excl");
mutex mtx;
condition_variable cv;
@@ -178,8 +174,9 @@ test_exclusion(size_t count)
[c, &only_one, excl, &lock_success_count, &lock_failure_count, &pings, &tasks_running, &cv, &mtx]() mutable {
// cerr << "Task #" << c << endl;
(void)c;
auto lock = excl->try_lock(Task::current_task());
auto lock = excl->try_lock();
if (!lock) {
excl->insert_task(Task::current_task());
++lock_failure_count;
return;
}
@@ -199,7 +196,7 @@ test_exclusion(size_t count)
t.run();
}
excl.reset();
// excl.reset();
unique_lock<mutex> lock(mtx);
while (tasks_running) {