1
0
mirror of https://github.com/Zygo/bees.git synced 2025-08-02 05:43:29 +02:00

1 Commits

Author SHA1 Message Date
Zygo Blaxell
11f69ff6c1 fanotify-watch: Not really part of Bees, but a useful tool nonetheless 2016-11-18 12:48:40 -05:00
112 changed files with 4330 additions and 13963 deletions

6
.gitignore vendored
View File

@@ -1,8 +1,6 @@
*.[ao]
*.bak
*.dep
*.new
*.tmp
*.so*
Doxyfile
README.html
@@ -12,7 +10,3 @@ html/
latex/
make.log
make.log.new
localconf
lib/configure.h
scripts/beesd
scripts/beesd@.service

View File

@@ -1,9 +0,0 @@
MAKE += PREFIX=$(PREFIX) LIBEXEC_PREFIX=$(LIBEXEC_PREFIX) ETC_PREFIX=$(ETC_PREFIX)
define TEMPLATE_COMPILER =
sed $< >$@ \
-e's#@DESTDIR@#$(DESTDIR)#' \
-e's#@PREFIX@#$(PREFIX)#' \
-e's#@ETC_PREFIX@#$(ETC_PREFIX)#' \
-e's#@LIBEXEC_PREFIX@#$(LIBEXEC_PREFIX)#'
endef

View File

@@ -1,71 +1,19 @@
PREFIX ?= /usr
ETC_PREFIX ?= /etc
LIBDIR ?= lib
default install all: lib src test README.html
LIB_PREFIX ?= $(PREFIX)/$(LIBDIR)
LIBEXEC_PREFIX ?= $(LIB_PREFIX)/bees
clean:
git clean -dfx
SYSTEMD_SYSTEM_UNIT_DIR ?= $(shell pkg-config systemd --variable=systemdsystemunitdir)
.PHONY: lib src
BEES_VERSION ?= $(shell git describe --always --dirty || echo UNKNOWN)
lib:
$(MAKE) -C lib
# allow local configuration to override above variables
-include localconf
DEFAULT_MAKE_TARGET ?= reallyall
ifeq ($(DEFAULT_MAKE_TARGET),reallyall)
RUN_INSTALL_TESTS = test
endif
include Defines.mk
default: $(DEFAULT_MAKE_TARGET)
all: lib src scripts
reallyall: all doc test
clean: ## Cleanup
git clean -dfx -e localconf
.PHONY: lib src test doc
lib: ## Build libs
+$(MAKE) TAG="$(BEES_VERSION)" -C lib
src: ## Build bins
src: lib
+$(MAKE) BEES_VERSION="$(BEES_VERSION)" -C src
$(MAKE) -C src
test: ## Run tests
test: lib src
+$(MAKE) -C test
$(MAKE) -C test
doc: ## Build docs
+$(MAKE) -C docs
scripts/%: scripts/%.in
$(TEMPLATE_COMPILER)
scripts: scripts/beesd scripts/beesd@.service
install_bees: ## Install bees + libs
install_bees: src $(RUN_INSTALL_TESTS)
install -Dm755 bin/bees $(DESTDIR)$(LIBEXEC_PREFIX)/bees
install_scripts: ## Install scipts
install_scripts: scripts
install -Dm755 scripts/beesd $(DESTDIR)$(PREFIX)/sbin/beesd
install -Dm644 scripts/beesd.conf.sample $(DESTDIR)$(ETC_PREFIX)/bees/beesd.conf.sample
ifneq ($(SYSTEMD_SYSTEM_UNIT_DIR),)
install -Dm644 scripts/beesd@.service $(DESTDIR)$(SYSTEMD_SYSTEM_UNIT_DIR)/beesd@.service
endif
install: ## Install distribution
install: install_bees install_scripts
help: ## Show help
@fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/\t/'
bees: reallyall
fly: install
README.html: README.md
markdown README.md > README.html.new
mv -f README.html.new README.html

407
README.md
View File

@@ -1,61 +1,376 @@
BEES
====
Best-Effort Extent-Same, a btrfs deduplication agent.
Best-Effort Extent-Same, a btrfs deduplication daemon.
About bees
About Bees
----------
bees is a block-oriented userspace deduplication agent designed to scale
up to large btrfs filesystems. It is an offline dedupe combined with
an incremental data scan capability to minimize time data spends on disk
from write to dedupe.
Bees is a daemon designed to run continuously on live file servers.
Bees scans and deduplicates whole filesystems in a single pass instead
of separate scan and dedup phases. RAM usage does _not_ depend on
unique data size or the number of input files. Hash tables and scan
progress are stored persistently so the daemon can resume after a reboot.
Bees uses the Linux kernel's `dedupe_file_range` feature to ensure data
is handled safely even if other applications concurrently modify it.
Strengths
---------
Bees is intentionally btrfs-specific for performance and capability.
Bees uses the btrfs `SEARCH_V2` ioctl to scan for new data without the
overhead of repeatedly walking filesystem trees with the POSIX API.
Bees uses `LOGICAL_INO` and `INO_PATHS` to leverage btrfs's existing
metadata instead of building its own redundant data structures.
Bees can cope with Btrfs filesystem compression. Bees can reassemble
Btrfs extents to deduplicate extents that contain a mix of duplicate
and unique data blocks.
* Space-efficient hash table - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
* Daemon mode - incrementally dedupes new data as it appears
* Largest extents first - recover more free space during fixed maintenance windows
* Works with btrfs compression - dedupe any combination of compressed and uncompressed files
* Whole-filesystem dedupe - scans data only once, even with snapshots and reflinks
* Persistent hash table for rapid restart after shutdown
* Constant hash table size - no increased RAM usage if data set becomes larger
* Works on live data - no scheduled downtime required
* Automatic self-throttling - reduces system load
* btrfs support - recovers more free space from btrfs than naive dedupers
Bees includes a number of workarounds for Btrfs kernel bugs to (try to)
avoid ruining your day. You're welcome.
Weaknesses
----------
How Bees Works
--------------
* Whole-filesystem dedupe - has no include/exclude filters, does not accept file lists
* Requires root privilege (`CAP_SYS_ADMIN` plus the usual filesystem read/modify caps)
* [First run may increase metadata space usage if many snapshots exist](docs/gotchas.md)
* Constant hash table size - no decreased RAM usage if data set becomes smaller
* btrfs only
Bees uses a fixed-size persistent dedup hash table with a variable dedup
block size. Any size of hash table can be dedicated to dedup. Bees will
scale the dedup block size to fit the filesystem's unique data size
using a weighted sampling algorithm. This allows Bees to adapt itself
to its filesystem size without forcing admins to do math at install time.
At the same time, the duplicate block alignment constraint can be as low
as 4K, allowing efficient deduplication of files with narrowly-aligned
duplicate block offsets (e.g. compiled binaries and VM/disk images)
even if the effective block size is much larger.
Installation and Usage
----------------------
The Bees hash table is loaded into RAM at startup (using hugepages if
available), mlocked, and synced to persistent storage by trickle-writing
over a period of several hours. This avoids issues related to seeking
or fragmentation, and enables the hash table to be efficiently stored
on Btrfs with compression (or an ext4 filesystem, or a raw disk, or
on CIFS...).
* [Installation](docs/install.md)
* [Configuration](docs/config.md)
* [Running](docs/running.md)
* [Command Line Options](docs/options.md)
Once a duplicate block is identified, Bees examines the nearby blocks
in the files where block appears. This allows Bees to find long runs
of adjacent duplicate block pairs if it has an entry for any one of
the blocks in its hash table. The stored hash entry plus the block
recently scanned from disk form a duplicate pair. On typical data sets,
this means most of the blocks in the hash table are redundant and can
be discarded without significant performance impact.
Recommended Reading
-------------------
Hash table entries are grouped together into LRU lists. As each block
is scanned, its hash table entry is inserted into the LRU list at a
random position. If the LRU list is full, the entry at the end of the
list is deleted. If a hash table entry is used to discover duplicate
blocks, the entry is moved to the beginning of the list. This makes Bees
unable to detect a small number of duplicates (less than 1% on typical
filesystems), but it dramatically improves efficiency on filesystems
with many small files. Bees has found a net 13% more duplicate bytes
than a naive fixed-block-size algorithm with a 64K block size using the
same size of hash table, even after discarding 1% of the duplicate bytes.
* [bees Gotchas](docs/gotchas.md)
* [btrfs kernel bugs](docs/btrfs-kernel.md) - especially DATA CORRUPTION WARNING for old kernels
* [bees vs. other btrfs features](docs/btrfs-other.md)
* [What to do when something goes wrong](docs/wrong.md)
Hash Table Sizing
-----------------
More Information
----------------
Hash table entries are 16 bytes each (64-bit hash, 52-bit block number,
and some metadata bits). Each entry represents a minimum of 4K on disk.
unique data size hash table size average dedup block size
1TB 4GB 4K
1TB 1GB 16K
1TB 256MB 64K
1TB 16MB 1024K
64TB 1GB 1024K
It is possible to resize the hash table by changing the size of
`beeshash.dat` (e.g. with `truncate`) and restarting `bees`. This
does not preserve all the existing hash table entries, but it does
preserve more than zero of them--especially if the old and new sizes
are a power-of-two multiple of each other.
Things You Might Expect That Bees Doesn't Have
----------------------------------------------
* There's no configuration file or getopt command line option processing
(patches welcome!). There are some tunables hardcoded in the source
that could eventually become configuration options.
* There's no way to *stop* the Bees daemon. Use SIGKILL, SIGTERM, or
Ctrl-C for now. Some of the destructors are unreachable and have never
been tested. Bees will repeat some work when restarted.
* The Bees process doesn't fork and writes its log to stdout/stderr.
A shell wrapper is required to make it behave more like a daemon.
* There's no facility to exclude any part of a filesystem (patches
welcome).
* PREALLOC extents and extents containing blocks filled with zeros will
be replaced by holes unconditionally.
* Duplicate block groups that are less than 12K in length can take 30%
of the run time while saving only 3% of the disk space. There should
be an option to just not bother with those.
* There is a lot of duplicate reading of blocks in snapshots. Bees will
scan all snapshots at close to the same time to try to get better
performance by caching, but really fixing this requires rewriting the
crawler to scan the btrfs extent tree directly instead of the subvol
FS trees.
* Bees had support for multiple worker threads in the past; however,
this was removed because it made Bees too aggressive to coexist with
other applications on the same machine. It also hit the *slow backrefs*
on N CPU cores instead of just one.
* Block reads are currently more allocation- and CPU-intensive than they
should be, especially for filesystems on SSD where the IO overhead is
much smaller. This is a problem for power-constrained environments
(e.g. laptops with slow CPU).
* Bees can currently fragment extents when required to remove duplicate
blocks, but has no defragmentation capability yet. When possible, Bees
will attempt to work with existing extent boundaries, but it will not
aggregate blocks together from multiple extents to create larger ones.
Good Btrfs Feature Interactions
-------------------------------
Bees has been tested in combination with the following:
* btrfs compression (either method), mixtures of compressed and uncompressed extents
* PREALLOC extents (unconditionally replaced with holes)
* HOLE extents and btrfs no-holes feature
* Other deduplicators, reflink copies (though Bees may decide to redo their work)
* btrfs snapshots and non-snapshot subvols (RW only)
* Concurrent file modification (e.g. PostgreSQL and sqlite databases, build daemons)
* all btrfs RAID profiles (people ask about this, but it's irrelevant)
* IO errors during dedup (read errors will throw exceptions, Bees will catch them and skip over the affected extent)
* Filesystems mounted *with* the flushoncommit option
* 4K filesystem data block size / clone alignment
* 64-bit CPUs (amd64)
* Large (>16M) extents
* Huge files (>1TB--although Btrfs performance on such files isn't great in general)
* filesystems up to 25T bytes, 100M+ files
Bad Btrfs Feature Interactions
------------------------------
Bees has not been tested with the following, and undesirable interactions may occur:
* Non-4K filesystem data block size (should work if recompiled)
* 32-bit CPUs (x86, arm)
* Non-equal hash (SUM) and filesystem data block (CLONE) sizes (probably never will work)
* btrfs read-only snapshots (never tested, probably wouldn't work well)
* btrfs send/receive (receive is probably OK, but send requires RO snapshots. See above)
* btrfs qgroups (never tested, no idea what might happen)
* btrfs seed filesystems (does anyone even use those?)
* btrfs autodefrag mount option (never tested, could fight with Bees)
* btrfs nodatacow mount option or inode attribute (*could* work, but might not)
* btrfs out-of-tree kernel patches (e.g. in-band dedup or encryption)
* btrfs-convert from ext2/3/4 (never tested)
* btrfs mixed block groups (don't know a reason why it would *not* work, but never tested)
* open(O_DIRECT)
* Filesystems mounted *without* the flushoncommit option
Other Caveats
-------------
* btrfs balance will invalidate parts of the dedup table. Bees will
happily rebuild the table, but it will have to scan all the blocks
again.
* btrfs defrag will cause Bees to rescan the defragmented file. If it
contained duplicate blocks and other references to the original
fragmented duplicates still exist, Bees will replace the defragmented
extents with the original fragmented ones.
* Bees creates temporary files (with O_TMPFILE) and uses them to split
and combine extents elsewhere in btrfs. These will take up to 2GB
during normal operation.
* Like all deduplicators, Bees will replace data blocks with metadata
references. It is a good idea to ensure there are several GB of
unallocated space (see `btrfs fi df`) on the filesystem before running
Bees for the first time. Use
btrfs balance start -dusage=100,limit=1 /your/filesystem
If possible, raise the `limit` parameter to the current size of metadata
usage (from `btrfs fi df`) plus 1.
A Brief List Of Btrfs Kernel Bugs
---------------------------------
Fixed bugs:
* 3.13: `FILE_EXTENT_SAME` ioctl added. No way to reliably dedup with
concurrent modifications before this.
* 3.16: `SEARCH_V2` ioctl added. Bees could use `SEARCH` instead.
* 4.2: `FILE_EXTENT_SAME` no longer updates mtime, can be used at EOF.
Kernel deadlock bugs fixed.
* 4.7: *slow backref* bug no longer triggers a softlockup panic. It still
too long to resolve a block address to a root/inode/offset triple.
Unfixed kernel bugs (as of 4.5.7) with workarounds in Bees:
* *slow backref*: If the number of references to a single shared extent
within a single file grows above a few thousand, the kernel consumes CPU
for up to 40 uninterruptible minutes while holding various locks that
block access to the filesystem. Bees avoids this bug by measuring the
time the kernel spends performing certain operations and permanently
blacklisting any extent or hash where the kernel starts to get slow.
Inside Bees, such blocks are marked as 'toxic' hash/block addresses.
* `LOGICAL_INO` output is arbitrarily limited to 2730 references
even if more buffer space is provided for results. Once this number
has been reached, Bees can no longer replace the extent since it can't
find and remove all existing references. Bees refrains from adding
any more references after the first 2560. Offending blocks are
marked 'toxic' even if there is no corresponding performance problem.
This places an obvious limit on dedup efficiency for extremely common
blocks or filesystems with many snapshots (although this limit is
far greater than the effective limit imposed by the *slow backref* bug).
* `FILE_EXTENT_SAME` is arbitrarily limited to 16MB. This is less than
128MB which is the maximum extent size that can be created by defrag
or prealloc. Bees avoids feedback loops this can generate while
attempting to replace extents over 16MB in length.
* `DEFRAG_RANGE` is useless. The ioctl attempts to implement `btrfs
fi defrag` in the kernel, and will arbitrarily defragment more or
less than the range requested to match the behavior expected from the
userspace tool. Bees implements its own defrag instead, copying data
to a temporary file and using the `FILE_EXTENT_SAME` ioctl to replace
precisely the specified range of offending fragmented blocks.
* When writing BeesStringFile, a crash can cause the directory entry
`beescrawl.UUID.dat.tmp` to exist without a corresponding inode.
This directory entry cannot be renamed or removed; however, it does
not prevent the creation of a second directory entry with the same
name that functions normally, so it doesn't prevent Bees operation.
The orphan directory entry can be removed by deleting its subvol,
so place BEESHOME on a separate subvol so you can delete these orphan
directory entries when they occur (or use btrfs zero-log before mounting
the filesystem after a crash).
* If the fsync() BeesTempFile::make_copy is removed, the filesystem
hangs within a few hours, requiring a reboot to recover.
Not really a bug, but a gotcha nonetheless:
* If a process holds a directory FD open, the subvol containing the
directory cannot be deleted (`btrfs sub del` will start the deletion
process, but it will not proceed past the first open directory FD).
`btrfs-cleaner` will simply skip over the directory *and all of its
children* until the FD is closed. Bees avoids this gotcha by closing
all of the FDs in its directory FD cache every 15 minutes.
Requirements
------------
* C++11 compiler (tested with GCC 4.9)
Sorry. I really like closures.
* btrfs-progs (tested with 4.1..4.7)
Needed for btrfs.h and ctree.h during compile.
Not needed at runtime.
* libuuid-dev
TODO: remove the one function used from this library.
It supports a feature Bees no longer implements.
* Linux kernel 4.2 or later
Don't bother trying to make Bees work with older kernels.
It won't end well.
* 64-bit host and target CPU
This code has never been tested on a 32-bit target CPU.
A 64-bit host CPU may be required for the self-tests.
Some of the ioctls don't work properly with a 64-bit
kernel and 32-bit userspace.
Build
-----
Build with `make`.
The build produces `bin/bees` and `lib/libcrucible.so`, which must be
copied to somewhere in `$PATH` and `$LD_LIBRARY_PATH` on the target
system respectively.
Setup
-----
Create a directory for bees state files:
export BEESHOME=/some/path
mkdir -p "$BEESHOME"
Create an empty hash table (your choice of size, but it must be a multiple
of 16M). This example creates a 1GB hash table:
truncate -s 1g "$BEESHOME/beeshash.dat"
chmod 700 "$BEESHOME/beeshash.dat"
Configuration
-------------
The only runtime configurable options are environment variables:
* BEESHOME: Directory containing Bees state files:
* beeshash.dat | persistent hash table (must be a multiple of 16M)
* beescrawl.`UUID`.dat | state of SEARCH_V2 crawlers
* beesstats.txt | statistics and performance counters
* BEESSTATS: File containing a snapshot of current Bees state (performance
counters and current status of each thread).
Other options (e.g. interval between filesystem crawls) can be configured
in src/bees.h.
Running
-------
We created this directory in the previous section:
export BEESHOME=/some/path
Use a tmpfs for BEESSTATUS, it updates once per second:
export BEESSTATUS=/run/bees.status
bees can only process the root subvol of a btrfs (seriously--if the
argument is not the root subvol directory, Bees will just throw an
exception and stop).
Use a bind mount, and let only bees access it:
mount -osubvol=/ /dev/<your-filesystem> /var/lib/bees/root
Reduce CPU and IO priority to be kinder to other applications
sharing this host (or raise them for more aggressive disk space
recovery). If you use cgroups, put `bees` in its own cgroup, then reduce
the `blkio.weight` and `cpu.shares` parameters. You can also use
`schedtool` and `ionice` in the shell script that launches `bees`:
schedtool -D -n20 $$
ionice -c3 -p $$
Let the bees fly:
bees /var/lib/bees/root >> /var/log/bees.log 2>&1
You'll probably want to arrange for /var/log/bees.log to be rotated
periodically. You may also want to set umask to 077 to prevent disclosure
of information about the contents of the filesystem through the log file.
* [How bees works](docs/how-it-works.md)
* [Missing bees features](docs/missing.md)
* [Event counter descriptions](docs/event-counters.md)
Bug Reports and Contributions
-----------------------------
@@ -66,9 +381,11 @@ You can also use Github:
https://github.com/Zygo/bees
Copyright & License
-------------------
Copyright 2015-2025 Zygo Blaxell <bees@furryterror.org>.
Copyright & License
===================
Copyright 2015-2016 Zygo Blaxell <bees@furryterror.org>.
GPL (version 3 or later).

1
docs/.gitignore vendored
View File

@@ -1 +0,0 @@
*.html

View File

@@ -1,18 +0,0 @@
MARKDOWN := $(firstword $(shell command -v cmark-gfm redcarpet markdown2 markdown markdown_py 2>/dev/null || echo markdown))
# If you have cmark-gfm, you get Github-style tables; otherwise, you don't.
ifeq ($(notdir $(MARKDOWN)),cmark-gfm)
MARKDOWN += -e table
endif
.PHONY: docs
docs: $(subst .md,.html,$(wildcard *.md)) index.html ../README.html
%.html: %.md Makefile
$(MARKDOWN) $< | sed -e 's/\.md/\.html/g' > $@.new
mv -f $@.new $@
index.md: ../README.md
sed -e 's:docs/::g' < ../README.md > index.md.new
mv -f index.md.new index.md

View File

@@ -1 +0,0 @@
theme: jekyll-theme-cayman

View File

@@ -1,148 +0,0 @@
Recommended Linux Kernel Version for bees
=========================================
First, a warning about old Linux kernel versions:
> **Linux kernel version 5.1, 5.2, and 5.3 should not be used with btrfs
due to a severe regression that can lead to fatal metadata corruption.**
This issue is fixed in version 5.4.14 and later.
**Recommended Linux kernel versions for bees are 5.4, 5.10, 5.15, 6.1,
6.6, or 6.12 with recent LTS and -stable updates.** The latest released
kernel as of this writing is 6.12.9, and the earliest supported LTS
kernel is 5.4.
Some optional bees features use kernel APIs introduced in kernel 4.15
(extent scan) and 5.6 (`openat2` support). These bees features are not
available on older kernels. Support for older kernels may be removed
in a future bees release.
bees will not run at all on kernels before 4.2 due to lack of minimal
API support.
Kernel Bug Tracking Table
-------------------------
These bugs are particularly popular among bees users, though not all are specifically relevant to bees:
| First bad kernel | Last bad kernel | Issue Description | Fixed Kernel Versions | Fix Commit
| :---: | :---: | --- | :---: | ---
| - | 4.10 | garbage inserted in read data when reading compressed inline extent followed by a hole | 3.18.89, 4.1.49, 4.4.107, 4.9.71, 4.11 and later | e1699d2d7bf6 btrfs: add missing memset while reading compressed inline extents
| - | 4.14 | spurious warnings from `fs/btrfs/backref.c` in `find_parent_nodes` | 3.16.57, 4.14.29, 4.15.12, 4.16 and later | c8195a7b1ad5 btrfs: remove spurious WARN_ON(ref->count < 0) in find_parent_nodes
| 4.15 | 4.18 | compression ratio and performance regression on bees test corpus | improved in 4.19 | 4.14 performance not fully restored yet
| - | 5.0 | silently corrupted data returned when reading compressed extents around a punched hole (bees dedupes all-zero data blocks with holes which can produce a similar effect to hole punching) | 3.16.70, 3.18.137, 4.4.177, 4.9.165, 4.14.108, 4.19.31, 5.0.4, 5.1 and later | 8e928218780e Btrfs: fix corruption reading shared and compressed extents after hole punching
| - | 5.0 | deadlock when dedupe and rename are used simultaneously on the same files | 5.0.4, 5.1 and later | 4ea748e1d2c9 Btrfs: fix deadlock between clone/dedupe and rename
| - | 5.1 | send failure or kernel crash while running send and dedupe on same snapshot at same time | 5.0.18, 5.1.4, 5.2 and later | 62d54f3a7fa2 Btrfs: fix race between send and deduplication that lead to failures and crashes
| - | 5.2 | alternating send and dedupe results in incremental send failure | 4.9.188, 4.14.137, 4.19.65, 5.2.7, 5.3 and later | b4f9a1a87a48 Btrfs: fix incremental send failure after deduplication
| 4.20 | 5.3 | balance convert to single rejected with error on 32-bit CPUs | 5.3.7, 5.4 and later | 7a54789074a5 btrfs: fix balance convert to single on 32-bit host CPUs
| - | 5.3 | kernel crash due to tree mod log issue #1 (often triggered by bees) | 3.16.79, 4.4.195, 4.9.195, 4.14.147, 4.19.77, 5.2.19, 5.3.4, 5.4 and later | efad8a853ad2 Btrfs: fix use-after-free when using the tree modification log
| - | 5.4 | kernel crash due to tree mod log issue #2 (often triggered by bees) | 3.16.83, 4.4.208, 4.9.208, 4.14.161, 4.19.92, 5.4.7, 5.5 and later | 6609fee8897a Btrfs: fix removal logic of the tree mod log that leads to use-after-free issues
| 5.1 | 5.4 | metadata corruption resulting in loss of filesystem when a write operation occurs while balance starts a new block group. **Do not use kernel 5.1 with btrfs.** Kernel 5.2 and 5.3 have workarounds that may detect corruption in progress and abort before it becomes permanent, but do not prevent corruption from occurring. Also kernel crash due to tree mod log issue #4. | 5.4.14, 5.5 and later | 6282675e6708 btrfs: relocation: fix reloc_root lifespan and access
| - | 5.4 | send performance failure when shared extents have too many references | 4.9.207, 4.14.159, 4.19.90, 5.3.17, 5.4.4, 5.5 and later | fd0ddbe25095 Btrfs: send, skip backreference walking for extents with many references
| 5.0 | 5.5 | dedupe fails to remove the last extent in a file if the file size is not a multiple of 4K | 5.4.19, 5.5.3, 5.6 and later | 831d2fa25ab8 Btrfs: make deduplication with range including the last block work
| 4.5, backported to 3.18.31, 4.1.22, 4.4.4 | 5.5 | `df` incorrectly reports 0 free space while data space is available. Triggered by changes in metadata size, including those typical of large-scale dedupe. Occurs more often starting in 5.3 and especially 5.4 | 4.4.213, 4.9.213, 4.14.170, 4.19.102, 5.4.18, 5.5.2, 5.6 and later | d55966c4279b btrfs: do not zero f_bavail if we have available space
| - | 5.5 | kernel crash due to tree mod log issue #3 (often triggered by bees) | 3.16.84, 4.4.214, 4.9.214, 4.14.171, 4.19.103, 5.4.19, 5.5.3, 5.6 and later | 7227ff4de55d Btrfs: fix race between adding and putting tree mod seq elements and nodes
| - | 5.6 | deadlock when enumerating file references to physical extent addresses while some references still exist in deleted subvols | 5.7 and later | 39dba8739c4e btrfs: do not resolve backrefs for roots that are being deleted
| - | 5.6 | deadlock when many extent reference updates are pending and available memory is low | 4.14.177, 4.19.116, 5.4.33, 5.5.18, 5.6.5, 5.7 and later | 351cbf6e4410 btrfs: use nofs allocations for running delayed items
| - | 5.6 | excessive CPU usage in `LOGICAL_INO` and `FIEMAP` ioctl and increased btrfs write latency in other processes when bees translates from extent physical address to list of referencing files and offsets. Also affects other tools like `duperemove` and `btrfs send` | 5.4.96, 5.7 and later | b25b0b871f20 btrfs: backref, use correct count to resolve normal data refs, plus 3 parent commits. Some improvements also in earlier kernels.
| - | 5.7 | filesystem becomes read-only if out of space while deleting snapshot | 4.9.238, 4.14.200, 4.19.149, 5.4.69, 5.8 and later | 7c09c03091ac btrfs: don't force read-only after error in drop snapshot
| 5.1 | 5.7 | balance, device delete, or filesystem shrink operations loop endlessly on a single block group without decreasing extent count | 5.4.54, 5.7.11, 5.8 and later | 1dae7e0e58b4 btrfs: reloc: clear DEAD\_RELOC\_TREE bit for orphan roots to prevent runaway balance
| - | 5.8 | deadlock in `TREE_SEARCH` ioctl (core component of bees filesystem scanner), followed by regression in deadlock fix | 4.4.237, 4.9.237, 4.14.199, 4.19.146, 5.4.66, 5.8.10 and later | a48b73eca4ce btrfs: fix potential deadlock in the search ioctl, 1c78544eaa46 btrfs: fix wrong address when faulting in pages in the search ioctl
| 5.7 | 5.10 | kernel crash if balance receives fatal signal e.g. Ctrl-C | 5.4.93, 5.10.11, 5.11 and later | 18d3bff411c8 btrfs: don't get an EINTR during drop_snapshot for reloc
| 5.10 | 5.10 | 20x write performance regression | 5.10.8, 5.11 and later | e076ab2a2ca7 btrfs: shrink delalloc pages instead of full inodes
| 5.4 | 5.11 | spurious tree checker failures on extent ref hash | 5.4.125, 5.10.43, 5.11.5, 5.12 and later | 1119a72e223f btrfs: tree-checker: do not error out if extent ref hash doesn't match
| - | 5.11 | tree mod log issue #5 | 4.4.263, 4.9.263, 4.14.227, 4.19.183, 5.4.108, 5.10.26, 5.11.9, 5.12 and later | dbcc7d57bffc btrfs: fix race when cloning extent buffer during rewind of an old root
| - | 5.12 | tree mod log issue #6 | 4.14.233, 4.19.191, 5.4.118, 5.10.36, 5.11.20, 5.12.3, 5.13 and later | f9690f426b21 btrfs: fix race when picking most recent mod log operation for an old root
| 5.11 | 5.12 | subvols marked for deletion with `btrfs sub del` become permanently undeletable ("ghost" subvols) | 5.12 stopped creation of new ghost subvols | Partially fixed in 8d488a8c7ba2 btrfs: fix subvolume/snapshot deletion not triggered on mount. Qu wrote a [patch](https://github.com/adam900710/linux/commit/9de990fcc8864c376eb28aa7482c54321f94acd4) to allow `btrfs sub del -i` to remove "ghost" subvols, but it was never merged upstream.
| 4.15 | 5.16 | spurious warnings from `fs/fs-writeback.c` when `flushoncommit` is enabled | 5.15.27, 5.16.13, 5.17 and later | a0f0cf8341e3 btrfs: get rid of warning on transaction commit when using flushoncommit
| - | 5.17 | crash during device removal can make filesystem unmountable | 5.15.54, 5.16.20, 5.17.3, 5.18 and later | bbac58698a55 btrfs: remove device item and update super block in the same transaction
| - | 5.18 | wrong superblock num_devices makes filesystem unmountable | 4.14.283, 4.19.247, 5.4.198, 5.10.121, 5.15.46, 5.17.14, 5.18.3, 5.19 and later | d201238ccd2f btrfs: repair super block num_devices automatically
| 5.18 | 5.19 | parent transid verify failed during log tree replay after a crash during a rename operation | 5.18.18, 5.19.2, 6.0 and later | 723df2bcc9e1 btrfs: join running log transaction when logging new name
| 5.12 | 6.0 | space cache corruption and potential double allocations | 5.15.65, 5.19.6, 6.0 and later | ced8ecf026fd btrfs: fix space cache corruption and potential double allocations
| 6.0 | 6.5 | suboptimal allocation in multi-device filesystems due to chunk allocator regression | 6.1.60, 6.5.9, 6.6 and later | 8a540e990d7d btrfs: fix stripe length calculation for non-zoned data chunk allocation
| 6.3, backported to 5.15.107, 6.1.24, 6.2.11 | 6.3 | vmalloc error, failed to allocate pages | 6.3.10, 6.4 and later. Bug (f349b15e183d "mm: vmalloc: avoid warn_alloc noise caused by fatal signal" in v6.3-rc6) backported to 6.1.24, 6.2.11, and 5.15.107. | 95a301eefa82 mm/vmalloc: do not output a spurious warning when huge vmalloc() fails
| 6.2 | 6.3 | `IGNORE_OFFSET` flag ignored in `LOGICAL_INO` ioctl | 6.2.16, 6.3.3, 6.4 and later | 0cad8f14d70c btrfs: fix backref walking not returning all inode refs
| 6.10 | 6.11 | `adding refs to an existing tree ref`, `failed to run delayed ref`, then read-only | 6.11.10, 6.12 and later | 7d493a5ecc26 btrfs: fix incorrect comparison for delayed refs
| 5.4 | - | kernel hang when multiple threads are running `LOGICAL_INO` and dedupe/clone ioctl on the same extent | - | workaround: avoid doing that
"Last bad kernel" refers to that version's last stable update from
kernel.org. Distro kernels may backport additional fixes. Consult
your distro's kernel support for details.
When the same version appears in both "last bad kernel" and "fixed kernel
version" columns, it means the bug appears in the `.0` release and is
fixed in the stated `.y` release. e.g. a "last bad kernel" of 5.4 and
a "fixed kernel version" of 5.4.14 has the bug in kernel versions 5.4.0
through 5.4.13 inclusive.
A "-" for "first bad kernel" indicates the bug has been present since
the relevant feature first appeared in btrfs.
A "-" for "last bad kernel" indicates the bug has not yet been fixed in
current kernels (see top of this page for which kernel version that is).
In cases where issues are fixed by commits spread out over multiple
kernel versions, "fixed kernel version" refers to the version that
contains the last committed component of the fix.
Workarounds for known kernel bugs
---------------------------------
* **Hangs with concurrent `LOGICAL_INO` and dedupe/clone**: on all
kernel versions so far, multiple threads running `LOGICAL_INO` and
dedupe/clone ioctls at the same time on the same inodes or extents
can lead to a kernel hang. The kernel enters an infinite loop in
`add_all_parents`, where `count` is 0, `ref->count` is 1, and
`btrfs_next_item` or `btrfs_next_old_item` never find a matching ref.
bees has two workarounds for this bug: 1. schedule work so that multiple
threads do not simultaneously access the same inode or the same extent,
and 2. use a brute-force global lock within bees that prevents any
thread from running `LOGICAL_INO` while any other thread is running
dedupe.
Workaround #1 isn't really a workaround, since we want to do the same
thing for unrelated performance reasons. If multiple threads try to
perform dedupe operations on the same extent or inode, btrfs will make
all the threads wait for the same locks anyway, so it's better to have
bees find some other inode or extent to work on while waiting for btrfs
to finish.
Workaround #2 doesn't seem to be needed after implementing workaround
#1, but it's better to be slightly slower than to hang one CPU core
and the filesystem until the kernel is rebooted.
It is still theoretically possible to trigger the kernel bug when
running bees at the same time as other dedupers, or other programs
that use `LOGICAL_INO` like `btdu`, or when performing a reflink clone
operation such as `cp` or `mv`; however, it's extremely difficult to
reproduce the bug without closely cooperating threads.
* **Slow backrefs** (aka toxic extents): On older kernels, under certain
conditions, if the number of references to a single shared extent grows
too high, the kernel consumes more and more CPU while also holding
locks that delay write access to the filesystem. This is no longer
a concern on kernels after 5.7 (or an up-to-date 5.4 LTS version),
but there are still some remains of earlier workarounds for this issue
in bees that have not been fully removed.
bees avoided this bug by measuring the time the kernel spends performing
`LOGICAL_INO` operations and permanently blacklisting any extent or
hash involved where the kernel starts to get slow. In the bees log,
such blocks are labelled as 'toxic' hash/block addresses.
Future bees releases will remove toxic extent detection (it only detects
false positives now) and clear all previously saved toxic extent bits.
* **dedupe breaks `btrfs send` in old kernels**. The bees option
`--workaround-btrfs-send` prevents any modification of read-only subvols
in order to avoid breaking `btrfs send` on kernels before 5.2.
This workaround is no longer necessary to avoid kernel crashes and
send performance failure on kernel 5.4.4 and later. bees will pause
dedupe until the send is finished on current kernels.
`btrfs receive` is not and has never been affected by this issue.

View File

@@ -1,44 +0,0 @@
Good Btrfs Feature Interactions
-------------------------------
bees has been tested in combination with the following:
* btrfs compression (zlib, lzo, zstd)
* PREALLOC extents (unconditionally replaced with holes)
* HOLE extents and btrfs no-holes feature
* Other deduplicators (`duperemove`, `jdupes`)
* Reflink copies (modern coreutils `cp` and `mv`)
* Concurrent file modification (e.g. PostgreSQL and sqlite databases, VMs, build daemons)
* All btrfs RAID profiles: single, dup, raid0, raid1, raid10, raid1c3, raid1c4, raid5, raid6
* IO errors during dedupe (affected extents are skipped)
* 4K filesystem data block size / clone alignment
* 64-bit and 32-bit LE host CPUs (amd64, x86, arm)
* Large files (kernel 5.4 or later strongly recommended)
* Filesystem data sizes up to 100T+ bytes, 1000M+ files
* `open(O_DIRECT)` (seems to work as well--or as poorly--with bees as with any other btrfs feature)
* btrfs-convert from ext2/3/4
* btrfs `autodefrag` mount option
* btrfs balance (data balances cause rescan of relocated data)
* btrfs block-group-tree
* btrfs `flushoncommit` and `noflushoncommit` mount options
* btrfs mixed block groups
* btrfs `nodatacow`/`nodatasum` inode attribute or mount option (bees skips all nodatasum files)
* btrfs qgroups and quota support (_not_ squotas)
* btrfs receive
* btrfs scrub
* btrfs send (dedupe pauses automatically, kernel 5.4 or later required)
* btrfs snapshot, non-snapshot subvols (RW and RO), snapshot delete
**Note:** some btrfs features have minimum kernel versions which are
higher than the minimum kernel version for bees.
Untested Btrfs Feature Interactions
-----------------------------------
bees has not been tested with the following, and undesirable interactions may occur:
* Non-4K filesystem data block size (should work if recompiled)
* Non-equal hash (SUM) and filesystem data block (CLONE) sizes (need to fix that eventually)
* btrfs seed filesystems, raid-stripe-tree, squotas (no particular reason these wouldn't work, but no one has reported trying)
* btrfs out-of-tree kernel patches (e.g. encryption, extent tree v2)
* Host CPUs with exotic page sizes, alignment requirements, or endianness (ppc, alpha, sparc, strongarm, s390, mips, m68k...)

View File

@@ -1,317 +0,0 @@
bees Configuration
==================
The only configuration parameter that *must* be provided is the hash
table size. Other parameters are optional or hardcoded, and the defaults
are reasonable in most cases.
Hash Table Sizing
-----------------
Hash table entries are 16 bytes per data block. The hash table stores the
most recently read unique hashes. Once the hash table is full, each new
entry added to the table evicts an old entry. This makes the hash table
a sliding window over the most recently scanned data from the filesystem.
Here are some numbers to estimate appropriate hash table sizes:
unique data size | hash table size |average dedupe extent size
1TB | 4GB | 4K
1TB | 1GB | 16K
1TB | 256MB | 64K
1TB | 128MB | 128K <- recommended
1TB | 16MB | 1024K
64TB | 1GB | 1024K
Notes:
* If the hash table is too large, no extra dedupe efficiency is
obtained, and the extra space wastes RAM.
* If the hash table is too small, bees extrapolates from matching
blocks to find matching adjacent blocks in the filesystem that have been
evicted from the hash table. In other words, bees only needs to find
one block in common between two extents in order to be able to dedupe
the entire extents. This provides significantly more dedupe hit rate
per hash table byte than other dedupe tools.
* There is a fairly wide range of usable hash sizes, and performances
degrades according to a smooth probabilistic curve in both directions.
Double or half the optimium size usually works just as well.
* When counting unique data in compressed data blocks to estimate
optimum hash table size, count the *uncompressed* size of the data.
* Another way to approach the hash table size is to simply decide how much
RAM can be spared without too much discomfort, give bees that amount of
RAM, and accept whatever dedupe hit rate occurs as a result. bees will
do the best job it can with the RAM it is given.
Factors affecting optimal hash table size
-----------------------------------------
It is difficult to predict the net effect of data layout and access
patterns on dedupe effectiveness without performing deep inspection of
both the filesystem data and its structure--a task that is as expensive
as performing the deduplication.
* **Compression** in files reduces the average extent length compared
to uncompressed files. The maximum compressed extent length on
btrfs is 128KB, while the maximum uncompressed extent length is 128MB.
Longer extents decrease the optimum hash table size while shorter extents
increase the optimum hash table size, because the probability of a hash
table entry being present (i.e. unevicted) in each extent is proportional
to the extent length.
As a rule of thumb, the optimal hash table size for a compressed
filesystem is 2-4x larger than the optimal hash table size for the same
data on an uncompressed filesystem. Dedupe efficiency falls rapidly with
hash tables smaller than 128MB/TB as the average dedupe extent size is
larger than the largest possible compressed extent size (128KB).
* **Short writes or fragmentation** also shorten the average extent
length and increase optimum hash table size. If a database writes to
files randomly using 4K page writes, all of these extents will be 4K
in length, and the hash table size must be increased to retain each one
(or the user must accept a lower dedupe hit rate).
Defragmenting files that have had many short writes increases the
extent length and therefore reduces the optimum hash table size.
* **Time between duplicate writes** also affects the optimum hash table
size. bees reads data blocks in logical order during its first pass,
and after that new data blocks are read incrementally a few seconds or
minutes after they are written. bees finds more matching blocks if there
is a smaller amount of data between the matching reads, i.e. there are
fewer blocks evicted from the hash table. If most identical writes to
the filesystem occur near the same time, the optimum hash table size is
smaller. If most identical writes occur over longer intervals of time,
the optimum hash table size must be larger to avoid evicting hashes from
the table before matches are found.
For example, a build server normally writes out very similar source
code files over and over, so it will need a smaller hash table than a
backup server which has to refer to the oldest data on the filesystem
every time a new client machine's data is added to the server.
Scanning modes
--------------
The `--scan-mode` option affects how bees iterates over the filesystem,
schedules extents for scanning, and tracks progress.
There are now two kinds of scan mode: the legacy **subvol** scan modes,
and the new **extent** scan mode.
Scan mode can be changed by restarting bees with a different scan mode
option.
Extent scan mode:
* Works with 4.15 and later kernels.
* Can estimate progress and provide an ETA.
* Can optimize scanning order to dedupe large extents first.
* Can keep up with frequent creation and deletion of snapshots.
Subvol scan modes:
* Work with 4.14 and earlier kernels.
* Cannot estimate or report progress.
* Cannot optimize scanning order by extent size.
* Have problems keeping up with multiple snapshots created during a scan.
The default scan mode is 4, "extent".
If you are using bees for the first time on a filesystem with many
existing snapshots, you should read about [snapshot gotchas](gotchas.md).
Subvol scan modes
-----------------
Subvol scan modes are maintained for compatibility with existing
installations, but will not be developed further. New installations
should use extent scan mode instead.
The _quantity_ of text below detailing the shortcomings of each subvol
scan mode should be informative all by itself.
Subvol scan modes work on any kernel version supported by bees. They
are the only scan modes usable on kernel 4.14 and earlier.
The difference between the subvol scan modes is the order in which the
files from different subvols are fed into the scanner. They all scan
files in inode number order, from low to high offset within each inode,
the same way that a program like `cat` would read files (but skipping
over old data from earlier btrfs transactions).
If a filesystem has only one subvolume with data in it, then all of
the subvol scan modes are equivalent. In this case, there is only one
subvolume to scan, so every possible ordering of subvols is the same.
The `--workaround-btrfs-send` option pauses scanning subvols that are
read-only. If the subvol is made read-write (e.g. with `btrfs prop set
$subvol ro false`), or if the `--workaround-btrfs-send` option is removed,
then the scan of that subvol is unpaused and dedupe proceeds normally.
Space will only be recovered when the last read-only subvol is deleted.
Subvol scan modes cannot efficiently or accurately calculate an ETA for
completion or estimate progress through the data. They simply request
"the next new inode" from btrfs, and they are completed when btrfs says
there is no next new inode.
Between subvols, there are several scheduling algorithms with different
trade-offs:
Scan mode 0, "lockstep", scans the same inode number in each subvol at
close to the same time. This is useful if the subvols are snapshots
with a common ancestor, since the same inode number in each subvol will
have similar or identical contents. This maximizes the likelihood that
all of the references to a snapshot of a file are scanned at close to
the same time, improving dedupe hit rate. If the subvols are unrelated
(i.e. not snapshots of a single subvol) then this mode does not provide
any significant advantage. This mode uses smaller amounts of temporary
space for shorter periods of time when most subvols are snapshots. When a
new snapshot is created, this mode will stop scanning other subvols and
scan the new snapshot until the same inode number is reached in each
subvol, which will effectively stop dedupe temporarily as this data has
already been scanned and deduped in the other snapshots.
Scan mode 1, "independent", scans the next inode with new data in
each subvol. There is no coordination between the subvols, other than
round-robin distribution of files from each subvol to each worker thread.
This mode makes continuous forward progress in all subvols. When a new
snapshot is created, previous subvol scans continue as before, but the
worker threads are now divided among one more subvol.
Scan mode 2, "sequential", scans one subvol at a time, in numerical subvol
ID order, processing each subvol completely before proceeding to the next
subvol. This avoids spending time scanning short-lived snapshots that
will be deleted before they can be fully deduped (e.g. those used for
`btrfs send`). Scanning starts on older subvols that are more likely
to be origin subvols for future snapshots, eliminating the need to
dedupe future snapshots separately. This mode uses the largest amount
of temporary space for the longest time, and typically requires a larger
hash table to maintain dedupe hit rate.
Scan mode 3, "recent", scans the subvols with the highest `min_transid`
value first (i.e. the ones that were most recently completely scanned),
then falls back to "independent" mode to break ties. This interrupts
long scans of old subvols to give a rapid dedupe response to new data
in previously scanned subvols, then returns to the old subvols after
the new data is scanned.
Extent scan mode
----------------
Scan mode 4, "extent", scans the extent tree instead of the subvol trees.
Extent scan mode reads each extent once, regardless of the number of
reflinks or snapshots. It adapts to the creation of new snapshots
and reflinks immediately, without having to revisit old data.
In the extent scan mode, extents are separated into multiple size tiers
to prioritize large extents over small ones. Deduping large extents
keeps the metadata update cost low per block saved, resulting in faster
dedupe at the start of a scan cycle. This is important for maximizing
performance in use cases where bees runs for a limited time, such as
during an overnight maintenance window.
Once the larger size tiers are completed, dedupe space recovery speeds
slow down significantly. It may be desirable to stop bees running once
the larger size tiers are finished, then start bees running some time
later after new data has appeared.
Each extent is mapped in physical address order, and all extent references
are submitted to the scanner at the same time, resulting in much better
cache behavior and dedupe performance compared to the subvol scan modes.
The "extent" scan mode is not usable on kernels before 4.15 because
it relies on the `LOGICAL_INO_V2` ioctl added in that kernel release.
When using bees with an older kernel, only subvol scan modes will work.
Extents are divided into virtual subvols by size, using reserved btrfs
subvol IDs 250..255. The size tier groups are:
* 250: 32M+1 and larger
* 251: 8M+1..32M
* 252: 2M+1..8M
* 253: 512K+1..2M
* 254: 128K+1..512K
* 255: 128K and smaller (includes all compressed extents)
Extent scan mode can efficiently calculate dedupe progress within
the filesystem and estimate an ETA for completion within each size
tier; however, the accuracy of the ETA can be questionable due to the
non-uniform distribution of block addresses in a typical user filesystem.
Older versions of bees do not recognize the virtual subvols, so running
an old bees version after running a new bees version will reset the
"extent" scan mode's progress in `beescrawl.dat` to the beginning.
This may change in future bees releases, i.e. extent scans will store
their checkpoint data somewhere else.
The `--workaround-btrfs-send` option behaves differently in extent
scan modes: In extent scan mode, dedupe proceeds on all subvols that are
read-write, but all subvols that are read-only are excluded from dedupe.
Space will only be recovered when the last read-only subvol is deleted.
During `btrfs send` all duplicate extents in the sent subvol will not be
removed (the kernel will reject dedupe commands while send is active,
and bees currently will not re-issue them after the send is complete).
It may be preferable to terminate the bees process while running `btrfs
send` in extent scan mode, and restart bees after the `send` is complete.
Threads and load management
---------------------------
By default, bees creates one worker thread for each CPU detected. These
threads then perform scanning and dedupe operations. bees attempts to
maximize the amount of productive work each thread does, until either the
threads are all continuously busy, or there is no remaining work to do.
In many cases it is not desirable to continually run bees at maximum
performance. Maximum performance is not necessary if bees can dedupe
new data faster than it appears on the filesystem. If it only takes
bees 10 minutes per day to dedupe all new data on a filesystem, then
bees doesn't need to run for more than 10 minutes per day.
bees supports a number of options for reducing system load:
* Run bees for a few hours per day, at an off-peak time (i.e. during
a maintenace window), instead of running bees continuously. Any data
added to the filesystem while bees is not running will be scanned when
bees restarts. At the end of the maintenance window, terminate the
bees process with SIGTERM to write the hash table and scan position
for the next maintenance window.
* Temporarily pause bees operation by sending the bees process SIGUSR1,
and resume operation with SIGUSR2. This is preferable to freezing
and thawing the process, e.g. with freezer cgroups or SIGSTOP/SIGCONT
signals, because it allows bees to close open file handles that would
otherwise prevent those files from being deleted while bees is frozen.
* Reduce the number of worker threads with the [`--thread-count` or
`--thread-factor` options](options.md). This simply leaves CPU cores
idle so that other applications on the host can use them, or to save
power.
* Allow bees to automatically track system load and increase or decrease
the number of threads to reach a target system load. This reduces
impact on the rest of the system by pausing bees when other CPU and IO
intensive loads are active on the system, and resumes bees when the other
loads are inactive. This is configured with the [`--loadavg-target`
and `--thread-min` options](options.md).
* Allow bees to self-throttle operations that enqueue delayed work
within btrfs. These operations are not well controlled by Linux
features such as process priority or IO priority or IO rate-limiting,
because the enqueued work is submitted to btrfs several seconds before
btrfs performs the work. By the time btrfs performs the work, it's too
late for external throttling to be effective. The [`--throttle-factor`
option](options.md) tracks how long it takes btrfs to complete queued
operations, and reduces bees's queued work submission rate to match
btrfs's queued work completion rate (or a fraction thereof, to reduce
system load).
Log verbosity
-------------
bees can be made less chatty with the [`--verbose` option](options.md).

View File

@@ -1,435 +0,0 @@
Event Counters
==============
General
-------
Event counters are used in bees to collect simple branch-coverage
statistics. Every time bees makes a decision, it increments an event
counter, so there are _many_ event counters.
Events are grouped by prefix in their event names, e.g. `block` is block
I/O, `dedup` is deduplication requests, `tmp` is temporary files, etc.
Events with the suffix `_ms` count total milliseconds spent performing
the operation. These are counted separately for each thread, so there
can be more than 1000 ms per second.
There is considerable overlap between some events, e.g. `example_try`
denotes an event that is counted when an action is attempted,
`example_hit` is counted when the attempt succeeds and has a desired
outcome, and `example_miss` is counted when the attempt succeeds but
the desired outcome is not achieved. In most cases `example_try =
example_hit + example_miss + (`example failed and threw an exception`)`,
but some event groups defy such simplistic equations.
addr
----
The `addr` event group consists of operations related to translating `(root,
inode, offset)` tuples (i.e. logical position within a file) into btrfs
virtual block addresses (i.e. physical position on disk).
* `addr_block`: The address of a block was computed.
* `addr_compressed`: Obsolete implementation of `addr_compressed_offset`.
* `addr_compressed_offset`: The address of a compressed block was computed.
* `addr_delalloc`: The address of a block could not be computed due to
delayed allocation. Only possible when using obsolete `FIEMAP` code.
* `addr_eof_e`: The address of a block at EOF that was not block-aligned was computed.
* `addr_from_fd`: The address of a block was computed using a `fd`
(open to the file in question) and `offset` pair.
* `addr_from_root_fd`: The address of a block was computed using
the filesystem root `fd` instead of the open file `fd` for the
`TREE_SEARCH_V2` ioctl. This is obsolete and should probably be removed
at some point.
* `addr_hole`: The address of a block in a hole was computed.
* `addr_magic`: The address of a block cannot be determined in a way
that bees can use (unrecognized flags or flags known to be incompatible
with bees).
* `addr_uncompressed`: The address of an uncompressed block was computed.
* `addr_unrecognized`: The address of a block with unrecognized flags
(i.e. kernel version newer than bees) was computed.
* `addr_unusable`: The address of a block with unusable flags (i.e. flags
that are known to be incompatible with bees) was computed.
adjust
------
The `adjust` event group consists of operations related to translating stored virtual block addresses (i.e. physical position on disk) to `(root, inode, offset)` tuples (i.e. logical positions within files). `BeesResolver::adjust_offset` determines if a single candidate reference from the `LOGICAL_INO` ioctl corresponds to the requested btrfs virtual block address.
* `adjust_compressed_offset_correct`: A block address corresponding to a compressed block was retrieved from the hash table and resolved to a physical block containing data that matches another block bees has already read.
* `adjust_compressed_offset_wrong`: A block address corresponding to a compressed block was retrieved from the hash table and resolved to a physical block containing data that matches the hash but not the data from another block bees has already read (i.e. there was a hash collision).
* `adjust_eof_fail`: A block address corresponding to a block at EOF that was not aligned to a block boundary matched another block bees already read, but the length of the unaligned data in both blocks was not equal. This is usually caused by stale entries in the hash table pointing to blocks that have been overwritten since the hash table entries were created. It can also be caused by hash collisions, but hashes are not yet computed at this point in the code, so this event does not correlate to the `hash_collision` counter.
* `adjust_eof_haystack`: A block address from the hash table corresponding to a block at EOF that was not aligned to a block boundary was processed.
* `adjust_eof_hit`: A block address corresponding to a block at EOF that was not aligned to a block boundary matched a similarly unaligned block that bees already read.
* `adjust_eof_miss`: A block address from the hash table corresponding to a block at EOF that was not aligned to a block boundary did not match a similarly unaligned block that bees already read.
* `adjust_eof_needle`: A block address from scanning the disk corresponding to a block at EOF that was not aligned to a block boundary was processed.
* `adjust_exact`: A block address from the hash table corresponding to an uncompressed data block was processed to find its `(root, inode, offset)` references.
* `adjust_exact_correct`: A block address corresponding to an uncompressed block was retrieved from the hash table and resolved to a physical block containing data that matches another block bees has already read.
* `adjust_exact_wrong`: A block address corresponding to an uncompressed block was retrieved from the hash table and resolved to a physical block containing data that matches the hash but not the data from another block bees has already read (i.e. there was a hash collision).
* `adjust_hit`: A block address was retrieved from the hash table and resolved to a physical block in an uncompressed extent containing data that matches the data from another block bees has already read (i.e. a duplicate match was found).
* `adjust_miss`: A block address was retrieved from the hash table and resolved to a physical block containing a hash that does not match the hash from another block bees has already read (i.e. the hash table contained a stale entry and the data it referred to has since been overwritten in the filesystem).
* `adjust_needle_too_long`: A block address was retrieved from the hash table, but when the corresponding extent item was retrieved, its offset or length were out of range to be a match (i.e. the hash table contained a stale entry and the data it referred to has since been overwritten in the filesystem).
* `adjust_no_match`: A hash collision occurred (i.e. a block on disk was located with the same hash as the hash table entry but different data) . Effectively an alias for `hash_collision` as it is not possible to have one event without the other.
* `adjust_offset_high`: The `LOGICAL_INO` ioctl gave an extent item that does not overlap with the desired block because the extent item ends before the desired block in the extent data.
* `adjust_offset_hit`: A block address was retrieved from the hash table and resolved to a physical block in a compressed extent containing data that matches the data from another block bees has already read (i.e. a duplicate match was found).
* `adjust_offset_low`: The `LOGICAL_INO` ioctl gave an extent item that does not overlap with the desired block because the extent item begins after the desired block in the extent data.
* `adjust_try`: A block address and extent item candidate were passed to `BeesResolver::adjust_offset` for processing.
block
-----
The `block` event group consists of operations related to reading data blocks from the filesystem.
* `block_bytes`: Number of data bytes read.
* `block_hash`: Number of block hashes computed.
* `block_ms`: Total time reading data blocks.
* `block_read`: Number of data blocks read.
* `block_zero`: Number of data blocks read with zero contents (i.e. candidates for replacement with a hole).
bug
---
The `bug` event group consists of known bugs in bees.
* `bug_bad_max_transid`: A bad `max_transid` was found and removed in `beescrawl.dat`.
* `bug_bad_min_transid`: A bad `min_transid` was found and removed in `beescrawl.dat`.
* `bug_dedup_same_physical`: `BeesContext::dedup` detected that the physical extent was the same for `src` and `dst`. This has no effect on space usage so it is a waste of time, and also carries the risk of creating a toxic extent.
* `bug_grow_pair_overlaps`: Two identical blocks were found, and while searching matching adjacent extents, the potential `src` grew to overlap the potential `dst`. This would create a cycle where bees keeps trying to eliminate blocks but instead just moves them around.
* `bug_hash_duplicate_cell`: Two entries in the hash table were identical. This only happens due to data corruption or a bug.
* `bug_hash_magic_addr`: An entry in the hash table contains an address with magic. Magic addresses cannot be deduplicated so they should not be stored in the hash table.
chase
-----
The `chase` event group consists of operations connecting btrfs virtual block addresses with `(root, inode, offset)` tuples. `resolve` is the top level, `adjust` is the bottom level, and `chase` is the middle level. `BeesResolver::chase_extent_ref` iterates over `(root, inode, offset)` tuples from `LOGICAL_INO` and attempts to find a single matching block in the filesystem given a candidate block from an earlier `scan` operation.
* `chase_corrected`: A matching block was resolved to a `(root, inode, offset)` tuple, but the offset of a block matching data did not match the offset given by `LOGICAL_INO`.
* `chase_hit`: A block address was successfully and correctly translated to a `(root, inode, offset)` tuple.
* `chase_no_data`: A block address was not successfully translated to a `(root, inode, offset)` tuple.
* `chase_no_fd`: A `(root, inode)` tuple could not be opened (i.e. the file was deleted on the filesystem).
* `chase_try`: A block address translation attempt started.
* `chase_uncorrected`: A matching block was resolved to a `(root, inode, offset)` tuple, and the offset of a block matching data did match the offset given by `LOGICAL_INO`.
* `chase_wrong_addr`: The btrfs virtual address (i.e. physical block address) found at a candidate `(root, inode, offset)` tuple did not match the expected btrfs virtual address (i.e. the filesystem was modified during the resolve operation).
* `chase_wrong_magic`: The extent item at a candidate `(root, inode, offset)` tuple has magic bits and cannot match any btrfs virtual address in the hash table (i.e. the filesystem was modified during the resolve operation).
crawl
-----
The `crawl` event group consists of operations related to scanning btrfs trees to find new extent refs to scan for dedupe.
* `crawl_again`: An inode crawl was restarted because the extent was already locked by another running crawl.
* `crawl_blacklisted`: An extent was not scanned because it belongs to a blacklisted file.
* `crawl_deferred_inode`: Two tasks attempted to scan the same inode at the same time, so one was deferred.
* `crawl_done`: One pass over a subvol was completed.
* `crawl_discard_high`: An extent that was too large for the crawler's size tier was discarded.
* `crawl_discard_low`: An extent that was too small for the crawler's size tier was discarded.
* `crawl_empty`: A `TREE_SEARCH_V2` ioctl call failed or returned an empty set (usually because all data in the subvol was scanned).
* `crawl_extent`: The extent crawler queued all references to an extent for processing.
* `crawl_fail`: A `TREE_SEARCH_V2` ioctl call failed.
* `crawl_flop`: Small extent items were not skipped because the next extent started at or before the end of the previous extent.
* `crawl_gen_high`: An extent item in the search results refers to an extent that is newer than the current crawl's `max_transid` allows.
* `crawl_gen_low`: An extent item in the search results refers to an extent that is older than the current crawl's `min_transid` allows.
* `crawl_hole`: An extent item in the search results refers to a hole.
* `crawl_inline`: An extent item in the search results contains an inline extent.
* `crawl_items`: An item in the `TREE_SEARCH_V2` data was processed.
* `crawl_ms`: Time spent running the `TREE_SEARCH_V2` ioctl.
* `crawl_no_empty`: Attempted to delete the last crawler. Should never happen.
* `crawl_nondata`: An item in the search results is not data.
* `crawl_prealloc`: An extent item in the search results refers to a `PREALLOC` extent.
* `crawl_push`: An extent item in the search results is suitable for scanning and deduplication.
* `crawl_scan`: An extent item in the search results is submitted to `BeesContext::scan_forward` for scanning and deduplication.
* `crawl_skip`: Small extent items were skipped because no extent of sufficient size was found within the minimum search distance.
* `crawl_skip_ms`: Time spent skipping small extent items.
* `crawl_search`: A `TREE_SEARCH_V2` ioctl call was successful.
* `crawl_throttled`: Extent scan created too many work queue items and was prevented from creating any more.
* `crawl_tree_block`: Extent scan found and skipped a metadata tree block.
* `crawl_unknown`: An extent item in the search results has an unrecognized type.
* `crawl_unthrottled`: Extent scan allowed to create work queue items again.
dedup
-----
The `dedup` (sic) event group consists of operations that deduplicate data.
* `dedup_bytes`: Total bytes in extent references deduplicated.
* `dedup_copy`: Total bytes copied to eliminate unique data in extents containing a mix of unique and duplicate data.
* `dedup_hit`: Total number of pairs of identical extent references.
* `dedup_miss`: Total number of pairs of non-identical extent references.
* `dedup_ms`: Total time spent running the `FILE_EXTENT_SAME` (aka `FI_DEDUPERANGE` or `dedupe_file_range`) ioctl.
* `dedup_prealloc_bytes`: Total bytes in eliminated `PREALLOC` extent references.
* `dedup_prealloc_hit`: Total number of successfully eliminated `PREALLOC` extent references.
* `dedup_prealloc_hit`: Total number of unsuccessfully eliminated `PREALLOC` extent references (i.e. filesystem data changed between scan and dedupe).
* `dedup_try`: Total number of pairs of extent references submitted for deduplication.
* `dedup_workaround_btrfs_send`: Total number of extent reference pairs submitted for deduplication that were discarded to workaround `btrfs send` bugs.
exception
---------
The `exception` event group consists of C++ exceptions. C++ exceptions are thrown due to IO errors and internal constraint check failures.
* `exception_caught`: Total number of C++ exceptions thrown and caught by a generic exception handler.
* `exception_caught_silent`: Total number of "silent" C++ exceptions thrown and caught by a generic exception handler. These are exceptions which are part of the correct and normal operation of bees. The exceptions are logged at a lower log level.
extent
------
The `extent` event group consists of events that occur within the extent scanner.
* `extent_deferred_inode`: A lock conflict was detected when two worker threads attempted to manipulate the same inode at the same time.
* `extent_empty`: A complete list of references to an extent was created but the list was empty, e.g. because all refs are in deleted inodes or snapshots.
* `extent_fail`: An ioctl call to `LOGICAL_INO` failed.
* `extent_forward`: An extent reference was submitted for scanning.
* `extent_mapped`: A complete map of references to an extent was created and added to the crawl queue.
* `extent_ok`: An ioctl call to `LOGICAL_INO` completed successfully.
* `extent_overflow`: A complete map of references to an extent exceeded `BEES_MAX_EXTENT_REF_COUNT`, so the extent was dropped.
* `extent_ref_missing`: An extent reference reported by `LOGICAL_INO` was not found by later `TREE_SEARCH_V2` calls.
* `extent_ref_ok`: One extent reference was queued for scanning.
* `extent_restart`: An extent reference was requeued to be scanned again after an active extent lock is released.
* `extent_retry`: An extent reference was requeued to be scanned again after an active inode lock is released.
* `extent_skip`: A 4K extent with more than 1000 refs was skipped.
* `extent_zero`: An ioctl call to `LOGICAL_INO` succeeded, but reported an empty list of extents.
hash
----
The `hash` event group consists of operations related to the bees hash table.
* `hash_already`: A `(hash, address)` pair was already present in the hash table during a `BeesHashTable::push_random_hash_addr` operation.
* `hash_bump`: An existing `(hash, address)` pair was moved forward in the hash table by a `BeesHashTable::push_random_hash_addr` operation.
* `hash_collision`: A pair of data blocks was found with identical hashes but different data.
* `hash_erase`: A `(hash, address)` pair in the hash table was removed because a matching data block could not be found in the filesystem (i.e. the hash table entry is out of date).
* `hash_erase_miss`: A `(hash, address)` pair was reported missing from the filesystem but no such entry was found in the hash table (i.e. race between scanning threads or pair already evicted).
* `hash_evict`: A `(hash, address)` pair was evicted from the hash table to accommodate a new hash table entry.
* `hash_extent_in`: A hash table extent was read.
* `hash_extent_out`: A hash table extent was written.
* `hash_front`: A `(hash, address)` pair was pushed to the front of the list because it matched a duplicate block.
* `hash_front_already`: A `(hash, address)` pair was pushed to the front of the list because it matched a duplicate block, but the pair was already at the front of the list so no change occurred.
* `hash_insert`: A `(hash, address)` pair was inserted by `BeesHashTable::push_random_hash_addr`.
* `hash_lookup`: The hash table was searched for `(hash, address)` pairs matching a given `hash`.
open
----
The `open` event group consists of operations related to translating `(root, inode)` tuples into open file descriptors (i.e. `open_by_handle` emulation for btrfs).
* `open_clear`: The open FD cache was cleared to avoid keeping file descriptors open too long.
* `open_fail_enoent`: A file could not be opened because it no longer exists (i.e. it was deleted or renamed during the lookup/resolve operations).
* `open_fail_error`: A file could not be opened for other reasons (e.g. IO error, permission denied, out of resources).
* `open_file`: A file was successfully opened. This counts only the `open()` system call, not other reasons why the opened FD might not be usable.
* `open_hit`: A file was successfully opened and the FD was acceptable.
* `open_ino_ms`: Total time spent executing the `open()` system call.
* `open_lookup_empty`: No paths were found for the inode in the `INO_PATHS` ioctl.
* `open_lookup_enoent`: The `INO_PATHS` ioctl returned ENOENT.
* `open_lookup_error`: The `INO_PATHS` ioctl returned a different error.
* `open_lookup_ok`: The `INO_PATHS` ioctl successfully returned a list of one or more filenames.
* `open_no_path`: All attempts to open a file by `(root, inode)` pair failed.
* `open_no_root`: An attempt to open a file by `(root, inode)` pair failed because the `root` could not be opened.
* `open_root_ms`: Total time spent opening subvol root FDs.
* `open_wrong_dev`: A FD returned by `open()` did not match the device belonging to the filesystem subvol.
* `open_wrong_flags`: A FD returned by `open()` had incompatible flags (`NODATASUM` / `NODATACOW`).
* `open_wrong_ino`: A FD returned by `open()` did not match the expected inode (i.e. the file was renamed or replaced during the lookup/resolve operations).
* `open_wrong_root`: A FD returned by `open()` did not match the expected subvol ID (i.e. `root`).
pairbackward
------------
The `pairbackward` event group consists of events related to extending matching block ranges backward starting from the initial block match found using the hash table.
* `pairbackward_bof_first`: A matching pair of block ranges could not be extended backward because the beginning of the first (src) file was reached.
* `pairbackward_bof_second`: A matching pair of block ranges could not be extended backward because the beginning of the second (dst) file was reached.
* `pairbackward_hit`: A pair of matching block ranges was extended backward by one block.
* `pairbackward_miss`: A pair of matching block ranges could not be extended backward by one block because the pair of blocks before the first block in the range did not contain identical data.
* `pairbackward_ms`: Total time spent extending matching block ranges backward from the first matching block found by hash table lookup.
* `pairbackward_overlap`: A pair of matching block ranges could not be extended backward by one block because this would cause the two block ranges to overlap.
* `pairbackward_same`: A pair of matching block ranges could not be extended backward by one block because this would cause the two block ranges to refer to the same btrfs data extent.
* `pairbackward_stop`: Stopped extending a pair of matching block ranges backward for any of the reasons listed here.
* `pairbackward_toxic_addr`: A pair of matching block ranges was abandoned because the extended range would include a data block with a toxic address.
* `pairbackward_toxic_hash`: A pair of matching block ranges was abandoned because the extended range would include a data block with a toxic hash.
* `pairbackward_try`: Started extending a pair of matching block ranges backward.
* `pairbackward_zero`: A pair of matching block ranges could not be extended backward by one block because the src block contained all zeros and was not compressed.
pairforward
-----------
The `pairforward` event group consists of events related to extending matching block ranges forward starting from the initial block match found using the hash table.
* `pairforward_eof_first`: A matching pair of block ranges could not be extended forward because the end of the first (src) file was reached.
* `pairforward_eof_malign`: A matching pair of block ranges could not be extended forward because the end of the second (dst) file was not aligned to a 4K boundary nor the end of the first (src) file.
* `pairforward_eof_second`: A matching pair of block ranges could not be extended forward because the end of the second (dst) file was reached.
* `pairforward_hit`: A pair of matching block ranges was extended forward by one block.
* `pairforward_hole`: A pair of matching block ranges was extended forward by one block, and the block was a hole in the second (dst) file.
* `pairforward_miss`: A pair of matching block ranges could not be extended forward by one block because the pair of blocks after the last block in the range did not contain identical data.
* `pairforward_ms`: Total time spent extending matching block ranges forward from the first matching block found by hash table lookup.
* `pairforward_overlap`: A pair of matching block ranges could not be extended forward by one block because this would cause the two block ranges to overlap.
* `pairforward_same`: A pair of matching block ranges could not be extended forward by one block because this would cause the two block ranges to refer to the same btrfs data extent.
* `pairforward_stop`: Stopped extending a pair of matching block ranges forward for any of the reasons listed here.
* `pairforward_toxic_addr`: A pair of matching block ranges was abandoned because the extended range would include a data block with a toxic address.
* `pairforward_toxic_hash`: A pair of matching block ranges was abandoned because the extended range would include a data block with a toxic hash.
* `pairforward_try`: Started extending a pair of matching block ranges forward.
* `pairforward_zero`: A pair of matching block ranges could not be extended backward by one block because the src block contained all zeros and was not compressed.
progress
--------
The `progress` event group consists of events related to progress estimation.
* `progress_no_data_bg`: Failed to retrieve any data block groups from the filesystem.
* `progress_not_created`: A crawler for one size tier had not been created for the extent scanner.
* `progress_complete`: A crawler for one size tier has completed a scan.
* `progress_not_found`: The extent position for a crawler does not correspond to any block group.
* `progress_out_of_bg`: The extent position for a crawler does not correspond to any data block group.
* `progress_ok`: Table of progress and ETA created successfully.
readahead
---------
The `readahead` event group consists of events related to data prefetching (formerly calls to `posix_fadvise` or `readahead`, but now emulated in userspace).
* `readahead_bytes`: Number of bytes prefetched.
* `readahead_count`: Number of read calls.
* `readahead_clear`: Number of times the duplicate read cache was cleared.
* `readahead_fail`: Number of read errors during prefetch.
* `readahead_ms`: Total time spent emulating readahead in user-space (kernel readahead is not measured).
* `readahead_skip`: Number of times a duplicate read was identified in the cache and skipped.
* `readahead_unread_ms`: Total time spent running `posix_fadvise(..., POSIX_FADV_DONTNEED)`.
replacedst
----------
The `replacedst` event group consists of events related to replacing a single reference to a dst extent using any suitable src extent (i.e. eliminating a single duplicate extent ref during a crawl).
* `replacedst_dedup_hit`: A duplicate extent reference was identified and removed.
* `replacedst_dedup_miss`: A duplicate extent reference was identified, but src and dst extents did not match (i.e. the filesystem changed in the meantime).
* `replacedst_grown`: A duplicate block was identified, and adjacent blocks were duplicate as well.
* `replacedst_overlaps`: A pair of duplicate block ranges was identified, but the pair was not usable for dedupe because the two ranges overlap.
* `replacedst_same`: A pair of duplicate block ranges was identified, but the pair was not usable for dedupe because the physical block ranges were the same.
* `replacedst_try`: A duplicate block was identified and an attempt was made to remove it (i.e. this is the total number of replacedst calls).
replacesrc
----------
The `replacesrc` event group consists of events related to replacing every reference to a src extent using a temporary copy of the extent's data (i.e. eliminating leftover unique data in a partially duplicate extent during a crawl).
* `replacesrc_dedup_hit`: A duplicate extent reference was identified and removed.
* `replacesrc_dedup_miss`: A duplicate extent reference was identified, but src and dst extents did not match (i.e. the filesystem changed in the meantime).
* `replacesrc_grown`: A duplicate block was identified, and adjacent blocks were duplicate as well.
* `replacesrc_overlaps`: A pair of duplicate block ranges was identified, but the pair was not usable for dedupe because the two ranges overlap.
* `replacesrc_try`: A duplicate block was identified and an attempt was made to remove it (i.e. this is the total number of replacedst calls).
resolve
-------
The `resolve` event group consists of operations related to translating a btrfs virtual block address (i.e. physical block address) to a `(root, inode, offset)` tuple (i.e. locating and opening the file containing a matching block). `resolve` is the top level, `chase` and `adjust` are the lower two levels.
* `resolve_empty`: The `LOGICAL_INO` ioctl returned successfully with an empty reference list (0 items).
* `resolve_fail`: The `LOGICAL_INO` ioctl returned an error.
* `resolve_large`: The `LOGICAL_INO` ioctl returned more than 2730 results (the limit of the v1 ioctl).
* `resolve_ms`: Total time spent in the `LOGICAL_INO` ioctl (i.e. wallclock time, not kernel CPU time).
* `resolve_ok`: The `LOGICAL_INO` ioctl returned success.
* `resolve_overflow`: The `LOGICAL_INO` ioctl returned 9999 or more extents (the limit configured in `bees.h`).
* `resolve_toxic`: The `LOGICAL_INO` ioctl took more than 0.1 seconds of kernel CPU time.
root
----
The `root` event group consists of operations related to translating a btrfs root ID (i.e. subvol ID) into an open file descriptor by navigating the btrfs root tree.
* `root_clear`: The root FD cache was cleared.
* `root_found`: A root FD was successfully opened.
* `root_notfound`: A root FD could not be opened because all candidate paths could not be opened, or there were no paths available.
* `root_ok`: A root FD was opened and its correctness verified.
* `root_open_fail`: A root FD `open()` attempt returned an error.
* `root_parent_open_fail`: A recursive call to open the parent of a subvol failed.
* `root_parent_open_ok`: A recursive call to open the parent of a subvol succeeded.
* `root_parent_open_try`: A recursive call to open the parent of a subvol was attempted.
* `root_parent_path_empty`: No path could be found to connect a parent root FD to its child.
* `root_parent_path_fail`: The `INO_PATH` ioctl failed to find a name for a child subvol relative to its parent.
* `root_parent_path_open_fail`: The `open()` call in a recursive call to open the parent of a subvol returned an error.
* `root_workaround_btrfs_send`: A subvol was determined to be read-only and disabled to implement the btrfs send workaround.
scan
----
The `scan` event group consists of operations related to scanning incoming data. This is where bees finds duplicate data and populates the hash table.
* `scan_blacklisted`: A blacklisted extent was passed to `scan_forward` and dropped.
* `scan_block`: A block of data was scanned.
* `scan_compressed_no_dedup`: An extent that was compressed contained non-zero, non-duplicate data.
* `scan_dup_block`: Number of duplicate block references deduped.
* `scan_dup_hit`: A pair of duplicate block ranges was found.
* `scan_dup_miss`: A pair of duplicate blocks was found in the hash table but not in the filesystem.
* `scan_extent`: An extent was scanned (`scan_one_extent`).
* `scan_forward`: A logical byte range was scanned (`scan_forward`).
* `scan_found`: An entry was found in the hash table matching a scanned block from the filesystem.
* `scan_hash_hit`: A block was found on the filesystem corresponding to a block found in the hash table.
* `scan_hash_miss`: A block was not found on the filesystem corresponding to a block found in the hash table.
* `scan_hash_preinsert`: A non-zero data block's hash was prepared for possible insertion into the hash table.
* `scan_hash_insert`: A non-zero data block's hash was inserted into the hash table.
* `scan_hole`: A hole extent was found during scan and ignored.
* `scan_interesting`: An extent had flags that were not recognized by bees and was ignored.
* `scan_lookup`: A hash was looked up in the hash table.
* `scan_malign`: A block being scanned matched a hash at EOF in the hash table, but the EOF was not aligned to a block boundary and the two blocks did not have the same length.
* `scan_push_front`: An entry in the hash table matched a duplicate block, so the entry was moved to the head of its LRU list.
* `scan_reinsert`: A copied block's hash and block address was inserted into the hash table.
* `scan_resolve_hit`: A block address in the hash table was successfully resolved to an open FD and offset pair.
* `scan_resolve_zero`: A block address in the hash table was not resolved to any subvol/inode pair, so the corresponding hash table entry was removed.
* `scan_rewrite`: A range of bytes in a file was copied, then the copy deduped over the original data.
* `scan_root_dead`: A deleted subvol was detected.
* `scan_seen_clear`: The list of recently scanned extents reached maximum size and was cleared.
* `scan_seen_erase`: An extent reference was modified by scan, so all future references to the extent must be scanned.
* `scan_seen_hit`: A scan was skipped because the same extent had recently been scanned.
* `scan_seen_insert`: An extent reference was not modified by scan and its hashes have been inserted into the hash table, so all future references to the extent can be ignored.
* `scan_seen_miss`: A scan was not skipped because the same extent had not recently been scanned (i.e. the extent was scanned normally).
* `scan_skip_bytes`: Nuisance dedupe or hole-punching would save less than half of the data in an extent.
* `scan_skip_ops`: Nuisance dedupe or hole-punching would require too many dedupe/copy/hole-punch operations in an extent.
* `scan_toxic_hash`: A scanned block has the same hash as a hash table entry that is marked toxic.
* `scan_toxic_match`: A hash table entry points to a block that is discovered to be toxic.
* `scan_twice`: Two references to the same block have been found in the hash table.
* `scan_zero`: A data block containing only zero bytes was detected.
scanf
-----
The `scanf` event group consists of operations related to `BeesContext::scan_forward`. This is the entry point where `crawl` schedules new data for scanning.
* `scanf_deferred_extent`: Two tasks attempted to scan the same extent at the same time, so one was deferred.
* `scanf_eof`: Scan past EOF was attempted.
* `scanf_extent`: A btrfs extent item was scanned.
* `scanf_extent_ms`: Total thread-seconds spent scanning btrfs extent items.
* `scanf_no_fd`: References to a block from the hash table were found, but a FD could not be opened.
* `scanf_total`: A logical byte range of a file was scanned.
* `scanf_total_ms`: Total thread-seconds spent scanning logical byte ranges.
Note that in current versions of bees, `scan_forward` is passed extents
that correspond exactly to btrfs extent items, so the `scanf_extent` and
`scanf_total` numbers can only be different if the filesystem changes
between crawl time and scan time.
sync
----
The `sync` event group consists of operations related to the `fsync` workarounds in bees.
* `sync_count`: `fsync()` was called on a temporary file.
* `sync_ms`: Total time spent executing `fsync()`.
tmp
---
The `sync` event group consists of operations related temporary files and the data within them.
* `tmp_aligned`: A temporary extent was allocated on a block boundary.
* `tmp_block`: Total number of temporary blocks copied.
* `tmp_block_zero`: Total number of temporary hole blocks copied.
* `tmp_bytes`: Total number of temporary bytes copied.
* `tmp_copy`: Total number of extents copied.
* `tmp_copy_ms`: Total time spent copying extents.
* `tmp_create`: Total number of temporary files created.
* `tmp_create_ms`: Total time spent creating temporary files.
* `tmp_hole`: Total number of hole extents created.
* `tmp_realign`: A temporary extent was not aligned to a block boundary.
* `tmp_resize`: A temporary file was resized with `ftruncate()`
* `tmp_resize_ms`: Total time spent in `ftruncate()`
* `tmp_trunc`: The temporary file size limit was exceeded, triggering a new temporary file creation.

View File

@@ -1,230 +0,0 @@
bees Gotchas
============
C++ Exceptions
--------------
bees is very paranoid about the data it gets from btrfs, and if btrfs
does anything bees does not expect, bees will throw an exception and move
on without touching the offending data. This will trigger a stack trace
to the log containing data which is useful for developers to understand
what happened.
In all cases C++ exceptions in bees are harmless to data on the
filesystem. bees handles most exceptions by aborting processing of
the current extent and moving to the next extent. In some cases an
exception may occur in a critical bees thread, which will stop the bees
process from making any further progress; however, these cases are rare
and are typically caused by unusual filesystem conditions (e.g. [freshly
formatted filesystem with no
data](https://github.com/Zygo/bees/issues/93)) or lack of memory or
other resources.
The following are common cases that users may encounter:
* If a snapshot is deleted, bees will generate a burst of exceptions for
references to files in the snapshot that no longer exist. This lasts
until the FD caches are cleared, usually a few minutes with default
btrfs mount options. These generally look like:
`std::system_error: BTRFS_IOC_TREE_SEARCH_V2: [path] at fs.cc:844: No such file or directory`
* If data is modified at the same time it is being scanned, bees will get
an inconsistent version of the data layout in the filesystem, causing
the `ExtentWalker` class to throw various constraint-check exceptions.
The exception causes bees to retry the extent in a later filesystem scan
(hopefully when the file is no longer being modified). The exception
text is similar to:
`std::runtime_error: fm.rbegin()->flags() = 776 failed constraint check (fm.rbegin()->flags() & FIEMAP_EXTENT_LAST) at extentwalker.cc:229`
but the line number or specific code fragment may vary.
* If there are too many possible matching blocks within a pair of extents,
bees will loop billions of times considering all possibilities. This is
a waste of time, so an exception is currently used to break out of such
loops early. The exception text in this case is:
`FIXME: too many duplicate candidates, bailing out here`
Terminating bees with SIGTERM
-----------------------------
bees is designed to survive host crashes, so it is safe to terminate bees
using SIGKILL; however, when bees next starts up, it will repeat some
work that was performed between the last bees crawl state save point
and the SIGKILL (up to 15 minutes), and a large hash table may not be
completely written back to disk, so some duplicate matches will be lost.
If bees is stopped and started less than once per week, then this is not
a problem as the proportional impact is quite small; however, users who
stop and start bees daily or even more often may prefer to have a clean
shutdown with SIGTERM so bees can restart faster.
The shutdown procedure performs these steps:
1. Crawl state is saved to `$BEESHOME`. This is the most
important bees state to save to disk as it directly impacts
restart time, so it is done as early as possible
2. Hash table is written to disk. Normally the hash table is
trickled back to disk at a rate of about 128KiB per second;
however, SIGTERM causes bees to attempt to flush the whole table
immediately. The time spent here depends on the size of RAM, speed
of disks, and aggressiveness of competing filesystem workloads.
It can trigger `vm.dirty_bytes` limits and block other processes
writing to the filesystem for a while.
3. The bees process calls `_exit`, which terminates all running
worker threads, closes and deletes all temporary files. This
can take a while _after_ the bees process exits, especially on
slow spinning disks.
Balances
--------
A btrfs balance relocates data on disk by making a new copy of the
data, replacing all references to the old data with references to the
new copy, and deleting the old copy. To bees, this is the same as any
other combination of new and deleted data (e.g. from defrag, or ordinary
file operations): some new data has appeared (to be scanned) and some
old data has disappeared (to be removed from the hash table when it is
detected).
As bees scans the newly balanced data, it will get hits on the hash
table pointing to the old data (it's identical data, so it would look
like a duplicate). These old hash table entries will not be valid any
more, so when bees tries to compare new data with old data, it will not
be able to find the old data at the old address, and bees will delete
the hash table entries. If no other duplicates are found, bees will
then insert new hash table entries pointing to the new data locations.
The erase is performed before the insert, so the new data simply replaces
the old and there is (little or) no impact on hash table entry lifetimes
(depending on how overcommitted the hash table is). Each block is
processed one at a time, which can be slow if there are many of them.
Routine btrfs maintenance balances rarely need to relocate more than 0.1%
of the total filesystem data, so the impact on bees is small even after
taking into account the extra work bees has to do.
If the filesystem must undergo a full balance (e.g. because disks were
added or removed, or to change RAID profiles), then every data block on
the filesystem will be relocated to a new address, which invalidates all
the data in the bees hash table at once. In such cases it is a good idea to:
1. Stop bees before the full balance starts,
2. Wipe the `$BEESHOME` directory (or delete and recreate `beeshash.dat`),
3. Restart bees after the full balance is finished.
bees will perform a full filesystem scan automatically after the balance
since all the data has "new" btrfs transids. bees won't waste any time
invalidating stale hash table data after the balance if the hash table
is empty. This can considerably improve the performance of both bees
(since it has no stale hash table entries to invalidate) and btrfs balance
(since it's not competing with bees for iops).
Snapshots
---------
bees can dedupe filesystems with many snapshots, but bees only does
well in this situation if bees was running on the filesystem from
the beginning.
Each time bees dedupes an extent that is referenced by a snapshot,
the entire metadata page in the snapshot subvol (16KB by default) must
be CoWed in btrfs. Since all references must be removed at the same
time, this CoW operation is repeated in every snapshot containing the
duplicate data. This can result in a substantial increase in btrfs
metadata size if there are many snapshots on a filesystem.
Normally, metadata is small (less than 1% of the filesystem) and dedupe
hit rates are large (10-40% of the filesystem), so the increase in
metadata size is offset by much larger reductions in data size and the
total space used by the entire filesystem is reduced.
If a subvol is deduped _before_ a snapshot is created, the snapshot will
have the same deduplication as the subvol. This does _not_ result in
unusually large metadata sizes. If a snapshot is made after bees has
fully scanned the origin subvol, bees can avoid scanning most of the
data in the snapshot subvol, as it will be provably identical to the
origin subvol that was already scanned.
If a subvol is deduped _after_ a snapshot is created, the origin and
snapshot subvols must be deduplicated separately. In the worst case, this
will double the amount of reading the bees scanner must perform, and will
also double the amount of btrfs metadata used for the snapshot; however,
the "worst case" is a dedupe hit rate of 1% or more, so a doubling of
metadata size is certain for all but the most unique data sets. Also,
bees will not be able to free any space until the last snapshot has been
scanned and deduped, so payoff in data space savings is deferred until
the metadata has almost finished expanding.
If a subvol is deduped after _many_ snapshots have been created, all
subvols must be deduplicated individually. In the worst case, this will
multiply the scanning work and metadata size by the number of snapshots.
For 100 snapshots this can mean a 100x growth in metadata size and
bees scanning time, which typically exceeds the possible savings from
reducing the data size by dedupe. In such cases using bees will result
in a net increase in disk space usage that persists until the snapshots
are deleted.
Snapshot case studies
---------------------
* bees running on an empty filesystem
* filesystem is mkfsed
* bees is installed and starts running
* data is written to the filesystem
* bees dedupes the data as it appears
* a snapshot is made of the data
* The snapshot will already be 99% deduped, so the metadata will
not expand very much because only 1% of the data in the snapshot
must be deduped.
* more snapshots are made of the data
* as long as dedupe has been completed on the origin subvol,
bees will quickly scan each new snapshot because it can skip
all the previously scanned data. Metadata usage remains low
(it may even shrink because there are fewer csums).
* bees installed on a non-empty filesystem with snapshots
* filesystem is mkfsed
* data is written to the filesystem
* multiple snapshots are made of the data
* bees is installed and starts running
* bees dedupes each snapshot individually
* The snapshot metadata will no longer be shared, resulting in
substantial growth of metadata usage.
* Disk space savings do not occur until bees processes the
last snapshot reference to data.
Other Gotchas
-------------
* bees avoids the [slow backrefs kernel bug](btrfs-kernel.md) by
measuring the time required to perform `LOGICAL_INO` operations.
If an extent requires over 5.0 kernel CPU seconds to perform a
`LOGICAL_INO` ioctl, then bees blacklists the extent and avoids
referencing it in future operations. In most cases, fewer than 0.1%
of extents in a filesystem must be avoided this way. This results
in short write latency spikes as btrfs will not allow writes to the
filesystem while `LOGICAL_INO` is running. Generally the CPU spends
most of the runtime of the `LOGICAL_INO` ioctl running the kernel,
so on a single-core CPU the entire system can freeze up for a second
during operations on toxic extents. Note this only occurs on older
kernels. See [the slow backrefs kernel bug section](btrfs-kernel.md).
* If a process holds a directory FD open, the subvol containing the
directory cannot be deleted (`btrfs sub del` will start the deletion
process, but it will not proceed past the first open directory FD).
`btrfs-cleaner` will simply skip over the directory *and all of its
children* until the FD is closed. bees avoids this gotcha by closing
all of the FDs in its directory FD cache every btrfs transaction.
* If a file is deleted while bees is caching an open FD to the file,
bees continues to scan the file. For very large files (e.g. VM
images), the deletion of the file can be delayed indefinitely.
To limit this delay, bees closes all FDs in its file FD cache every
btrfs transaction.

View File

@@ -1,102 +0,0 @@
How bees Works
--------------
bees is a daemon designed to run continuously and maintain its state
across crashes and reboots.
bees uses checkpoints for persistence to eliminate the IO overhead of a
transactional data store. On restart, bees will dedupe any data that
was added to the filesystem since the last checkpoint. Checkpoints
occur every 15 minutes for scan progress, stored in `beescrawl.dat`.
The hash table trickle-writes to disk at 128KiB/s to `beeshash.dat`,
but will flush immediately if bees is terminated by SIGTERM.
There are no special requirements for bees hash table storage--`.beeshome`
could be stored on a different btrfs filesystem, ext4, or even CIFS (but
not MS-DOS--beeshome does need filenames longer than 8.3).
bees uses a persistent dedupe hash table with a fixed size configured
by the user. Any size of hash table can be dedicated to dedupe. If a
fast dedupe with low hit rate is desired, bees can use a hash table as
small as 128KB.
The bees hash table is loaded into RAM at startup and `mlock`ed so it
will not be swapped out by the kernel (if swap is permitted, performance
degrades to nearly zero, for both bees and the swap device).
bees scans the filesystem in a single pass which removes duplicate
extents immediately after they are detected. There are no distinct
scanning and dedupe phases, so bees can start recovering free space
immediately after startup.
Once a filesystem scan has been completed, bees uses the `min_transid`
parameter of the `TREE_SEARCH_V2` ioctl to avoid rescanning old data
on future scans and quickly scan new data. An incremental data scan
can complete in less than a millisecond on an idle filesystem.
Once a duplicate data block is identified, bees examines the nearby
blocks in the files where the matched block appears. This allows bees
to find long runs of adjacent duplicate block pairs if it has an entry
for any one of the blocks in its hash table. On typical data sets,
this means most of the blocks in the hash table are redundant and can
be discarded without significant impact on dedupe hit rate.
Hash table entries are grouped together into LRU lists. As each block
is scanned, its hash table entry is inserted into the LRU list at a
random position. If the LRU list is full, the entry at the end of the
list is deleted. If a hash table entry is used to discover duplicate
blocks, the entry is moved to the beginning of the list. This makes bees
unable to detect a small number of duplicates, but it dramatically
improves efficiency on filesystems with many small files.
Once the hash table fills up, old entries are evicted by new entries.
This means that the optimum hash table size is determined by the
distance between duplicate blocks on the filesystem rather than the
filesystem unique data size. Even if the hash table is too small
to find all duplicates, it may still find _most_ of them, especially
during incremental scans where the data in many workloads tends to be
more similar.
When a duplicate block pair is found in two btrfs extents, bees will
attempt to match all other blocks in the newer extent with blocks in
the older extent (i.e. the goal is to keep the extent referenced in the
hash table and remove the most recently scanned extent). If this is
possible, then the new extent will be replaced with a reference to the
old extent. If this is not possible, then bees will create a temporary
copy of the unmatched data in the new extent so that the entire new
extent can be removed by deduplication. This must be done because btrfs
cannot partially overwrite extents--the _entire_ extent must be replaced.
The temporary copy is then scanned during the next pass bees makes over
the filesystem for potential duplication of other extents.
When a block containing all-zero bytes is found, bees dedupes the extent
against a temporary file containing a hole, possibly creating temporary
copies of any non-zero data in the extent for later deduplication as
described above. If the extent is compressed, bees avoids splitting
the extent in the middle as this generally has a negative impact on
compression ratio (and also triggers a [kernel bug](btrfs-kernel.md)).
bees does not store any information about filesystem structure, so
its performance is linear in the number or size of files. The hash
table stores physical block numbers which are converted into paths
and FDs on demand through btrfs `SEARCH_V2` and `LOGICAL_INO` ioctls.
This eliminates the storage required to maintain the equivalents
of these functions in userspace, at the expense of encountering [some
kernel bugs in `LOGICAL_INO` performance](btrfs-kernel.md).
bees uses only the data-safe `FILE_EXTENT_SAME` (aka `FIDEDUPERANGE`)
kernel ioctl to manipulate user data, so it can dedupe live data
(e.g. build servers, sqlite databases, VM disk images). bees does not
modify file attributes or timestamps in deduplicated files.
When bees has scanned all of the data, bees will pause until a new
transaction has completed in the btrfs filesystem. bees tracks
the current btrfs transaction ID over time so that it polls less often
on quiescent filesystems and more often on busy filesystems.
Scanning and deduplication work is performed by worker threads. If the
[`--loadavg-target` option](options.md) is used, bees adjusts the number
of worker threads up or down as required to have a user-specified load
impact on the system. The maximum and minimum number of threads is
configurable. If the system load is too high then bees will stop until
the load falls to acceptable levels.

View File

@@ -1,74 +0,0 @@
BEES
====
Best-Effort Extent-Same, a btrfs deduplication agent.
About bees
----------
bees is a block-oriented userspace deduplication agent designed to scale
up to large btrfs filesystems. It is an offline dedupe combined with
an incremental data scan capability to minimize time data spends on disk
from write to dedupe.
Strengths
---------
* Space-efficient hash table - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
* Daemon mode - incrementally dedupes new data as it appears
* Largest extents first - recover more free space during fixed maintenance windows
* Works with btrfs compression - dedupe any combination of compressed and uncompressed files
* Whole-filesystem dedupe - scans data only once, even with snapshots and reflinks
* Persistent hash table for rapid restart after shutdown
* Constant hash table size - no increased RAM usage if data set becomes larger
* Works on live data - no scheduled downtime required
* Automatic self-throttling - reduces system load
* btrfs support - recovers more free space from btrfs than naive dedupers
Weaknesses
----------
* Whole-filesystem dedupe - has no include/exclude filters, does not accept file lists
* Requires root privilege (`CAP_SYS_ADMIN` plus the usual filesystem read/modify caps)
* [First run may increase metadata space usage if many snapshots exist](gotchas.md)
* Constant hash table size - no decreased RAM usage if data set becomes smaller
* btrfs only
Installation and Usage
----------------------
* [Installation](install.md)
* [Configuration](config.md)
* [Running](running.md)
* [Command Line Options](options.md)
Recommended Reading
-------------------
* [bees Gotchas](gotchas.md)
* [btrfs kernel bugs](btrfs-kernel.md) - especially DATA CORRUPTION WARNING for old kernels
* [bees vs. other btrfs features](btrfs-other.md)
* [What to do when something goes wrong](wrong.md)
More Information
----------------
* [How bees works](how-it-works.md)
* [Missing bees features](missing.md)
* [Event counter descriptions](event-counters.md)
Bug Reports and Contributions
-----------------------------
Email bug reports and patches to Zygo Blaxell <bees@furryterror.org>.
You can also use Github:
https://github.com/Zygo/bees
Copyright & License
-------------------
Copyright 2015-2025 Zygo Blaxell <bees@furryterror.org>.
GPL (version 3 or later).

View File

@@ -1,91 +0,0 @@
Building bees
=============
Dependencies
------------
* C++11 compiler (tested with GCC 8.1.0, 12.2.0)
Sorry. I really like closures and shared_ptr, so support
for earlier compiler versions is unlikely.
Note that the C++ standard--and GCC's implementation of it--is evolving.
There may be problems when building with newer compiler versions.
Build failure reports welcome!
* btrfs-progs
Needed at runtime by the service wrapper script.
* [Linux kernel version](btrfs-kernel.md) gets its own page.
* markdown to build the documentation
* util-linux version that provides `blkid` command for the helper
script `scripts/beesd` to work
Installation
============
bees can be installed by following one these instructions:
Arch package
------------
bees is available for Arch Linux in the community repository. Install with:
`$ pacman -S bees`
or build a live version from git master using AUR:
`$ git clone https://aur.archlinux.org/bees-git.git && cd bees-git && makepkg -si`
Gentoo package
--------------
bees is officially available in Gentoo Portage. Just emerge a stable
version:
`$ emerge --ask bees`
or build a live version from git master:
`$ emerge --ask =bees-9999`
You can opt-out of building the support tools with
`USE="-tools" emerge ...`
If you want to start hacking on bees and contribute changes, just emerge
the live version which automatically pulls in all required development
packages.
Build from source
-----------------
Build with `make`. The build produces `bin/bees` which must be copied
to somewhere in `$PATH` on the target system respectively.
It will also generate `scripts/beesd@.service` for systemd users. This
service makes use of a helper script `scripts/beesd` to boot the service.
Both of the latter use the filesystem UUID to mount the root subvolume
within a temporary runtime directory.
### Ubuntu 16.04 - 17.04:
`$ apt -y install build-essential btrfs-tools markdown && make`
### Ubuntu 18.10:
`$ apt -y install build-essential btrfs-progs markdown && make`
Packaging
---------
See 'Dependencies' above. Package maintainers can pick ideas for building and
configuring the source package from the Gentoo ebuild:
<https://github.com/gentoo/gentoo/tree/master/sys-fs/bees>
You can configure some build options by creating a file `localconf` and
adjust settings for your distribution environment there.
Please also review the Makefile for additional hints.

View File

@@ -1,42 +0,0 @@
Features You Might Expect That bees Doesn't Have
------------------------------------------------
* There's no configuration file (patches welcome!). There are
some tunables hardcoded in the source (`src/bees.h`) that could eventually
become configuration options. There's also an incomplete option parser
(patches welcome!).
* The bees process doesn't fork and writes its log to stdout/stderr.
A shell wrapper is required to make it behave more like a daemon.
* There's no facility to exclude any part of a filesystem or focus on
specific files (patches welcome).
* PREALLOC extents and extents containing blocks filled with zeros will
be replaced by holes. There is no way to turn this off.
* The fundamental unit of deduplication is the extent _reference_, when
it should be the _extent_ itself. This is an architectural limitation
that results in excess reads of extent data, even in the Extent scan mode.
* Block reads are currently more allocation- and CPU-intensive than they
should be, especially for filesystems on SSD where the IO overhead is
much smaller. This is a problem for CPU-power-constrained environments
(e.g. laptops running from battery, or ARM devices with slow CPU).
* bees can currently fragment extents when required to remove duplicate
blocks, but has no defragmentation capability yet. When possible, bees
will attempt to work with existing extent boundaries and choose the
largest fragments available, but it will not aggregate blocks together
from multiple extents to create larger ones.
* When bees fragments an extent, the copied data is compressed. There
is currently no way (other than by modifying the source) to select a
compression method or not compress the data (patches welcome!).
* It is theoretically possible to resize the hash table without starting
over with a new full-filesystem scan; however, this feature has not been
implemented yet.
* btrfs maintains csums of data blocks which bees could use to improve
scan speeds, but bees doesn't use them yet.

View File

@@ -1,124 +0,0 @@
# bees Command Line Options
## Load management options
* `--thread-count COUNT` or `-c`
Specify maximum number of worker threads. Overrides `--thread-factor`
(`-C`), default/autodetected values, and the hardcoded thread limit.
* `--thread-factor FACTOR` or `-C`
Specify ratio of worker threads to detected CPU cores. Overridden by
`--thread-count` (`-c`).
Default is 1.0, i.e. 1 worker thread per detected CPU. Use values
below 1.0 to leave some cores idle, or above 1.0 if there are more
disks than CPUs in the filesystem.
* `--loadavg-target LOADAVG` or `-g`
Specify load average target for dynamic worker threads. Default is
to run the maximum number of worker threads all the time.
Worker threads will be started or stopped subject to the upper limit
imposed by `--thread-factor`, `--thread-min` and `--thread-count`
until the load average is within +/- 0.5 of `LOADAVG`.
* `--thread-min COUNT` or `-G`
Specify minimum number of dynamic worker threads. This can be used
to force a minimum number of threads to continue running while using
`--loadavg-target` to manage load.
Default is 0, i.e. all bees worker threads will stop when the system
load exceeds the target.
Has no effect unless `--loadavg-target` is used to specify a target load.
* `--throttle-factor FACTOR`
In order to avoid saturating btrfs deferred work queues, bees tracks
the time that operations with delayed effect (dedupe and tmpfile copy)
and operations with long run times (`LOGICAL_INO`) run. If an operation
finishes before the average run time for that operation, bees will
sleep for the remainder of the average run time, so that operations
are submitted to btrfs at a rate similar to the rate that btrfs can
complete them.
The `FACTOR` is multiplied by the average run time for each operation
to calculate the target delay time.
`FACTOR` 0 is the default, which adds no delays. bees will attempt
to saturate btrfs delayed work queues as quickly as possible, which
may impact other processes on the same filesystem, or even slow down
bees itself.
`FACTOR` 1.0 will attempt to keep btrfs delayed work queues filled at
a steady average rate.
`FACTOR` more than 1.0 will add delays longer than the average
run time (e.g. 10.0 will delay all operations that take less than 10x
the average run time). High values of `FACTOR` may be desirable when
using bees with other applications on the same filesystem.
The maximum delay per operation is 60 seconds.
## Filesystem tree traversal options
* `--scan-mode MODE` or `-m`
Specify extent scanning algorithm.
**EXPERIMENTAL** feature that may go away.
* Mode 0: lockstep
* Mode 1: independent
* Mode 2: sequential
* Mode 3: recent
* Mode 4: extent
For details of the different scanning modes and the default value of
this option, see [bees configuration](config.md).
## Workarounds
* `--workaround-btrfs-send` or `-a`
_This option is obsolete and should not be used any more._
Pretend that read-only snapshots are empty and silently discard any
request to dedupe files referenced through them. This is a workaround
for [problems with old kernels running `btrfs send` and `btrfs send
-p`](btrfs-kernel.md) which make these btrfs features unusable with bees.
This option was used to avoid breaking `btrfs send` on old kernels.
The affected kernels are now too old to be recommended for use with bees.
bees now waits for `btrfs send` to finish. There is no need for an
option to enable this.
**Note:** There is a _significant_ space tradeoff when using this option:
it is likely no space will be recovered--and possibly significant extra
space used--until the read-only snapshots are deleted.
## Logging options
* `--timestamps` or `-t`
Enable timestamps in log output.
* `--no-timestamps` or `-T`
Disable timestamps in log output.
* `--absolute-paths` or `-p`
Paths in log output will be absolute.
* `--strip-paths` or `-P`
Paths in log output will have the working directory at bees startup stripped.
* `--verbose` or `-v`
Set log verbosity (0 = no output, 8 = all output, default 8).

View File

@@ -1,91 +0,0 @@
Running bees
============
Setup
-----
If you don't want to use the helper script `scripts/beesd` to setup and
configure bees, here's how you manually setup bees.
Create a directory for bees state files:
export BEESHOME=/some/path
mkdir -p "$BEESHOME"
Create an empty hash table ([your choice of size](config.md), but it
must be a multiple of 128KB). This example creates a 1GB hash table:
truncate -s 1g "$BEESHOME/beeshash.dat"
chmod 700 "$BEESHOME/beeshash.dat"
bees can _only_ process the root subvol of a btrfs with nothing mounted
over top. If the bees argument is not the root subvol directory, bees
will just throw an exception and stop.
Use a separate mount point, and let only bees access it:
UUID=3399e413-695a-4b0b-9384-1b0ef8f6c4cd
mkdir -p /var/lib/bees/$UUID
mount /dev/disk/by-uuid/$UUID /var/lib/bees/$UUID -osubvol=/
If you don't set BEESHOME, the path "`.beeshome`" will be used relative
to the root subvol of the filesystem. For example:
btrfs sub create /var/lib/bees/$UUID/.beeshome
truncate -s 1g /var/lib/bees/$UUID/.beeshome/beeshash.dat
chmod 700 /var/lib/bees/$UUID/.beeshome/beeshash.dat
You can use any relative path in `BEESHOME`. The path will be taken
relative to the root of the deduped filesystem (in other words it can
be the name of a subvol):
export BEESHOME=@my-beeshome
btrfs sub create /var/lib/bees/$UUID/$BEESHOME
truncate -s 1g /var/lib/bees/$UUID/$BEESHOME/beeshash.dat
chmod 700 /var/lib/bees/$UUID/$BEESHOME/beeshash.dat
Configuration
-------------
There are some runtime configurable options using environment variables:
* BEESHOME: Directory containing bees state files:
* beeshash.dat | persistent hash table. Must be a multiple of 128KB, and must be created before bees starts.
* beescrawl.dat | state of SEARCH_V2 crawlers. ASCII text. bees will create this.
* beesstats.txt | statistics and performance counters. ASCII text. bees will create this.
* BEESSTATUS: File containing a snapshot of current bees state: performance
counters and current status of each thread. The file is meant to be
human readable, but understanding it probably requires reading the source.
You can watch bees run in realtime with a command like:
watch -n1 cat $BEESSTATUS
Other options (e.g. interval between filesystem crawls) can be configured
in `src/bees.h` or [on the command line](options.md).
Running
-------
Reduce CPU and IO priority to be kinder to other applications sharing
this host (or raise them for more aggressive disk space recovery). If you
use cgroups, put `bees` in its own cgroup, then reduce the `blkio.weight`
and `cpu.shares` parameters. You can also use `schedtool` and `ionice`
in the shell script that launches `bees`:
schedtool -D -n20 $$
ionice -c3 -p $$
You can also use the [load management options](options.md) to further
control the impact of bees on the rest of the system.
Let the bees fly:
for fs in /var/lib/bees/*-*-*-*-*/; do
bees "$fs" >> "$fs/.beeshome/bees.log" 2>&1 &
done
You'll probably want to arrange for `/var/log/bees.log` to be rotated
periodically. You may also want to set umask to 077 to prevent disclosure
of information about the contents of the filesystem through the log file.
There are also some shell wrappers in the `scripts/` directory.

View File

@@ -1,167 +0,0 @@
What to do when something goes wrong with bees
==============================================
Hangs and excessive slowness
----------------------------
### Use load-throttling options
If bees is just more aggressive than you would like, consider using
[load throttling options](options.md). These are usually more effective
than `ionice`, `schedtool`, and the `blkio` cgroup (though you can
certainly use those too) because they limit work that bees queues up
for later execution inside btrfs.
### Check `$BEESSTATUS`
If bees or the filesystem seems to be stuck, check the contents of
`$BEESSTATUS`. bees describes what it is doing (and how long it has
been trying to do it) through this file.
Sample:
<pre>
THREADS (work queue 68 tasks):
tid 20939: crawl_5986: dedup BeesRangePair: 512K src[0x9933f000..0x993bf000] dst[0x9933f000..0x993bf000]
src = 147 /run/bees/ede84fbd-cb59-0c60-9ea7-376fa4984887/data/home/builder/linux/.git/objects/pack/pack-09f06f8759ac7fd163df320b7f7671f06ac2a747.pack
dst = 15 /run/bees/ede84fbd-cb59-0c60-9ea7-376fa4984887/data.new/home/builder/linux/.git/objects/pack/pack-09f06f8759ac7fd163df320b7f7671f06ac2a747.pack
tid 20940: crawl_5986: dedup BeesRangePair: 512K src[0x992bf000..0x9933f000] dst[0x992bf000..0x9933f000]
src = 147 /run/bees/ede84fbd-cb59-0c60-9ea7-376fa4984887/data/home/builder/linux/.git/objects/pack/pack-09f06f8759ac7fd163df320b7f7671f06ac2a747.pack
dst = 15 /run/bees/ede84fbd-cb59-0c60-9ea7-376fa4984887/data.new/home/builder/linux/.git/objects/pack/pack-09f06f8759ac7fd163df320b7f7671f06ac2a747.pack
tid 21177: crawl_5986: dedup BeesRangePair: 512K src[0x9923f000..0x992bf000] dst[0x9923f000..0x992bf000]
src = 147 /run/bees/ede84fbd-cb59-0c60-9ea7-376fa4984887/data/home/builder/linux/.git/objects/pack/pack-09f06f8759ac7fd163df320b7f7671f06ac2a747.pack
dst = 15 /run/bees/ede84fbd-cb59-0c60-9ea7-376fa4984887/data.new/home/builder/linux/.git/objects/pack/pack-09f06f8759ac7fd163df320b7f7671f06ac2a747.pack
tid 21677: bees: [68493.1s] main
tid 21689: crawl_transid: [236.508s] waiting 332.575s for next 10 transid RateEstimator { count = 87179, raw = 969.066 / 32229.2, ratio = 969.066 / 32465.7, rate = 0.0298489, duration(1) = 33.5021, seconds_for(1) = 1 }
tid 21690: status: writing status to file '/run/bees.status'
tid 21691: crawl_writeback: [203.456s] idle, dirty
tid 21692: hash_writeback: [12.466s] flush rate limited after extent #17 of 64 extents
tid 21693: hash_prefetch: [2896.61s] idle 3600s
</pre>
The time in square brackets indicates how long the thread has been
executing the current task (if this time is below 5 seconds then it
is omitted). We can see here that the main thread (and therefore the
bees process as a whole) has been running for 68493.1 seconds, the
last hash table write was 12.5 seconds ago, and the last transid poll
was 236.5 seconds ago. Three worker threads are currently performing
dedupe on extents.
Thread names of note:
* `bees`: main thread (doesn't do anything after startup, but its task execution time is that of the whole bees process)
* `crawl_master`: task that finds new extents in the filesystem and populates the work queue
* `crawl_transid`: btrfs transid (generation number) tracker and polling thread
* `status`: the thread that writes the status reports to `$BEESSTATUS`
* `crawl_writeback`: writes the scanner progress to `beescrawl.dat`
* `hash_writeback`: trickle-writes the hash table back to `beeshash.dat`
* `hash_prefetch`: prefetches the hash table at startup and updates `beesstats.txt` hourly
Most other threads have names that are derived from the current dedupe
task that they are executing:
* `ref_205ad76b1000_24K_50`: extent scan performing dedupe of btrfs extent bytenr `205ad76b1000`, which is 24 KiB long and has 50 references
* `extent_250_32M_16E`: extent scan searching for extents between 32 MiB + 1 and 16 EiB bytes long, tracking scan position in virtual subvol `250`.
* `crawl_378_18916`: subvol scan searching for extent refs in subvol `378`, inode `18916`.
### Dump kernel stacks of hung processes
Check the kernel stacks of all blocked kernel processes:
ps xar | while read -r x y; do ps "$x"; head -50 --verbose /proc/"$x"/task/*/stack; done | tee lockup-stacks.txt
Submit the above information in your bug report.
### Check dmesg for btrfs stack dumps
Sometimes these are relevant too.
bees Crashes
------------
* If you have a core dump, run these commands in gdb and include
the output in your report (you may need to post it as a compressed
attachment, as it can be quite large):
(gdb) set pagination off
(gdb) info shared
(gdb) bt
(gdb) thread apply all bt
(gdb) thread apply all bt full
The last line generates megabytes of output and will often crash gdb.
Submit whatever output gdb can produce.
**Note that this output may include filenames or data from your
filesystem.**
* If you have `systemd-coredump` installed, you can use `coredumpctl`:
(echo set pagination off;
echo info shared;
echo bt;
echo thread apply all bt;
echo thread apply all bt full) | coredumpctl gdb bees
* If the crash happens often (or don't want to use coredumpctl),
you can run automate the gdb data collection with this wrapper script:
<pre>
#!/bin/sh
set -x
# Move aside old core files for analysis
for x in core*; do
if [ -e "$x" ]; then
mv -vf "$x" "old-$x.$(date +%Y-%m-%d-%H-%M-%S)"
fi
done
# Delete old core files after a week
find old-core* -type f -mtime +7 -exec rm -vf {} + &
# Turn on the cores (FIXME: may need to change other system parameters
# that capture or redirect core files)
ulimit -c unlimited
# Run the command
"$@"
rv="$?"
# Don't clobber our core when gdb crashes
ulimit -c 0
# If there were core files, generate reports for them
for x in core*; do
if [ -e "$x" ]; then
gdb --core="$x" \
--eval-command='set pagination off' \
--eval-command='info shared' \
--eval-command='bt' \
--eval-command='thread apply all bt' \
--eval-command='thread apply all bt full' \
--eval-command='quit' \
--args "$@" 2>&1 | tee -a "$x.txt"
fi
done
# Return process exit status to caller
exit "$rv"
</pre>
To use the wrapper script, insert it just before the `bees` command,
as in:
gdb-wrapper bees /path/to/fs/
Kernel crashes, corruption, and filesystem damage
-------------------------------------------------
bees doesn't do anything that _should_ cause corruption or data loss;
however, [btrfs has kernel bugs](btrfs-kernel.md), so corruption is
not impossible.
Issues with the btrfs filesystem kernel code or other block device layers
should be reported to their respective maintainers.

13
include/crucible/bool.h Normal file
View File

@@ -0,0 +1,13 @@
#ifndef CRUCIBLE_BOOL_H
#define CRUCIBLE_BOOL_H
namespace crucible {
struct DefaultBool {
bool m_b;
DefaultBool(bool init = false) : m_b(init) {}
operator bool() const { return m_b; }
bool &operator=(const bool &that) { return m_b = that; }
};
}
#endif // CRUCIBLE_BOOL_H

View File

@@ -1,216 +0,0 @@
#ifndef CRUCIBLE_BTRFS_TREE_H
#define CRUCIBLE_BTRFS_TREE_H
#include "crucible/fd.h"
#include "crucible/fs.h"
#include "crucible/bytevector.h"
namespace crucible {
using namespace std;
class BtrfsTreeItem {
uint64_t m_objectid = 0;
uint64_t m_offset = 0;
uint64_t m_transid = 0;
ByteVector m_data;
uint8_t m_type = 0;
public:
uint64_t objectid() const { return m_objectid; }
uint64_t offset() const { return m_offset; }
uint64_t transid() const { return m_transid; }
uint8_t type() const { return m_type; }
const ByteVector data() const { return m_data; }
BtrfsTreeItem() = default;
BtrfsTreeItem(const BtrfsIoctlSearchHeader &bish);
BtrfsTreeItem& operator=(const BtrfsIoctlSearchHeader &bish);
bool operator!() const;
/// Member access methods. Invoking a method on the
/// wrong type of item will throw an exception.
/// @{ Block group items
uint64_t block_group_flags() const;
uint64_t block_group_used() const;
/// @}
/// @{ Chunk items
uint64_t chunk_length() const;
uint64_t chunk_type() const;
/// @}
/// @{ Dev extent items (physical byte ranges)
uint64_t dev_extent_chunk_offset() const;
uint64_t dev_extent_length() const;
/// @}
/// @{ Dev items (devices)
uint64_t dev_item_total_bytes() const;
uint64_t dev_item_bytes_used() const;
/// @}
/// @{ Inode items
uint64_t inode_size() const;
/// @}
/// @{ Extent refs (EXTENT_DATA)
uint64_t file_extent_logical_bytes() const;
uint64_t file_extent_generation() const;
uint64_t file_extent_offset() const;
uint64_t file_extent_bytenr() const;
uint8_t file_extent_type() const;
btrfs_compression_type file_extent_compression() const;
/// @}
/// @{ Extent items (EXTENT_ITEM)
uint64_t extent_begin() const;
uint64_t extent_end() const;
uint64_t extent_flags() const;
uint64_t extent_generation() const;
/// @}
/// @{ Root items
uint64_t root_flags() const;
uint64_t root_refs() const;
/// @}
/// @{ Root backref items.
uint64_t root_ref_dirid() const;
string root_ref_name() const;
uint64_t root_ref_parent_rootid() const;
/// @}
};
ostream &operator<<(ostream &os, const BtrfsTreeItem &bti);
class BtrfsTreeFetcher {
protected:
Fd m_fd;
BtrfsIoctlSearchKey m_sk;
uint64_t m_tree = 0;
uint64_t m_min_transid = 0;
uint64_t m_max_transid = numeric_limits<uint64_t>::max();
uint64_t m_block_size = 0;
uint64_t m_lookbehind_size = 0;
uint64_t m_scale_size = 0;
uint8_t m_type = 0;
uint64_t scale_logical(uint64_t logical) const;
uint64_t unscale_logical(uint64_t logical) const;
const static uint64_t s_max_logical = numeric_limits<uint64_t>::max();
uint64_t scaled_max_logical() const;
virtual void fill_sk(BtrfsIoctlSearchKey &key, uint64_t object);
virtual void next_sk(BtrfsIoctlSearchKey &key, const BtrfsIoctlSearchHeader &hdr);
virtual uint64_t hdr_logical(const BtrfsIoctlSearchHeader &hdr) = 0;
virtual bool hdr_match(const BtrfsIoctlSearchHeader &hdr) = 0;
virtual bool hdr_stop(const BtrfsIoctlSearchHeader &hdr) = 0;
Fd fd() const;
void fd(Fd fd);
public:
virtual ~BtrfsTreeFetcher() = default;
BtrfsTreeFetcher(Fd new_fd);
void type(uint8_t type);
uint8_t type();
void tree(uint64_t tree);
uint64_t tree();
void transid(uint64_t min_transid, uint64_t max_transid = numeric_limits<uint64_t>::max());
/// Block size (sectorsize) of filesystem
uint64_t block_size() const;
/// Fetch last object < logical, null if not found
BtrfsTreeItem prev(uint64_t logical);
/// Fetch first object > logical, null if not found
BtrfsTreeItem next(uint64_t logical);
/// Fetch object at exactly logical, null if not found
BtrfsTreeItem at(uint64_t);
/// Fetch first object >= logical
BtrfsTreeItem lower_bound(uint64_t logical);
/// Fetch last object <= logical
BtrfsTreeItem rlower_bound(uint64_t logical);
/// Estimated distance between objects
virtual uint64_t lookbehind_size() const;
virtual void lookbehind_size(uint64_t);
/// Scale size (normally block size but must be set to 1 for fs trees)
uint64_t scale_size() const;
void scale_size(uint64_t);
};
class BtrfsTreeObjectFetcher : public BtrfsTreeFetcher {
protected:
virtual void fill_sk(BtrfsIoctlSearchKey &key, uint64_t logical) override;
virtual uint64_t hdr_logical(const BtrfsIoctlSearchHeader &hdr) override;
virtual bool hdr_match(const BtrfsIoctlSearchHeader &hdr) override;
virtual bool hdr_stop(const BtrfsIoctlSearchHeader &hdr) override;
public:
using BtrfsTreeFetcher::BtrfsTreeFetcher;
};
class BtrfsTreeOffsetFetcher : public BtrfsTreeFetcher {
protected:
uint64_t m_objectid = 0;
virtual void fill_sk(BtrfsIoctlSearchKey &key, uint64_t offset) override;
virtual uint64_t hdr_logical(const BtrfsIoctlSearchHeader &hdr) override;
virtual bool hdr_match(const BtrfsIoctlSearchHeader &hdr) override;
virtual bool hdr_stop(const BtrfsIoctlSearchHeader &hdr) override;
public:
using BtrfsTreeFetcher::BtrfsTreeFetcher;
void objectid(uint64_t objectid);
uint64_t objectid() const;
};
class BtrfsCsumTreeFetcher : public BtrfsTreeOffsetFetcher {
public:
const uint32_t BTRFS_CSUM_TYPE_UNKNOWN = uint32_t(1) << 16;
private:
size_t m_sum_size = 0;
uint32_t m_sum_type = BTRFS_CSUM_TYPE_UNKNOWN;
public:
BtrfsCsumTreeFetcher(const Fd &fd);
uint32_t sum_type() const;
size_t sum_size() const;
void get_sums(uint64_t logical, size_t count, function<void(uint64_t logical, const uint8_t *buf, size_t count)> output);
};
/// Fetch extent items from extent tree.
/// Does not filter out metadata! See BtrfsDataExtentTreeFetcher for that.
class BtrfsExtentItemFetcher : public BtrfsTreeObjectFetcher {
public:
BtrfsExtentItemFetcher(const Fd &fd);
};
/// Fetch extent refs from an inode. Caller must set the tree and objectid.
class BtrfsExtentDataFetcher : public BtrfsTreeOffsetFetcher {
public:
BtrfsExtentDataFetcher(const Fd &fd);
};
/// Fetch raw inode items
class BtrfsInodeFetcher : public BtrfsTreeObjectFetcher {
public:
BtrfsInodeFetcher(const Fd &fd);
BtrfsTreeItem stat(uint64_t subvol, uint64_t inode);
};
/// Fetch a root (subvol) item
class BtrfsRootFetcher : public BtrfsTreeObjectFetcher {
public:
BtrfsRootFetcher(const Fd &fd);
BtrfsTreeItem root(uint64_t subvol);
BtrfsTreeItem root_backref(uint64_t subvol);
};
/// Fetch data extent items from extent tree, skipping metadata-only block groups
class BtrfsDataExtentTreeFetcher : public BtrfsExtentItemFetcher {
BtrfsTreeItem m_current_bg;
BtrfsTreeOffsetFetcher m_chunk_tree;
protected:
virtual void next_sk(BtrfsIoctlSearchKey &key, const BtrfsIoctlSearchHeader &hdr) override;
public:
BtrfsDataExtentTreeFetcher(const Fd &fd);
};
}
#endif

View File

@@ -13,22 +13,18 @@
// __u64 typedef and friends
#include <linux/types.h>
// the btrfs headers
#include <linux/btrfs.h>
#include <linux/btrfs_tree.h>
// try Linux headers first
#include <btrfs/ioctl.h>
// And now all the things that have been missing in some version of
// the headers.
// Supply any missing definitions
#define mutex not_mutex
#include <btrfs/ctree.h>
// Repair the damage
#undef min
#undef max
#undef mutex
enum btrfs_compression_type {
BTRFS_COMPRESS_NONE,
BTRFS_COMPRESS_ZLIB,
BTRFS_COMPRESS_LZO,
BTRFS_COMPRESS_ZSTD,
};
// BTRFS_CSUM_ITEM_KEY is not defined in include/uapi
#ifndef BTRFS_CSUM_ITEM_KEY
#ifndef BTRFS_FIRST_FREE_OBJECTID
#define BTRFS_ROOT_TREE_OBJECTID 1ULL
#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
@@ -78,6 +74,9 @@ enum btrfs_compression_type {
#define BTRFS_SHARED_BLOCK_REF_KEY 182
#define BTRFS_SHARED_DATA_REF_KEY 184
#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
#define BTRFS_FREE_SPACE_INFO_KEY 198
#define BTRFS_FREE_SPACE_EXTENT_KEY 199
#define BTRFS_FREE_SPACE_BITMAP_KEY 200
#define BTRFS_DEV_EXTENT_KEY 204
#define BTRFS_DEV_ITEM_KEY 216
#define BTRFS_CHUNK_ITEM_KEY 228
@@ -94,18 +93,6 @@ enum btrfs_compression_type {
#endif
#ifndef BTRFS_FREE_SPACE_INFO_KEY
#define BTRFS_FREE_SPACE_INFO_KEY 198
#define BTRFS_FREE_SPACE_EXTENT_KEY 199
#define BTRFS_FREE_SPACE_BITMAP_KEY 200
#define BTRFS_FREE_SPACE_OBJECTID -11ULL
#endif
#ifndef BTRFS_BLOCK_GROUP_RAID1C4
#define BTRFS_BLOCK_GROUP_RAID1C3 (1ULL << 9)
#define BTRFS_BLOCK_GROUP_RAID1C4 (1ULL << 10)
#endif
#ifndef BTRFS_DEFRAG_RANGE_START_IO
// For some reason uapi has BTRFS_DEFRAG_RANGE_COMPRESS and
@@ -143,7 +130,7 @@ enum btrfs_compression_type {
};
#endif
#ifndef BTRFS_IOC_CLONE_RANGE
struct btrfs_ioctl_clone_range_args {
@@ -171,7 +158,7 @@ enum btrfs_compression_type {
__u64 bytes_deduped; /* out - total # of bytes we were able
* to dedupe from this file */
/* status of this dedupe operation:
* 0 if dedupe succeeds
* 0 if dedup succeeds
* < 0 for error
* == BTRFS_SAME_DATA_DIFFERS if data differs
*/
@@ -215,51 +202,4 @@ enum btrfs_compression_type {
struct btrfs_ioctl_search_args_v2)
#endif
#ifndef BTRFS_IOC_LOGICAL_INO_V2
#define BTRFS_IOC_LOGICAL_INO_V2 _IOWR(BTRFS_IOCTL_MAGIC, 59, struct btrfs_ioctl_logical_ino_args)
#define BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET (1ULL << 0)
#endif
#ifndef BTRFS_FS_INFO_FLAG_CSUM_INFO
/* Request information about checksum type and size */
#define BTRFS_FS_INFO_FLAG_CSUM_INFO (1 << 0)
#endif
#ifndef BTRFS_FS_INFO_FLAG_GENERATION
/* Request information about filesystem generation */
#define BTRFS_FS_INFO_FLAG_GENERATION (1 << 1)
#endif
#ifndef BTRFS_FS_INFO_FLAG_METADATA_UUID
/* Request information about filesystem metadata UUID */
#define BTRFS_FS_INFO_FLAG_METADATA_UUID (1 << 2)
#endif
// BTRFS_CSUM_TYPE_CRC32 was a #define from 2008 to 2019.
// After that, it's an enum with the other 3 types.
// So if we do _not_ have CRC32 defined, it means we have the other 3;
// if we _do_ have CRC32 defined, it means we need the other 3.
// This seems likely to break some day.
#ifdef BTRFS_CSUM_TYPE_CRC32
#define BTRFS_CSUM_TYPE_XXHASH 1
#define BTRFS_CSUM_TYPE_SHA256 2
#define BTRFS_CSUM_TYPE_BLAKE2 3
#endif
struct btrfs_ioctl_fs_info_args_v3 {
__u64 max_id; /* out */
__u64 num_devices; /* out */
__u8 fsid[BTRFS_FSID_SIZE]; /* out */
__u32 nodesize; /* out */
__u32 sectorsize; /* out */
__u32 clone_alignment; /* out */
/* See BTRFS_FS_INFO_FLAG_* */
__u16 csum_type; /* out */
__u16 csum_size; /* out */
__u64 flags; /* in/out */
__u64 generation; /* out */
__u8 metadata_uuid[BTRFS_FSID_SIZE]; /* out */
__u8 reserved[944]; /* pad to 1k */
};
#endif // CRUCIBLE_BTRFS_H

View File

@@ -1,80 +0,0 @@
#ifndef _CRUCIBLE_BYTEVECTOR_H_
#define _CRUCIBLE_BYTEVECTOR_H_
#include <crucible/error.h>
#include <memory>
#include <mutex>
#include <ostream>
#include <cstdint>
#include <cstdlib>
namespace crucible {
using namespace std;
// new[] is a little slower than malloc
// shared_ptr is about 2x slower than unique_ptr
// vector<uint8_t> is ~160x slower
// so we won't bother with unique_ptr because we can't do shared copies with it
class ByteVector {
public:
using Pointer = shared_ptr<uint8_t>;
using value_type = Pointer::element_type;
using iterator = value_type*;
ByteVector() = default;
ByteVector(const ByteVector &that);
ByteVector& operator=(const ByteVector &that);
ByteVector(size_t size);
ByteVector(const ByteVector &that, size_t start, size_t length);
ByteVector(iterator begin, iterator end, size_t min_size = 0);
ByteVector at(size_t start, size_t length) const;
value_type& at(size_t) const;
iterator begin() const;
void clear();
value_type* data() const;
bool empty() const;
iterator end() const;
value_type& operator[](size_t) const;
size_t size() const;
bool operator==(const ByteVector &that) const;
// this version of erase only works at the beginning or end of the buffer, else throws exception
void erase(iterator first);
void erase(iterator first, iterator last);
// An important use case is ioctls that have a fixed-size header struct
// followed by a buffer for further arguments. These templates avoid
// doing reinterpret_casts every time.
template <class T> ByteVector(const T& object, size_t min_size);
template <class T> T* get() const;
private:
Pointer m_ptr;
size_t m_size = 0;
mutable mutex m_mutex;
};
template <class T>
ByteVector::ByteVector(const T& object, size_t min_size)
{
const auto size = max(min_size, sizeof(T));
m_ptr = Pointer(static_cast<value_type*>(malloc(size)), free);
memcpy(m_ptr.get(), &object, sizeof(T));
m_size = size;
}
template <class T>
T*
ByteVector::get() const
{
THROW_CHECK2(out_of_range, size(), sizeof(T), size() >= sizeof(T));
return reinterpret_cast<T*>(data());
}
ostream& operator<<(ostream &os, const ByteVector &bv);
}
#endif // _CRUCIBLE_BYTEVECTOR_H_

View File

@@ -3,8 +3,8 @@
#include "crucible/lockset.h"
#include <algorithm>
#include <functional>
#include <list>
#include <map>
#include <mutex>
#include <tuple>
@@ -17,26 +17,17 @@ namespace crucible {
public:
using Key = tuple<Arguments...>;
using Func = function<Return(Arguments...)>;
using Time = unsigned;
using Value = pair<Time, Return>;
private:
struct Value {
Key key;
Return ret;
};
using ListIter = typename list<Value>::iterator;
Func m_fn;
list<Value> m_list;
map<Key, ListIter> m_map;
LockSet<Key> m_lockset;
size_t m_max_size;
mutable mutex m_mutex;
Func m_fn;
Time m_ctr;
map<Key, Value> m_map;
LockSet<Key> m_lockset;
size_t m_max_size;
mutex m_mutex;
void check_overflow();
void recent_use(ListIter vp);
void erase_item(ListIter vp);
void erase_key(const Key &k);
Return insert_item(Func fn, Arguments... args);
public:
LRUCache(Func f = Func(), size_t max_size = 100);
@@ -46,115 +37,35 @@ namespace crucible {
Return operator()(Arguments... args);
Return refresh(Arguments... args);
void expire(Arguments... args);
void prune(function<bool(const Return &)> predicate);
void insert(const Return &r, Arguments... args);
void clear();
size_t size() const;
};
template <class Return, class... Arguments>
LRUCache<Return, Arguments...>::LRUCache(Func f, size_t max_size) :
m_fn(f),
m_ctr(0),
m_max_size(max_size)
{
}
template <class Return, class... Arguments>
Return
LRUCache<Return, Arguments...>::insert_item(Func fn, Arguments... args)
{
Key k(args...);
// Do we have it cached?
unique_lock<mutex> lock(m_mutex);
auto found = m_map.find(k);
if (found == m_map.end()) {
// No, release cache lock and acquire key lock
lock.unlock();
auto key_lock = m_lockset.make_lock(k);
// Did item appear in cache while we were waiting for key?
lock.lock();
found = m_map.find(k);
if (found == m_map.end()) {
// No, we now hold key and cache locks, but item not in cache.
// Release cache lock and call the function
lock.unlock();
// Create new value
Value v {
.key = k,
.ret = fn(args...),
};
// Reacquire cache lock
lock.lock();
// Make room
check_overflow();
// Insert return value at back of LRU list (hot end)
auto new_item = m_list.insert(m_list.end(), v);
// Insert return value in map
bool inserted = false;
tie(found, inserted) = m_map.insert(make_pair(v.key, new_item));
// We (should be) holding a lock on this key so we are the ones to insert it
THROW_CHECK0(runtime_error, inserted);
}
// Item should be in cache now
THROW_CHECK0(runtime_error, found != m_map.end());
} else {
// Move to end of LRU
recent_use(found->second);
}
// Return cached object
return found->second->ret;
}
template <class Return, class... Arguments>
void
LRUCache<Return, Arguments...>::erase_item(ListIter vp)
{
if (vp != m_list.end()) {
m_map.erase(vp->key);
m_list.erase(vp);
}
}
template <class Return, class... Arguments>
void
LRUCache<Return, Arguments...>::erase_key(const Key &k)
{
auto map_item = m_map.find(k);
if (map_item != m_map.end()) {
auto list_item = map_item->second;
m_map.erase(map_item);
m_list.erase(list_item);
}
}
template <class Return, class... Arguments>
void
LRUCache<Return, Arguments...>::check_overflow()
{
// Erase items at front of LRU list (cold end) until max size reached or list empty
while (m_map.size() >= m_max_size && !m_list.empty()) {
erase_item(m_list.begin());
if (m_map.size() <= m_max_size) return;
vector<pair<Key, Time>> map_contents;
map_contents.reserve(m_map.size());
for (auto i : m_map) {
map_contents.push_back(make_pair(i.first, i.second.first));
}
sort(map_contents.begin(), map_contents.end(), [](const pair<Key, Time> &a, const pair<Key, Time> &b) {
return a.second < b.second;
});
for (size_t i = 0; i < map_contents.size() / 2; ++i) {
m_map.erase(map_contents[i].first);
}
}
template <class Return, class... Arguments>
void
LRUCache<Return, Arguments...>::recent_use(ListIter vp)
{
// Splice existing items at back of LRU list (hot end)
auto next_vp = vp;
++next_vp;
m_list.splice(m_list.end(), m_list, vp, next_vp);
}
template <class Return, class... Arguments>
@@ -163,9 +74,6 @@ namespace crucible {
{
unique_lock<mutex> lock(m_mutex);
m_max_size = new_max_size;
// FIXME: this really reduces the cache size to new_max_size - 1
// because every other time we call this method, it is immediately
// followed by insert.
check_overflow();
}
@@ -181,37 +89,80 @@ namespace crucible {
void
LRUCache<Return, Arguments...>::clear()
{
// Move the map and list onto the stack, then destroy it after we've released the lock
// so that we don't block other threads if the list's destructors are expensive
decltype(m_list) new_list;
decltype(m_map) new_map;
unique_lock<mutex> lock(m_mutex);
m_list.swap(new_list);
m_map.swap(new_map);
lock.unlock();
m_map.clear();
}
template <class Return, class... Arguments>
size_t
LRUCache<Return, Arguments...>::size() const
void
LRUCache<Return, Arguments...>::prune(function<bool(const Return &)> pred)
{
unique_lock<mutex> lock(m_mutex);
return m_map.size();
for (auto it = m_map.begin(); it != m_map.end(); ) {
auto next_it = ++it;
if (pred(it.second.second)) {
m_map.erase(it);
}
it = next_it;
}
}
template<class Return, class... Arguments>
Return
LRUCache<Return, Arguments...>::operator()(Arguments... args)
{
return insert_item(m_fn, args...);
Key k(args...);
bool inserted = false;
// Do we have it cached?
unique_lock<mutex> lock(m_mutex);
auto found = m_map.find(k);
if (found == m_map.end()) {
// No, release cache lock and acquire key lock
lock.unlock();
typename LockSet<Key>::Lock key_lock(m_lockset, k);
// Did item appear in cache while we were waiting for key?
lock.lock();
found = m_map.find(k);
if (found == m_map.end()) {
// No, we hold key and cache locks, but item not in cache.
// Release cache lock and call function
auto ctr_copy = m_ctr++;
lock.unlock();
Value v(ctr_copy, m_fn(args...));
// Reacquire cache lock and insert return value
lock.lock();
tie(found, inserted) = m_map.insert(make_pair(k, v));
// We hold a lock on this key so we are the ones to insert it
THROW_CHECK0(runtime_error, inserted);
// Release key lock and clean out overflow
key_lock.unlock();
check_overflow();
}
}
// Item should be in cache now
THROW_CHECK0(runtime_error, found != m_map.end());
// We are using this object so update the timestamp
if (!inserted) {
found->second.first = m_ctr++;
}
return found->second.second;
}
template<class Return, class... Arguments>
void
LRUCache<Return, Arguments...>::expire(Arguments... args)
{
Key k(args...);
unique_lock<mutex> lock(m_mutex);
erase_key(Key(args...));
m_map.erase(k);
}
template<class Return, class... Arguments>
@@ -226,7 +177,44 @@ namespace crucible {
void
LRUCache<Return, Arguments...>::insert(const Return &r, Arguments... args)
{
insert_item([&](Arguments...) -> Return { return r; }, args...);
Key k(args...);
bool inserted = false;
// Do we have it cached?
unique_lock<mutex> lock(m_mutex);
auto found = m_map.find(k);
if (found == m_map.end()) {
// No, release cache lock and acquire key lock
lock.unlock();
typename LockSet<Key>::Lock key_lock(m_lockset, k);
// Did item appear in cache while we were waiting for key?
lock.lock();
found = m_map.find(k);
if (found == m_map.end()) {
// No, we hold key and cache locks, but item not in cache.
// Release cache lock and insert the provided return value
auto ctr_copy = m_ctr++;
Value v(ctr_copy, r);
tie(found, inserted) = m_map.insert(make_pair(k, v));
// We hold a lock on this key so we are the ones to insert it
THROW_CHECK0(runtime_error, inserted);
// Release key lock and clean out overflow
key_lock.unlock();
check_overflow();
}
}
// Item should be in cache now
THROW_CHECK0(runtime_error, found != m_map.end());
// We are using this object so update the timestamp
if (!inserted) {
found->second.first = m_ctr++;
}
}
}

View File

@@ -8,8 +8,6 @@
#include <string>
#include <typeinfo>
#include <syslog.h>
/** \brief Chatter wraps a std::ostream reference with a destructor that
writes a newline, and inserts timestamp, pid, and tid prefixes on output.
@@ -35,22 +33,18 @@ namespace crucible {
using namespace std;
class Chatter {
int m_loglevel;
string m_name;
ostream &m_os;
ostringstream m_oss;
public:
Chatter(int loglevel, string name, ostream &os = cerr);
Chatter(string name, ostream &os = cerr);
Chatter(Chatter &&c);
ostream &get_os() { return m_oss; }
template <class T> Chatter &operator<<(const T& arg);
~Chatter();
static void enable_timestamp(bool prefix_timestamp);
static void enable_level(bool prefix_level);
};
template <class Argument>
@@ -92,6 +86,16 @@ namespace crucible {
}
};
template <>
struct ChatterTraits<ostream &> {
Chatter &
operator()(Chatter &c, ostream & arg)
{
c.get_os() << arg;
return c;
}
};
class ChatterBox {
string m_file;
int m_line;
@@ -107,7 +111,7 @@ namespace crucible {
template <class T> Chatter operator<<(const T &t)
{
Chatter c(LOG_NOTICE, m_pretty_function, m_os);
Chatter c(m_pretty_function, m_os);
c << t;
return c;
}

View File

@@ -1,113 +0,0 @@
// Copyright (c) 2011 Google, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
// CityHash, by Geoff Pike and Jyrki Alakuijala
//
// http://code.google.com/p/cityhash/
//
// This file provides a few functions for hashing strings. All of them are
// high-quality functions in the sense that they pass standard tests such
// as Austin Appleby's SMHasher. They are also fast.
//
// For 64-bit x86 code, on short strings, we don't know of anything faster than
// CityHash64 that is of comparable quality. We believe our nearest competitor
// is Murmur3. For 64-bit x86 code, CityHash64 is an excellent choice for hash
// tables and most other hashing (excluding cryptography).
//
// For 64-bit x86 code, on long strings, the picture is more complicated.
// On many recent Intel CPUs, such as Nehalem, Westmere, Sandy Bridge, etc.,
// CityHashCrc128 appears to be faster than all competitors of comparable
// quality. CityHash128 is also good but not quite as fast. We believe our
// nearest competitor is Bob Jenkins' Spooky. We don't have great data for
// other 64-bit CPUs, but for long strings we know that Spooky is slightly
// faster than CityHash on some relatively recent AMD x86-64 CPUs, for example.
// Note that CityHashCrc128 is declared in citycrc.h [which has been removed
// for bees].
//
// For 32-bit x86 code, we don't know of anything faster than CityHash32 that
// is of comparable quality. We believe our nearest competitor is Murmur3A.
// (On 64-bit CPUs, it is typically faster to use the other CityHash variants.)
//
// Functions in the CityHash family are not suitable for cryptography.
//
// Please see CityHash's README file for more details on our performance
// measurements and so on.
//
// WARNING: This code has been only lightly tested on big-endian platforms!
// It is known to work well on little-endian platforms that have a small penalty
// for unaligned reads, such as current Intel and AMD moderate-to-high-end CPUs.
// It should work on all 32-bit and 64-bit platforms that allow unaligned reads;
// bug reports are welcome.
//
// By the way, for some hash functions, given strings a and b, the hash
// of a+b is easily derived from the hashes of a and b. This property
// doesn't hold for any hash functions in this file.
#ifndef CITY_HASH_H_
#define CITY_HASH_H_
#include <stdlib.h> // for size_t.
#include <stdint.h>
#include <utility>
typedef uint8_t uint8;
typedef uint32_t uint32;
typedef uint64_t uint64;
typedef std::pair<uint64, uint64> uint128;
inline uint64 Uint128Low64(const uint128& x) { return x.first; }
inline uint64 Uint128High64(const uint128& x) { return x.second; }
// Hash function for a byte array.
uint64 CityHash64(const char *buf, size_t len);
// Hash function for a byte array. For convenience, a 64-bit seed is also
// hashed into the result.
uint64 CityHash64WithSeed(const char *buf, size_t len, uint64 seed);
// Hash function for a byte array. For convenience, two seeds are also
// hashed into the result.
uint64 CityHash64WithSeeds(const char *buf, size_t len,
uint64 seed0, uint64 seed1);
// Hash function for a byte array.
uint128 CityHash128(const char *s, size_t len);
// Hash function for a byte array. For convenience, a 128-bit seed is also
// hashed into the result.
uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed);
// Hash function for a byte array. Most useful in 32-bit binaries.
uint32 CityHash32(const char *buf, size_t len);
// Hash 128 input bits down to 64 bits of output.
// This is intended to be a reasonably good hash function.
inline uint64 Hash128to64(const uint128& x) {
// Murmur-inspired hashing.
const uint64 kMul = 0x9ddfea08eb382d69ULL;
uint64 a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul;
a ^= (a >> 47);
uint64 b = (Uint128High64(x) ^ a) * kMul;
b ^= (b >> 47);
b *= kMul;
return b;
}
#endif // CITY_HASH_H_

View File

@@ -1,18 +0,0 @@
#ifndef CRUCIBLE_CLEANUP_H
#define CRUCIBLE_CLEANUP_H
#include <functional>
namespace crucible {
using namespace std;
class Cleanup {
function<void()> m_cleaner;
public:
Cleanup(function<void()> func);
~Cleanup();
};
}
#endif // CRUCIBLE_CLEANUP_H

View File

@@ -3,11 +3,11 @@
#include <cstdint>
#include <cstdlib>
#include <cstring>
namespace crucible {
namespace Digest {
namespace CRC {
uint64_t crc64(const char *s);
uint64_t crc64(const void *p, size_t len);
};
};

View File

@@ -1,58 +0,0 @@
#ifndef CRUCIBLE_ENDIAN_H
#define CRUCIBLE_ENDIAN_H
#include <cstdint>
#include <endian.h>
namespace crucible {
template<class T>
struct le_to_cpu_helper {
T operator()(const T v);
};
template<> struct le_to_cpu_helper<uint64_t> {
uint64_t operator()(const uint64_t v) { return le64toh(v); }
};
#if __SIZEOF_LONG__ == 8
// uint64_t is unsigned long on LP64 platforms
template<> struct le_to_cpu_helper<unsigned long long> {
unsigned long long operator()(const unsigned long long v) { return le64toh(v); }
};
#endif
template<> struct le_to_cpu_helper<uint32_t> {
uint32_t operator()(const uint32_t v) { return le32toh(v); }
};
template<> struct le_to_cpu_helper<uint16_t> {
uint16_t operator()(const uint16_t v) { return le16toh(v); }
};
template<> struct le_to_cpu_helper<uint8_t> {
uint8_t operator()(const uint8_t v) { return v; }
};
template<class T>
T
le_to_cpu(const T v)
{
return le_to_cpu_helper<T>()(v);
}
template<class T>
T
get_unaligned(const void *const p)
{
struct not_aligned {
T v;
} __attribute__((packed));
const not_aligned *const nap = reinterpret_cast<const not_aligned*>(p);
return nap->v;
}
}
#endif // CRUCIBLE_ENDIAN_H

View File

@@ -81,25 +81,31 @@ namespace crucible {
// macro for throwing an error
#define THROW_ERROR(type, expr) do { \
std::ostringstream _te_oss; \
_te_oss << expr << " at " << __FILE__ << ":" << __LINE__; \
_te_oss << expr; \
throw type(_te_oss.str()); \
} while (0)
// macro for throwing a system_error with errno
#define THROW_ERRNO(expr) do { \
std::ostringstream _te_oss; \
_te_oss << expr << " at " << __FILE__ << ":" << __LINE__; \
_te_oss << expr; \
throw std::system_error(std::error_code(errno, std::system_category()), _te_oss.str()); \
} while (0)
// macro for throwing a system_error with some other variable
#define THROW_ERRNO_VALUE(value, expr) do { \
std::ostringstream _te_oss; \
_te_oss << expr << " at " << __FILE__ << ":" << __LINE__; \
_te_oss << expr; \
throw std::system_error(std::error_code((value), std::system_category()), _te_oss.str()); \
} while (0)
// macros for checking a constraint
#define CHECK_CONSTRAINT(value, expr) do { \
if (!(expr)) { \
THROW_ERROR(out_of_range, #value << " = " << value << " failed constraint check (" << #expr << ")"); \
} \
} while(0)
#define THROW_CHECK0(type, expr) do { \
if (!(expr)) { \
THROW_ERROR(type, "failed constraint check (" << #expr << ")"); \
@@ -126,13 +132,6 @@ namespace crucible {
} \
} while(0)
#define THROW_CHECK4(type, value1, value2, value3, value4, expr) do { \
if (!(expr)) { \
THROW_ERROR(type, #value1 << " = " << (value1) << ", " #value2 << " = " << (value2) << ", " #value3 << " = " << (value3) << ", " #value4 << " = " << (value4) \
<< " failed constraint check (" << #expr << ")"); \
} \
} while(0)
#define THROW_CHECK_BIN_OP(type, value1, op, value2) do { \
if (!((value1) op (value2))) { \
THROW_ERROR(type, "failed constraint check " << #value1 << " (" << (value1) << ") " << #op << " " << #value2 << " (" << (value2) << ")"); \

View File

@@ -0,0 +1,28 @@
#ifndef CRUCIBLE_EXECPIPE_H
#define CRUCIBLE_EXECPIPE_H
#include "crucible/fd.h"
#include <functional>
#include <limits>
#include <string>
namespace crucible {
using namespace std;
void redirect_stdin(const Fd &child_fd);
void redirect_stdin_stdout(const Fd &child_fd);
void redirect_stdin_stdout_stderr(const Fd &child_fd);
void redirect_stdout(const Fd &child_fd);
void redirect_stdout_stderr(const Fd &child_fd);
// Open a pipe (actually socketpair) to child process, then execute code in that process.
// e.g. popen([] () { system("echo Hello, World!"); });
// Forked process will exit when function returns.
Fd popen(function<int()> f, function<void(const Fd &child_fd)> import_fd_fn = redirect_stdin_stdout);
// Read all the data from fd into a string
string read_all(Fd fd, size_t max_bytes = numeric_limits<size_t>::max(), size_t chunk_bytes = 4096);
};
#endif // CRUCIBLE_EXECPIPE_H

View File

@@ -8,15 +8,15 @@ namespace crucible {
// FIXME: ExtentCursor is probably a better name
struct Extent {
off_t m_begin = 0;
off_t m_end = 0;
uint64_t m_physical = 0;
uint64_t m_flags = 0;
off_t m_begin;
off_t m_end;
uint64_t m_physical;
uint64_t m_flags;
// Btrfs extent reference details
off_t m_physical_len = 0;
off_t m_logical_len = 0;
off_t m_offset = 0;
off_t m_physical_len;
off_t m_logical_len;
off_t m_offset;
// fiemap flags are uint32_t, so bits 32..63 are OK for us
@@ -38,10 +38,11 @@ namespace crucible {
off_t physical_len() const { return m_physical_len; }
off_t logical_len() const { return m_logical_len; }
off_t offset() const { return m_offset; }
bool compressed() const;
uint64_t bytenr() const;
bool operator==(const Extent &that) const;
bool operator!=(const Extent &that) const { return !(*this == that); }
Extent();
Extent(const Extent &e) = default;
};
class ExtentWalker {
@@ -55,6 +56,10 @@ namespace crucible {
virtual Vec get_extent_map(off_t pos);
static const unsigned sc_extent_fetch_max = 64;
static const unsigned sc_extent_fetch_min = 4;
static const off_t sc_step_size = 0x1000 * (sc_extent_fetch_max / 2);
private:
Vec m_extents;
Itr m_current;
@@ -62,10 +67,6 @@ namespace crucible {
Itr find_in_cache(off_t pos);
void run_fiemap(off_t pos);
#ifdef EXTENTWALKER_DEBUG
ostringstream m_log;
#endif
public:
ExtentWalker(Fd fd = Fd());
ExtentWalker(Fd fd, off_t initial_pos);

View File

@@ -1,8 +1,7 @@
#ifndef CRUCIBLE_FD_H
#define CRUCIBLE_FD_H
#include "crucible/bytevector.h"
#include "crucible/namedptr.h"
#include "crucible/resource.h"
#include <cstring>
@@ -14,10 +13,6 @@
#include <sys/stat.h>
#include <fcntl.h>
// ioctl
#include <sys/ioctl.h>
#include <linux/fs.h>
// socket
#include <sys/socket.h>
@@ -27,91 +22,76 @@
namespace crucible {
using namespace std;
/// File descriptor owner object. It closes them when destroyed.
/// Most of the functions here don't use it because these functions don't own FDs.
/// All good names for such objects are taken.
// IOHandle is a file descriptor owner object. It closes them when destroyed.
// Most of the functions here don't use it because these functions don't own FDs.
// All good names for such objects are taken.
class IOHandle {
IOHandle(const IOHandle &) = delete;
IOHandle(IOHandle &&) = delete;
IOHandle& operator=(IOHandle &&) = delete;
IOHandle& operator=(const IOHandle &) = delete;
protected:
int m_fd;
void close();
IOHandle& operator=(int that) { m_fd = that; return *this; }
public:
virtual ~IOHandle();
IOHandle(int fd = -1);
int get_fd() const;
IOHandle(int fd);
IOHandle();
void close();
int get_fd() const { return m_fd; }
int release_fd();
};
/// Copyable file descriptor.
class Fd {
static NamedPtr<IOHandle, int> s_named_ptr;
shared_ptr<IOHandle> m_handle;
public:
using resource_type = IOHandle;
Fd();
Fd(int fd);
Fd &operator=(int fd);
Fd &operator=(const shared_ptr<IOHandle> &);
operator int() const;
bool operator!() const;
shared_ptr<IOHandle> operator->() const;
};
template <>
struct ResourceTraits<int, IOHandle> {
int get_key(const IOHandle &res) const { return res.get_fd(); }
shared_ptr<IOHandle> make_resource(int fd) const { return make_shared<IOHandle>(fd); }
bool is_null_key(const int &key) const { return key < 0; }
int get_null_key() const { return -1; }
};
void set_relative_path(string path);
string relative_path();
typedef ResourceHandle<int, IOHandle> Fd;
// Functions named "foo_or_die" throw exceptions on failure.
/// Attempt to open the file with the given mode, throw exception on failure.
// Attempt to open the file with the given mode
int open_or_die(const string &file, int flags = O_RDONLY, mode_t mode = 0777);
/// Attempt to open the file with the given mode, throw exception on failure.
int openat_or_die(int dir_fd, const string &file, int flags = O_RDONLY, mode_t mode = 0777);
/// Decode open flags
// Decode open parameters
string o_flags_ntoa(int flags);
/// Decode open mode
string o_mode_ntoa(mode_t mode);
/// mmap with its one weird error case
// mmap with its one weird error case
void *mmap_or_die(void *addr, size_t length, int prot, int flags, int fd, off_t offset);
/// Decode mmap prot
// Decode mmap parameters
string mmap_prot_ntoa(int prot);
/// Decode mmap flags
string mmap_flags_ntoa(int flags);
/// Rename, throw exception on failure.
// Unlink, rename
void unlink_or_die(const string &file);
void rename_or_die(const string &from, const string &to);
/// Rename, throw exception on failure.
void renameat_or_die(int fromfd, const string &frompath, int tofd, const string &topath);
/// Truncate, throw exception on failure.
void ftruncate_or_die(int fd, off_t size);
// Read or write structs:
// There is a template specialization to read or write strings
// Three-arg version of read_or_die/write_or_die throws an error on incomplete read/writes
// Four-arg version returns number of bytes read/written through reference arg
/// Attempt read by pointer and length, throw exception on IO error or short read.
void read_or_die(int fd, void *buf, size_t size);
/// Attempt read of a POD struct, throw exception on IO error or short read.
template <class T> void read_or_die(int fd, T& buf)
{
return read_or_die(fd, static_cast<void *>(&buf), sizeof(buf));
}
/// Attempt read by pointer and length, throw exception on IO error but not short read.
void read_partial_or_die(int fd, void *buf, size_t size_wanted, size_t &size_read);
/// Attempt read of a POD struct, throw exception on IO error but not short read.
template <class T> void read_partial_or_die(int fd, T& buf, size_t &size_read)
{
return read_partial_or_die(fd, static_cast<void *>(&buf), sizeof(buf), size_read);
}
/// Attempt read at position by pointer and length, throw exception on IO error but not short read.
void pread_or_die(int fd, void *buf, size_t size, off_t offset);
/// Attempt read at position of a POD struct, throw exception on IO error but not short read.
template <class T> void pread_or_die(int fd, T& buf, off_t offset)
{
return pread_or_die(fd, static_cast<void *>(&buf), sizeof(buf), offset);
@@ -138,23 +118,17 @@ namespace crucible {
// Specialization for strings which reads/writes the string content, not the struct string
template<> void write_or_die<string>(int fd, const string& str);
template<> void pread_or_die<string>(int fd, string& str, off_t offset);
template<> void pwrite_or_die<string>(int fd, const string& str, off_t offset);
template<> void pread_or_die<ByteVector>(int fd, ByteVector& str, off_t offset);
template<> void pwrite_or_die<ByteVector>(int fd, const ByteVector& str, off_t offset);
// Deprecated
template<> void pread_or_die<vector<uint8_t>>(int fd, vector<uint8_t>& str, off_t offset) = delete;
template<> void pwrite_or_die<vector<uint8_t>>(int fd, const vector<uint8_t>& str, off_t offset) = delete;
template<> void pread_or_die<vector<char>>(int fd, vector<char>& str, off_t offset) = delete;
template<> void pwrite_or_die<vector<char>>(int fd, const vector<char>& str, off_t offset) = delete;
template<> void pread_or_die<vector<char>>(int fd, vector<char>& str, off_t offset);
template<> void pread_or_die<vector<uint8_t>>(int fd, vector<uint8_t>& str, off_t offset);
/// Read a simple string.
// A different approach to reading a simple string
string read_string(int fd, size_t size);
/// A lot of Unix API wants you to initialize a struct and call
/// one function to fill it, another function to throw it away,
/// and has some unknown third thing you have to do when there's
/// an error. That's also a C++ object with an exception-throwing
/// constructor.
// A lot of Unix API wants you to initialize a struct and call
// one function to fill it, another function to throw it away,
// and has some unknown third thing you have to do when there's
// an error. That's also a C++ object with an exception-throwing
// constructor.
struct Stat : public stat {
Stat();
Stat(int f);
@@ -163,22 +137,19 @@ namespace crucible {
Stat &lstat(const string &filename);
};
int ioctl_iflags_get(int fd);
void ioctl_iflags_set(int fd, int attr);
string st_mode_ntoa(mode_t mode);
/// Because it's not trivial to do correctly
// Because it's not trivial to do correctly
string readlink_or_die(const string &path);
/// Determine the name of a FD by readlink through /proc/self/fd/
// Determine the name of a FD by readlink through /proc/self/fd/
string name_fd(int fd);
/// Returns Fd objects because it does own them.
// Returns Fd objects because it does own them.
pair<Fd, Fd> socketpair_or_die(int domain = AF_UNIX, int type = SOCK_STREAM, int protocol = 0);
/// like unique_lock but for flock instead of mutexes...and not trying
/// to hide the many and subtle differences between those two things *at all*.
// like unique_lock but for flock instead of mutexes...and not trying
// to hide the many and subtle differences between those two things *at all*.
class Flock {
int m_fd;
bool m_locked;
@@ -199,7 +170,7 @@ namespace crucible {
int fd();
};
/// Doesn't use Fd objects because it's usually just used to replace stdin/stdout/stderr.
// Doesn't use Fd objects because it's usually just used to replace stdin/stdout/stderr.
void dup2_or_die(int fd_in, int fd_out);
}

View File

@@ -1,8 +1,6 @@
#ifndef CRUCIBLE_FS_H
#define CRUCIBLE_FS_H
#include "crucible/bytevector.h"
#include "crucible/endian.h"
#include "crucible/error.h"
// Terribly Linux-specific FS-wrangling functions
@@ -15,7 +13,6 @@
#include <cstdint>
#include <iosfwd>
#include <set>
#include <vector>
#include <fcntl.h>
@@ -27,16 +24,23 @@ namespace crucible {
// wrapper around fallocate(...FALLOC_FL_PUNCH_HOLE...)
void punch_hole(int fd, off_t offset, off_t len);
struct BtrfsExtentSame {
struct BtrfsExtentInfo : public btrfs_ioctl_same_extent_info {
BtrfsExtentInfo(int dst_fd, off_t dst_offset);
};
struct BtrfsExtentSame : public btrfs_ioctl_same_args {
virtual ~BtrfsExtentSame();
BtrfsExtentSame(int src_fd, off_t src_offset, off_t src_length);
void add(int fd, uint64_t offset);
void add(int fd, off_t offset);
virtual void do_ioctl();
uint64_t m_logical_offset = 0;
uint64_t m_length = 0;
int m_fd;
vector<btrfs_ioctl_same_extent_info> m_info;
vector<BtrfsExtentInfo> m_info;
};
struct BtrfsExtentSameByClone : public BtrfsExtentSame {
using BtrfsExtentSame::BtrfsExtentSame;
void do_ioctl() override;
};
ostream & operator<<(ostream &os, const btrfs_ioctl_same_extent_info *info);
@@ -51,51 +55,26 @@ namespace crucible {
ostream & operator<<(ostream &os, const BtrfsInodeOffsetRoot &p);
struct BtrfsDataContainer {
struct BtrfsDataContainer : public btrfs_data_container {
BtrfsDataContainer(size_t size = 64 * 1024);
void *prepare(size_t size);
void *prepare();
size_t get_size() const;
decltype(btrfs_data_container::bytes_left) get_bytes_left() const;
decltype(btrfs_data_container::bytes_missing) get_bytes_missing() const;
decltype(btrfs_data_container::elem_cnt) get_elem_cnt() const;
decltype(btrfs_data_container::elem_missed) get_elem_missed() const;
decltype(bytes_left) get_bytes_left() const;
decltype(bytes_missing) get_bytes_missing() const;
decltype(elem_cnt) get_elem_cnt() const;
decltype(elem_missed) get_elem_missed() const;
ByteVector m_data;
vector<char> m_data;
};
struct BtrfsIoctlLogicalInoArgs {
BtrfsIoctlLogicalInoArgs(uint64_t logical, size_t buf_size = 16 * 1024 * 1024);
struct BtrfsIoctlLogicalInoArgs : public btrfs_ioctl_logical_ino_args {
BtrfsIoctlLogicalInoArgs(uint64_t logical, size_t buf_size = 64 * 1024);
virtual void do_ioctl(int fd);
virtual bool do_ioctl_nothrow(int fd);
uint64_t get_flags() const;
void set_flags(uint64_t new_flags);
void set_logical(uint64_t new_logical);
void set_size(uint64_t new_size);
void do_ioctl(int fd);
bool do_ioctl_nothrow(int fd);
struct BtrfsInodeOffsetRootSpan {
using iterator = BtrfsInodeOffsetRoot*;
using const_iterator = const BtrfsInodeOffsetRoot*;
size_t size() const;
iterator begin() const;
iterator end() const;
const_iterator cbegin() const;
const_iterator cend() const;
iterator data() const;
void clear();
private:
iterator m_begin = nullptr;
iterator m_end = nullptr;
friend struct BtrfsIoctlLogicalInoArgs;
} m_iors;
private:
size_t m_container_size;
BtrfsDataContainer m_container;
uint64_t m_logical;
uint64_t m_flags = 0;
friend ostream & operator<<(ostream &os, const BtrfsIoctlLogicalInoArgs *p);
vector<BtrfsInodeOffsetRoot> m_iors;
};
ostream & operator<<(ostream &os, const BtrfsIoctlLogicalInoArgs &p);
@@ -105,7 +84,7 @@ namespace crucible {
virtual void do_ioctl(int fd);
virtual bool do_ioctl_nothrow(int fd);
size_t m_container_size;
BtrfsDataContainer m_container;
vector<string> m_paths;
};
@@ -127,6 +106,15 @@ namespace crucible {
ostream & operator<<(ostream &os, const BtrfsIoctlDefragRangeArgs *p);
// in btrfs/ctree.h, but that's a nightmare to #include here
typedef enum {
BTRFS_COMPRESS_NONE = 0,
BTRFS_COMPRESS_ZLIB = 1,
BTRFS_COMPRESS_LZO = 2,
BTRFS_COMPRESS_TYPES = 2,
BTRFS_COMPRESS_LAST = 3,
} btrfs_compression_type;
struct FiemapExtent : public fiemap_extent {
FiemapExtent();
FiemapExtent(const fiemap_extent &that);
@@ -135,26 +123,16 @@ namespace crucible {
off_t end() const;
};
struct Fiemap {
// because fiemap.h insists on giving FIEMAP_MAX_OFFSET
// a different type from the struct fiemap members
static const uint64_t s_fiemap_max_offset = FIEMAP_MAX_OFFSET;
struct Fiemap : public fiemap {
// Get entire file
Fiemap(uint64_t start = 0, uint64_t length = s_fiemap_max_offset);
Fiemap(uint64_t start = 0, uint64_t length = FIEMAP_MAX_OFFSET);
void do_ioctl(int fd);
vector<FiemapExtent> m_extents;
decltype(fiemap::fm_extent_count) m_min_count = (4096 - sizeof(fiemap)) / sizeof(fiemap_extent);
decltype(fiemap::fm_extent_count) m_max_count = 16 * 1024 * 1024 / sizeof(fiemap_extent);
uint64_t m_start;
uint64_t m_length;
// FIEMAP is slow and full of lies.
// This makes FIEMAP even slower, but reduces the lies a little.
decltype(fiemap::fm_flags) m_flags = FIEMAP_FLAG_SYNC;
friend ostream &operator<<(ostream &, const Fiemap &);
uint64_t m_min_count = (4096 - sizeof(fiemap)) / sizeof(fiemap_extent);
uint64_t m_max_count = 16 * 1024 * 1024 / sizeof(fiemap_extent);
};
ostream & operator<<(ostream &os, const fiemap_extent *info);
@@ -170,70 +148,79 @@ namespace crucible {
struct BtrfsIoctlSearchHeader : public btrfs_ioctl_search_header {
BtrfsIoctlSearchHeader();
ByteVector m_data;
size_t set_data(const ByteVector &v, size_t offset);
bool operator<(const BtrfsIoctlSearchHeader &that) const;
vector<char> m_data;
size_t set_data(const vector<char> &v, size_t offset);
};
// Perf blames this function for a few percent overhead; move it here so it can be inline
inline bool BtrfsIoctlSearchHeader::operator<(const BtrfsIoctlSearchHeader &that) const
{
return tie(objectid, type, offset, len, transid) < tie(that.objectid, that.type, that.offset, that.len, that.transid);
}
ostream & operator<<(ostream &os, const btrfs_ioctl_search_header &hdr);
ostream & operator<<(ostream &os, const BtrfsIoctlSearchHeader &hdr);
struct BtrfsIoctlSearchKey : public btrfs_ioctl_search_key {
BtrfsIoctlSearchKey(size_t buf_size = 1024);
bool do_ioctl_nothrow(int fd);
void do_ioctl(int fd);
BtrfsIoctlSearchKey(size_t buf_size = 1024 * 1024);
virtual bool do_ioctl_nothrow(int fd);
virtual void do_ioctl(int fd);
// Copy objectid/type/offset so we move forward
void next_min(const BtrfsIoctlSearchHeader& ref);
// move forward to next object of a single type
void next_min(const BtrfsIoctlSearchHeader& ref, const uint8_t type);
size_t m_buf_size;
set<BtrfsIoctlSearchHeader> m_result;
static thread_local size_t s_calls;
static thread_local size_t s_loops;
static thread_local size_t s_loops_empty;
static thread_local shared_ptr<ostream> s_debug_ostream;
vector<BtrfsIoctlSearchHeader> m_result;
};
ostream & operator<<(ostream &os, const btrfs_ioctl_search_key &key);
ostream & operator<<(ostream &os, const BtrfsIoctlSearchKey &key);
string btrfs_chunk_type_ntoa(uint64_t type);
string btrfs_search_type_ntoa(unsigned type);
string btrfs_search_objectid_ntoa(uint64_t objectid);
string btrfs_compress_type_ntoa(uint8_t type);
string btrfs_search_objectid_ntoa(unsigned objectid);
uint64_t btrfs_get_root_id(int fd);
uint64_t btrfs_get_root_transid(int fd);
template<class T, class V>
template<class T>
const T*
get_struct_ptr(const V &v, size_t offset = 0)
get_struct_ptr(vector<char> &v, size_t offset = 0)
{
THROW_CHECK2(out_of_range, v.size(), offset + sizeof(T), offset + sizeof(T) <= v.size());
const uint8_t *const data_ptr = v.data();
return reinterpret_cast<const T*>(data_ptr + offset);
// OK so sometimes btrfs overshoots a little
if (offset + sizeof(T) > v.size()) {
v.resize(offset + sizeof(T), 0);
}
THROW_CHECK2(invalid_argument, v.size(), offset + sizeof(T), offset + sizeof(T) <= v.size());
return reinterpret_cast<const T*>(v.data() + offset);
}
template<class S, class T, class V>
T
btrfs_get_member(T S::* member, V &v, size_t offset = 0)
template<class A, class R>
R
call_btrfs_get(R (*func)(const A*), vector<char> &v, size_t offset = 0)
{
const S *const sp = nullptr;
const T *const spm = &(sp->*member);
const auto member_offset = reinterpret_cast<const uint8_t *>(spm) - reinterpret_cast<const uint8_t *>(sp);
const void *struct_ptr = get_struct_ptr<T>(v, offset + member_offset);
const T unaligned_t = get_unaligned<T>(struct_ptr);
return le_to_cpu(unaligned_t);
return func(get_struct_ptr<A>(v, offset));
}
template <class T> struct btrfs_get_le;
template<> struct btrfs_get_le<__le64> {
uint64_t operator()(const void *p) { return get_unaligned_le64(p); }
};
template<> struct btrfs_get_le<__le32> {
uint32_t operator()(const void *p) { return get_unaligned_le32(p); }
};
template<> struct btrfs_get_le<__le16> {
uint16_t operator()(const void *p) { return get_unaligned_le16(p); }
};
template<> struct btrfs_get_le<__le8> {
uint8_t operator()(const void *p) { return get_unaligned_le8(p); }
};
template<class S, class T>
T
btrfs_get_member(T S::* member, vector<char> &v, size_t offset = 0)
{
const S *sp = reinterpret_cast<const S*>(NULL);
const T *spm = &(sp->*member);
auto member_offset = reinterpret_cast<const char *>(spm) - reinterpret_cast<const char *>(sp);
return btrfs_get_le<T>()(get_struct_ptr<S>(v, offset + member_offset));
}
struct Statvfs : public statvfs {
@@ -245,14 +232,12 @@ namespace crucible {
unsigned long available() const;
};
struct BtrfsIoctlFsInfoArgs : public btrfs_ioctl_fs_info_args_v3 {
ostream &hexdump(ostream &os, const vector<char> &v);
struct BtrfsIoctlFsInfoArgs : public btrfs_ioctl_fs_info_args {
BtrfsIoctlFsInfoArgs();
void do_ioctl(int fd);
bool do_ioctl_nothrow(int fd);
uint16_t csum_type() const;
uint16_t csum_size() const;
uint64_t generation() const;
vector<uint8_t> fsid() const;
string uuid() const;
};
ostream & operator<<(ostream &os, const BtrfsIoctlFsInfoArgs &a);

View File

@@ -1,38 +0,0 @@
#ifndef CRUCIBLE_HEXDUMP_H
#define CRUCIBLE_HEXDUMP_H
#include "crucible/string.h"
#include <ostream>
namespace crucible {
using namespace std;
template <class V>
ostream &
hexdump(ostream &os, const V &v)
{
const auto v_size = v.size();
const uint8_t* const v_data = reinterpret_cast<const uint8_t*>(v.data());
os << "V { size = " << v_size << ", data:\n";
for (size_t i = 0; i < v_size; i += 8) {
string hex, ascii;
for (size_t j = i; j < i + 8; ++j) {
if (j < v_size) {
const uint8_t c = v_data[j];
char buf[8];
sprintf(buf, "%02x ", c);
hex += buf;
ascii += (c < 32 || c > 126) ? '.' : c;
} else {
hex += " ";
ascii += ' ';
}
}
os << astringprintf("\t%08x %s %s\n", i, hex.c_str(), ascii.c_str());
}
return os << "}";
}
};
#endif // CRUCIBLE_HEXDUMP_H

106
include/crucible/interp.h Normal file
View File

@@ -0,0 +1,106 @@
#ifndef CRUCIBLE_INTERP_H
#define CRUCIBLE_INTERP_H
#include "crucible/error.h"
#include <map>
#include <memory>
#include <string>
#include <vector>
namespace crucible {
using namespace std;
struct ArgList : public vector<string> {
ArgList(const char **argv);
// using vector<string>::vector ... doesn't work:
// error: std::vector<std::basic_string<char> >::vector names constructor
// Still doesn't work in 4.9 because it can't manage a conversion
ArgList(const vector<string> &&that);
};
struct ArgActor {
struct ArgActorBase {
virtual void predicate(void *obj, string arg);
};
template <class T>
struct ArgActorDerived {
function<void(T, string)> m_func;
ArgActorDerived(decltype(m_func) func) :
m_func(func)
{
}
void predicate(void *obj, string arg) override
{
T &op = *(reinterpret_cast<T*>(obj));
m_func(op, obj);
}
};
template <class T>
ArgActor(T, function<void(T, string)> func) :
m_actor(make_shared(ArgActorDerived<T>(func)))
{
}
ArgActor() = default;
void predicate(void *t, string arg)
{
if (m_actor) {
m_actor->predicate(t, arg);
} else {
THROW_ERROR(invalid_argument, "null m_actor for predicate arg '" << arg << "'");
}
}
private:
shared_ptr<ArgActorBase> m_actor;
};
struct ArgParser {
~ArgParser();
ArgParser();
void add_opt(string opt, ArgActor actor);
template <class T>
void
parse(T t, const ArgList &args)
{
void *vt = &t;
parse_backend(vt, args);
}
private:
void parse_backend(void *t, const ArgList &args);
map<string, ArgActor> m_string_opts;
};
struct Command {
virtual ~Command();
virtual int exec(const ArgList &args) = 0;
};
struct Proc : public Command {
int exec(const ArgList &args) override;
Proc(const function<int(const ArgList &)> &f);
private:
function<int(const ArgList &)> m_cmd;
};
struct Interp {
virtual ~Interp();
Interp(const map<string, shared_ptr<Command> > &cmdlist);
void add_command(const string &name, const shared_ptr<Command> &command);
int exec(const ArgList &args);
private:
Interp(const Interp &) = delete;
map<string, shared_ptr<Command> > m_commands;
};
};
#endif // CRUCIBLE_INTERP_H

View File

@@ -1,17 +1,14 @@
#ifndef CRUCIBLE_LOCKSET_H
#define CRUCIBLE_LOCKSET_H
#include "crucible/error.h"
#include "crucible/process.h"
#include <crucible/error.h>
#include <cassert>
#include <condition_variable>
#include <iostream>
#include <limits>
#include <map>
#include <memory>
#include <mutex>
#include <set>
namespace crucible {
using namespace std;
@@ -20,36 +17,14 @@ namespace crucible {
class LockSet {
public:
using set_type = map<T, pid_t>;
using key_type = typename set_type::key_type;
using key_type = T;
using set_type = set<T>;
private:
set_type m_set;
mutex m_mutex;
condition_variable m_condvar;
size_t m_max_size = numeric_limits<size_t>::max();
bool full();
bool locked(const key_type &name);
class Lock {
LockSet &m_lockset;
key_type m_name;
bool m_locked;
Lock() = delete;
Lock(const Lock &) = delete;
Lock& operator=(const Lock &) = delete;
Lock(Lock &&that) = delete;
Lock& operator=(Lock &&that) = delete;
public:
~Lock();
Lock(LockSet &lockset, const key_type &name, bool start_locked = true);
void lock();
void unlock();
bool try_lock();
};
public:
~LockSet();
@@ -61,21 +36,26 @@ namespace crucible {
size_t size();
bool empty();
set_type copy();
void wait_unlock(double interval);
void max_size(size_t max);
class LockHandle {
shared_ptr<Lock> m_lock;
class Lock {
LockSet &m_lockset;
key_type m_name;
bool m_locked;
Lock() = delete;
Lock(const Lock &) = delete;
Lock& operator=(const Lock &) = delete;
public:
LockHandle(LockSet &lockset, const key_type &name, bool start_locked = true) :
m_lock(make_shared<Lock>(lockset, name, start_locked)) {}
void lock() { m_lock->lock(); }
void unlock() { m_lock->unlock(); }
bool try_lock() { return m_lock->try_lock(); }
~Lock();
Lock(LockSet &lockset, const key_type &m_name, bool start_locked = true);
Lock(Lock &&that);
Lock& operator=(Lock &&that);
void lock();
void unlock();
bool try_lock();
};
LockHandle make_lock(const key_type &name, bool start_locked = true);
};
template <class T>
@@ -88,36 +68,15 @@ namespace crucible {
assert(m_set.empty());
}
template <class T>
bool
LockSet<T>::full()
{
return m_set.size() >= m_max_size;
}
template <class T>
bool
LockSet<T>::locked(const key_type &name)
{
return m_set.count(name);
}
template <class T>
void
LockSet<T>::max_size(size_t s)
{
m_max_size = s;
}
template <class T>
void
LockSet<T>::lock(const key_type &name)
{
unique_lock<mutex> lock(m_mutex);
while (full() || locked(name)) {
while (m_set.count(name)) {
m_condvar.wait(lock);
}
auto rv = m_set.insert(make_pair(name, gettid()));
auto rv = m_set.insert(name);
THROW_CHECK0(runtime_error, rv.second);
}
@@ -126,10 +85,10 @@ namespace crucible {
LockSet<T>::try_lock(const key_type &name)
{
unique_lock<mutex> lock(m_mutex);
if (full() || locked(name)) {
if (m_set.count(name)) {
return false;
}
auto rv = m_set.insert(make_pair(name, gettid()));
auto rv = m_set.insert(name);
THROW_CHECK1(runtime_error, name, rv.second);
return true;
}
@@ -139,11 +98,20 @@ namespace crucible {
LockSet<T>::unlock(const key_type &name)
{
unique_lock<mutex> lock(m_mutex);
auto erase_count = m_set.erase(name);
m_condvar.notify_all();
auto erase_count = m_set.erase(name);
THROW_CHECK1(invalid_argument, erase_count, erase_count == 1);
}
template <class T>
void
LockSet<T>::wait_unlock(double interval)
{
unique_lock<mutex> lock(m_mutex);
if (m_set.empty()) return;
m_condvar.wait_for(lock, chrono::duration<double>(interval));
}
template <class T>
size_t
LockSet<T>::size()
@@ -165,10 +133,7 @@ namespace crucible {
LockSet<T>::copy()
{
unique_lock<mutex> lock(m_mutex);
// Make temporary copy of set while protected by mutex
auto rv = m_set;
// Return temporary copy after releasing lock
return rv;
return m_set;
}
template <class T>
@@ -218,10 +183,26 @@ namespace crucible {
}
template <class T>
typename LockSet<T>::LockHandle
LockSet<T>::make_lock(const key_type &name, bool start_locked)
LockSet<T>::Lock::Lock(Lock &&that) :
m_lockset(that.lockset),
m_name(that.m_name),
m_locked(that.m_locked)
{
return LockHandle(*this, name, start_locked);
that.m_locked = false;
}
template <class T>
typename LockSet<T>::Lock &
LockSet<T>::Lock::operator=(Lock &&that)
{
THROW_CHECK2(invalid_argument, &m_lockset, &that.m_lockset, &m_lockset == &that.m_lockset);
if (m_locked && that.m_name != m_name) {
unlock();
}
m_name = that.m_name;
m_locked = that.m_locked;
that.m_locked = false;
return *this;
}
}

View File

@@ -1,42 +0,0 @@
#ifndef CRUCIBLE_MULTILOCK_H
#define CRUCIBLE_MULTILOCK_H
#include <condition_variable>
#include <map>
#include <memory>
#include <mutex>
#include <string>
namespace crucible {
using namespace std;
class MultiLocker {
mutex m_mutex;
condition_variable m_cv;
map<string, size_t> m_counters;
bool m_do_locking = true;
class LockHandle {
const string m_type;
MultiLocker &m_parent;
bool m_locked = false;
void set_locked(bool state);
public:
~LockHandle();
LockHandle(const string &type, MultiLocker &parent);
friend class MultiLocker;
};
friend class LockHandle;
bool is_lock_available(const string &type);
void put_lock(const string &type);
shared_ptr<LockHandle> get_lock_private(const string &type);
public:
static shared_ptr<LockHandle> get_lock(const string &type);
static void enable_locking(bool enabled);
};
}
#endif // CRUCIBLE_MULTILOCK_H

View File

@@ -1,225 +0,0 @@
#ifndef CRUCIBLE_NAMEDPTR_H
#define CRUCIBLE_NAMEDPTR_H
#include "crucible/lockset.h"
#include <functional>
#include <map>
#include <memory>
#include <mutex>
#include <tuple>
namespace crucible {
using namespace std;
/// A thread-safe container for RAII of shared resources with unique names.
template <class Return, class... Arguments>
class NamedPtr {
public:
/// The name in "NamedPtr"
using Key = tuple<Arguments...>;
/// A shared pointer to the named object with ownership
/// tracking that erases the object's stored name when
/// the last shared pointer is destroyed.
using Ptr = shared_ptr<Return>;
/// A function that translates a name into a shared pointer to an object.
using Func = function<Ptr(Arguments...)>;
private:
struct Value;
using WeakPtr = weak_ptr<Value>;
using MapType = map<Key, WeakPtr>;
struct MapRep {
MapType m_map;
mutex m_mutex;
};
using MapPtr = shared_ptr<MapRep>;
/// Container for Return pointers. Destructor removes entry from map.
struct Value {
Ptr m_ret_ptr;
MapPtr m_map_rep;
Key m_ret_key;
~Value();
Value(Ptr&& ret_ptr, const Key &key, const MapPtr &map_rep);
};
Func m_fn;
MapPtr m_map_rep = make_shared<MapRep>();
LockSet<Key> m_lockset;
Ptr lookup_item(const Key &k);
Ptr insert_item(Func fn, Arguments... args);
public:
NamedPtr(Func f = Func());
void func(Func f);
Ptr operator()(Arguments... args);
Ptr insert(const Ptr &r, Arguments... args);
};
/// Construct NamedPtr map and define a function to turn a name into a pointer.
template <class Return, class... Arguments>
NamedPtr<Return, Arguments...>::NamedPtr(Func f) :
m_fn(f)
{
}
/// Construct a Value wrapper: the value to store, the argument key to store the value under,
/// and a pointer to the map. Everything needed to remove the key from the map when the
/// last NamedPtr is deleted. NamedPtr then releases its own pointer to the value, which
/// may or may not trigger deletion there.
template <class Return, class... Arguments>
NamedPtr<Return, Arguments...>::Value::Value(Ptr&& ret_ptr, const Key &key, const MapPtr &map_rep) :
m_ret_ptr(ret_ptr),
m_map_rep(map_rep),
m_ret_key(key)
{
}
/// Destroy a Value wrapper: remove a dead Key from the map, then let the member destructors
/// do the rest. The Key might be in the map and not dead, so leave it alone in that case.
template <class Return, class... Arguments>
NamedPtr<Return, Arguments...>::Value::~Value()
{
unique_lock<mutex> lock(m_map_rep->m_mutex);
// We are called from the shared_ptr destructor, so we
// know that the weak_ptr in the map has already expired;
// however, if another thread already noticed that the
// map entry expired while we were waiting for the lock,
// the other thread will have already replaced the map
// entry with a pointer to some other object, and that
// object now owns the map entry. So we do a key lookup
// here instead of storing a map iterator, and only erase
// "our" map entry if it exists and is expired. The other
// thread would have done the same for us if the race had
// a different winner.
const auto found = m_map_rep->m_map.find(m_ret_key);
if (found != m_map_rep->m_map.end() && found->second.expired()) {
m_map_rep->m_map.erase(found);
}
}
/// Find a Return by key and fetch a strong Return pointer.
/// Ignore Keys that have expired weak pointers.
template <class Return, class... Arguments>
typename NamedPtr<Return, Arguments...>::Ptr
NamedPtr<Return, Arguments...>::lookup_item(const Key &k)
{
// Must be called with lock held
const auto found = m_map_rep->m_map.find(k);
if (found != m_map_rep->m_map.end()) {
// Get the strong pointer back
const auto rv = found->second.lock();
if (rv) {
// Have strong pointer. Return value that shares map entry.
return shared_ptr<Return>(rv, rv->m_ret_ptr.get());
}
// Have expired weak pointer. Another thread is trying to delete it,
// but we got the lock first. Leave the map entry alone here.
// The other thread will erase it, or we will put a different entry
// in the same map entry.
}
return Ptr();
}
/// Insert the Return value of calling Func(Arguments...).
/// If the value already exists in the map, return the existing value.
/// If another thread is already running Func(Arguments...) then this thread
/// will block until the other thread finishes inserting the Return in the
/// map, and both threads will return the same Return value.
template <class Return, class... Arguments>
typename NamedPtr<Return, Arguments...>::Ptr
NamedPtr<Return, Arguments...>::insert_item(Func fn, Arguments... args)
{
Key k(args...);
// Is it already in the map?
unique_lock<mutex> lock_lookup(m_map_rep->m_mutex);
auto rv = lookup_item(k);
if (rv) {
return rv;
}
// Release map lock and acquire key lock
lock_lookup.unlock();
const auto key_lock = m_lockset.make_lock(k);
// Did item appear in map while we were waiting for key?
lock_lookup.lock();
rv = lookup_item(k);
if (rv) {
return rv;
}
// We now hold key and index locks, but item not in map (or expired).
// Release map lock so other threads can use the map
lock_lookup.unlock();
// Call the function and create a new Value outside of the map
const auto new_value_ptr = make_shared<Value>(fn(args...), k, m_map_rep);
// Function must return a non-null pointer
THROW_CHECK0(runtime_error, new_value_ptr->m_ret_ptr);
// Reacquire index lock for map insertion. We still hold the key lock.
// Use a different lock object to make exceptions unlock in the right order
unique_lock<mutex> lock_insert(m_map_rep->m_mutex);
// Insert return value in map or overwrite existing
// empty or expired weak_ptr value.
WeakPtr &new_item_ref = m_map_rep->m_map[k];
// We searched the map while holding both locks and
// found no entry or an expired weak_ptr; therefore, no
// other thread could have inserted a new non-expired
// weak_ptr, and the weak_ptr in the map is expired
// or was default-constructed as a nullptr. So if the
// new_item_ref is not expired, we have a bug we need
// to find and fix.
assert(new_item_ref.expired());
// Update the map slot we are sure is empty
new_item_ref = new_value_ptr;
// Return shared_ptr to Return using strong pointer's reference counter
return shared_ptr<Return>(new_value_ptr, new_value_ptr->m_ret_ptr.get());
// Release map lock, then key lock
}
/// (Re)define a function to turn a name into a pointer.
template <class Return, class... Arguments>
void
NamedPtr<Return, Arguments...>::func(Func func)
{
unique_lock<mutex> lock(m_map_rep->m_mutex);
m_fn = func;
}
/// Convert a name into a pointer using the configured function.
template<class Return, class... Arguments>
typename NamedPtr<Return, Arguments...>::Ptr
NamedPtr<Return, Arguments...>::operator()(Arguments... args)
{
return insert_item(m_fn, args...);
}
/// Insert a pointer that has already been created under the
/// given name. Useful for inserting a pointer to a derived
/// class when the name doesn't contain all of the information
/// required for the object, or when the Return is already known by
/// some cheaper method than calling the function.
template<class Return, class... Arguments>
typename NamedPtr<Return, Arguments...>::Ptr
NamedPtr<Return, Arguments...>::insert(const Ptr &r, Arguments... args)
{
THROW_CHECK0(invalid_argument, r);
return insert_item([&](Arguments...) { return r; }, args...);
}
}
#endif // CRUCIBLE_NAMEDPTR_H

View File

@@ -7,12 +7,12 @@ namespace crucible {
using namespace std;
struct bits_ntoa_table {
unsigned long long n;
unsigned long long mask;
unsigned long n;
unsigned long mask;
const char *a;
};
string bits_ntoa(unsigned long long n, const bits_ntoa_table *a);
string bits_ntoa(unsigned long n, const bits_ntoa_table *a);
};
@@ -20,9 +20,9 @@ namespace crucible {
#define NTOA_TABLE_ENTRY_BITS(x) { .n = (x), .mask = (x), .a = (#x) }
// Enumerations (entire value matches all bits)
#define NTOA_TABLE_ENTRY_ENUM(x) { .n = (x), .mask = ~0ULL, .a = (#x) }
#define NTOA_TABLE_ENTRY_ENUM(x) { .n = (x), .mask = ~0UL, .a = (#x) }
// End of table (sorry, C++ didn't get C99's compound literals, so we have to write out all the member names)
// End of table (sorry, gcc doesn't implement this)
#define NTOA_TABLE_ENTRY_END() { .n = 0, .mask = 0, .a = nullptr }
#endif // CRUCIBLE_NTOA_H

View File

@@ -1,52 +0,0 @@
#ifndef CRUCIBLE_OPENAT2_H
#define CRUCIBLE_OPENAT2_H
#include <cstdlib>
// Compatibility for building on old libc for new kernel
#include <linux/version.h>
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)
#include <linux/openat2.h>
#else
#include <linux/types.h>
#ifndef RESOLVE_NO_XDEV
#define RESOLVE_NO_XDEV 1
// RESOLVE_NO_XDEV was there from the beginning of openat2,
// so if that's missing, so is open_how
struct open_how {
__u64 flags;
__u64 mode;
__u64 resolve;
};
#endif
#ifndef RESOLVE_NO_MAGICLINKS
#define RESOLVE_NO_MAGICLINKS 2
#endif
#ifndef RESOLVE_NO_SYMLINKS
#define RESOLVE_NO_SYMLINKS 4
#endif
#ifndef RESOLVE_BENEATH
#define RESOLVE_BENEATH 8
#endif
#ifndef RESOLVE_IN_ROOT
#define RESOLVE_IN_ROOT 16
#endif
#endif // Linux version >= v5.6
extern "C" {
/// Weak symbol to support libc with no syscall wrapper
int openat2(int dirfd, const char *pathname, struct open_how *how, size_t size) throw();
};
#endif // CRUCIBLE_OPENAT2_H

View File

@@ -1,185 +0,0 @@
#ifndef CRUCIBLE_POOL_H
#define CRUCIBLE_POOL_H
#include "crucible/error.h"
#include <functional>
#include <list>
#include <memory>
#include <mutex>
namespace crucible {
using namespace std;
/// Storage for reusable anonymous objects that are too expensive to create and/or destroy frequently
template <class T>
class Pool {
public:
using Ptr = shared_ptr<T>;
using Generator = function<Ptr()>;
using Checker = function<void(Ptr)>;
~Pool();
Pool(Generator f = Generator(), Checker checkin = Checker(), Checker checkout = Checker());
/// Function to create new objects when Pool is empty
void generator(Generator f);
/// Optional function called when objects exit the pool (user handle is created and returned to user)
void checkout(Checker f);
/// Optional function called when objects enter the pool (last user handle is destroyed)
void checkin(Checker f);
/// Pool() returns a handle to an object of type shared_ptr<T>
Ptr operator()();
/// Destroy all objects in Pool that are not in use
void clear();
private:
struct PoolRep {
list<Ptr> m_list;
mutex m_mutex;
Checker m_checkin;
PoolRep(Checker checkin);
};
struct Handle {
weak_ptr<PoolRep> m_list_rep;
Ptr m_ret_ptr;
Handle(shared_ptr<PoolRep> list_rep, Ptr ret_ptr);
~Handle();
};
Generator m_fn;
Checker m_checkout;
shared_ptr<PoolRep> m_list_rep;
};
template <class T>
Pool<T>::PoolRep::PoolRep(Checker checkin) :
m_checkin(checkin)
{
}
template <class T>
Pool<T>::Pool(Generator f, Checker checkin, Checker checkout) :
m_fn(f),
m_checkout(checkout),
m_list_rep(make_shared<PoolRep>(checkin))
{
}
template <class T>
Pool<T>::~Pool()
{
auto list_rep = m_list_rep;
unique_lock<mutex> lock(list_rep->m_mutex);
m_list_rep.reset();
}
template <class T>
Pool<T>::Handle::Handle(shared_ptr<PoolRep> list_rep, Ptr ret_ptr) :
m_list_rep(list_rep),
m_ret_ptr(ret_ptr)
{
}
template <class T>
Pool<T>::Handle::~Handle()
{
// Checkin prepares the object for storage and reuse.
// Neither of those will happen if there is no Pool.
// If the Pool was destroyed, just let m_ret_ptr expire.
auto list_rep = m_list_rep.lock();
if (!list_rep) {
return;
}
unique_lock<mutex> lock(list_rep->m_mutex);
// If a checkin function is defined, call it
auto checkin = list_rep->m_checkin;
if (checkin) {
lock.unlock();
checkin(m_ret_ptr);
lock.lock();
}
// Place object back in pool
list_rep->m_list.push_front(m_ret_ptr);
}
template <class T>
typename Pool<T>::Ptr
Pool<T>::operator()()
{
Ptr rv;
// Do we have an object in the pool we can return instead?
unique_lock<mutex> lock(m_list_rep->m_mutex);
if (m_list_rep->m_list.empty()) {
// No, release cache lock and call the function
lock.unlock();
// Create new value
rv = m_fn();
} else {
rv = m_list_rep->m_list.front();
m_list_rep->m_list.pop_front();
// Release lock so we don't deadlock with Handle destructor
lock.unlock();
}
// rv now points to a T object that is not in the list.
THROW_CHECK0(runtime_error, rv);
// Construct a shared_ptr for Handle which will refcount the Handle objects
// and reinsert the T into the Pool when the last Handle is destroyed.
auto hv = make_shared<Handle>(m_list_rep, rv);
// If a checkout function is defined, call it
if (m_checkout) {
m_checkout(rv);
}
// T an alias shared_ptr for the T using Handle's refcount.
return Ptr(hv, rv.get());
}
template <class T>
void
Pool<T>::generator(Generator func)
{
unique_lock<mutex> lock(m_list_rep->m_mutex);
m_fn = func;
}
template <class T>
void
Pool<T>::checkin(Checker func)
{
unique_lock<mutex> lock(m_list_rep->m_mutex);
m_list_rep->m_checkin = func;
}
template <class T>
void
Pool<T>::checkout(Checker func)
{
unique_lock<mutex> lock(m_list_rep->m_mutex);
m_checkout = func;
}
template <class T>
void
Pool<T>::clear()
{
unique_lock<mutex> lock(m_list_rep->m_mutex);
m_list_rep->m_list.clear();
}
}
#endif // POOL_H

View File

@@ -10,10 +10,6 @@
#include <sys/wait.h>
#include <unistd.h>
extern "C" {
pid_t gettid() throw();
};
namespace crucible {
using namespace std;
@@ -77,10 +73,6 @@ namespace crucible {
typedef ResourceHandle<Process::id, Process> Pid;
double getloadavg1();
double getloadavg5();
double getloadavg15();
string signal_ntoa(int sig);
pid_t gettid();
}
#endif // CRUCIBLE_PROCESS_H

View File

@@ -1,140 +0,0 @@
#ifndef CRUCIBLE_PROGRESS_H
#define CRUCIBLE_PROGRESS_H
#include "crucible/error.h"
#include <functional>
#include <memory>
#include <mutex>
#include <set>
#include <cassert>
namespace crucible {
using namespace std;
/// A class to track progress of multiple workers using only two points:
/// the first and last incomplete state. The first incomplete
/// state can be recorded as a checkpoint to resume later on.
/// The last completed state is the starting point for workers that
/// need something to do.
template <class T>
class ProgressTracker {
struct ProgressTrackerState;
class ProgressHolderState;
public:
using value_type = T;
using ProgressHolder = shared_ptr<ProgressHolderState>;
/// Create ProgressTracker with initial begin and end state 'v'.
ProgressTracker(const value_type &v);
/// The first incomplete state. This is not "sticky",
/// it will revert to the end state if there are no
/// items in progress.
value_type begin() const;
/// The last incomplete state. This is "sticky",
/// it can only increase and never decrease.
value_type end() const;
ProgressHolder hold(const value_type &v);
friend class ProgressHolderState;
private:
struct ProgressTrackerState {
using key_type = pair<value_type, ProgressHolderState *>;
mutex m_mutex;
set<key_type> m_in_progress;
value_type m_begin;
value_type m_end;
};
class ProgressHolderState {
shared_ptr<ProgressTrackerState> m_state;
const value_type m_value;
using key_type = typename ProgressTrackerState::key_type;
public:
ProgressHolderState(shared_ptr<ProgressTrackerState> state, const value_type &v);
~ProgressHolderState();
value_type get() const;
};
shared_ptr<ProgressTrackerState> m_state;
};
template <class T>
typename ProgressTracker<T>::value_type
ProgressTracker<T>::begin() const
{
unique_lock<mutex> lock(m_state->m_mutex);
return m_state->m_begin;
}
template <class T>
typename ProgressTracker<T>::value_type
ProgressTracker<T>::end() const
{
unique_lock<mutex> lock(m_state->m_mutex);
return m_state->m_end;
}
template <class T>
typename ProgressTracker<T>::value_type
ProgressTracker<T>::ProgressHolderState::get() const
{
return m_value;
}
template <class T>
ProgressTracker<T>::ProgressTracker(const ProgressTracker::value_type &t) :
m_state(make_shared<ProgressTrackerState>())
{
m_state->m_begin = t;
m_state->m_end = t;
}
template <class T>
ProgressTracker<T>::ProgressHolderState::ProgressHolderState(shared_ptr<ProgressTrackerState> state, const value_type &v) :
m_state(state),
m_value(v)
{
unique_lock<mutex> lock(m_state->m_mutex);
const auto rv = m_state->m_in_progress.insert(key_type(m_value, this));
THROW_CHECK1(runtime_error, m_value, rv.second);
// Set the beginning to the first existing in-progress item
m_state->m_begin = m_state->m_in_progress.begin()->first;
// If this value is past the end, move the end, but don't go backwards
if (m_state->m_end < m_value) {
m_state->m_end = m_value;
}
}
template <class T>
ProgressTracker<T>::ProgressHolderState::~ProgressHolderState()
{
unique_lock<mutex> lock(m_state->m_mutex);
const auto rv = m_state->m_in_progress.erase(key_type(m_value, this));
// THROW_CHECK2(runtime_error, m_value, rv, rv == 1);
assert(rv == 1);
if (m_state->m_in_progress.empty()) {
// If we made the list empty, then m_begin == m_end
m_state->m_begin = m_state->m_end;
} else {
// If we deleted the first element, then m_begin = current first element
m_state->m_begin = m_state->m_in_progress.begin()->first;
}
}
template <class T>
shared_ptr<typename ProgressTracker<T>::ProgressHolderState>
ProgressTracker<T>::hold(const value_type &v)
{
return make_shared<ProgressHolderState>(m_state, v);
}
}
#endif // CRUCIBLE_PROGRESS_H

View File

@@ -8,7 +8,6 @@
#include <memory>
#include <mutex>
#include <iostream>
#include <stdexcept>
namespace crucible {
using namespace std;
@@ -45,29 +44,36 @@ namespace crucible {
private:
using traits_type = ResourceTraits<Key, Resource>;
using weak_ptr_type = weak_ptr<Resource>;
using map_type = map<key_type, weak_ptr_type>;
class ResourceHolder {
resource_ptr_type m_ptr;
public:
~ResourceHolder();
ResourceHolder(resource_ptr_type that);
ResourceHolder(const ResourceHolder &that) = default;
ResourceHolder(ResourceHolder &&that) = default;
ResourceHolder& operator=(ResourceHolder &&that) = default;
ResourceHolder& operator=(const ResourceHolder &that) = default;
resource_ptr_type get_resource_ptr() const;
};
using holder_ptr_type = shared_ptr<ResourceHolder>;
using weak_holder_ptr_type = weak_ptr<ResourceHolder>;
using map_type = map<key_type, weak_holder_ptr_type>;
// The only instance variable
resource_ptr_type m_ptr;
holder_ptr_type m_ptr;
// A bunch of static variables and functions
static mutex s_map_mutex;
static map_type s_map;
static resource_ptr_type insert(const key_type &key);
static resource_ptr_type insert(const resource_ptr_type &res);
static void clean_locked();
static mutex &s_mutex();
static shared_ptr<map_type> s_map();
static holder_ptr_type insert(const key_type &key);
static holder_ptr_type insert(const resource_ptr_type &res);
static void erase(const key_type &key);
static ResourceTraits<Key, Resource> s_traits;
public:
// Exceptions
struct duplicate_resource : public invalid_argument {
key_type m_key;
key_type get_key() const;
duplicate_resource(const key_type &key);
};
// test for resource. A separate operator because key_type could be confused with bool.
bool operator!() const;
@@ -83,15 +89,8 @@ namespace crucible {
ResourceHandle(const resource_ptr_type &res);
ResourceHandle& operator=(const resource_ptr_type &res);
// default construct/assign/move is public and mostly harmless
// default constructor is public
ResourceHandle() = default;
ResourceHandle(const ResourceHandle &that) = default;
ResourceHandle(ResourceHandle &&that) = default;
ResourceHandle& operator=(const ResourceHandle &that) = default;
ResourceHandle& operator=(ResourceHandle &&that) = default;
// Nontrivial destructor
~ResourceHandle();
// forward anything else to the Resource constructor
// if we can do so unambiguously
@@ -110,7 +109,7 @@ namespace crucible {
// get pointer to Resource object (nothrow, result may be null)
resource_ptr_type get_resource_ptr() const;
// this version throws
// this version throws and is probably not thread safe
resource_ptr_type operator->() const;
// dynamic casting of the resource (throws if cast fails)
@@ -146,94 +145,139 @@ namespace crucible {
}
template <class Key, class Resource>
ResourceHandle<Key, Resource>::duplicate_resource::duplicate_resource(const key_type &key) :
invalid_argument("duplicate resource"),
m_key(key)
ResourceHandle<Key, Resource>::ResourceHolder::ResourceHolder(resource_ptr_type that) :
m_ptr(that)
{
// Cannot insert ourselves here since our shared_ptr does not exist yet.
}
template <class Key, class Resource>
auto
ResourceHandle<Key, Resource>::duplicate_resource::get_key() const -> key_type
mutex &
ResourceHandle<Key, Resource>::s_mutex()
{
return m_key;
static mutex gcc_won_t_instantiate_this_either;
return gcc_won_t_instantiate_this_either;
}
template <class Key, class Resource>
shared_ptr<typename ResourceHandle<Key, Resource>::map_type>
ResourceHandle<Key, Resource>::s_map()
{
static shared_ptr<map_type> gcc_won_t_instantiate_the_damn_static_vars;
if (!gcc_won_t_instantiate_the_damn_static_vars) {
gcc_won_t_instantiate_the_damn_static_vars = make_shared<map_type>();
}
return gcc_won_t_instantiate_the_damn_static_vars;
}
template <class Key, class Resource>
void
ResourceHandle<Key, Resource>::clean_locked()
ResourceHandle<Key, Resource>::erase(const key_type &key)
{
// Must be called with lock held
for (auto i = s_map.begin(); i != s_map.end(); ) {
auto this_i = i;
++i;
if (this_i->second.expired()) {
s_map.erase(this_i);
unique_lock<mutex> lock(s_mutex());
// Resources are allowed to set their Keys to null.
if (s_traits.is_null_key(key)) {
// Clean out any dead weak_ptr objects.
for (auto i = s_map()->begin(); i != s_map()->end(); ) {
if (! (*i).second.lock()) {
i = s_map()->erase(i);
} else {
++i;
}
}
return;
}
auto erased = s_map()->erase(key);
if (erased != 1) {
cerr << __PRETTY_FUNCTION__ << ": WARNING: s_map()->erase(" << key << ") returned " << erased << " != 1" << endl;
}
}
template <class Key, class Resource>
typename ResourceHandle<Key, Resource>::resource_ptr_type
ResourceHandle<Key, Resource>::ResourceHolder::~ResourceHolder()
{
if (!m_ptr) {
// Probably something harmless like a failed constructor.
cerr << __PRETTY_FUNCTION__ << ": WARNING: destroying null m_ptr" << endl;
return;
}
Key key = s_traits.get_key(*m_ptr);
ResourceHandle::erase(key);
}
template <class Key, class Resource>
typename ResourceHandle<Key, Resource>::holder_ptr_type
ResourceHandle<Key, Resource>::insert(const key_type &key)
{
// no Resources for null keys
if (s_traits.is_null_key(key)) {
return resource_ptr_type();
return holder_ptr_type();
}
unique_lock<mutex> lock(s_map_mutex);
auto found = s_map.find(key);
if (found != s_map.end()) {
resource_ptr_type rv = found->second.lock();
unique_lock<mutex> lock(s_mutex());
// find ResourceHolder for non-null key
auto found = s_map()->find(key);
if (found != s_map()->end()) {
holder_ptr_type rv = (*found).second.lock();
// a weak_ptr may have expired
if (rv) {
// Use existing Resource
return rv;
} else {
// It's OK for the map to temporarily contain an expired weak_ptr to some dead Resource
clean_locked();
}
}
// not found or expired, throw any existing ref away and make a new one
resource_ptr_type rpt = s_traits.make_resource(key);
holder_ptr_type hpt = make_shared<ResourceHolder>(rpt);
// store weak_ptr in map
s_map[key] = rpt;
(*s_map())[key] = hpt;
// return shared_ptr
return rpt;
return hpt;
};
template <class Key, class Resource>
typename ResourceHandle<Key, Resource>::resource_ptr_type
typename ResourceHandle<Key, Resource>::holder_ptr_type
ResourceHandle<Key, Resource>::insert(const resource_ptr_type &res)
{
// no Resources for null keys
// no Resource, no ResourceHolder.
if (!res) {
return resource_ptr_type();
return holder_ptr_type();
}
// no ResourceHolders for null keys either.
key_type key = s_traits.get_key(*res);
if (s_traits.is_null_key(key)) {
return resource_ptr_type();
return holder_ptr_type();
}
unique_lock<mutex> lock(s_map_mutex);
// find Resource for non-null key
auto found = s_map.find(key);
if (found != s_map.end()) {
resource_ptr_type rv = found->second.lock();
// It's OK for the map to temporarily contain an expired weak_ptr to some dead Resource...
unique_lock<mutex> lock(s_mutex());
// find ResourceHolder for non-null key
auto found = s_map()->find(key);
if (found != s_map()->end()) {
holder_ptr_type rv = (*found).second.lock();
// The map doesn't own the ResourceHolders, the ResourceHandles do.
// It's OK for the map to contain an expired weak_ptr to some dead ResourceHolder...
if (rv) {
// ...but not a duplicate Resource.
if (rv.owner_before(res) || res.owner_before(rv)) {
throw duplicate_resource(key);
// found ResourceHolder, look at pointer
resource_ptr_type rp = rv->get_resource_ptr();
// We do not store references to null Resources.
assert(rp);
// Key retrieved for an existing object must match key searched or be null.
key_type found_key = s_traits.get_key(*rp);
bool found_key_is_null = s_traits.is_null_key(found_key);
assert(found_key_is_null || found_key == key);
if (!found_key_is_null) {
// We do not store references to duplicate resources.
if (rp.owner_before(res) || res.owner_before(rp)) {
cerr << "inserting new Resource with existing Key " << key << " not allowed at " << __PRETTY_FUNCTION__ << endl;;
abort();
// THROW_ERROR(out_of_range, "inserting new Resource with existing Key " << key << " not allowed at " << __PRETTY_FUNCTION__);
}
// rv is good, return it
return rv;
}
// Use the existing Resource (discard the caller's).
return rv;
} else {
// Clean out expired weak_ptrs
clean_locked();
}
}
// not found or expired, make a new one or replace old one
s_map[key] = res;
return res;
// not found or expired, make a new one
holder_ptr_type rv = make_shared<ResourceHolder>(res);
s_map()->insert(make_pair(key, weak_holder_ptr_type(rv)));
// no need to check s_map result, we are either replacing a dead weak_ptr or adding a new one
return rv;
};
template <class Key, class Resource>
@@ -265,47 +309,31 @@ namespace crucible {
}
template <class Key, class Resource>
ResourceHandle<Key, Resource>::~ResourceHandle()
typename ResourceHandle<Key, Resource>::resource_ptr_type
ResourceHandle<Key, Resource>::ResourceHolder::get_resource_ptr() const
{
// No pointer, nothing to do
if (!m_ptr) {
return;
}
// Save key so we can clean the map
auto key = s_traits.get_key(*m_ptr);
// Save a weak_ptr so we can tell if we need to clean the map
weak_ptr_type wp = m_ptr;
// Drop shared_ptr
m_ptr.reset();
// If there are still other references to the shared_ptr, we can stop now
if (!wp.expired()) {
return;
}
// Remove weak_ptr from map if it has expired
// (and not been replaced in the meantime)
unique_lock<mutex> lock_map(s_map_mutex);
auto found = s_map.find(key);
// Map entry may have been replaced, so check for expiry again
if (found != s_map.end() && found->second.expired()) {
s_map.erase(key);
}
return m_ptr;
}
template <class Key, class Resource>
typename ResourceHandle<Key, Resource>::resource_ptr_type
ResourceHandle<Key, Resource>::get_resource_ptr() const
{
return m_ptr;
if (!m_ptr) {
return resource_ptr_type();
}
return m_ptr->get_resource_ptr();
}
template <class Key, class Resource>
typename ResourceHandle<Key, Resource>::resource_ptr_type
ResourceHandle<Key, Resource>::operator->() const
{
if (!m_ptr) {
resource_ptr_type rp = get_resource_ptr();
if (!rp) {
THROW_ERROR(out_of_range, __PRETTY_FUNCTION__ << " called on null Resource");
}
return m_ptr;
return rp;
}
template <class Key, class Resource>
@@ -314,10 +342,11 @@ namespace crucible {
ResourceHandle<Key, Resource>::cast() const
{
shared_ptr<T> dp;
if (!m_ptr) {
resource_ptr_type rp = get_resource_ptr();
if (!rp) {
return dp;
}
dp = dynamic_pointer_cast<T>(m_ptr);
dp = dynamic_pointer_cast<T>(rp);
if (!dp) {
throw bad_cast();
}
@@ -328,10 +357,11 @@ namespace crucible {
typename ResourceHandle<Key, Resource>::key_type
ResourceHandle<Key, Resource>::get_key() const
{
if (!m_ptr) {
resource_ptr_type rp = get_resource_ptr();
if (!rp) {
return s_traits.get_null_key();
} else {
return s_traits.get_key(*m_ptr);
return s_traits.get_key(*rp);
}
}
@@ -348,15 +378,9 @@ namespace crucible {
return s_traits.is_null_key(operator key_type());
}
// Apparently GCC wants these to be used before they are defined.
template <class Key, class Resource>
ResourceTraits<Key, Resource> ResourceHandle<Key, Resource>::s_traits;
template <class Key, class Resource>
mutex ResourceHandle<Key, Resource>::s_map_mutex;
template <class Key, class Resource>
typename ResourceHandle<Key, Resource>::map_type ResourceHandle<Key, Resource>::s_map;
}

View File

@@ -1,158 +0,0 @@
#ifndef _CRUCIBLE_SEEKER_H_
#define _CRUCIBLE_SEEKER_H_
#include "crucible/error.h"
#include <algorithm>
#include <limits>
// Debug stream
#include <memory>
#include <iostream>
#include <sstream>
#include <cstdint>
namespace crucible {
using namespace std;
extern thread_local shared_ptr<ostream> tl_seeker_debug_str;
#define SEEKER_DEBUG_LOG(__x) do { \
if (tl_seeker_debug_str) { \
(*tl_seeker_debug_str) << __x << "\n"; \
} \
} while (false)
// Requirements for Container<Pos> Fetch(Pos lower, Pos upper):
// - fetches objects in Pos order, starting from lower (must be >= lower)
// - must return upper if present, may or may not return objects after that
// - returns a container of Pos objects with begin(), end(), rbegin(), rend()
// - container must iterate over objects in Pos order
// - uniqueness of Pos objects not required
// - should store the underlying data as a side effect
//
// Requirements for Pos:
// - should behave like an unsigned integer type
// - must have specializations in numeric_limits<T> for digits, max(), min()
// - must support +, -, -=, and related operators
// - must support <, <=, ==, and related operators
// - must support Pos / 2 (only)
//
// Requirements for seek_backward:
// - calls Fetch to search Pos space near target_pos
// - if no key exists with value <= target_pos, returns the minimum Pos value
// - returns the highest key value <= target_pos
// - returned key value may not be part of most recent Fetch result
// - 1 loop iteration when target_pos exists
template <class Fetch, class Pos = uint64_t>
Pos
seek_backward(Pos const target_pos, Fetch fetch, Pos min_step = 1, size_t max_loops = numeric_limits<size_t>::max())
{
static const Pos end_pos = numeric_limits<Pos>::max();
// TBH this probably won't work if begin_pos != 0, i.e. any signed type
static const Pos begin_pos = numeric_limits<Pos>::min();
// Run a binary search looking for the highest key below target_pos.
// Initial upper bound of the search is target_pos.
// Find initial lower bound by doubling the size of the range until a key below target_pos
// is found, or the lower bound reaches the beginning of the search space.
// If the lower bound search reaches the beginning of the search space without finding a key,
// return the beginning of the search space; otherwise, perform a binary search between
// the bounds now established.
Pos lower_bound = 0;
Pos upper_bound = target_pos;
bool found_low = false;
Pos probe_pos = target_pos;
// We need one loop for each bit of the search space to find the lower bound,
// one loop for each bit of the search space to find the upper bound,
// and one extra loop to confirm the boundary is correct.
for (size_t loop_count = min((1 + numeric_limits<Pos>::digits) * size_t(2), max_loops); loop_count; --loop_count) {
SEEKER_DEBUG_LOG("fetch(probe_pos = " << probe_pos << ", target_pos = " << target_pos << ")");
auto result = fetch(probe_pos, target_pos);
const Pos low_pos = result.empty() ? end_pos : *result.begin();
const Pos high_pos = result.empty() ? end_pos : *result.rbegin();
SEEKER_DEBUG_LOG(" = " << low_pos << ".." << high_pos);
// check for correct behavior of the fetch function
THROW_CHECK2(out_of_range, high_pos, probe_pos, probe_pos <= high_pos);
THROW_CHECK2(out_of_range, low_pos, probe_pos, probe_pos <= low_pos);
THROW_CHECK2(out_of_range, low_pos, high_pos, low_pos <= high_pos);
if (!found_low) {
// if target_pos == end_pos then we will find it in every empty result set,
// so in that case we force the lower bound to be lower than end_pos
if ((target_pos == end_pos) ? (low_pos < target_pos) : (low_pos <= target_pos)) {
// found a lower bound, set the low bound there and switch to binary search
found_low = true;
lower_bound = low_pos;
SEEKER_DEBUG_LOG("found_low = true, lower_bound = " << lower_bound);
} else {
// still looking for lower bound
// if probe_pos was begin_pos then we can stop with no result
if (probe_pos == begin_pos) {
SEEKER_DEBUG_LOG("return: probe_pos == begin_pos " << begin_pos);
return begin_pos;
}
// double the range size, or use the distance between objects found so far
THROW_CHECK2(out_of_range, upper_bound, probe_pos, probe_pos <= upper_bound);
// already checked low_pos <= high_pos above
const Pos want_delta = max(upper_bound - probe_pos, min_step);
// avoid underflowing the beginning of the search space
const Pos have_delta = min(want_delta, probe_pos - begin_pos);
THROW_CHECK2(out_of_range, want_delta, have_delta, have_delta <= want_delta);
// move probe and try again
probe_pos = probe_pos - have_delta;
SEEKER_DEBUG_LOG("probe_pos " << probe_pos << " = probe_pos - have_delta " << have_delta << " (want_delta " << want_delta << ")");
continue;
}
}
if (low_pos <= target_pos && target_pos <= high_pos) {
// have keys on either side of target_pos in result
// search from the high end until we find the highest key below target
for (auto i = result.rbegin(); i != result.rend(); ++i) {
// more correctness checking for fetch
THROW_CHECK2(out_of_range, *i, probe_pos, probe_pos <= *i);
if (*i <= target_pos) {
SEEKER_DEBUG_LOG("return: *i " << *i << " <= target_pos " << target_pos);
return *i;
}
}
// if the list is empty then low_pos = high_pos = end_pos
// if target_pos = end_pos also, then we will execute the loop
// above but not find any matching entries.
THROW_CHECK0(runtime_error, result.empty());
}
if (target_pos <= low_pos) {
// results are all too high, so probe_pos..low_pos is too high
// lower the high bound to the probe pos, low_pos cannot be lower
SEEKER_DEBUG_LOG("upper_bound = probe_pos " << probe_pos);
upper_bound = probe_pos;
}
if (high_pos < target_pos) {
// results are all too low, so probe_pos..high_pos is too low
// raise the low bound to high_pos but not above upper_bound
const auto next_pos = min(high_pos, upper_bound);
SEEKER_DEBUG_LOG("lower_bound = next_pos " << next_pos);
lower_bound = next_pos;
}
// compute a new probe pos at the middle of the range and try again
// we can't have a zero-size range here because we would not have set found_low yet
THROW_CHECK2(out_of_range, lower_bound, upper_bound, lower_bound <= upper_bound);
const Pos delta = (upper_bound - lower_bound) / 2;
probe_pos = lower_bound + delta;
if (delta < 1) {
// nothing can exist in the range (lower_bound, upper_bound)
// and an object is known to exist at lower_bound
SEEKER_DEBUG_LOG("return: probe_pos == lower_bound " << lower_bound);
return lower_bound;
}
THROW_CHECK2(out_of_range, lower_bound, probe_pos, lower_bound <= probe_pos);
THROW_CHECK2(out_of_range, upper_bound, probe_pos, probe_pos <= upper_bound);
SEEKER_DEBUG_LOG("loop bottom: lower_bound " << lower_bound << ", probe_pos " << probe_pos << ", upper_bound " << upper_bound);
}
THROW_ERROR(runtime_error, "FIXME: should not reach this line: "
"lower_bound..upper_bound " << lower_bound << ".." << upper_bound << ", "
"found_low " << found_low);
}
}
#endif // _CRUCIBLE_SEEKER_H_

View File

@@ -11,6 +11,23 @@
namespace crucible {
using namespace std;
// Zero-initialize a base class object (usually a C struct)
template <class Base>
void
memset_zero(Base *that)
{
memset(that, 0, sizeof(Base));
}
// Copy a base class object (usually a C struct) into a vector<char>
template <class Base>
vector<char>
vector_copy_struct(Base *that)
{
const char *begin_that = reinterpret_cast<const char *>(static_cast<const Base *>(that));
return vector<char>(begin_that, begin_that + sizeof(Base));
}
// int->hex conversion with sprintf
string to_hex(uint64_t i);
@@ -43,7 +60,7 @@ namespace crucible {
ptrdiff_t
pointer_distance(const P1 *a, const P2 *b)
{
return reinterpret_cast<const uint8_t *>(a) - reinterpret_cast<const uint8_t *>(b);
return reinterpret_cast<const char *>(a) - reinterpret_cast<const char *>(b);
}
};

View File

@@ -1,106 +0,0 @@
#ifndef CRUCIBLE_TABLE_H
#define CRUCIBLE_TABLE_H
#include <functional>
#include <limits>
#include <map>
#include <memory>
#include <ostream>
#include <sstream>
#include <string>
#include <vector>
namespace crucible {
namespace Table {
using namespace std;
using Content = function<string(size_t width, size_t height)>;
const size_t endpos = numeric_limits<size_t>::max();
Content Fill(const char c);
Content Text(const string& s);
template <class T>
Content Number(const T& num)
{
ostringstream oss;
oss << num;
return Text(oss.str());
}
class Cell {
Content m_content;
public:
Cell(const Content &fn = [](size_t, size_t) { return string(); } );
Cell& operator=(const Content &fn);
string text(size_t width, size_t height) const;
};
class Dimension {
size_t m_next_pos = 0;
vector<size_t> m_elements;
friend class Table;
size_t at(size_t) const;
public:
size_t size() const;
size_t insert(size_t pos);
void erase(size_t pos);
};
class Table {
Dimension m_rows, m_cols;
map<pair<size_t, size_t>, Cell> m_cells;
string m_left = "|";
string m_mid = "|";
string m_right = "|";
public:
Dimension &rows();
const Dimension& rows() const;
Dimension &cols();
const Dimension& cols() const;
Cell& at(size_t row, size_t col);
const Cell& at(size_t row, size_t col) const;
template <class T> void insert_row(size_t pos, const T& container);
template <class T> void insert_col(size_t pos, const T& container);
void left(const string &s);
void mid(const string &s);
void right(const string &s);
const string& left() const;
const string& mid() const;
const string& right() const;
};
ostream& operator<<(ostream &os, const Table &table);
template <class T>
void
Table::insert_row(size_t pos, const T& container)
{
const auto new_pos = m_rows.insert(pos);
size_t col = 0;
for (const auto &i : container) {
if (col >= cols().size()) {
cols().insert(col);
}
at(new_pos, col++) = i;
}
}
template <class T>
void
Table::insert_col(size_t pos, const T& container)
{
const auto new_pos = m_cols.insert(pos);
size_t row = 0;
for (const auto &i : container) {
if (row >= rows().size()) {
rows().insert(row);
}
at(row++, new_pos) = i;
}
}
}
}
#endif // CRUCIBLE_TABLE_H

View File

@@ -1,188 +0,0 @@
#ifndef CRUCIBLE_TASK_H
#define CRUCIBLE_TASK_H
#include <functional>
#include <memory>
#include <mutex>
#include <ostream>
#include <string>
namespace crucible {
using namespace std;
class TaskState;
using TaskId = uint64_t;
/// A unit of work to be scheduled by TaskMaster.
class Task {
shared_ptr<TaskState> m_task_state;
Task(shared_ptr<TaskState> pts);
public:
/// Create empty Task object.
Task() = default;
/// Create Task object containing closure and description.
Task(string title, function<void()> exec_fn);
/// Schedule Task for at most one future execution.
/// May run Task in current thread or in other thread.
/// May run Task before or after returning.
/// Schedules Task at the end of the global execution queue.
///
/// Only one instance of a Task may execute at a time.
/// If a Task is already scheduled, run() does nothing.
/// If a Task is already running when a new instance reaches
/// the front of the queue, the new instance will execute
/// after the current instance exits.
void run() const;
/// Schedule task to run when no other Task is available.
void idle() const;
/// Schedule Task to run after this Task has run or
/// been destroyed.
void append(const Task &task) const;
/// Schedule Task to run after this Task has run or
/// been destroyed, in Task ID order.
void insert(const Task &task) const;
/// Describe Task as text.
string title() const;
/// Returns currently executing task if called from exec_fn.
/// Usually used to reschedule the currently executing Task.
static Task current_task();
/// Returns number of currently existing Task objects.
/// Good for spotting leaks.
static size_t instance_count();
/// Ordering operator for containers
bool operator<(const Task &that) const;
/// Null test
operator bool() const;
/// Unique non-repeating(ish) ID for task
TaskId id() const;
};
ostream &operator<<(ostream &os, const Task &task);
class TaskMaster {
public:
/// Blocks until the running thread count reaches this number
static void set_thread_count(size_t threads);
/// Sets minimum thread count when load average tracking enabled
static void set_thread_min_count(size_t min_threads);
/// Calls set_thread_count with default
static void set_thread_count();
/// Creates thread to track load average and adjust thread count dynamically
static void set_loadavg_target(double target);
/// Writes the current non-executing Task queue
static ostream & print_queue(ostream &);
/// Writes the current executing Task for each worker
static ostream & print_workers(ostream &);
/// Gets the current number of queued Tasks
static size_t get_queue_count();
/// Gets the current number of active workers
static size_t get_thread_count();
/// Gets the current load tracking statistics
struct LoadStats {
/// Current load extracted from last two 5-second load average samples
double current_load;
/// Target thread count computed from previous thread count and current load
double thread_target;
/// Load average for last 60 seconds
double loadavg;
};
static LoadStats get_current_load();
/// Drop the current queue and discard new Tasks without
/// running them. Currently executing tasks are not
/// affected (use set_thread_count(0) to wait for those
/// to complete).
static void cancel();
/// Stop running any new Tasks. All existing
/// Consumer threads will exit. Does not affect queue.
/// Does not wait for threads to exit. Reversible.
static void pause(bool paused = true);
};
class BarrierState;
/// Barrier delays the execution of one or more Tasks.
/// The Tasks are executed when the last shared reference to the
/// BarrierState is released. Copies of Barrier objects refer
/// to the same Barrier state.
class Barrier {
shared_ptr<BarrierState> m_barrier_state;
public:
Barrier();
/// Schedule a task for execution when last Barrier is released.
void insert_task(Task t);
/// Release this reference to the barrier state.
/// Last released reference executes the task.
/// Barrier can only be released once, after which the
/// object can no longer be used.
void release();
};
class ExclusionLock {
shared_ptr<Task> m_owner;
ExclusionLock(shared_ptr<Task> owner);
friend class Exclusion;
public:
/// Explicit default constructor because we have other kinds
ExclusionLock() = default;
/// Release this Lock immediately and permanently
void release();
/// Test for locked state
operator bool() const;
};
class Exclusion {
mutex m_mutex;
weak_ptr<Task> m_owner;
public:
/// Attempt to obtain a Lock. If successful, current Task
/// owns the Lock until the ExclusionLock is released
/// (it is the ExclusionLock that owns the lock, so it can
/// be passed to other Tasks or threads, but this is not
/// recommended practice).
/// If not successful, the argument Task is appended to the
/// task that currently holds the lock. Current task is
/// expected to immediately release any other ExclusionLock
/// objects it holds, and exit its Task function.
ExclusionLock try_lock(const Task &task);
};
/// Wrapper around pthread_setname_np which handles length limits
void pthread_setname(const string &name);
/// Wrapper around pthread_getname_np for symmetry
string pthread_getname();
}
#endif // CRUCIBLE_TASK_H

View File

@@ -4,8 +4,6 @@
#include "crucible/error.h"
#include <chrono>
#include <condition_variable>
#include <limits>
#include <mutex>
#include <ostream>
@@ -19,9 +17,10 @@ namespace crucible {
public:
Timer();
double age() const;
chrono::high_resolution_clock::time_point get() const;
double report(int precision = 1000) const;
void reset();
void set(const chrono::high_resolution_clock::time_point &start);
void set(double delta);
double lap();
bool operator<(double d) const;
bool operator>(double d) const;
@@ -33,78 +32,18 @@ namespace crucible {
Timer m_timer;
double m_rate;
double m_burst;
double m_tokens = 0.0;
mutable mutex m_mutex;
double m_tokens;
mutex m_mutex;
void update_tokens();
RateLimiter() = delete;
public:
RateLimiter(double rate, double burst);
RateLimiter(double rate);
void sleep_for(double cost = 1.0);
double sleep_time(double cost = 1.0);
bool is_ready();
void borrow(double cost = 1.0);
void rate(double new_rate);
double rate() const;
};
class RateEstimator {
mutable mutex m_mutex;
mutable condition_variable m_condvar;
Timer m_timer;
double m_num = 0.0;
double m_den = 0.0;
uint64_t m_last_count = numeric_limits<uint64_t>::max();
Timer m_last_update;
const double m_decay = 0.99;
Timer m_last_decay;
double m_min_delay;
double m_max_delay;
chrono::duration<double> duration_unlocked(uint64_t relative_count) const;
chrono::high_resolution_clock::time_point time_point_unlocked(uint64_t absolute_count) const;
double rate_unlocked() const;
pair<double, double> ratio_unlocked() const;
void update_unlocked(uint64_t new_count);
public:
RateEstimator(double min_delay = 1, double max_delay = 3600);
// Block until count reached
void wait_for(uint64_t new_count_relative) const;
void wait_until(uint64_t new_count_absolute) const;
// Computed rates and ratios
double rate() const;
pair<double, double> ratio() const;
// Inspect raw num/den
pair<double, double> raw() const;
// Write count
void update(uint64_t new_count);
// Ignore counts that go backwards
void update_monotonic(uint64_t new_count);
// Read count
uint64_t count() const;
/// Increment count (like update(count() + more), but atomic)
void increment(uint64_t more = 1);
// Convert counts to chrono types
chrono::high_resolution_clock::time_point time_point(uint64_t absolute_count) const;
chrono::duration<double> duration(uint64_t relative_count) const;
// Polling delay until count reached (limited by min/max delay)
double seconds_for(uint64_t new_count_relative) const;
double seconds_until(uint64_t new_count_absolute) const;
};
ostream &
operator<<(ostream &os, const RateEstimator &re);
}
#endif // CRUCIBLE_TIME_H

View File

@@ -0,0 +1,188 @@
#ifndef CRUCIBLE_TIMEQUEUE_H
#define CRUCIBLE_TIMEQUEUE_H
#include <crucible/error.h>
#include <crucible/time.h>
#include <condition_variable>
#include <limits>
#include <list>
#include <memory>
#include <mutex>
#include <set>
namespace crucible {
using namespace std;
template <class Task>
class TimeQueue {
public:
using Timestamp = chrono::high_resolution_clock::time_point;
private:
struct Item {
Timestamp m_time;
unsigned m_id;
Task m_task;
bool operator<(const Item &that) const {
if (m_time < that.m_time) return true;
if (that.m_time < m_time) return false;
return m_id < that.m_id;
}
static unsigned s_id;
Item(const Timestamp &time, const Task& task) :
m_time(time),
m_id(++s_id),
m_task(task)
{
}
};
set<Item> m_set;
mutable mutex m_mutex;
condition_variable m_cond_full, m_cond_empty;
size_t m_max_queue_depth;
public:
~TimeQueue();
TimeQueue(size_t max_queue_depth = numeric_limits<size_t>::max());
void push(const Task &task, double delay = 0);
void push_nowait(const Task &task, double delay = 0);
Task pop();
bool pop_nowait(Task &t);
double when() const;
size_t size() const;
bool empty() const;
list<Task> peek(size_t count) const;
};
template <class Task> unsigned TimeQueue<Task>::Item::s_id = 0;
template <class Task>
TimeQueue<Task>::~TimeQueue()
{
if (!m_set.empty()) {
cerr << "ERROR: " << m_set.size() << " locked items still in TimeQueue at destruction" << endl;
}
}
template <class Task>
void
TimeQueue<Task>::push(const Task &task, double delay)
{
Timestamp time = chrono::high_resolution_clock::now() +
chrono::duration_cast<chrono::high_resolution_clock::duration>(chrono::duration<double>(delay));
unique_lock<mutex> lock(m_mutex);
while (m_set.size() > m_max_queue_depth) {
m_cond_full.wait(lock);
}
m_set.insert(Item(time, task));
m_cond_empty.notify_all();
}
template <class Task>
void
TimeQueue<Task>::push_nowait(const Task &task, double delay)
{
Timestamp time = chrono::high_resolution_clock::now() +
chrono::duration_cast<chrono::high_resolution_clock::duration>(chrono::duration<double>(delay));
unique_lock<mutex> lock(m_mutex);
m_set.insert(Item(time, task));
m_cond_empty.notify_all();
}
template <class Task>
Task
TimeQueue<Task>::pop()
{
unique_lock<mutex> lock(m_mutex);
while (1) {
while (m_set.empty()) {
m_cond_empty.wait(lock);
}
Timestamp now = chrono::high_resolution_clock::now();
if (now > m_set.begin()->m_time) {
Task rv = m_set.begin()->m_task;
m_set.erase(m_set.begin());
m_cond_full.notify_all();
return rv;
}
m_cond_empty.wait_until(lock, m_set.begin()->m_time);
}
}
template <class Task>
bool
TimeQueue<Task>::pop_nowait(Task &t)
{
unique_lock<mutex> lock(m_mutex);
if (m_set.empty()) {
return false;
}
Timestamp now = chrono::high_resolution_clock::now();
if (now <= m_set.begin()->m_time) {
return false;
}
t = m_set.begin()->m_task;
m_set.erase(m_set.begin());
m_cond_full.notify_all();
return true;
}
template <class Task>
double
TimeQueue<Task>::when() const
{
unique_lock<mutex> lock(m_mutex);
if (m_set.empty()) {
return numeric_limits<double>::infinity();
}
return chrono::duration<double>(m_set.begin()->m_time - chrono::high_resolution_clock::now()).count();
}
template <class Task>
size_t
TimeQueue<Task>::size() const
{
unique_lock<mutex> lock(m_mutex);
return m_set.size();
}
template <class Task>
bool
TimeQueue<Task>::empty() const
{
unique_lock<mutex> lock(m_mutex);
return m_set.empty();
}
template <class Task>
list<Task>
TimeQueue<Task>::peek(size_t count) const
{
unique_lock<mutex> lock(m_mutex);
list<Task> rv;
auto it = m_set.begin();
while (count-- && it != m_set.end()) {
rv.push_back(it->m_task);
++it;
}
return rv;
}
template <class Task>
TimeQueue<Task>::TimeQueue(size_t max_depth) :
m_max_queue_depth(max_depth)
{
}
}
#endif // CRUCIBLE_TIMEQUEUE_H

View File

@@ -1,14 +0,0 @@
#ifndef CRUCIBLE_UNAME_H
#define CRUCIBLE_UNAME_H
#include <sys/utsname.h>
namespace crucible {
using namespace std;
struct Uname : public utsname {
Uname();
};
}
#endif

14
include/crucible/uuid.h Normal file
View File

@@ -0,0 +1,14 @@
#ifndef CRUCIBLE_UUID_H
#define CRUCIBLE_UUID_H
#include <string>
#include <uuid/uuid.h>
namespace crucible {
using namespace std;
string uuid_unparse(const unsigned char a[16]);
}
#endif // CRUCIBLE_UUID_H

View File

@@ -1,8 +0,0 @@
#ifndef CRUCIBLE_VERSION_H
#define CRUCIBLE_VERSION_H
namespace crucible {
extern const char *VERSION;
}
#endif CRUCIBLE_VERSION_H

View File

@@ -0,0 +1,189 @@
#ifndef CRUCIBLE_WORKQUEUE_H
#define CRUCIBLE_WORKQUEUE_H
#include <crucible/error.h>
#include <condition_variable>
#include <limits>
#include <list>
#include <memory>
#include <mutex>
#include <set>
namespace crucible {
using namespace std;
template <class Task>
class WorkQueue {
public:
using set_type = set<Task>;
using key_type = Task;
private:
set_type m_set;
mutable mutex m_mutex;
condition_variable m_cond_full, m_cond_empty;
size_t m_max_queue_depth;
public:
~WorkQueue();
template <class... Args> WorkQueue(size_t max_queue_depth, Args... args);
template <class... Args> WorkQueue(Args... args);
void push(const key_type &name);
void push_wait(const key_type &name, size_t limit);
void push_nowait(const key_type &name);
key_type pop();
bool pop_nowait(key_type &rv);
key_type peek();
size_t size() const;
bool empty();
set_type copy();
list<Task> peek(size_t count) const;
};
template <class Task>
WorkQueue<Task>::~WorkQueue()
{
if (!m_set.empty()) {
cerr << "ERROR: " << m_set.size() << " locked items still in WorkQueue " << this << " at destruction" << endl;
}
}
template <class Task>
void
WorkQueue<Task>::push(const key_type &name)
{
unique_lock<mutex> lock(m_mutex);
while (!m_set.count(name) && m_set.size() > m_max_queue_depth) {
m_cond_full.wait(lock);
}
m_set.insert(name);
m_cond_empty.notify_all();
}
template <class Task>
void
WorkQueue<Task>::push_wait(const key_type &name, size_t limit)
{
unique_lock<mutex> lock(m_mutex);
while (!m_set.count(name) && m_set.size() >= limit) {
m_cond_full.wait(lock);
}
m_set.insert(name);
m_cond_empty.notify_all();
}
template <class Task>
void
WorkQueue<Task>::push_nowait(const key_type &name)
{
unique_lock<mutex> lock(m_mutex);
m_set.insert(name);
m_cond_empty.notify_all();
}
template <class Task>
typename WorkQueue<Task>::key_type
WorkQueue<Task>::pop()
{
unique_lock<mutex> lock(m_mutex);
while (m_set.empty()) {
m_cond_empty.wait(lock);
}
key_type rv = *m_set.begin();
m_set.erase(m_set.begin());
m_cond_full.notify_all();
return rv;
}
template <class Task>
bool
WorkQueue<Task>::pop_nowait(key_type &rv)
{
unique_lock<mutex> lock(m_mutex);
if (m_set.empty()) {
return false;
}
rv = *m_set.begin();
m_set.erase(m_set.begin());
m_cond_full.notify_all();
return true;
}
template <class Task>
typename WorkQueue<Task>::key_type
WorkQueue<Task>::peek()
{
unique_lock<mutex> lock(m_mutex);
if (m_set.empty()) {
return key_type();
} else {
return *m_set.begin();
}
}
template <class Task>
size_t
WorkQueue<Task>::size() const
{
unique_lock<mutex> lock(m_mutex);
return m_set.size();
}
template <class Task>
bool
WorkQueue<Task>::empty()
{
unique_lock<mutex> lock(m_mutex);
return m_set.empty();
}
template <class Task>
typename WorkQueue<Task>::set_type
WorkQueue<Task>::copy()
{
unique_lock<mutex> lock(m_mutex);
return m_set;
}
template <class Task>
list<Task>
WorkQueue<Task>::peek(size_t count) const
{
unique_lock<mutex> lock(m_mutex);
list<Task> rv;
for (auto i : m_set) {
if (count--) {
rv.push_back(i);
} else {
break;
}
}
return rv;
}
template <class Task>
template <class... Args>
WorkQueue<Task>::WorkQueue(Args... args) :
m_set(args...),
m_max_queue_depth(numeric_limits<size_t>::max())
{
}
template <class Task>
template <class... Args>
WorkQueue<Task>::WorkQueue(size_t max_depth, Args... args) :
m_set(args...),
m_max_queue_depth(max_depth)
{
}
}
#endif // CRUCIBLE_WORKQUEUE_H

1
lib/.gitignore vendored
View File

@@ -1 +0,0 @@
.version.*

View File

@@ -1,45 +1,37 @@
default: libcrucible.a
%.a: Makefile
default: libcrucible.so
CRUCIBLE_OBJS = \
bytevector.o \
btrfs-tree.o \
chatter.o \
city.o \
cleanup.o \
OBJS = \
crc64.o \
chatter.o \
error.o \
execpipe.o \
extentwalker.o \
fd.o \
fs.o \
multilock.o \
interp.o \
ntoa.o \
openat2.o \
path.o \
process.o \
seeker.o \
string.o \
table.o \
task.o \
time.o \
uname.o \
uuid.o \
include ../makeflags
-include ../localconf
include ../Defines.mk
BEES_LDFLAGS = $(LDFLAGS)
LDFLAGS = -shared -luuid
configure.h: configure.h.in
$(TEMPLATE_COMPILER)
depends.mk: *.c *.cc
for x in *.c; do $(CC) $(CFLAGS) -M "$$x"; done > depends.mk.new
for x in *.cc; do $(CXX) $(CXXFLAGS) -M "$$x"; done >> depends.mk.new
mv -fv depends.mk.new depends.mk
%.dep: %.cc configure.h Makefile
$(CXX) $(BEES_CXXFLAGS) -M -MF $@ -MT $(<:.cc=.o) $<
-include depends.mk
include $(CRUCIBLE_OBJS:%.o=%.dep)
%.o: %.c
$(CC) $(CFLAGS) -o $@ -c $<
%.o: %.cc ../makeflags
$(CXX) $(BEES_CXXFLAGS) -o $@ -c $<
%.o: %.cc ../include/crucible/%.h
$(CXX) $(CXXFLAGS) -o $@ -c $<
libcrucible.a: $(CRUCIBLE_OBJS)
$(AR) rcs $@ $^
libcrucible.so: $(OBJS) Makefile
$(CXX) $(LDFLAGS) -o $@ $(OBJS)

View File

@@ -1,786 +0,0 @@
#include "crucible/btrfs-tree.h"
#include "crucible/btrfs.h"
#include "crucible/error.h"
#include "crucible/fs.h"
#include "crucible/hexdump.h"
#include "crucible/seeker.h"
#define CRUCIBLE_BTRFS_TREE_DEBUG(x) do { \
if (BtrfsIoctlSearchKey::s_debug_ostream) { \
(*BtrfsIoctlSearchKey::s_debug_ostream) << x; \
} \
} while (false)
namespace crucible {
using namespace std;
uint64_t
BtrfsTreeItem::extent_begin() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_EXTENT_ITEM_KEY);
return m_objectid;
}
uint64_t
BtrfsTreeItem::extent_end() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_EXTENT_ITEM_KEY);
return m_objectid + m_offset;
}
uint64_t
BtrfsTreeItem::extent_flags() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_EXTENT_ITEM_KEY);
return btrfs_get_member(&btrfs_extent_item::flags, m_data);
}
uint64_t
BtrfsTreeItem::extent_generation() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_EXTENT_ITEM_KEY);
return btrfs_get_member(&btrfs_extent_item::generation, m_data);
}
uint64_t
BtrfsTreeItem::root_ref_dirid() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_ROOT_BACKREF_KEY);
return btrfs_get_member(&btrfs_root_ref::dirid, m_data);
}
string
BtrfsTreeItem::root_ref_name() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_ROOT_BACKREF_KEY);
const auto name_len = btrfs_get_member(&btrfs_root_ref::name_len, m_data);
const auto name_start = sizeof(struct btrfs_root_ref);
const auto name_end = name_len + name_start;
THROW_CHECK2(runtime_error, m_data.size(), name_end, m_data.size() >= name_end);
return string(m_data.data() + name_start, m_data.data() + name_end);
}
uint64_t
BtrfsTreeItem::root_ref_parent_rootid() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_ROOT_BACKREF_KEY);
return offset();
}
uint64_t
BtrfsTreeItem::root_flags() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_ROOT_ITEM_KEY);
return btrfs_get_member(&btrfs_root_item::flags, m_data);
}
uint64_t
BtrfsTreeItem::root_refs() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_ROOT_ITEM_KEY);
return btrfs_get_member(&btrfs_root_item::refs, m_data);
}
ostream &
operator<<(ostream &os, const BtrfsTreeItem &bti)
{
os << "BtrfsTreeItem {"
<< " objectid = " << to_hex(bti.objectid())
<< ", type = " << btrfs_search_type_ntoa(bti.type())
<< ", offset = " << to_hex(bti.offset())
<< ", transid = " << bti.transid()
<< ", data = ";
hexdump(os, bti.data());
return os;
}
uint64_t
BtrfsTreeItem::block_group_flags() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_BLOCK_GROUP_ITEM_KEY);
return btrfs_get_member(&btrfs_block_group_item::flags, m_data);
}
uint64_t
BtrfsTreeItem::block_group_used() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_BLOCK_GROUP_ITEM_KEY);
return btrfs_get_member(&btrfs_block_group_item::used, m_data);
}
uint64_t
BtrfsTreeItem::chunk_length() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_CHUNK_ITEM_KEY);
return btrfs_get_member(&btrfs_chunk::length, m_data);
}
uint64_t
BtrfsTreeItem::chunk_type() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_CHUNK_ITEM_KEY);
return btrfs_get_member(&btrfs_chunk::type, m_data);
}
uint64_t
BtrfsTreeItem::dev_extent_chunk_offset() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_DEV_EXTENT_KEY);
return btrfs_get_member(&btrfs_dev_extent::chunk_offset, m_data);
}
uint64_t
BtrfsTreeItem::dev_extent_length() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_DEV_EXTENT_KEY);
return btrfs_get_member(&btrfs_dev_extent::length, m_data);
}
uint64_t
BtrfsTreeItem::dev_item_total_bytes() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_DEV_ITEM_KEY);
return btrfs_get_member(&btrfs_dev_item::total_bytes, m_data);
}
uint64_t
BtrfsTreeItem::dev_item_bytes_used() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_DEV_ITEM_KEY);
return btrfs_get_member(&btrfs_dev_item::bytes_used, m_data);
}
uint64_t
BtrfsTreeItem::inode_size() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_INODE_ITEM_KEY);
return btrfs_get_member(&btrfs_inode_item::size, m_data);
}
uint64_t
BtrfsTreeItem::file_extent_logical_bytes() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_EXTENT_DATA_KEY);
const auto file_extent_item_type = btrfs_get_member(&btrfs_file_extent_item::type, m_data);
switch (file_extent_item_type) {
case BTRFS_FILE_EXTENT_INLINE:
return btrfs_get_member(&btrfs_file_extent_item::ram_bytes, m_data);
case BTRFS_FILE_EXTENT_PREALLOC:
case BTRFS_FILE_EXTENT_REG:
return btrfs_get_member(&btrfs_file_extent_item::num_bytes, m_data);
default:
THROW_ERROR(runtime_error, "unknown btrfs_file_extent_item type " << file_extent_item_type);
}
}
uint64_t
BtrfsTreeItem::file_extent_offset() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_EXTENT_DATA_KEY);
const auto file_extent_item_type = btrfs_get_member(&btrfs_file_extent_item::type, m_data);
switch (file_extent_item_type) {
case BTRFS_FILE_EXTENT_INLINE:
THROW_ERROR(invalid_argument, "extent is inline " << *this);
case BTRFS_FILE_EXTENT_PREALLOC:
case BTRFS_FILE_EXTENT_REG:
return btrfs_get_member(&btrfs_file_extent_item::offset, m_data);
default:
THROW_ERROR(runtime_error, "unknown btrfs_file_extent_item type " << file_extent_item_type << " in " << *this);
}
}
uint64_t
BtrfsTreeItem::file_extent_generation() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_EXTENT_DATA_KEY);
return btrfs_get_member(&btrfs_file_extent_item::generation, m_data);
}
uint64_t
BtrfsTreeItem::file_extent_bytenr() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_EXTENT_DATA_KEY);
auto file_extent_item_type = btrfs_get_member(&btrfs_file_extent_item::type, m_data);
switch (file_extent_item_type) {
case BTRFS_FILE_EXTENT_INLINE:
THROW_ERROR(invalid_argument, "extent is inline " << *this);
case BTRFS_FILE_EXTENT_PREALLOC:
case BTRFS_FILE_EXTENT_REG:
return btrfs_get_member(&btrfs_file_extent_item::disk_bytenr, m_data);
default:
THROW_ERROR(runtime_error, "unknown btrfs_file_extent_item type " << file_extent_item_type << " in " << *this);
}
}
uint8_t
BtrfsTreeItem::file_extent_type() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_EXTENT_DATA_KEY);
return btrfs_get_member(&btrfs_file_extent_item::type, m_data);
}
btrfs_compression_type
BtrfsTreeItem::file_extent_compression() const
{
THROW_CHECK1(invalid_argument, btrfs_search_type_ntoa(m_type), m_type == BTRFS_EXTENT_DATA_KEY);
return static_cast<btrfs_compression_type>(btrfs_get_member(&btrfs_file_extent_item::compression, m_data));
}
BtrfsTreeItem::BtrfsTreeItem(const BtrfsIoctlSearchHeader &bish) :
m_objectid(bish.objectid),
m_offset(bish.offset),
m_transid(bish.transid),
m_data(bish.m_data),
m_type(bish.type)
{
}
BtrfsTreeItem &
BtrfsTreeItem::operator=(const BtrfsIoctlSearchHeader &bish)
{
m_objectid = bish.objectid;
m_offset = bish.offset;
m_transid = bish.transid;
m_data = bish.m_data;
m_type = bish.type;
return *this;
}
bool
BtrfsTreeItem::operator!() const
{
return m_transid == 0 && m_objectid == 0 && m_offset == 0 && m_type == 0;
}
uint64_t
BtrfsTreeFetcher::block_size() const
{
return m_block_size;
}
BtrfsTreeFetcher::BtrfsTreeFetcher(Fd new_fd) :
m_fd(new_fd)
{
BtrfsIoctlFsInfoArgs bifia;
bifia.do_ioctl(fd());
m_block_size = bifia.sectorsize;
THROW_CHECK1(runtime_error, m_block_size, m_block_size > 0);
// We don't believe sector sizes that aren't multiples of 4K
THROW_CHECK1(runtime_error, m_block_size, (m_block_size % 4096) == 0);
m_lookbehind_size = 128 * 1024;
m_scale_size = m_block_size;
}
Fd
BtrfsTreeFetcher::fd() const
{
return m_fd;
}
void
BtrfsTreeFetcher::fd(Fd fd)
{
m_fd = fd;
}
void
BtrfsTreeFetcher::type(uint8_t type)
{
m_type = type;
}
uint8_t
BtrfsTreeFetcher::type()
{
return m_type;
}
void
BtrfsTreeFetcher::tree(uint64_t tree)
{
m_tree = tree;
}
uint64_t
BtrfsTreeFetcher::tree()
{
return m_tree;
}
void
BtrfsTreeFetcher::transid(uint64_t min_transid, uint64_t max_transid)
{
m_min_transid = min_transid;
m_max_transid = max_transid;
}
uint64_t
BtrfsTreeFetcher::lookbehind_size() const
{
return m_lookbehind_size;
}
void
BtrfsTreeFetcher::lookbehind_size(uint64_t lookbehind_size)
{
m_lookbehind_size = lookbehind_size;
}
uint64_t
BtrfsTreeFetcher::scale_size() const
{
return m_scale_size;
}
void
BtrfsTreeFetcher::scale_size(uint64_t scale_size)
{
m_scale_size = scale_size;
}
void
BtrfsTreeFetcher::fill_sk(BtrfsIoctlSearchKey &sk, uint64_t object)
{
(void)object;
// btrfs allows tree ID 0 meaning the current tree, but we do not.
THROW_CHECK0(invalid_argument, m_tree != 0);
sk.tree_id = m_tree;
sk.min_type = m_type;
sk.max_type = m_type;
sk.min_transid = m_min_transid;
sk.max_transid = m_max_transid;
sk.nr_items = 1;
}
void
BtrfsTreeFetcher::next_sk(BtrfsIoctlSearchKey &key, const BtrfsIoctlSearchHeader &hdr)
{
key.next_min(hdr, m_type);
}
BtrfsTreeItem
BtrfsTreeFetcher::at(uint64_t logical)
{
CRUCIBLE_BTRFS_TREE_DEBUG("at " << logical);
BtrfsIoctlSearchKey &sk = m_sk;
fill_sk(sk, logical);
// Exact match, should return 0 or 1 items
sk.max_type = sk.min_type;
sk.nr_items = 1;
sk.do_ioctl(fd());
THROW_CHECK1(runtime_error, sk.m_result.size(), sk.m_result.size() < 2);
for (const auto &i : sk.m_result) {
if (hdr_logical(i) == logical && hdr_match(i)) {
return i;
}
}
return BtrfsTreeItem();
}
uint64_t
BtrfsTreeFetcher::scale_logical(const uint64_t logical) const
{
THROW_CHECK1(invalid_argument, logical, (logical % m_scale_size) == 0 || logical == s_max_logical);
return logical / m_scale_size;
}
uint64_t
BtrfsTreeFetcher::scaled_max_logical() const
{
return scale_logical(s_max_logical);
}
uint64_t
BtrfsTreeFetcher::unscale_logical(const uint64_t logical) const
{
THROW_CHECK1(invalid_argument, logical, logical <= scaled_max_logical());
if (logical == scaled_max_logical()) {
return s_max_logical;
}
return logical * scale_size();
}
BtrfsTreeItem
BtrfsTreeFetcher::rlower_bound(uint64_t logical)
{
#if 0
static bool btfrlb_debug = getenv("BTFLRB_DEBUG");
#define BTFRLB_DEBUG(x) do { if (btfrlb_debug) cerr << x; } while (false)
#else
#define BTFRLB_DEBUG(x) CRUCIBLE_BTRFS_TREE_DEBUG(x)
#endif
BtrfsTreeItem closest_item;
uint64_t closest_logical = 0;
BtrfsIoctlSearchKey &sk = m_sk;
size_t loops = 0;
BTFRLB_DEBUG("rlower_bound: " << to_hex(logical) << " in tree " << tree() << endl);
seek_backward(scale_logical(logical), [&](uint64_t const lower_bound, uint64_t const upper_bound) {
++loops;
fill_sk(sk, unscale_logical(min(scaled_max_logical(), lower_bound)));
set<uint64_t> rv;
bool too_far = false;
do {
sk.nr_items = 4;
sk.do_ioctl(fd());
BTFRLB_DEBUG("fetch: loop " << loops << " lower_bound..upper_bound " << to_hex(lower_bound) << ".." << to_hex(upper_bound));
for (auto &i : sk.m_result) {
next_sk(sk, i);
// If hdr_stop or !hdr_match, don't inspect the item
if (hdr_stop(i)) {
too_far = true;
rv.insert(numeric_limits<uint64_t>::max());
BTFRLB_DEBUG("(stop)");
break;
}
if (!hdr_match(i)) {
BTFRLB_DEBUG("(no match)");
continue;
}
const auto this_logical = hdr_logical(i);
BTFRLB_DEBUG(" " << to_hex(this_logical) << " " << i);
const auto scaled_hdr_logical = scale_logical(this_logical);
BTFRLB_DEBUG(" " << "(match)");
if (scaled_hdr_logical > upper_bound) {
too_far = true;
BTFRLB_DEBUG("(" << to_hex(scaled_hdr_logical) << " >= " << to_hex(upper_bound) << ")");
break;
}
if (this_logical <= logical && this_logical > closest_logical) {
closest_logical = this_logical;
closest_item = i;
BTFRLB_DEBUG("(closest)");
}
rv.insert(scaled_hdr_logical);
BTFRLB_DEBUG("(cont'd)");
}
BTFRLB_DEBUG(endl);
// We might get a search result that contains only non-matching items.
// Keep looping until we find any matching item or we run out of tree.
} while (!too_far && rv.empty() && !sk.m_result.empty());
return rv;
}, scale_logical(lookbehind_size()));
return closest_item;
#undef BTFRLB_DEBUG
}
BtrfsTreeItem
BtrfsTreeFetcher::lower_bound(uint64_t logical)
{
BtrfsIoctlSearchKey &sk = m_sk;
fill_sk(sk, logical);
do {
assert(sk.max_offset == s_max_logical);
sk.do_ioctl(fd());
for (const auto &i : sk.m_result) {
if (hdr_match(i)) {
return i;
}
if (hdr_stop(i)) {
return BtrfsTreeItem();
}
next_sk(sk, i);
}
} while (!sk.m_result.empty());
return BtrfsTreeItem();
}
BtrfsTreeItem
BtrfsTreeFetcher::next(uint64_t logical)
{
CRUCIBLE_BTRFS_TREE_DEBUG("next " << logical);
const auto scaled_logical = scale_logical(logical);
if (scaled_logical + 1 > scaled_max_logical()) {
return BtrfsTreeItem();
}
return lower_bound(unscale_logical(scaled_logical + 1));
}
BtrfsTreeItem
BtrfsTreeFetcher::prev(uint64_t logical)
{
CRUCIBLE_BTRFS_TREE_DEBUG("prev " << logical);
const auto scaled_logical = scale_logical(logical);
if (scaled_logical < 1) {
return BtrfsTreeItem();
}
return rlower_bound(unscale_logical(scaled_logical - 1));
}
void
BtrfsTreeObjectFetcher::fill_sk(BtrfsIoctlSearchKey &sk, uint64_t object)
{
BtrfsTreeFetcher::fill_sk(sk, object);
sk.min_offset = 0;
sk.max_offset = numeric_limits<decltype(sk.max_offset)>::max();
sk.min_objectid = object;
sk.max_objectid = numeric_limits<decltype(sk.max_objectid)>::max();
}
uint64_t
BtrfsTreeObjectFetcher::hdr_logical(const BtrfsIoctlSearchHeader &hdr)
{
return hdr.objectid;
}
bool
BtrfsTreeObjectFetcher::hdr_match(const BtrfsIoctlSearchHeader &hdr)
{
// If you're calling this method without overriding it, you should have set type first
assert(m_type);
return hdr.type == m_type;
}
bool
BtrfsTreeObjectFetcher::hdr_stop(const BtrfsIoctlSearchHeader &hdr)
{
return false;
(void)hdr;
}
uint64_t
BtrfsTreeOffsetFetcher::hdr_logical(const BtrfsIoctlSearchHeader &hdr)
{
return hdr.offset;
}
bool
BtrfsTreeOffsetFetcher::hdr_match(const BtrfsIoctlSearchHeader &hdr)
{
assert(m_type);
return hdr.type == m_type && hdr.objectid == m_objectid;
}
bool
BtrfsTreeOffsetFetcher::hdr_stop(const BtrfsIoctlSearchHeader &hdr)
{
assert(m_type);
return hdr.objectid > m_objectid || hdr.type > m_type;
}
void
BtrfsTreeOffsetFetcher::objectid(uint64_t objectid)
{
m_objectid = objectid;
}
uint64_t
BtrfsTreeOffsetFetcher::objectid() const
{
return m_objectid;
}
void
BtrfsTreeOffsetFetcher::fill_sk(BtrfsIoctlSearchKey &sk, uint64_t offset)
{
BtrfsTreeFetcher::fill_sk(sk, offset);
sk.min_offset = offset;
sk.max_offset = numeric_limits<decltype(sk.max_offset)>::max();
sk.min_objectid = m_objectid;
sk.max_objectid = m_objectid;
}
void
BtrfsCsumTreeFetcher::get_sums(uint64_t const logical, size_t count, function<void(uint64_t logical, const uint8_t *buf, size_t bytes)> output)
{
#if 0
static bool bctfgs_debug = getenv("BCTFGS_DEBUG");
#define BCTFGS_DEBUG(x) do { if (bctfgs_debug) cerr << x; } while (false)
#else
#define BCTFGS_DEBUG(x) CRUCIBLE_BTRFS_TREE_DEBUG(x)
#endif
const uint64_t logical_end = logical + count * block_size();
BtrfsTreeItem bti = rlower_bound(logical);
size_t __attribute__((unused)) loops = 0;
BCTFGS_DEBUG("get_sums " << to_hex(logical) << ".." << to_hex(logical_end) << endl);
while (!!bti) {
BCTFGS_DEBUG("get_sums[" << loops << "]: " << bti << endl);
++loops;
// Reject wrong type or objectid
THROW_CHECK1(runtime_error, bti.type(), bti.type() == BTRFS_EXTENT_CSUM_KEY);
THROW_CHECK1(runtime_error, bti.objectid(), bti.objectid() == BTRFS_EXTENT_CSUM_OBJECTID);
// Is this object in range?
const uint64_t data_logical = bti.offset();
if (data_logical >= logical_end) {
// csum object is past end of range, we are done
return;
}
// Figure out how long this csum item is in various units
const size_t csum_byte_count = bti.data().size();
THROW_CHECK1(runtime_error, csum_byte_count, (csum_byte_count % m_sum_size) == 0);
THROW_CHECK1(runtime_error, csum_byte_count, csum_byte_count > 0);
const size_t csum_count = csum_byte_count / m_sum_size;
const uint64_t data_byte_count = csum_count * block_size();
const uint64_t data_logical_end = data_logical + data_byte_count;
if (data_logical_end <= logical) {
// too low, look at next item
bti = lower_bound(logical);
continue;
}
// There is some overlap?
const uint64_t overlap_begin = max(logical, data_logical);
const uint64_t overlap_end = min(logical_end, data_logical_end);
THROW_CHECK2(runtime_error, overlap_begin, overlap_end, overlap_begin < overlap_end);
const uint64_t overlap_offset = overlap_begin - data_logical;
THROW_CHECK1(runtime_error, overlap_offset, (overlap_offset % block_size()) == 0);
const uint64_t overlap_index = overlap_offset * m_sum_size / block_size();
const uint64_t overlap_byte_count = overlap_end - overlap_begin;
const uint64_t overlap_csum_byte_count = overlap_byte_count * m_sum_size / block_size();
// Can't be bigger than a btrfs item
THROW_CHECK1(runtime_error, overlap_index, overlap_index < 65536);
THROW_CHECK1(runtime_error, overlap_csum_byte_count, overlap_csum_byte_count < 65536);
// Yes, process the overlap
output(overlap_begin, bti.data().data() + overlap_index, overlap_csum_byte_count);
// Advance
bti = lower_bound(overlap_end);
}
#undef BCTFGS_DEBUG
}
uint32_t
BtrfsCsumTreeFetcher::sum_type() const
{
return m_sum_type;
}
size_t
BtrfsCsumTreeFetcher::sum_size() const
{
return m_sum_size;
}
BtrfsCsumTreeFetcher::BtrfsCsumTreeFetcher(const Fd &new_fd) :
BtrfsTreeOffsetFetcher(new_fd)
{
type(BTRFS_EXTENT_CSUM_KEY);
tree(BTRFS_CSUM_TREE_OBJECTID);
objectid(BTRFS_EXTENT_CSUM_OBJECTID);
BtrfsIoctlFsInfoArgs bifia;
bifia.do_ioctl(fd());
m_sum_type = static_cast<btrfs_compression_type>(bifia.csum_type());
m_sum_size = bifia.csum_size();
if (m_sum_type == BTRFS_CSUM_TYPE_CRC32 && m_sum_size == 0) {
// Older kernel versions don't fill in this field
m_sum_size = 4;
}
THROW_CHECK1(runtime_error, m_sum_size, m_sum_size > 0);
}
BtrfsExtentItemFetcher::BtrfsExtentItemFetcher(const Fd &new_fd) :
BtrfsTreeObjectFetcher(new_fd)
{
tree(BTRFS_EXTENT_TREE_OBJECTID);
type(BTRFS_EXTENT_ITEM_KEY);
}
BtrfsExtentDataFetcher::BtrfsExtentDataFetcher(const Fd &new_fd) :
BtrfsTreeOffsetFetcher(new_fd)
{
type(BTRFS_EXTENT_DATA_KEY);
}
BtrfsInodeFetcher::BtrfsInodeFetcher(const Fd &fd) :
BtrfsTreeObjectFetcher(fd)
{
type(BTRFS_INODE_ITEM_KEY);
scale_size(1);
}
BtrfsTreeItem
BtrfsInodeFetcher::stat(uint64_t subvol, uint64_t inode)
{
tree(subvol);
const auto item = at(inode);
if (!!item) {
THROW_CHECK2(runtime_error, item.objectid(), inode, inode == item.objectid());
THROW_CHECK2(runtime_error, item.type(), BTRFS_INODE_ITEM_KEY, item.type() == BTRFS_INODE_ITEM_KEY);
}
return item;
}
BtrfsRootFetcher::BtrfsRootFetcher(const Fd &fd) :
BtrfsTreeObjectFetcher(fd)
{
tree(BTRFS_ROOT_TREE_OBJECTID);
scale_size(1);
}
BtrfsTreeItem
BtrfsRootFetcher::root(const uint64_t subvol)
{
const auto my_type = BTRFS_ROOT_ITEM_KEY;
type(my_type);
const auto item = at(subvol);
if (!!item) {
THROW_CHECK2(runtime_error, item.objectid(), subvol, subvol == item.objectid());
THROW_CHECK2(runtime_error, item.type(), my_type, item.type() == my_type);
}
return item;
}
BtrfsTreeItem
BtrfsRootFetcher::root_backref(const uint64_t subvol)
{
const auto my_type = BTRFS_ROOT_BACKREF_KEY;
type(my_type);
const auto item = at(subvol);
if (!!item) {
THROW_CHECK2(runtime_error, item.objectid(), subvol, subvol == item.objectid());
THROW_CHECK2(runtime_error, item.type(), my_type, item.type() == my_type);
}
return item;
}
BtrfsDataExtentTreeFetcher::BtrfsDataExtentTreeFetcher(const Fd &fd) :
BtrfsExtentItemFetcher(fd),
m_chunk_tree(fd)
{
tree(BTRFS_EXTENT_TREE_OBJECTID);
type(BTRFS_EXTENT_ITEM_KEY);
m_chunk_tree.tree(BTRFS_CHUNK_TREE_OBJECTID);
m_chunk_tree.type(BTRFS_CHUNK_ITEM_KEY);
m_chunk_tree.objectid(BTRFS_FIRST_CHUNK_TREE_OBJECTID);
}
void
BtrfsDataExtentTreeFetcher::next_sk(BtrfsIoctlSearchKey &key, const BtrfsIoctlSearchHeader &hdr)
{
key.min_type = key.max_type = type();
key.max_objectid = key.max_offset = numeric_limits<uint64_t>::max();
key.min_offset = 0;
key.min_objectid = hdr.objectid;
const auto step = scale_size();
if (key.min_objectid < numeric_limits<uint64_t>::max() - step) {
key.min_objectid += step;
} else {
key.min_objectid = numeric_limits<uint64_t>::max();
}
// If we're still in our current block group, check here
if (!!m_current_bg) {
const auto bg_begin = m_current_bg.offset();
const auto bg_end = bg_begin + m_current_bg.chunk_length();
// If we are still in our current block group, return early
if (key.min_objectid >= bg_begin && key.min_objectid < bg_end) return;
}
// We don't have a current block group or we're out of range
// Find the chunk that this bytenr belongs to
m_current_bg = m_chunk_tree.rlower_bound(key.min_objectid);
// Make sure it's a data block group
while (!!m_current_bg) {
// Data block group, stop here
if (m_current_bg.chunk_type() & BTRFS_BLOCK_GROUP_DATA) break;
// Not a data block group, skip to end
key.min_objectid = m_current_bg.offset() + m_current_bg.chunk_length();
m_current_bg = m_chunk_tree.lower_bound(key.min_objectid);
}
if (!m_current_bg) {
// Ran out of data block groups, stop here
return;
}
// Check to see if bytenr is in the current data block group
const auto bg_begin = m_current_bg.offset();
if (key.min_objectid < bg_begin) {
// Move forward to start of data block group
key.min_objectid = bg_begin;
}
}
}

View File

@@ -1,189 +0,0 @@
#include "crucible/bytevector.h"
#include "crucible/error.h"
#include "crucible/hexdump.h"
#include "crucible/string.h"
#include <cassert>
namespace crucible {
using namespace std;
ByteVector::iterator
ByteVector::begin() const
{
unique_lock<mutex> lock(m_mutex);
return m_ptr.get();
}
ByteVector::iterator
ByteVector::end() const
{
unique_lock<mutex> lock(m_mutex);
return m_ptr.get() + m_size;
}
size_t
ByteVector::size() const
{
return m_size;
}
bool
ByteVector::empty() const
{
return !m_ptr || !m_size;
}
void
ByteVector::clear()
{
unique_lock<mutex> lock(m_mutex);
m_ptr.reset();
m_size = 0;
}
ByteVector::value_type&
ByteVector::operator[](size_t index) const
{
unique_lock<mutex> lock(m_mutex);
return m_ptr.get()[index];
}
ByteVector::ByteVector(const ByteVector &that)
{
unique_lock<mutex> lock(that.m_mutex);
m_ptr = that.m_ptr;
m_size = that.m_size;
}
ByteVector&
ByteVector::operator=(const ByteVector &that)
{
// If &that == this, there's no need to do anything, but
// especially don't try to lock the same mutex twice.
if (&m_mutex != &that.m_mutex) {
unique_lock<mutex> lock_this(m_mutex, defer_lock);
unique_lock<mutex> lock_that(that.m_mutex, defer_lock);
lock(lock_this, lock_that);
m_ptr = that.m_ptr;
m_size = that.m_size;
}
return *this;
}
ByteVector::ByteVector(const ByteVector &that, size_t start, size_t length)
{
THROW_CHECK0(out_of_range, that.m_ptr);
THROW_CHECK2(out_of_range, start, that.m_size, start <= that.m_size);
THROW_CHECK2(out_of_range, start + length, that.m_size + length, start + length <= that.m_size + length);
m_ptr = Pointer(that.m_ptr, that.m_ptr.get() + start);
m_size = length;
}
ByteVector
ByteVector::at(size_t start, size_t length) const
{
return ByteVector(*this, start, length);
}
ByteVector::value_type&
ByteVector::at(size_t size) const
{
unique_lock<mutex> lock(m_mutex);
THROW_CHECK0(out_of_range, m_ptr);
THROW_CHECK2(out_of_range, size, m_size, size < m_size);
return m_ptr.get()[size];
}
static
void *
bv_allocate(size_t size)
{
#ifdef BEES_VALGRIND
// XXX: only do this to shut up valgrind
return calloc(1, size);
#else
return malloc(size);
#endif
}
ByteVector::ByteVector(size_t size)
{
m_ptr = Pointer(static_cast<value_type*>(bv_allocate(size)), free);
// bad_alloc doesn't fit THROW_CHECK's template
THROW_CHECK0(runtime_error, m_ptr);
m_size = size;
}
ByteVector::ByteVector(iterator begin, iterator end, size_t min_size)
{
const size_t size = end - begin;
const size_t alloc_size = max(size, min_size);
m_ptr = Pointer(static_cast<value_type*>(bv_allocate(alloc_size)), free);
THROW_CHECK0(runtime_error, m_ptr);
m_size = alloc_size;
memcpy(m_ptr.get(), begin, size);
}
bool
ByteVector::operator==(const ByteVector &that) const
{
unique_lock<mutex> lock_this(m_mutex, defer_lock);
unique_lock<mutex> lock_that(that.m_mutex, defer_lock);
lock(lock_this, lock_that);
if (!m_ptr) {
return !that.m_ptr;
}
if (!that.m_ptr) {
return false;
}
if (m_size != that.m_size) {
return false;
}
if (m_ptr.get() == that.m_ptr.get()) {
return true;
}
return !memcmp(m_ptr.get(), that.m_ptr.get(), m_size);
}
void
ByteVector::erase(iterator begin, iterator end)
{
unique_lock<mutex> lock(m_mutex);
const size_t size = end - begin;
if (!size) return;
THROW_CHECK0(out_of_range, m_ptr);
const iterator my_begin = m_ptr.get();
const iterator my_end = my_begin + m_size;
THROW_CHECK4(out_of_range, my_begin, begin, my_end, end, my_begin == begin || my_end == end);
if (begin == my_begin) {
if (end == my_end) {
m_size = 0;
m_ptr.reset();
return;
}
m_ptr = Pointer(m_ptr, end);
}
m_size -= size;
}
void
ByteVector::erase(iterator begin)
{
erase(begin, begin + 1);
}
ByteVector::value_type*
ByteVector::data() const
{
unique_lock<mutex> lock(m_mutex);
return m_ptr.get();
}
ostream&
operator<<(ostream &os, const ByteVector &bv) {
hexdump(os, bv);
return os;
}
}

View File

@@ -15,10 +15,8 @@
namespace crucible {
using namespace std;
static shared_ptr<set<string>> chatter_names;
static auto_ptr<set<string>> chatter_names;
static const char *SPACETAB = " \t";
static bool add_prefix_timestamp = true;
static bool add_prefix_level = true;
static
void
@@ -45,52 +43,28 @@ namespace crucible {
}
}
Chatter::Chatter(int loglevel, string name, ostream &os)
: m_loglevel(loglevel), m_name(name), m_os(os)
Chatter::Chatter(string name, ostream &os)
: m_name(name), m_os(os)
{
}
void
Chatter::enable_timestamp(bool prefix_timestamp)
{
add_prefix_timestamp = prefix_timestamp;
}
void
Chatter::enable_level(bool prefix_level)
{
add_prefix_level = prefix_level;
}
Chatter::~Chatter()
{
ostringstream header_stream;
if (add_prefix_timestamp) {
time_t ltime;
DIE_IF_MINUS_ONE(time(&ltime));
struct tm ltm;
DIE_IF_ZERO(localtime_r(&ltime, &ltm));
time_t ltime;
DIE_IF_MINUS_ONE(time(&ltime));
struct tm ltm;
DIE_IF_ZERO(localtime_r(&ltime, &ltm));
char buf[1024];
DIE_IF_ZERO(strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", &ltm));
char buf[1024];
DIE_IF_ZERO(strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", &ltm));
header_stream << buf;
header_stream << " " << getpid() << "." << gettid();
if (add_prefix_level) {
header_stream << "<" << m_loglevel << ">";
}
if (!m_name.empty()) {
header_stream << " " << m_name;
}
} else {
if (add_prefix_level) {
header_stream << "<" << m_loglevel << ">";
}
header_stream << (m_name.empty() ? "thread" : m_name);
header_stream << "[" << gettid() << "]";
header_stream << buf;
header_stream << " " << getpid() << "." << gettid();
if (!m_name.empty()) {
header_stream << " " << m_name;
}
header_stream << ": ";
string out = m_oss.str();
@@ -112,7 +86,7 @@ namespace crucible {
}
Chatter::Chatter(Chatter &&c)
: m_loglevel(c.m_loglevel), m_name(c.m_name), m_os(c.m_os), m_oss(c.m_oss.str())
: m_name(c.m_name), m_os(c.m_os), m_oss(c.m_oss.str())
{
c.m_oss.str("");
}
@@ -136,7 +110,6 @@ namespace crucible {
} else if (!chatter_names->empty()) {
cerr << "CRUCIBLE_CHATTER does not list '" << m_file << "' or '" << m_pretty_function << "'" << endl;
}
(void)m_line; // not implemented yet
// cerr << "ChatterBox " << reinterpret_cast<void*>(this) << " constructed" << endl;
}
@@ -159,7 +132,7 @@ namespace crucible {
ChatterUnwinder::~ChatterUnwinder()
{
if (current_exception()) {
if (uncaught_exception()) {
m_func();
}
}

View File

@@ -1,513 +0,0 @@
// Copyright (c) 2011 Google, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
// CityHash, by Geoff Pike and Jyrki Alakuijala
//
// This file provides CityHash64() and related functions.
//
// It's probably possible to create even faster hash functions by
// writing a program that systematically explores some of the space of
// possible hash functions, by using SIMD instructions, or by
// compromising on hash quality.
#include "crucible/city.h"
#include <algorithm>
#include <string.h> // for memcpy and memset
using namespace std;
static uint64 UNALIGNED_LOAD64(const char *p) {
uint64 result;
memcpy(&result, p, sizeof(result));
return result;
}
static uint32 UNALIGNED_LOAD32(const char *p) {
uint32 result;
memcpy(&result, p, sizeof(result));
return result;
}
#ifdef _MSC_VER
#include <stdlib.h>
#define bswap_32(x) _byteswap_ulong(x)
#define bswap_64(x) _byteswap_uint64(x)
#elif defined(__APPLE__)
// Mac OS X / Darwin features
#include <libkern/OSByteOrder.h>
#define bswap_32(x) OSSwapInt32(x)
#define bswap_64(x) OSSwapInt64(x)
#elif defined(__sun) || defined(sun)
#include <sys/byteorder.h>
#define bswap_32(x) BSWAP_32(x)
#define bswap_64(x) BSWAP_64(x)
#elif defined(__FreeBSD__)
#include <sys/endian.h>
#define bswap_32(x) bswap32(x)
#define bswap_64(x) bswap64(x)
#elif defined(__OpenBSD__)
#include <sys/types.h>
#define bswap_32(x) swap32(x)
#define bswap_64(x) swap64(x)
#elif defined(__NetBSD__)
#include <sys/types.h>
#include <machine/bswap.h>
#if defined(__BSWAP_RENAME) && !defined(__bswap_32)
#define bswap_32(x) bswap32(x)
#define bswap_64(x) bswap64(x)
#endif
#else
#include <byteswap.h>
#endif
#ifdef WORDS_BIGENDIAN
#define uint32_in_expected_order(x) (bswap_32(x))
#define uint64_in_expected_order(x) (bswap_64(x))
#else
#define uint32_in_expected_order(x) (x)
#define uint64_in_expected_order(x) (x)
#endif
#if !defined(LIKELY)
#if HAVE_BUILTIN_EXPECT
#define LIKELY(x) (__builtin_expect(!!(x), 1))
#else
#define LIKELY(x) (x)
#endif
#endif
static uint64 Fetch64(const char *p) {
return uint64_in_expected_order(UNALIGNED_LOAD64(p));
}
static uint32 Fetch32(const char *p) {
return uint32_in_expected_order(UNALIGNED_LOAD32(p));
}
// Some primes between 2^63 and 2^64 for various uses.
static const uint64 k0 = 0xc3a5c85c97cb3127ULL;
static const uint64 k1 = 0xb492b66fbe98f273ULL;
static const uint64 k2 = 0x9ae16a3b2f90404fULL;
// Magic numbers for 32-bit hashing. Copied from Murmur3.
static const uint32 c1 = 0xcc9e2d51;
static const uint32 c2 = 0x1b873593;
// A 32-bit to 32-bit integer hash copied from Murmur3.
static uint32 fmix(uint32 h)
{
h ^= h >> 16;
h *= 0x85ebca6b;
h ^= h >> 13;
h *= 0xc2b2ae35;
h ^= h >> 16;
return h;
}
static uint32 Rotate32(uint32 val, int shift) {
// Avoid shifting by 32: doing so yields an undefined result.
return shift == 0 ? val : ((val >> shift) | (val << (32 - shift)));
}
#undef PERMUTE3
#define PERMUTE3(a, b, c) do { std::swap(a, b); std::swap(a, c); } while (0)
static uint32 Mur(uint32 a, uint32 h) {
// Helper from Murmur3 for combining two 32-bit values.
a *= c1;
a = Rotate32(a, 17);
a *= c2;
h ^= a;
h = Rotate32(h, 19);
return h * 5 + 0xe6546b64;
}
static uint32 Hash32Len13to24(const char *s, size_t len) {
uint32 a = Fetch32(s - 4 + (len >> 1));
uint32 b = Fetch32(s + 4);
uint32 c = Fetch32(s + len - 8);
uint32 d = Fetch32(s + (len >> 1));
uint32 e = Fetch32(s);
uint32 f = Fetch32(s + len - 4);
uint32 h = len;
return fmix(Mur(f, Mur(e, Mur(d, Mur(c, Mur(b, Mur(a, h)))))));
}
static uint32 Hash32Len0to4(const char *s, size_t len) {
uint32 b = 0;
uint32 c = 9;
for (size_t i = 0; i < len; i++) {
signed char v = s[i];
b = b * c1 + v;
c ^= b;
}
return fmix(Mur(b, Mur(len, c)));
}
static uint32 Hash32Len5to12(const char *s, size_t len) {
uint32 a = len, b = len * 5, c = 9, d = b;
a += Fetch32(s);
b += Fetch32(s + len - 4);
c += Fetch32(s + ((len >> 1) & 4));
return fmix(Mur(c, Mur(b, Mur(a, d))));
}
uint32 CityHash32(const char *s, size_t len) {
if (len <= 24) {
return len <= 12 ?
(len <= 4 ? Hash32Len0to4(s, len) : Hash32Len5to12(s, len)) :
Hash32Len13to24(s, len);
}
// len > 24
uint32 h = len, g = c1 * len, f = g;
uint32 a0 = Rotate32(Fetch32(s + len - 4) * c1, 17) * c2;
uint32 a1 = Rotate32(Fetch32(s + len - 8) * c1, 17) * c2;
uint32 a2 = Rotate32(Fetch32(s + len - 16) * c1, 17) * c2;
uint32 a3 = Rotate32(Fetch32(s + len - 12) * c1, 17) * c2;
uint32 a4 = Rotate32(Fetch32(s + len - 20) * c1, 17) * c2;
h ^= a0;
h = Rotate32(h, 19);
h = h * 5 + 0xe6546b64;
h ^= a2;
h = Rotate32(h, 19);
h = h * 5 + 0xe6546b64;
g ^= a1;
g = Rotate32(g, 19);
g = g * 5 + 0xe6546b64;
g ^= a3;
g = Rotate32(g, 19);
g = g * 5 + 0xe6546b64;
f += a4;
f = Rotate32(f, 19);
f = f * 5 + 0xe6546b64;
size_t iters = (len - 1) / 20;
do {
uint32 a0 = Rotate32(Fetch32(s) * c1, 17) * c2;
uint32 a1 = Fetch32(s + 4);
uint32 a2 = Rotate32(Fetch32(s + 8) * c1, 17) * c2;
uint32 a3 = Rotate32(Fetch32(s + 12) * c1, 17) * c2;
uint32 a4 = Fetch32(s + 16);
h ^= a0;
h = Rotate32(h, 18);
h = h * 5 + 0xe6546b64;
f += a1;
f = Rotate32(f, 19);
f = f * c1;
g += a2;
g = Rotate32(g, 18);
g = g * 5 + 0xe6546b64;
h ^= a3 + a1;
h = Rotate32(h, 19);
h = h * 5 + 0xe6546b64;
g ^= a4;
g = bswap_32(g) * 5;
h += a4 * 5;
h = bswap_32(h);
f += a0;
PERMUTE3(f, h, g);
s += 20;
} while (--iters != 0);
g = Rotate32(g, 11) * c1;
g = Rotate32(g, 17) * c1;
f = Rotate32(f, 11) * c1;
f = Rotate32(f, 17) * c1;
h = Rotate32(h + g, 19);
h = h * 5 + 0xe6546b64;
h = Rotate32(h, 17) * c1;
h = Rotate32(h + f, 19);
h = h * 5 + 0xe6546b64;
h = Rotate32(h, 17) * c1;
return h;
}
// Bitwise right rotate. Normally this will compile to a single
// instruction, especially if the shift is a manifest constant.
static uint64 Rotate(uint64 val, int shift) {
// Avoid shifting by 64: doing so yields an undefined result.
return shift == 0 ? val : ((val >> shift) | (val << (64 - shift)));
}
static uint64 ShiftMix(uint64 val) {
return val ^ (val >> 47);
}
static uint64 HashLen16(uint64 u, uint64 v) {
return Hash128to64(uint128(u, v));
}
static uint64 HashLen16(uint64 u, uint64 v, uint64 mul) {
// Murmur-inspired hashing.
uint64 a = (u ^ v) * mul;
a ^= (a >> 47);
uint64 b = (v ^ a) * mul;
b ^= (b >> 47);
b *= mul;
return b;
}
static uint64 HashLen0to16(const char *s, size_t len) {
if (len >= 8) {
uint64 mul = k2 + len * 2;
uint64 a = Fetch64(s) + k2;
uint64 b = Fetch64(s + len - 8);
uint64 c = Rotate(b, 37) * mul + a;
uint64 d = (Rotate(a, 25) + b) * mul;
return HashLen16(c, d, mul);
}
if (len >= 4) {
uint64 mul = k2 + len * 2;
uint64 a = Fetch32(s);
return HashLen16(len + (a << 3), Fetch32(s + len - 4), mul);
}
if (len > 0) {
uint8 a = s[0];
uint8 b = s[len >> 1];
uint8 c = s[len - 1];
uint32 y = static_cast<uint32>(a) + (static_cast<uint32>(b) << 8);
uint32 z = len + (static_cast<uint32>(c) << 2);
return ShiftMix(y * k2 ^ z * k0) * k2;
}
return k2;
}
// This probably works well for 16-byte strings as well, but it may be overkill
// in that case.
static uint64 HashLen17to32(const char *s, size_t len) {
uint64 mul = k2 + len * 2;
uint64 a = Fetch64(s) * k1;
uint64 b = Fetch64(s + 8);
uint64 c = Fetch64(s + len - 8) * mul;
uint64 d = Fetch64(s + len - 16) * k2;
return HashLen16(Rotate(a + b, 43) + Rotate(c, 30) + d,
a + Rotate(b + k2, 18) + c, mul);
}
// Return a 16-byte hash for 48 bytes. Quick and dirty.
// Callers do best to use "random-looking" values for a and b.
static pair<uint64, uint64> WeakHashLen32WithSeeds(
uint64 w, uint64 x, uint64 y, uint64 z, uint64 a, uint64 b) {
a += w;
b = Rotate(b + a + z, 21);
uint64 c = a;
a += x;
a += y;
b += Rotate(a, 44);
return make_pair(a + z, b + c);
}
// Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty.
static pair<uint64, uint64> WeakHashLen32WithSeeds(
const char* s, uint64 a, uint64 b) {
return WeakHashLen32WithSeeds(Fetch64(s),
Fetch64(s + 8),
Fetch64(s + 16),
Fetch64(s + 24),
a,
b);
}
// Return an 8-byte hash for 33 to 64 bytes.
static uint64 HashLen33to64(const char *s, size_t len) {
uint64 mul = k2 + len * 2;
uint64 a = Fetch64(s) * k2;
uint64 b = Fetch64(s + 8);
uint64 c = Fetch64(s + len - 24);
uint64 d = Fetch64(s + len - 32);
uint64 e = Fetch64(s + 16) * k2;
uint64 f = Fetch64(s + 24) * 9;
uint64 g = Fetch64(s + len - 8);
uint64 h = Fetch64(s + len - 16) * mul;
uint64 u = Rotate(a + g, 43) + (Rotate(b, 30) + c) * 9;
uint64 v = ((a + g) ^ d) + f + 1;
uint64 w = bswap_64((u + v) * mul) + h;
uint64 x = Rotate(e + f, 42) + c;
uint64 y = (bswap_64((v + w) * mul) + g) * mul;
uint64 z = e + f + c;
a = bswap_64((x + z) * mul + y) + b;
b = ShiftMix((z + a) * mul + d + h) * mul;
return b + x;
}
uint64 CityHash64(const char *s, size_t len) {
if (len <= 32) {
if (len <= 16) {
return HashLen0to16(s, len);
} else {
return HashLen17to32(s, len);
}
} else if (len <= 64) {
return HashLen33to64(s, len);
}
// For strings over 64 bytes we hash the end first, and then as we
// loop we keep 56 bytes of state: v, w, x, y, and z.
uint64 x = Fetch64(s + len - 40);
uint64 y = Fetch64(s + len - 16) + Fetch64(s + len - 56);
uint64 z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24));
pair<uint64, uint64> v = WeakHashLen32WithSeeds(s + len - 64, len, z);
pair<uint64, uint64> w = WeakHashLen32WithSeeds(s + len - 32, y + k1, x);
x = x * k1 + Fetch64(s);
// Decrease len to the nearest multiple of 64, and operate on 64-byte chunks.
len = (len - 1) & ~static_cast<size_t>(63);
do {
x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
x ^= w.second;
y += v.first + Fetch64(s + 40);
z = Rotate(z + w.first, 33) * k1;
v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
std::swap(z, x);
s += 64;
len -= 64;
} while (len != 0);
return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z,
HashLen16(v.second, w.second) + x);
}
uint64 CityHash64WithSeed(const char *s, size_t len, uint64 seed) {
return CityHash64WithSeeds(s, len, k2, seed);
}
uint64 CityHash64WithSeeds(const char *s, size_t len,
uint64 seed0, uint64 seed1) {
return HashLen16(CityHash64(s, len) - seed0, seed1);
}
// A subroutine for CityHash128(). Returns a decent 128-bit hash for strings
// of any length representable in signed long. Based on City and Murmur.
static uint128 CityMurmur(const char *s, size_t len, uint128 seed) {
uint64 a = Uint128Low64(seed);
uint64 b = Uint128High64(seed);
uint64 c = 0;
uint64 d = 0;
signed long l = len - 16;
if (l <= 0) { // len <= 16
a = ShiftMix(a * k1) * k1;
c = b * k1 + HashLen0to16(s, len);
d = ShiftMix(a + (len >= 8 ? Fetch64(s) : c));
} else { // len > 16
c = HashLen16(Fetch64(s + len - 8) + k1, a);
d = HashLen16(b + len, c + Fetch64(s + len - 16));
a += d;
do {
a ^= ShiftMix(Fetch64(s) * k1) * k1;
a *= k1;
b ^= a;
c ^= ShiftMix(Fetch64(s + 8) * k1) * k1;
c *= k1;
d ^= c;
s += 16;
l -= 16;
} while (l > 0);
}
a = HashLen16(a, c);
b = HashLen16(d, b);
return uint128(a ^ b, HashLen16(b, a));
}
uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed) {
if (len < 128) {
return CityMurmur(s, len, seed);
}
// We expect len >= 128 to be the common case. Keep 56 bytes of state:
// v, w, x, y, and z.
pair<uint64, uint64> v, w;
uint64 x = Uint128Low64(seed);
uint64 y = Uint128High64(seed);
uint64 z = len * k1;
v.first = Rotate(y ^ k1, 49) * k1 + Fetch64(s);
v.second = Rotate(v.first, 42) * k1 + Fetch64(s + 8);
w.first = Rotate(y + z, 35) * k1 + x;
w.second = Rotate(x + Fetch64(s + 88), 53) * k1;
// This is the same inner loop as CityHash64(), manually unrolled.
do {
x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
x ^= w.second;
y += v.first + Fetch64(s + 40);
z = Rotate(z + w.first, 33) * k1;
v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
std::swap(z, x);
s += 64;
x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
x ^= w.second;
y += v.first + Fetch64(s + 40);
z = Rotate(z + w.first, 33) * k1;
v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
std::swap(z, x);
s += 64;
len -= 128;
} while (LIKELY(len >= 128));
x += Rotate(v.first + z, 49) * k0;
y = y * k0 + Rotate(w.second, 37);
z = z * k0 + Rotate(w.first, 27);
w.first *= 9;
v.first *= k0;
// If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s.
for (size_t tail_done = 0; tail_done < len; ) {
tail_done += 32;
y = Rotate(x + y, 42) * k0 + v.second;
w.first += Fetch64(s + len - tail_done + 16);
x = x * k0 + w.first;
z += w.second + Fetch64(s + len - tail_done);
w.second += v.first;
v = WeakHashLen32WithSeeds(s + len - tail_done, v.first + z, v.second);
v.first *= k0;
}
// At this point our 56 bytes of state should contain more than
// enough information for a strong 128-bit hash. We use two
// different 56-byte-to-8-byte hashes to get a 16-byte final result.
x = HashLen16(x, v.first);
y = HashLen16(y + z, w.first);
return uint128(HashLen16(x + v.second, w.second) + y,
HashLen16(x + w.second, y + v.second));
}
uint128 CityHash128(const char *s, size_t len) {
return len >= 16 ?
CityHash128WithSeed(s + 16, len - 16,
uint128(Fetch64(s), Fetch64(s + 8) + k0)) :
CityHash128WithSeed(s, len, uint128(k0, k1));
}

View File

@@ -1,17 +0,0 @@
#include "crucible/cleanup.h"
namespace crucible {
Cleanup::Cleanup(function<void()> func) :
m_cleaner(func)
{
}
Cleanup::~Cleanup()
{
if (m_cleaner) {
m_cleaner();
}
}
}

View File

@@ -1,6 +0,0 @@
#ifndef _CONFIGURE_H
#define ETC_PREFIX "@ETC_PREFIX@"
#define _CONFIGURE_H
#endif

View File

@@ -1,31 +1,3 @@
/* crc64.c -- compute CRC-64
* Copyright (C) 2013 Mark Adler
* Version 1.4 16 Dec 2013 Mark Adler
*/
/*
This software is provided 'as-is', without any express or implied
warranty. In no event will the author be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
Mark Adler
madler@alumni.caltech.edu
*/
/* Substantially modified by Paul Jones for usage in bees */
#include "crucible/crc64.h"
#define POLY64REV 0xd800000000000000ULL
@@ -33,16 +5,13 @@
namespace crucible {
static bool init = false;
static uint64_t CRCTable[8][256];
static uint64_t CRCTable[256];
static void init_crc64_table()
{
if (!init) {
uint64_t crc;
// Generate CRCs for all single byte sequences
for (int n = 0; n < 256; n++) {
uint64_t part = n;
for (int i = 0; i <= 255; i++) {
uint64_t part = i;
for (int j = 0; j < 8; j++) {
if (part & 1) {
part = (part >> 1) ^ POLY64REV;
@@ -50,53 +19,37 @@ namespace crucible {
part >>= 1;
}
}
CRCTable[0][n] = part;
}
// Generate nested CRC table for slice-by-8 lookup
for (int n = 0; n < 256; n++) {
crc = CRCTable[0][n];
for (int k = 1; k < 8; k++) {
crc = CRCTable[0][crc & 0xff] ^ (crc >> 8);
CRCTable[k][n] = crc;
}
CRCTable[i] = part;
}
init = true;
}
}
uint64_t
Digest::CRC::crc64(const char *s)
{
init_crc64_table();
uint64_t crc = 0;
for (; *s; s++) {
uint64_t temp1 = crc >> 8;
uint64_t temp2 = CRCTable[(crc ^ static_cast<uint64_t>(*s)) & 0xff];
crc = temp1 ^ temp2;
}
return crc;
}
uint64_t
Digest::CRC::crc64(const void *p, size_t len)
{
init_crc64_table();
const unsigned char *next = static_cast<const unsigned char *>(p);
uint64_t crc = 0;
// Process individual bytes until we reach an 8-byte aligned pointer
while (len && (reinterpret_cast<uintptr_t>(next) & 7) != 0) {
crc = CRCTable[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
len--;
}
// Fast middle processing, 8 bytes (aligned!) per loop
while (len >= 8) {
crc ^= *(reinterpret_cast< const uint64_t *>(next));
crc = CRCTable[7][crc & 0xff] ^
CRCTable[6][(crc >> 8) & 0xff] ^
CRCTable[5][(crc >> 16) & 0xff] ^
CRCTable[4][(crc >> 24) & 0xff] ^
CRCTable[3][(crc >> 32) & 0xff] ^
CRCTable[2][(crc >> 40) & 0xff] ^
CRCTable[1][(crc >> 48) & 0xff] ^
CRCTable[0][crc >> 56];
next += 8;
len -= 8;
}
// Process remaining bytes (can't be larger than 8)
while (len) {
crc = CRCTable[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
len--;
for (const unsigned char *s = static_cast<const unsigned char *>(p); len; --len) {
uint64_t temp1 = crc >> 8;
uint64_t temp2 = CRCTable[(crc ^ *s++) & 0xff];
crc = temp1 ^ temp2;
}
return crc;

View File

@@ -32,7 +32,7 @@ namespace crucible {
// FIXME: could probably avoid some of these levels of indirection
static
function<void(string s)> current_catch_explainer = [](string s) {
function<void(string s)> current_catch_explainer = [&](string s) {
cerr << s << endl;
};

104
lib/execpipe.cc Normal file
View File

@@ -0,0 +1,104 @@
#include "crucible/execpipe.h"
#include "crucible/chatter.h"
#include "crucible/error.h"
#include "crucible/process.h"
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/wait.h>
#include <unistd.h>
namespace crucible {
using namespace std;
void
redirect_stdin(const Fd &child_fd)
{
dup2_or_die(child_fd, STDIN_FILENO);
}
void
redirect_stdin_stdout(const Fd &child_fd)
{
dup2_or_die(child_fd, STDOUT_FILENO);
dup2_or_die(child_fd, STDIN_FILENO);
}
void
redirect_stdin_stdout_stderr(const Fd &child_fd)
{
dup2_or_die(child_fd, STDERR_FILENO);
dup2_or_die(child_fd, STDOUT_FILENO);
dup2_or_die(child_fd, STDIN_FILENO);
}
void
redirect_stdout_stderr(const Fd &child_fd)
{
dup2_or_die(child_fd, STDERR_FILENO);
dup2_or_die(child_fd, STDOUT_FILENO);
}
void
redirect_stdout(const Fd &child_fd)
{
dup2_or_die(child_fd, STDOUT_FILENO);
}
void
redirect_stderr(const Fd &child_fd)
{
dup2_or_die(child_fd, STDERR_FILENO);
}
Fd popen(function<int()> f, function<void(const Fd &child_fd)> import_fd_fn)
{
Fd parent_fd, child_fd;
{
pair<Fd, Fd> fd_pair = socketpair_or_die();
parent_fd = fd_pair.first;
child_fd = fd_pair.second;
}
pid_t fv;
DIE_IF_MINUS_ONE(fv = fork());
if (fv) {
child_fd->close();
return parent_fd;
} else {
int rv = EXIT_FAILURE;
catch_all([&]() {
parent_fd->close();
import_fd_fn(child_fd);
// system("ls -l /proc/$$/fd/ >&2");
rv = f();
});
_exit(rv);
cerr << "PID " << getpid() << " TID " << gettid() << "STILL ALIVE" << endl;
system("ls -l /proc/$$/task/ >&2");
exit(EXIT_FAILURE);
}
}
string
read_all(Fd fd, size_t max_bytes, size_t chunk_bytes)
{
char buf[chunk_bytes];
string str;
size_t rv;
while (1) {
read_partial_or_die(fd, static_cast<void *>(buf), chunk_bytes, rv);
if (rv == 0) {
break;
}
if (max_bytes - str.size() < rv) {
THROW_ERROR(out_of_range, "Output size limit " << max_bytes << " exceeded by appending " << rv << " bytes read to " << str.size() << " already in string");
}
str.append(buf, rv);
}
return str;
}
}

View File

@@ -6,40 +6,26 @@
#include "crucible/limits.h"
#include "crucible/string.h"
namespace crucible {
using namespace std;
const off_t ExtentWalker::sc_step_size;
// fm_start, fm_length, fm_flags, m_extents
// fe_logical, fe_physical, fe_length, fe_flags
static const off_t MAX_OFFSET = numeric_limits<off_t>::max();
static const off_t FIEMAP_BLOCK_SIZE = 4096;
// Maximum number of extents from TREE_SEARCH.
static const unsigned sc_extent_fetch_max = 16;
static bool __ew_do_log = getenv("EXTENTWALKER_DEBUG");
// Minimum number of extents from TREE_SEARCH.
// If we don't get this number, we'll binary search backward
// until we reach the beginning of the file or find at least this
// number of extents.
static const unsigned sc_extent_fetch_min = 4;
// This is a guess that tries to land at least one extent
// before the target extent, so we don't have to search backward as often.
static const off_t sc_back_step_size = 64 * 1024;
#ifdef EXTENTWALKER_DEBUG
#define EWLOG(x) do { \
m_log << x << endl; \
if (__ew_do_log) { \
CHATTER(x); \
} \
} while (0)
#define EWTRACE(x) do { \
CHATTER_UNWIND(x); \
} while (0)
#else
#define EWLOG(x) do {} while (0)
#define EWTRACE(x) do {} while (0)
#endif
ostream &
operator<<(ostream &os, const Extent &e)
{
@@ -57,7 +43,9 @@ namespace crucible {
if (e.m_flags & Extent::OBSCURED) {
os << "Extent::OBSCURED|";
}
os << fiemap_extent_flags_ntoa(e.m_flags & ~(Extent::HOLE | Extent::PREALLOC | Extent::OBSCURED));
if (e.m_flags & ~(Extent::HOLE | Extent::PREALLOC | Extent::OBSCURED)) {
os << fiemap_extent_flags_ntoa(e.m_flags & ~(Extent::HOLE | Extent::PREALLOC | Extent::OBSCURED));
}
if (e.m_physical_len) {
os << ", physical_len = " << to_hex(e.m_physical_len);
}
@@ -83,16 +71,23 @@ namespace crucible {
ostream &
operator<<(ostream &os, const ExtentWalker &ew)
{
os << "ExtentWalker {"
return os << "ExtentWalker {"
<< " fd = " << name_fd(ew.m_fd)
<< ", stat.st_size = " << to_hex(ew.m_stat.st_size)
<< ", extents = " << ew.m_extents
<< ", current = [" << ew.m_current - ew.m_extents.begin()
<< "] ";
#ifdef EXTENTWALKER_DEBUG
os << "\nLog:\n" << ew.m_log.str() << "\nEnd log";
#endif
return os << "}";
<< "] }";
}
Extent::Extent() :
m_begin(0),
m_end(0),
m_physical(0),
m_flags(0),
m_physical_len(0),
m_logical_len(0),
m_offset(0)
{
}
Extent::operator bool() const
@@ -114,18 +109,6 @@ namespace crucible {
return m_begin == that.m_begin && m_end == that.m_end && m_physical == that.m_physical && m_flags == that.m_flags;
}
bool
Extent::compressed() const
{
return m_flags & FIEMAP_EXTENT_ENCODED;
}
uint64_t
Extent::bytenr() const
{
return compressed() ? m_physical : m_physical - m_offset;
}
ExtentWalker::ExtentWalker(Fd fd) :
m_fd(fd),
m_current(m_extents.begin())
@@ -178,7 +161,8 @@ namespace crucible {
void
ExtentWalker::run_fiemap(off_t pos)
{
EWTRACE("Log of run_fiemap: " << m_log.str());
ostringstream log;
CHATTER_UNWIND("Log of run_fiemap: " << log.str());
EWLOG("pos = " << to_hex(pos));
@@ -186,24 +170,18 @@ namespace crucible {
Vec fm;
// Start backward search by dropping lowest bit
off_t step_size = (pos > 0) ? (pos ^ (pos & (pos - 1))) * 2 : 0;
// Start first pass through loop just a little before the target extent,
// because the first iteration will be wasted if we have an exact match.
off_t begin = pos - min(pos, sc_back_step_size);
off_t step_size = pos;
off_t begin = pos - min(pos, sc_step_size);
// This loop should not run forever
int loop_count = 0;
const int loop_limit = 99;
int loop_limit = 99;
while (true) {
#ifdef EXTENTWALKER_DEBUG
if (loop_count >= loop_limit) {
cerr << "Too many loops!" << endl << m_log.str() << endl;
abort();
if (loop_count == 90) {
EWLOG(log.str());
}
#endif
THROW_CHECK2(runtime_error, *this, loop_count, loop_count < loop_limit);
THROW_CHECK1(runtime_error, loop_count, loop_count < loop_limit);
++loop_count;
// Get file size every time in case it changes under us
@@ -211,16 +189,7 @@ namespace crucible {
// Get fiemap begin..EOF
fm = get_extent_map(begin);
EWLOG("fiemap result loop count #" << loop_count << " begin " << to_hex(begin) << " pos "
<< to_hex(pos) << " step_size " << to_hex(step_size) << ":\n" << fm);
// Sanity check on the data: in order, not overlapping, not empty, not before pos
off_t sanity_pos = begin;
for (auto const &i : fm) {
THROW_CHECK1(runtime_error, fm, i.begin() >= sanity_pos);
THROW_CHECK1(runtime_error, fm, i.end() > i.begin());
sanity_pos = i.end();
}
EWLOG("fiemap result loop count #" << loop_count << ":" << fm);
// This algorithm seeks at least three extents: one before,
// one after, and one containing pos. Files which contain
@@ -228,15 +197,15 @@ namespace crucible {
// so handle those cases separately.
// FIEMAP lies, and we catch it in a lie about the size of the
// second extent. To work around this, sc_extent_fetch_min is at least 4.
// second extent. To work around this, try getting more than 3.
// 0..2(ish) extents
if (fm.size() < sc_extent_fetch_min) {
// If we are not at beginning of file, move backward by zeroing the lowest bit
// If we are not at beginning of file, move backward
if (begin > 0) {
step_size = (begin > 0) ? (begin ^ (begin & (begin - 1))) : 0;
step_size /= 2;
auto next_begin = (begin - min(step_size, begin)) & ~(FIEMAP_BLOCK_SIZE - 1);
EWLOG("step backward " << to_hex(begin) << " -> " << to_hex(next_begin));
EWLOG("step backward " << to_hex(begin) << " -> " << to_hex(next_begin) << " extents size " << fm.size());
if (begin == next_begin) {
EWLOG("step backward stopped");
break;
@@ -264,18 +233,18 @@ namespace crucible {
// We have at least three extents, so there is now a first and last.
// We want pos to be between first and last. There doesn't have
// to be an extent between these (it could be a hole).
auto &first_extent = *fm.begin();
auto &first_extent = fm.at(sc_extent_fetch_min - 2);
auto &last_extent = *fm.rbegin();
EWLOG("first_extent = " << first_extent);
EWLOG("last_extent = " << last_extent);
// First extent must end on or before pos; otherwise, go further back
// First extent must end on or before pos
if (first_extent.end() > pos) {
// Can we move backward?
if (begin > 0) {
step_size = (begin > 0) ? (begin ^ (begin & (begin - 1))) : 0;
step_size /= 2;
auto next_begin = (begin - min(step_size, begin)) & ~(FIEMAP_BLOCK_SIZE - 1);
EWLOG("step backward " << to_hex(begin) << " -> " << to_hex(next_begin));
EWLOG("step backward " << to_hex(begin) << " -> " << to_hex(next_begin) << " extents size " << fm.size());
if (begin == next_begin) {
EWLOG("step backward stopped");
break;
@@ -285,29 +254,38 @@ namespace crucible {
}
// We are as far back as we can go, so there must be no
// extent before pos (i.e. file starts with a hole
// or first extent starts at pos 0).
// extent before pos (i.e. file starts with a hole).
EWLOG("no extent before pos");
break;
}
// If last extent is EOF then we cannot more any further forward.
// First extent ends on or before pos.
// If last extent is EOF then we have the entire file in the buffer.
// pos could be in last extent, so skip the later checks that
// insist pos be located prior to the last extent.
if (last_extent.flags() & FIEMAP_EXTENT_LAST) {
break;
}
// Don't have EOF, must have an extent after pos; otherwise, go forward
// Don't have EOF, must have an extent after pos.
if (last_extent.begin() <= pos) {
// Set the bit just below the one we last cleared
step_size /= 2;
auto new_begin = (begin + max(FIEMAP_BLOCK_SIZE, step_size)) & ~(FIEMAP_BLOCK_SIZE - 1);
auto new_begin = (begin + step_size) & ~(FIEMAP_BLOCK_SIZE - 1);
EWLOG("step forward " << to_hex(begin) << " -> " << to_hex(new_begin));
if (begin == new_begin) {
EWLOG("step forward stopped");
break;
}
begin = new_begin;
continue;
}
// Last extent begins after pos, first extent ends on or before pos.
// All other cases should have been handled before here.
THROW_CHECK2(runtime_error, pos, first_extent, first_extent.end() <= pos);
THROW_CHECK2(runtime_error, pos, last_extent, last_extent.begin() > pos);
// We should probably stop now
break;
}
@@ -322,11 +300,6 @@ namespace crucible {
while (fmi != fm.end()) {
Extent new_extent(*fmi);
THROW_CHECK2(runtime_error, ipos, new_extent.m_begin, ipos <= new_extent.m_begin);
// Don't map extents past EOF, we can't read them
if (new_extent.m_begin >= m_stat.st_size) {
last_extent_is_last = true;
break;
}
if (new_extent.m_begin > ipos) {
Extent hole_extent;
hole_extent.m_begin = ipos;
@@ -354,14 +327,12 @@ namespace crucible {
hole_extent.m_flags |= FIEMAP_EXTENT_LAST;
}
new_vec.push_back(hole_extent);
ipos += hole_extent.size();
ipos += new_vec.size();
}
// Extent list must now be non-empty, at least a hole
THROW_CHECK1(runtime_error, new_vec.size(), !new_vec.empty());
// ipos must match end of last extent
THROW_CHECK3(runtime_error, ipos, new_vec.rbegin()->m_end, m_stat.st_size, ipos == new_vec.rbegin()->m_end);
// Allow last extent to extend beyond desired range (e.g. at EOF)
THROW_CHECK2(runtime_error, ipos, new_vec.rbegin()->m_end, ipos <= new_vec.rbegin()->m_end);
// If we have the last extent in the file, truncate it to the file size.
if (ipos >= m_stat.st_size) {
THROW_CHECK2(runtime_error, new_vec.rbegin()->m_begin, m_stat.st_size, m_stat.st_size > new_vec.rbegin()->m_begin);
@@ -369,10 +340,9 @@ namespace crucible {
new_vec.rbegin()->m_end = m_stat.st_size;
}
// Verify at least one Extent
// Verify contiguous, ascending order, at least one Extent
THROW_CHECK1(runtime_error, new_vec, !new_vec.empty());
// Verify contiguous, ascending order, only extent with FIEMAP_EXTENT_LAST flag is the last extent
ipos = new_vec.begin()->m_begin;
bool last_flag_last = false;
for (auto e : new_vec) {
@@ -382,6 +352,7 @@ namespace crucible {
ipos += e.size();
last_flag_last = e.m_flags & FIEMAP_EXTENT_LAST;
}
THROW_CHECK1(runtime_error, new_vec, !last_extent_is_last || new_vec.rbegin()->m_end == ipos);
m_extents = new_vec;
m_current = m_extents.begin();
@@ -397,7 +368,7 @@ namespace crucible {
void
ExtentWalker::seek(off_t pos)
{
EWTRACE("seek " << to_hex(pos));
CHATTER_UNWIND("seek " << to_hex(pos));
THROW_CHECK1(out_of_range, pos, pos >= 0);
Itr rv = find_in_cache(pos);
if (rv != m_extents.end()) {
@@ -406,28 +377,29 @@ namespace crucible {
}
run_fiemap(pos);
m_current = find_in_cache(pos);
THROW_CHECK2(runtime_error, *this, to_hex(pos), m_current != m_extents.end());
}
Extent
ExtentWalker::current()
{
THROW_CHECK2(invalid_argument, *this, m_extents.size(), m_current != m_extents.end());
CHATTER_UNWIND("current " << *m_current);
return *m_current;
}
bool
ExtentWalker::next()
{
EWTRACE("next");
CHATTER_UNWIND("next");
THROW_CHECK1(invalid_argument, (m_current != m_extents.end()), m_current != m_extents.end());
if (current().m_end >= m_stat.st_size) {
EWTRACE("next EOF");
CHATTER_UNWIND("next EOF");
return false;
}
auto next_pos = current().m_end;
if (next_pos >= m_stat.st_size) {
EWTRACE("next next_pos = " << next_pos << " m_stat.st_size = " << m_stat.st_size);
CHATTER_UNWIND("next next_pos = " << next_pos << " m_stat.st_size = " << m_stat.st_size);
return false;
}
seek(next_pos);
@@ -445,16 +417,16 @@ namespace crucible {
bool
ExtentWalker::prev()
{
EWTRACE("prev");
CHATTER_UNWIND("prev");
THROW_CHECK1(invalid_argument, (m_current != m_extents.end()), m_current != m_extents.end());
auto prev_iter = m_current;
if (prev_iter->m_begin == 0) {
EWTRACE("prev BOF");
CHATTER_UNWIND("prev BOF");
return false;
}
THROW_CHECK1(invalid_argument, (prev_iter != m_extents.begin()), prev_iter != m_extents.begin());
--prev_iter;
EWTRACE("prev seeking to " << *prev_iter << "->m_begin");
CHATTER_UNWIND("prev seeking to " << *prev_iter << "->m_begin");
auto prev_end = current().m_begin;
seek(prev_iter->m_begin);
THROW_CHECK1(runtime_error, (m_current != m_extents.end()), m_current != m_extents.end());
@@ -513,7 +485,7 @@ namespace crucible {
sk.min_type = sk.max_type = BTRFS_EXTENT_DATA_KEY;
sk.nr_items = sc_extent_fetch_max;
EWTRACE("sk " << sk << " root_fd " << name_fd(m_root_fd));
CHATTER_UNWIND("sk " << sk << " root_fd " << name_fd(m_root_fd));
sk.do_ioctl(m_root_fd);
Vec rv;
@@ -539,38 +511,37 @@ namespace crucible {
Extent e;
e.m_begin = i.offset;
auto compressed = btrfs_get_member(&btrfs_file_extent_item::compression, i.m_data);
auto compressed = call_btrfs_get(btrfs_stack_file_extent_compression, i.m_data);
// FIEMAP told us about compressed extents and we can too
if (compressed) {
e.m_flags |= FIEMAP_EXTENT_ENCODED;
}
auto type = btrfs_get_member(&btrfs_file_extent_item::type, i.m_data);
auto type = call_btrfs_get(btrfs_stack_file_extent_type, i.m_data);
off_t len = -1;
switch (type) {
default:
switch (type) {
default:
cerr << "Unhandled file extent type " << type << " in root " << m_tree_id << " ino " << m_stat.st_ino << endl;
break;
case BTRFS_FILE_EXTENT_INLINE:
len = ranged_cast<off_t>(btrfs_get_member(&btrfs_file_extent_item::ram_bytes, i.m_data));
case BTRFS_FILE_EXTENT_INLINE:
len = ranged_cast<off_t>(call_btrfs_get(btrfs_stack_file_extent_ram_bytes, i.m_data));
e.m_flags |= FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED;
// Inline extents are never obscured, so don't bother filling in m_physical_len, etc.
break;
case BTRFS_FILE_EXTENT_PREALLOC:
break;
case BTRFS_FILE_EXTENT_PREALLOC:
e.m_flags |= Extent::PREALLOC;
// fallthrough
case BTRFS_FILE_EXTENT_REG: {
e.m_physical = btrfs_get_member(&btrfs_file_extent_item::disk_bytenr, i.m_data);
case BTRFS_FILE_EXTENT_REG: {
e.m_physical = call_btrfs_get(btrfs_stack_file_extent_disk_bytenr, i.m_data);
// This is the length of the full extent (decompressed)
off_t ram = ranged_cast<off_t>(btrfs_get_member(&btrfs_file_extent_item::ram_bytes, i.m_data));
off_t ram = ranged_cast<off_t>(call_btrfs_get(btrfs_stack_file_extent_ram_bytes, i.m_data));
// This is the length of the part of the extent appearing in the file (decompressed)
len = ranged_cast<off_t>(btrfs_get_member(&btrfs_file_extent_item::num_bytes, i.m_data));
len = ranged_cast<off_t>(call_btrfs_get(btrfs_stack_file_extent_num_bytes, i.m_data));
// This is the offset from start of on-disk extent to the part we see in the file (decompressed)
// May be negative due to the kind of bug we're stuck with forever, so no cast range check
off_t offset = btrfs_get_member(&btrfs_file_extent_item::offset, i.m_data);
off_t offset = call_btrfs_get(btrfs_stack_file_extent_offset, i.m_data);
// If there is a physical address there must be size too
if (e.m_physical) {
@@ -623,7 +594,7 @@ namespace crucible {
e.m_flags |= FIEMAP_EXTENT_LAST;
}
// FIXME: no FIEMAP_EXTENT_SHARED
// WONTFIX: non-trivial to replicate LOGICAL_INO
// WONTFIX: non-trivial to replicate LOGIAL_INO
rv.push_back(e);
}
}
@@ -639,8 +610,9 @@ namespace crucible {
ExtentWalker::Vec
ExtentWalker::get_extent_map(off_t pos)
{
EWLOG("get_extent_map(" << to_hex(pos) << ")");
Fiemap fm(ranged_cast<uint64_t>(pos), ranged_cast<uint64_t>(numeric_limits<off_t>::max() - pos));
Fiemap fm;
fm.fm_start = ranged_cast<uint64_t>(pos);
fm.fm_length = ranged_cast<uint64_t>(numeric_limits<off_t>::max() - pos);
fm.m_max_count = fm.m_min_count = sc_extent_fetch_max;
fm.do_ioctl(m_fd);
Vec rv;
@@ -651,9 +623,7 @@ namespace crucible {
e.m_physical = i.fe_physical;
e.m_flags = i.fe_flags;
rv.push_back(e);
EWLOG("push_back(" << e << ")");
}
EWLOG("get_extent_map(" << to_hex(pos) << ") returning " << rv.size() << " extents");
return rv;
}

152
lib/fd.cc
View File

@@ -107,6 +107,12 @@ namespace crucible {
}
}
IOHandle::IOHandle() :
m_fd(-1)
{
CHATTER_TRACE("open fd " << m_fd << " in " << this);
}
IOHandle::IOHandle(int fd) :
m_fd(fd)
{
@@ -114,52 +120,12 @@ namespace crucible {
}
int
IOHandle::get_fd() const
IOHandle::release_fd()
{
return m_fd;
}
NamedPtr<IOHandle, int> Fd::s_named_ptr([](int fd) { return make_shared<IOHandle>(fd); });
Fd::Fd() :
m_handle(s_named_ptr(-1))
{
}
Fd::Fd(int fd) :
m_handle(s_named_ptr(fd < 0 ? -1 : fd))
{
}
Fd &
Fd::operator=(int const fd)
{
m_handle = s_named_ptr(fd < 0 ? -1 : fd);
return *this;
}
Fd &
Fd::operator=(const shared_ptr<IOHandle> &handle)
{
m_handle = s_named_ptr.insert(handle, handle->get_fd());
return *this;
}
Fd::operator int() const
{
return m_handle->get_fd();
}
bool
Fd::operator!() const
{
return m_handle->get_fd() < 0;
}
shared_ptr<IOHandle>
Fd::operator->() const
{
return m_handle;
CHATTER_TRACE("release fd " << m_fd << " in " << this);
int rv = m_fd;
m_fd = -1;
return rv;
}
// XXX: necessary? useful?
@@ -208,13 +174,11 @@ namespace crucible {
static const struct bits_ntoa_table mmap_flags_table[] = {
NTOA_TABLE_ENTRY_BITS(MAP_SHARED),
NTOA_TABLE_ENTRY_BITS(MAP_PRIVATE),
#ifdef MAP_32BIT
NTOA_TABLE_ENTRY_BITS(MAP_32BIT),
#endif
NTOA_TABLE_ENTRY_BITS(MAP_ANONYMOUS),
NTOA_TABLE_ENTRY_BITS(MAP_DENYWRITE),
NTOA_TABLE_ENTRY_BITS(MAP_EXECUTABLE),
#ifdef MAP_FILE
#if MAP_FILE
NTOA_TABLE_ENTRY_BITS(MAP_FILE),
#endif
NTOA_TABLE_ENTRY_BITS(MAP_FIXED),
@@ -266,14 +230,6 @@ namespace crucible {
}
}
void
ftruncate_or_die(int fd, off_t size)
{
if (::ftruncate(fd, size)) {
THROW_ERRNO("ftruncate: " << name_fd(fd) << " size " << size);
}
}
string
socket_domain_ntoa(int domain)
{
@@ -361,11 +317,8 @@ namespace crucible {
THROW_ERROR(invalid_argument, "pwrite: trying to write on a closed file descriptor");
}
int rv = ::pwrite(fd, buf, size, offset);
if (rv < 0) {
THROW_ERRNO("pwrite: could not write " << size << " bytes at fd " << name_fd(fd) << " offset " << offset);
}
if (rv != static_cast<ssize_t>(size)) {
THROW_ERROR(runtime_error, "pwrite: only " << rv << " of " << size << " bytes written at fd " << name_fd(fd) << " offset " << offset);
if (rv != static_cast<int>(size)) {
THROW_ERROR(runtime_error, "pwrite: only " << rv << " of " << size << " bytes written at offset " << offset);
}
}
@@ -395,7 +348,7 @@ namespace crucible {
}
THROW_ERRNO("read: " << size << " bytes");
}
if (rv > static_cast<ssize_t>(size)) {
if (rv > static_cast<int>(size)) {
THROW_ERROR(runtime_error, "read: somehow read more bytes (" << rv << ") than requested (" << size << ")");
}
if (rv == 0) break;
@@ -444,8 +397,8 @@ namespace crucible {
}
THROW_ERRNO("pread: " << size << " bytes");
}
if (rv != static_cast<ssize_t>(size)) {
THROW_ERROR(runtime_error, "pread: " << size << " bytes at fd " << name_fd(fd) << " offset " << offset << " returned " << rv);
if (rv != static_cast<int>(size)) {
THROW_ERROR(runtime_error, "pread: " << size << " bytes at offset " << offset << " returned " << rv);
}
break;
}
@@ -461,28 +414,21 @@ namespace crucible {
template<>
void
pread_or_die<ByteVector>(int fd, ByteVector &text, off_t offset)
pread_or_die<vector<char>>(int fd, vector<char> &text, off_t offset)
{
return pread_or_die(fd, text.data(), text.size(), offset);
}
template<>
void
pwrite_or_die<ByteVector>(int fd, const ByteVector &text, off_t offset)
pread_or_die<vector<uint8_t>>(int fd, vector<uint8_t> &text, off_t offset)
{
return pwrite_or_die(fd, text.data(), text.size(), offset);
return pread_or_die(fd, text.data(), text.size(), offset);
}
template<>
void
pwrite_or_die<string>(int fd, const string &text, off_t offset)
{
return pwrite_or_die(fd, text.data(), text.size(), offset);
}
Stat::Stat() :
stat( (stat) { } )
Stat::Stat()
{
memset_zero<stat>(this);
}
Stat &
@@ -501,39 +447,18 @@ namespace crucible {
return *this;
}
Stat::Stat(int fd) :
stat( (stat) { } )
Stat::Stat(int fd)
{
memset_zero<stat>(this);
fstat(fd);
}
Stat::Stat(const string &filename) :
stat( (stat) { } )
Stat::Stat(const string &filename)
{
memset_zero<stat>(this);
lstat(filename);
}
int
ioctl_iflags_get(int fd)
{
int attr = 0;
DIE_IF_MINUS_ONE(ioctl(fd, FS_IOC_GETFLAGS, &attr));
return attr;
}
void
ioctl_iflags_set(int fd, int attr)
{
// This bit of nonsense brought to you by Valgrind.
union {
int attr;
long zero;
} u;
u.zero = 0;
u.attr = attr;
DIE_IF_MINUS_ONE(ioctl(fd, FS_IOC_SETFLAGS, &u.attr));
}
string
readlink_or_die(const string &path)
{
@@ -559,24 +484,6 @@ namespace crucible {
THROW_ERROR(runtime_error, "readlink: maximum buffer size exceeded");
}
static string __relative_path;
string
relative_path()
{
return __relative_path;
}
void
set_relative_path(string path)
{
path = path + "/";
for (string::size_type i = path.find("//"); i != string::npos; i = path.find("//")) {
path.erase(i, 1);
}
__relative_path = path;
}
// Turn a FD into a human-recognizable filename OR an error message.
string
name_fd(int fd)
@@ -584,12 +491,7 @@ namespace crucible {
try {
ostringstream oss;
oss << "/proc/self/fd/" << fd;
string path = readlink_or_die(oss.str());
if (!__relative_path.empty() && 0 == path.find(__relative_path))
{
path.erase(0, __relative_path.length());
}
return path;
return readlink_or_die(oss.str());
} catch (exception &e) {
return string(e.what());
}

599
lib/fs.cc
View File

@@ -2,10 +2,10 @@
#include "crucible/error.h"
#include "crucible/fd.h"
#include "crucible/hexdump.h"
#include "crucible/limits.h"
#include "crucible/ntoa.h"
#include "crucible/string.h"
#include "crucible/uuid.h"
// FS_IOC_FIEMAP
#include <linux/fs.h>
@@ -33,11 +33,19 @@ namespace crucible {
#endif
}
BtrfsExtentInfo::BtrfsExtentInfo(int dst_fd, off_t dst_offset)
{
memset_zero<btrfs_ioctl_same_extent_info>(this);
fd = dst_fd;
logical_offset = dst_offset;
}
BtrfsExtentSame::BtrfsExtentSame(int src_fd, off_t src_offset, off_t src_length) :
m_logical_offset(src_offset),
m_length(src_length),
m_fd(src_fd)
{
memset_zero<btrfs_ioctl_same_args>(this);
logical_offset = src_offset;
length = src_length;
}
BtrfsExtentSame::~BtrfsExtentSame()
@@ -45,12 +53,9 @@ namespace crucible {
}
void
BtrfsExtentSame::add(int const fd, uint64_t const offset)
BtrfsExtentSame::add(int fd, off_t offset)
{
m_info.push_back( (btrfs_ioctl_same_extent_info) {
.fd = fd,
.logical_offset = offset,
});
m_info.push_back(BtrfsExtentInfo(fd, offset));
}
ostream &
@@ -107,8 +112,11 @@ namespace crucible {
os << " '" << fd_name << "'";
});
}
os << ", .logical_offset = " << to_hex(bes.m_logical_offset);
os << ", .length = " << to_hex(bes.m_length);
os << ", .logical_offset = " << to_hex(bes.logical_offset);
os << ", .length = " << to_hex(bes.length);
os << ", .dest_count = " << bes.dest_count;
os << ", .reserved1 = " << bes.reserved1;
os << ", .reserved2 = " << bes.reserved2;
os << ", .info[] = {";
for (size_t i = 0; i < bes.m_info.size(); ++i) {
os << " [" << i << "] = " << &(bes.m_info[i]) << ",";
@@ -119,25 +127,67 @@ namespace crucible {
void
btrfs_clone_range(int src_fd, off_t src_offset, off_t src_length, int dst_fd, off_t dst_offset)
{
btrfs_ioctl_clone_range_args args ( (btrfs_ioctl_clone_range_args) {
.src_fd = src_fd,
.src_offset = ranged_cast<uint64_t, off_t>(src_offset),
.src_length = ranged_cast<uint64_t, off_t>(src_length),
.dest_offset = ranged_cast<uint64_t, off_t>(dst_offset),
} );
struct btrfs_ioctl_clone_range_args args;
memset_zero(&args);
args.src_fd = src_fd;
args.src_offset = src_offset;
args.src_length = src_length;
args.dest_offset = dst_offset;
DIE_IF_MINUS_ONE(ioctl(dst_fd, BTRFS_IOC_CLONE_RANGE, &args));
}
// Userspace emulation of extent-same ioctl to work around kernel bugs
// (a memory leak, a deadlock, inability to cope with unaligned EOF, and a length limit)
// The emulation is incomplete: no locking, and we always change ctime
void
BtrfsExtentSameByClone::do_ioctl()
{
if (length <= 0) {
throw out_of_range(string("length = 0 in ") + __PRETTY_FUNCTION__);
}
vector<char> cmp_buf_common(length);
vector<char> cmp_buf_iter(length);
pread_or_die(m_fd, cmp_buf_common.data(), length, logical_offset);
for (auto i = m_info.begin(); i != m_info.end(); ++i) {
i->status = -EIO;
i->bytes_deduped = 0;
// save atime/ctime for later
Stat target_stat(i->fd);
pread_or_die(i->fd, cmp_buf_iter.data(), length, i->logical_offset);
if (cmp_buf_common == cmp_buf_iter) {
// This never happens, so stop checking.
// assert(!memcmp(cmp_buf_common.data(), cmp_buf_iter.data(), length));
btrfs_clone_range(m_fd, logical_offset, length, i->fd, i->logical_offset);
i->status = 0;
i->bytes_deduped = length;
// The extent-same ioctl does not change mtime (as of patch v4)
struct timespec restore_ts[2] = {
target_stat.st_atim,
target_stat.st_mtim
};
// Ignore futimens failure as the real extent-same ioctl would never raise it
futimens(i->fd, restore_ts);
} else {
assert(memcmp(cmp_buf_common.data(), cmp_buf_iter.data(), length));
i->status = BTRFS_SAME_DATA_DIFFERS;
}
}
}
void
BtrfsExtentSame::do_ioctl()
{
const size_t buf_size = sizeof(btrfs_ioctl_same_args) + m_info.size() * sizeof(btrfs_ioctl_same_extent_info);
ByteVector ioctl_arg( (btrfs_ioctl_same_args) {
.logical_offset = m_logical_offset,
.length = m_length,
.dest_count = ranged_cast<decltype(btrfs_ioctl_same_args::dest_count)>(m_info.size()),
}, buf_size);
btrfs_ioctl_same_args *const ioctl_ptr = ioctl_arg.get<btrfs_ioctl_same_args>();
dest_count = m_info.size();
vector<char> ioctl_arg = vector_copy_struct<btrfs_ioctl_same_args>(this);
ioctl_arg.resize(sizeof(btrfs_ioctl_same_args) + dest_count * sizeof(btrfs_ioctl_same_extent_info), 0);
btrfs_ioctl_same_args *ioctl_ptr = reinterpret_cast<btrfs_ioctl_same_args *>(ioctl_arg.data());
size_t count = 0;
for (auto i = m_info.cbegin(); i != m_info.cend(); ++i) {
ioctl_ptr->info[count] = static_cast<const btrfs_ioctl_same_extent_info &>(m_info[count]);
@@ -159,13 +209,12 @@ namespace crucible {
{
THROW_CHECK1(invalid_argument, src_length, src_length > 0);
while (src_length > 0) {
BtrfsExtentSame bes(src_fd, src_offset, src_length);
off_t length = min(off_t(BTRFS_MAX_DEDUPE_LEN), src_length);
BtrfsExtentSame bes(src_fd, src_offset, length);
bes.add(dst_fd, dst_offset);
bes.do_ioctl();
const auto status = bes.m_info.at(0).status;
auto status = bes.m_info.at(0).status;
if (status == 0) {
const off_t length = bes.m_info.at(0).bytes_deduped;
THROW_CHECK0(invalid_argument, length > 0);
src_offset += length;
dst_offset += length;
src_length -= length;
@@ -184,22 +233,23 @@ namespace crucible {
}
BtrfsDataContainer::BtrfsDataContainer(size_t buf_size) :
m_data(buf_size)
m_data(buf_size, 0)
{
}
void *
BtrfsDataContainer::prepare(size_t container_size)
BtrfsDataContainer::prepare()
{
const size_t min_size = offsetof(btrfs_data_container, val);
btrfs_data_container *p = reinterpret_cast<btrfs_data_container *>(m_data.data());
size_t min_size = offsetof(btrfs_data_container, val);
size_t container_size = m_data.size();
if (container_size < min_size) {
THROW_ERROR(out_of_range, "container size " << container_size << " smaller than minimum " << min_size);
}
if (m_data.size() < container_size) {
m_data = ByteVector(container_size);
}
const auto p = m_data.get<btrfs_data_container>();
*p = (btrfs_data_container) { };
p->bytes_left = 0;
p->bytes_missing = 0;
p->elem_cnt = 0;
p->elem_missed = 0;
return p;
}
@@ -212,29 +262,25 @@ namespace crucible {
decltype(btrfs_data_container::bytes_left)
BtrfsDataContainer::get_bytes_left() const
{
const auto p = m_data.get<btrfs_data_container>();
return p->bytes_left;
return bytes_left;
}
decltype(btrfs_data_container::bytes_missing)
BtrfsDataContainer::get_bytes_missing() const
{
const auto p = m_data.get<btrfs_data_container>();
return p->bytes_missing;
return bytes_missing;
}
decltype(btrfs_data_container::elem_cnt)
BtrfsDataContainer::get_elem_cnt() const
{
const auto p = m_data.get<btrfs_data_container>();
return p->elem_cnt;
return elem_cnt;
}
decltype(btrfs_data_container::elem_missed)
BtrfsDataContainer::get_elem_missed() const
{
const auto p = m_data.get<btrfs_data_container>();
return p->elem_missed;
return elem_missed;
}
ostream &
@@ -244,7 +290,7 @@ namespace crucible {
return os << "BtrfsIoctlLogicalInoArgs NULL";
}
os << "BtrfsIoctlLogicalInoArgs {";
os << " .m_logical = " << to_hex(p->m_logical);
os << " .logical = " << to_hex(p->logical);
os << " .inodes[] = {\n";
unsigned count = 0;
for (auto i = p->m_iors.cbegin(); i != p->m_iors.cend(); ++i) {
@@ -255,134 +301,33 @@ namespace crucible {
}
BtrfsIoctlLogicalInoArgs::BtrfsIoctlLogicalInoArgs(uint64_t new_logical, size_t new_size) :
m_container_size(new_size),
m_container(new_size),
m_logical(new_logical)
m_container(new_size)
{
}
size_t
BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::size() const
{
return m_end - m_begin;
}
BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::const_iterator
BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::cbegin() const
{
return m_begin;
}
BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::const_iterator
BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::cend() const
{
return m_end;
}
BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::iterator
BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::begin() const
{
return m_begin;
}
BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::iterator
BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::end() const
{
return m_end;
}
BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::iterator
BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::data() const
{
return m_begin;
}
void
BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::clear()
{
m_end = m_begin = nullptr;
}
void
BtrfsIoctlLogicalInoArgs::set_flags(uint64_t new_flags)
{
m_flags = new_flags;
}
uint64_t
BtrfsIoctlLogicalInoArgs::get_flags() const
{
// We are still supporting building with old headers that don't have .flags yet
return m_flags;
}
void
BtrfsIoctlLogicalInoArgs::set_logical(uint64_t new_logical)
{
m_logical = new_logical;
}
void
BtrfsIoctlLogicalInoArgs::set_size(uint64_t new_size)
{
m_container_size = new_size;
memset_zero<btrfs_ioctl_logical_ino_args>(this);
logical = new_logical;
}
bool
BtrfsIoctlLogicalInoArgs::do_ioctl_nothrow(int fd)
{
btrfs_ioctl_logical_ino_args args = (btrfs_ioctl_logical_ino_args) {
.logical = m_logical,
.size = m_container_size,
.inodes = reinterpret_cast<uintptr_t>(m_container.prepare(m_container_size)),
};
// We are still supporting building with old headers that don't have .flags yet
*(&args.reserved[0] + 3) = m_flags;
btrfs_ioctl_logical_ino_args *const p = &args;
btrfs_ioctl_logical_ino_args *p = static_cast<btrfs_ioctl_logical_ino_args *>(this);
inodes = reinterpret_cast<uint64_t>(m_container.prepare());
size = m_container.get_size();
m_iors.clear();
static unsigned long bili_version = 0;
if (get_flags() == 0) {
// Could use either V1 or V2
if (bili_version) {
// We tested both versions and came to a decision
if (ioctl(fd, bili_version, p)) {
return false;
}
} else {
// Try V2
if (ioctl(fd, BTRFS_IOC_LOGICAL_INO_V2, p)) {
// V2 failed, try again with V1
if (ioctl(fd, BTRFS_IOC_LOGICAL_INO, p)) {
// both V1 and V2 failed, doesn't tell us which one to choose
return false;
}
// V1 and V2 both tested with same arguments, V1 OK, and V2 failed
bili_version = BTRFS_IOC_LOGICAL_INO;
} else {
// V2 succeeded, don't use V1 any more
bili_version = BTRFS_IOC_LOGICAL_INO_V2;
}
}
} else {
// Flags/size require a V2 feature, no fallback to V1 possible
if (ioctl(fd, BTRFS_IOC_LOGICAL_INO_V2, p)) {
return false;
}
// V2 succeeded so we don't need to probe any more
bili_version = BTRFS_IOC_LOGICAL_INO_V2;
if (ioctl(fd, BTRFS_IOC_LOGICAL_INO, p)) {
return false;
}
btrfs_data_container *const bdc = reinterpret_cast<btrfs_data_container *>(p->inodes);
BtrfsInodeOffsetRoot *const ior_iter = reinterpret_cast<BtrfsInodeOffsetRoot *>(bdc->val);
btrfs_data_container *bdc = reinterpret_cast<btrfs_data_container *>(p->inodes);
BtrfsInodeOffsetRoot *input_iter = reinterpret_cast<BtrfsInodeOffsetRoot *>(bdc->val);
m_iors.reserve(bdc->elem_cnt);
for (auto count = bdc->elem_cnt; count > 2; count -= 3) {
m_iors.push_back(*input_iter++);
}
// elem_cnt counts uint64_t, but BtrfsInodeOffsetRoot is 3x uint64_t
THROW_CHECK1(runtime_error, bdc->elem_cnt, bdc->elem_cnt % 3 == 0);
m_iors.m_begin = ior_iter;
m_iors.m_end = ior_iter + bdc->elem_cnt / 3;
return true;
}
@@ -405,10 +350,9 @@ namespace crucible {
}
BtrfsIoctlInoPathArgs::BtrfsIoctlInoPathArgs(uint64_t inode, size_t new_size) :
btrfs_ioctl_ino_path_args( (btrfs_ioctl_ino_path_args) { } ),
m_container_size(new_size)
m_container(new_size)
{
assert(inum == 0);
memset_zero<btrfs_ioctl_ino_path_args>(this);
inum = inode;
}
@@ -416,9 +360,8 @@ namespace crucible {
BtrfsIoctlInoPathArgs::do_ioctl_nothrow(int fd)
{
btrfs_ioctl_ino_path_args *p = static_cast<btrfs_ioctl_ino_path_args *>(this);
BtrfsDataContainer container(m_container_size);
fspath = reinterpret_cast<uintptr_t>(container.prepare(m_container_size));
size = container.get_size();
fspath = reinterpret_cast<uint64_t>(m_container.prepare());
size = m_container.get_size();
m_paths.clear();
@@ -426,16 +369,16 @@ namespace crucible {
return false;
}
btrfs_data_container *const bdc = reinterpret_cast<btrfs_data_container *>(p->fspath);
btrfs_data_container *bdc = reinterpret_cast<btrfs_data_container *>(p->fspath);
m_paths.reserve(bdc->elem_cnt);
const uint64_t *up = reinterpret_cast<const uint64_t *>(bdc->val);
const char *const cp = reinterpret_cast<const char *>(bdc->val);
const char *cp = reinterpret_cast<const char *>(bdc->val);
for (auto count = bdc->elem_cnt; count > 0; --count) {
const char *const path = cp + *up++;
if (static_cast<size_t>(path - cp) > container.get_size()) {
THROW_ERROR(out_of_range, "offset " << (path - cp) << " > size " << container.get_size() << " in " << __PRETTY_FUNCTION__);
const char *path = cp + *up++;
if (static_cast<size_t>(path - cp) > m_container.get_size()) {
THROW_ERROR(out_of_range, "offset " << (path - cp) << " > size " << m_container.get_size() << " in " << __PRETTY_FUNCTION__);
}
m_paths.push_back(string(path));
}
@@ -468,10 +411,9 @@ namespace crucible {
return os;
}
BtrfsIoctlInoLookupArgs::BtrfsIoctlInoLookupArgs(uint64_t new_objectid) :
btrfs_ioctl_ino_lookup_args( (btrfs_ioctl_ino_lookup_args) { } )
BtrfsIoctlInoLookupArgs::BtrfsIoctlInoLookupArgs(uint64_t new_objectid)
{
assert(objectid == 0);
memset_zero<btrfs_ioctl_ino_lookup_args>(this);
objectid = new_objectid;
}
@@ -489,9 +431,9 @@ namespace crucible {
}
}
BtrfsIoctlDefragRangeArgs::BtrfsIoctlDefragRangeArgs() :
btrfs_ioctl_defrag_range_args( (btrfs_ioctl_defrag_range_args) { } )
BtrfsIoctlDefragRangeArgs::BtrfsIoctlDefragRangeArgs()
{
memset_zero<btrfs_ioctl_defrag_range_args>(this);
}
bool
@@ -521,13 +463,11 @@ namespace crucible {
}
string
btrfs_compress_type_ntoa(uint8_t compress_type)
btrfs_ioctl_defrag_range_compress_type_ntoa(uint32_t compress_type)
{
static const bits_ntoa_table table[] = {
NTOA_TABLE_ENTRY_ENUM(BTRFS_COMPRESS_NONE),
NTOA_TABLE_ENTRY_ENUM(BTRFS_COMPRESS_ZLIB),
NTOA_TABLE_ENTRY_ENUM(BTRFS_COMPRESS_LZO),
NTOA_TABLE_ENTRY_ENUM(BTRFS_COMPRESS_ZSTD),
NTOA_TABLE_ENTRY_END()
};
return bits_ntoa(compress_type, table);
@@ -544,14 +484,14 @@ namespace crucible {
os << " .len = " << p->len;
os << " .flags = " << btrfs_ioctl_defrag_range_flags_ntoa(p->flags);
os << " .extent_thresh = " << p->extent_thresh;
os << " .compress_type = " << btrfs_compress_type_ntoa(p->compress_type);
os << " .compress_type = " << btrfs_ioctl_defrag_range_compress_type_ntoa(p->compress_type);
os << " .unused[4] = { " << p->unused[0] << ", " << p->unused[1] << ", " << p->unused[2] << ", " << p->unused[3] << "} }";
return os;
}
FiemapExtent::FiemapExtent() :
fiemap_extent( (fiemap_extent) { } )
FiemapExtent::FiemapExtent()
{
memset_zero<fiemap_extent>(this);
}
FiemapExtent::FiemapExtent(const fiemap_extent &that)
@@ -658,10 +598,13 @@ namespace crucible {
operator<<(ostream &os, const Fiemap &args)
{
os << "Fiemap {";
os << " .m_start = " << to_hex(args.m_start) << ".." << to_hex(args.m_start + args.m_length);
os << ", .m_length = " << to_hex(args.m_length);
os << ", .m_flags = " << fiemap_flags_ntoa(args.m_flags);
os << ", .fm_extents[" << args.m_extents.size() << "] = {";
os << " .fm_start = " << to_hex(args.fm_start) << ".." << to_hex(args.fm_start + args.fm_length);
os << ", .fm_length = " << to_hex(args.fm_length);
if (args.fm_flags) os << ", .fm_flags = " << fiemap_flags_ntoa(args.fm_flags);
os << ", .fm_mapped_extents = " << args.fm_mapped_extents;
os << ", .fm_extent_count = " << args.fm_extent_count;
if (args.fm_reserved) os << ", .fm_reserved = " << args.fm_reserved;
os << ", .fm_extents[] = {";
size_t count = 0;
for (auto i = args.m_extents.cbegin(); i != args.m_extents.cend(); ++i) {
os << "\n\t[" << count++ << "] = " << &(*i) << ",";
@@ -669,35 +612,41 @@ namespace crucible {
return os << "\n}";
}
Fiemap::Fiemap(uint64_t start, uint64_t length) :
m_start(start),
m_length(length)
Fiemap::Fiemap(uint64_t start, uint64_t length)
{
memset_zero<fiemap>(this);
fm_start = start;
fm_length = length;
// FIEMAP is slow and full of lines.
// This makes FIEMAP even slower, but reduces the lies a little.
fm_flags = FIEMAP_FLAG_SYNC;
}
void
Fiemap::do_ioctl(int fd)
{
THROW_CHECK1(out_of_range, m_min_count, m_min_count <= m_max_count);
THROW_CHECK1(out_of_range, m_min_count, m_min_count > 0);
CHECK_CONSTRAINT(m_min_count, m_min_count <= m_max_count);
const auto extent_count = m_min_count;
ByteVector ioctl_arg(sizeof(fiemap) + extent_count * sizeof(fiemap_extent));
auto extent_count = m_min_count;
vector<char> ioctl_arg = vector_copy_struct<fiemap>(this);
fiemap *const ioctl_ptr = ioctl_arg.get<fiemap>();
ioctl_arg.resize(sizeof(fiemap) + extent_count * sizeof(fiemap_extent), 0);
auto start = m_start;
const auto end = m_start + m_length;
fiemap *ioctl_ptr = reinterpret_cast<fiemap *>(ioctl_arg.data());
auto start = fm_start;
auto end = fm_start + fm_length;
auto orig_start = fm_start;
auto orig_length = fm_length;
vector<FiemapExtent> extents;
while (start < end && extents.size() < m_max_count) {
*ioctl_ptr = (fiemap) {
.fm_start = start,
.fm_length = end - start,
.fm_flags = m_flags,
.fm_extent_count = extent_count,
};
ioctl_ptr->fm_start = start;
ioctl_ptr->fm_length = end - start;
ioctl_ptr->fm_extent_count = extent_count;
ioctl_ptr->fm_mapped_extents = 0;
// cerr << "Before (fd = " << fd << ") : " << ioctl_ptr << endl;
DIE_IF_MINUS_ONE(ioctl(fd, FS_IOC_FIEMAP, ioctl_ptr));
@@ -723,107 +672,68 @@ namespace crucible {
}
}
fiemap *this_ptr = static_cast<fiemap *>(this);
*this_ptr = *ioctl_ptr;
fm_start = orig_start;
fm_length = orig_length;
fm_extent_count = extents.size();
m_extents = extents;
}
BtrfsIoctlSearchKey::BtrfsIoctlSearchKey(size_t buf_size) :
btrfs_ioctl_search_key( (btrfs_ioctl_search_key) {
.max_objectid = numeric_limits<decltype(max_objectid)>::max(),
.max_offset = numeric_limits<decltype(max_offset)>::max(),
.max_transid = numeric_limits<decltype(max_transid)>::max(),
.max_type = numeric_limits<decltype(max_type)>::max(),
.nr_items = 1,
}),
m_buf_size(buf_size)
{
memset_zero<btrfs_ioctl_search_key>(this);
max_objectid = numeric_limits<decltype(max_objectid)>::max();
max_offset = numeric_limits<decltype(max_offset)>::max();
max_transid = numeric_limits<decltype(max_transid)>::max();
max_type = numeric_limits<decltype(max_type)>::max();
nr_items = numeric_limits<decltype(nr_items)>::max();
}
BtrfsIoctlSearchHeader::BtrfsIoctlSearchHeader() :
btrfs_ioctl_search_header( (btrfs_ioctl_search_header) { } )
BtrfsIoctlSearchHeader::BtrfsIoctlSearchHeader()
{
memset_zero<btrfs_ioctl_search_header>(this);
}
size_t
BtrfsIoctlSearchHeader::set_data(const ByteVector &v, size_t offset)
BtrfsIoctlSearchHeader::set_data(const vector<char> &v, size_t offset)
{
THROW_CHECK2(invalid_argument, offset, v.size(), offset + sizeof(btrfs_ioctl_search_header) <= v.size());
memcpy(static_cast<btrfs_ioctl_search_header *>(this), &v[offset], sizeof(btrfs_ioctl_search_header));
memcpy(this, &v[offset], sizeof(btrfs_ioctl_search_header));
offset += sizeof(btrfs_ioctl_search_header);
THROW_CHECK2(invalid_argument, offset + len, v.size(), offset + len <= v.size());
m_data = ByteVector(v, offset, len);
m_data = vector<char>(&v[offset], &v[offset + len]);
return offset + len;
}
thread_local size_t BtrfsIoctlSearchKey::s_calls = 0;
thread_local size_t BtrfsIoctlSearchKey::s_loops = 0;
thread_local size_t BtrfsIoctlSearchKey::s_loops_empty = 0;
thread_local shared_ptr<ostream> BtrfsIoctlSearchKey::s_debug_ostream;
bool
BtrfsIoctlSearchKey::do_ioctl_nothrow(int fd)
{
// It would be really nice if the kernel tells us whether our
// buffer overflowed or how big the overflowing object
// was; instead, we have to guess.
vector<char> ioctl_arg = vector_copy_struct<btrfs_ioctl_search_key>(this);
ioctl_arg.resize(sizeof(btrfs_ioctl_search_args_v2) + m_buf_size, 0);
btrfs_ioctl_search_args_v2 *ioctl_ptr = reinterpret_cast<btrfs_ioctl_search_args_v2 *>(ioctl_arg.data());
ioctl_ptr->buf_size = m_buf_size;
// Don't bother supporting V1. Kernels that old have other problems.
int rv = ioctl(fd, BTRFS_IOC_TREE_SEARCH_V2, ioctl_ptr);
if (rv != 0) {
return false;
}
static_cast<btrfs_ioctl_search_key&>(*this) = ioctl_ptr->key;
m_result.clear();
// Make sure there is space for at least the search key and one (empty) header
size_t buf_size = max(m_buf_size, sizeof(btrfs_ioctl_search_args_v2) + sizeof(btrfs_ioctl_search_header));
ByteVector ioctl_arg;
btrfs_ioctl_search_args_v2 *ioctl_ptr;
do {
// ioctl buffer size does not include search key header or buffer size
ioctl_arg = ByteVector(buf_size + sizeof(btrfs_ioctl_search_args_v2));
ioctl_ptr = ioctl_arg.get<btrfs_ioctl_search_args_v2>();
ioctl_ptr->key = static_cast<const btrfs_ioctl_search_key&>(*this);
ioctl_ptr->buf_size = buf_size;
if (s_debug_ostream) {
(*s_debug_ostream) << "bisk " << (ioctl_ptr->key) << "\n";
}
// Don't bother supporting V1. Kernels that old have other problems.
int rv = ioctl(fd, BTRFS_IOC_TREE_SEARCH_V2, ioctl_arg.data());
++s_calls;
if (rv != 0 && errno == ENOENT) {
// If we are searching a tree that is deleted or no longer exists, just return an empty list
ioctl_ptr->key.nr_items = 0;
break;
}
if (rv != 0 && errno != EOVERFLOW) {
return false;
}
if (rv == 0 && nr_items <= ioctl_ptr->key.nr_items) {
// got all the items we wanted, thanks
m_buf_size = max(m_buf_size, buf_size);
break;
}
// Didn't get all the items we wanted. Increase the buf size and try again.
// These sizes are very common on default-formatted btrfs, so use these
// instead of naive doubling.
if (buf_size < 4096) {
buf_size = 4096;
} else if (buf_size < 16384) {
buf_size = 16384;
} else if (buf_size < 65536) {
buf_size = 65536;
} else {
buf_size *= 2;
}
// don't automatically raise the buf size higher than 64K, the largest possible btrfs item
++s_loops;
if (ioctl_ptr->key.nr_items == 0) {
++s_loops_empty;
}
} while (buf_size < 65536);
// ioctl changes nr_items, this has to be copied back
static_cast<btrfs_ioctl_search_key&>(*this) = ioctl_ptr->key;
m_result.reserve(nr_items);
size_t offset = pointer_distance(ioctl_ptr->buf, ioctl_ptr);
for (decltype(nr_items) i = 0; i < nr_items; ++i) {
BtrfsIoctlSearchHeader item;
offset = item.set_data(ioctl_arg, offset);
m_result.insert(item);
m_result.push_back(item);
}
return true;
}
@@ -831,7 +741,7 @@ namespace crucible {
BtrfsIoctlSearchKey::do_ioctl(int fd)
{
if (!do_ioctl_nothrow(fd)) {
THROW_ERRNO("BTRFS_IOC_TREE_SEARCH_V2: " << name_fd(fd) << ": " << *this);
THROW_ERRNO("BTRFS_IOC_TREE_SEARCH_V2: " << name_fd(fd));
}
}
@@ -842,67 +752,31 @@ namespace crucible {
min_type = ref.type;
min_offset = ref.offset + 1;
if (min_offset < ref.offset) {
// We wrapped, try the next type
++min_type;
assert(min_offset == 0);
if (min_type < ref.type) {
assert(min_type == 0);
// We wrapped, try the next objectid
++min_objectid;
// no advancement possible at end
THROW_CHECK1(runtime_error, min_type, min_type == 0);
}
// We wrapped, try the next objectid
++min_objectid;
}
}
void
BtrfsIoctlSearchKey::next_min(const BtrfsIoctlSearchHeader &ref, const uint8_t type)
ostream &hexdump(ostream &os, const vector<char> &v)
{
if (ref.type < type) {
// forward to type in same object with zero offset
min_objectid = ref.objectid;
min_type = type;
min_offset = 0;
} else if (ref.type > type) {
// skip directly to start of next objectid with target type
min_objectid = ref.objectid + 1;
// no advancement possible at end
THROW_CHECK2(out_of_range, min_objectid, ref.objectid, min_objectid > ref.objectid);
min_type = type;
min_offset = 0;
} else {
// advance within this type
min_objectid = ref.objectid;
min_type = ref.type;
min_offset = ref.offset + 1;
if (min_offset < ref.offset) {
// We wrapped, try the next objectid, same type
++min_objectid;
THROW_CHECK2(out_of_range, min_objectid, ref.objectid, min_objectid > ref.objectid);
min_type = type;
assert(min_offset == 0);
os << "vector<char> { size = " << v.size() << ", data:\n";
for (size_t i = 0; i < v.size(); i += 8) {
string hex, ascii;
for (size_t j = i; j < i + 8; ++j) {
if (j < v.size()) {
unsigned char c = v[j];
char buf[8];
sprintf(buf, "%02x ", c);
hex += buf;
ascii += (c < 32 || c > 126) ? '.' : c;
} else {
hex += " ";
ascii += ' ';
}
}
os << astringprintf("\t%08x %s %s\n", i, hex.c_str(), ascii.c_str());
}
}
string
btrfs_chunk_type_ntoa(uint64_t type)
{
static const bits_ntoa_table table[] = {
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_DATA),
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_METADATA),
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_SYSTEM),
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_DUP),
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_RAID0),
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_RAID1),
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_RAID10),
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_RAID1C3),
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_RAID1C4),
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_RAID5),
NTOA_TABLE_ENTRY_BITS(BTRFS_BLOCK_GROUP_RAID6),
NTOA_TABLE_ENTRY_END()
};
return bits_ntoa(type, table);
return os << "}";
}
string
@@ -932,9 +806,15 @@ namespace crucible {
NTOA_TABLE_ENTRY_ENUM(BTRFS_SHARED_BLOCK_REF_KEY),
NTOA_TABLE_ENTRY_ENUM(BTRFS_SHARED_DATA_REF_KEY),
NTOA_TABLE_ENTRY_ENUM(BTRFS_BLOCK_GROUP_ITEM_KEY),
#ifdef BTRFS_FREE_SPACE_INFO_KEY
NTOA_TABLE_ENTRY_ENUM(BTRFS_FREE_SPACE_INFO_KEY),
#endif
#ifdef BTRFS_FREE_SPACE_EXTENT_KEY
NTOA_TABLE_ENTRY_ENUM(BTRFS_FREE_SPACE_EXTENT_KEY),
#endif
#ifdef BTRFS_FREE_SPACE_BITMAP_KEY
NTOA_TABLE_ENTRY_ENUM(BTRFS_FREE_SPACE_BITMAP_KEY),
#endif
NTOA_TABLE_ENTRY_ENUM(BTRFS_DEV_EXTENT_KEY),
NTOA_TABLE_ENTRY_ENUM(BTRFS_DEV_ITEM_KEY),
NTOA_TABLE_ENTRY_ENUM(BTRFS_CHUNK_ITEM_KEY),
@@ -954,7 +834,7 @@ namespace crucible {
}
string
btrfs_search_objectid_ntoa(uint64_t objectid)
btrfs_search_objectid_ntoa(unsigned objectid)
{
static const bits_ntoa_table table[] = {
NTOA_TABLE_ENTRY_ENUM(BTRFS_ROOT_TREE_OBJECTID),
@@ -966,7 +846,9 @@ namespace crucible {
NTOA_TABLE_ENTRY_ENUM(BTRFS_CSUM_TREE_OBJECTID),
NTOA_TABLE_ENTRY_ENUM(BTRFS_QUOTA_TREE_OBJECTID),
NTOA_TABLE_ENTRY_ENUM(BTRFS_UUID_TREE_OBJECTID),
#ifdef BTRFS_FREE_SPACE_TREE_OBJECTID
NTOA_TABLE_ENTRY_ENUM(BTRFS_FREE_SPACE_TREE_OBJECTID),
#endif
NTOA_TABLE_ENTRY_ENUM(BTRFS_BALANCE_OBJECTID),
NTOA_TABLE_ENTRY_ENUM(BTRFS_ORPHAN_OBJECTID),
NTOA_TABLE_ENTRY_ENUM(BTRFS_TREE_LOG_OBJECTID),
@@ -1024,7 +906,7 @@ namespace crucible {
ostream &
operator<<(ostream &os, const BtrfsIoctlSearchHeader &hdr)
{
os << "BtrfsIoctlSearchHeader { "
os << "BtrfsIoctlSearchHeader { "
<< static_cast<const btrfs_ioctl_search_header &>(hdr)
<< ", data = ";
hexdump(os, hdr.m_data);
@@ -1034,7 +916,7 @@ namespace crucible {
ostream &
operator<<(ostream &os, const BtrfsIoctlSearchKey &key)
{
os << "BtrfsIoctlSearchKey { "
os << "BtrfsIoctlSearchKey { "
<< static_cast<const btrfs_ioctl_search_key &>(key)
<< ", buf_size = " << key.m_buf_size
<< ", buf[" << key.m_result.size() << "] = {";
@@ -1079,7 +961,7 @@ namespace crucible {
}
if (i.objectid == root_id && i.type == BTRFS_ROOT_ITEM_KEY) {
rv = max(rv, uint64_t(btrfs_get_member(&btrfs_root_item::generation, i.m_data)));
rv = max(rv, uint64_t(call_btrfs_get(btrfs_root_generation, i.m_data)));
}
}
if (sk.min_offset < numeric_limits<decltype(sk.min_offset)>::max()) {
@@ -1091,9 +973,9 @@ namespace crucible {
return rv;
}
Statvfs::Statvfs() :
statvfs( (statvfs) { } )
Statvfs::Statvfs()
{
memset_zero<statvfs>(this);
}
Statvfs::Statvfs(int fd) :
@@ -1132,6 +1014,7 @@ namespace crucible {
os << "BtrfsIoctlFsInfoArgs {"
<< " max_id = " << a.max_id << ","
<< " num_devices = " << a.num_devices << ","
<< " fsid = " << a.uuid() << ","
#if 0
<< " nodesize = " << a.nodesize << ","
<< " sectorsize = " << a.sectorsize << ","
@@ -1144,54 +1027,24 @@ namespace crucible {
return os << " }";
};
BtrfsIoctlFsInfoArgs::BtrfsIoctlFsInfoArgs() :
btrfs_ioctl_fs_info_args_v3( (btrfs_ioctl_fs_info_args_v3) {
.flags = 0
| BTRFS_FS_INFO_FLAG_CSUM_INFO
| BTRFS_FS_INFO_FLAG_GENERATION
,
})
BtrfsIoctlFsInfoArgs::BtrfsIoctlFsInfoArgs()
{
}
bool
BtrfsIoctlFsInfoArgs::do_ioctl_nothrow(int const fd)
{
btrfs_ioctl_fs_info_args_v3 *p = static_cast<btrfs_ioctl_fs_info_args_v3 *>(this);
return 0 == ioctl(fd, BTRFS_IOC_FS_INFO, p);
memset_zero<btrfs_ioctl_fs_info_args>(this);
}
void
BtrfsIoctlFsInfoArgs::do_ioctl(int const fd)
BtrfsIoctlFsInfoArgs::do_ioctl(int fd)
{
if (!do_ioctl_nothrow(fd)) {
btrfs_ioctl_fs_info_args *p = static_cast<btrfs_ioctl_fs_info_args *>(this);
if (ioctl(fd, BTRFS_IOC_FS_INFO, p)) {
THROW_ERRNO("BTRFS_IOC_FS_INFO: fd " << fd);
}
}
uint16_t
BtrfsIoctlFsInfoArgs::csum_type() const
string
BtrfsIoctlFsInfoArgs::uuid() const
{
return this->btrfs_ioctl_fs_info_args_v3::csum_type;
}
uint16_t
BtrfsIoctlFsInfoArgs::csum_size() const
{
return this->btrfs_ioctl_fs_info_args_v3::csum_size;
}
vector<uint8_t>
BtrfsIoctlFsInfoArgs::fsid() const
{
const auto begin = btrfs_ioctl_fs_info_args_v3::fsid;
return vector<uint8_t>(begin, begin + BTRFS_FSID_SIZE);
}
uint64_t
BtrfsIoctlFsInfoArgs::generation() const
{
return this->btrfs_ioctl_fs_info_args_v3::generation;
return uuid_unparse(fsid);
}
};

96
lib/interp.cc Normal file
View File

@@ -0,0 +1,96 @@
#include "crucible/interp.h"
#include "crucible/chatter.h"
namespace crucible {
using namespace std;
int
Proc::exec(const ArgList &args)
{
return m_cmd(args);
}
Proc::Proc(const function<int(const ArgList &)> &f) :
m_cmd(f)
{
}
Command::~Command()
{
}
ArgList::ArgList(const char **argv)
{
while (argv && *argv) {
push_back(*argv++);
}
}
ArgList::ArgList(const vector<string> &&that) :
vector<string>(that)
{
}
Interp::~Interp()
{
}
Interp::Interp(const map<string, shared_ptr<Command> > &cmdlist) :
m_commands(cmdlist)
{
}
void
Interp::add_command(const string &name, const shared_ptr<Command> &command)
{
m_commands[name] = command;
}
int
Interp::exec(const ArgList &args)
{
auto next_arg = args.begin();
++next_arg;
return m_commands.at(args[0])->exec(vector<string>(next_arg, args.end()));
}
ArgParser::~ArgParser()
{
}
ArgParser::ArgParser()
{
}
void
ArgParser::add_opt(string opt, ArgActor actor)
{
m_string_opts[opt] = actor;
}
void
ArgParser::parse_backend(void *t, const ArgList &args)
{
bool quote_args = false;
for (string arg : args) {
if (quote_args) {
cerr << "arg: '" << arg << "'" << endl;
continue;
}
if (arg == "--") {
quote_args = true;
continue;
}
if (arg.compare(0, 2, "--") == 0) {
auto found = m_string_opts.find(arg.substr(2, string::npos));
if (found != m_string_opts.end()) {
found->second.predicate(t, "foo");
}
(void)t;
}
}
}
};

View File

@@ -1,83 +0,0 @@
#include "crucible/multilock.h"
#include "crucible/error.h"
namespace crucible {
using namespace std;
MultiLocker::LockHandle::LockHandle(const string &type, MultiLocker &parent) :
m_type(type),
m_parent(parent)
{
}
void
MultiLocker::LockHandle::set_locked(const bool state)
{
m_locked = state;
}
MultiLocker::LockHandle::~LockHandle()
{
if (m_locked) {
m_parent.put_lock(m_type);
m_locked = false;
}
}
bool
MultiLocker::is_lock_available(const string &type)
{
for (const auto &i : m_counters) {
if (i.second != 0 && i.first != type) {
return false;
}
}
return true;
}
void
MultiLocker::put_lock(const string &type)
{
unique_lock<mutex> lock(m_mutex);
auto &counter = m_counters[type];
THROW_CHECK2(runtime_error, type, counter, counter > 0);
--counter;
if (counter == 0) {
m_cv.notify_all();
}
}
shared_ptr<MultiLocker::LockHandle>
MultiLocker::get_lock_private(const string &type)
{
unique_lock<mutex> lock(m_mutex);
m_counters.insert(make_pair(type, size_t(0)));
while (!is_lock_available(type)) {
m_cv.wait(lock);
}
const auto rv = make_shared<LockHandle>(type, *this);
++m_counters[type];
rv->set_locked(true);
return rv;
}
static MultiLocker s_process_instance;
shared_ptr<MultiLocker::LockHandle>
MultiLocker::get_lock(const string &type)
{
if (s_process_instance.m_do_locking) {
return s_process_instance.get_lock_private(type);
} else {
return shared_ptr<MultiLocker::LockHandle>();
}
}
void
MultiLocker::enable_locking(const bool enabled)
{
s_process_instance.m_do_locking = enabled;
}
}

View File

@@ -1,17 +1,18 @@
#include "crucible/ntoa.h"
#include "crucible/error.h"
#include "crucible/string.h"
#include <cassert>
#include <sstream>
#include <string>
namespace crucible {
using namespace std;
string bits_ntoa(unsigned long long n, const bits_ntoa_table *table)
string bits_ntoa(unsigned long n, const bits_ntoa_table *table)
{
string out;
while (n && table->a) {
// No bits in n outside of mask
THROW_CHECK2(invalid_argument, table->mask, table->n, ((~table->mask) & table->n) == 0);
assert( ((~table->mask) & table->n) == 0);
if ( (n & table->mask) == table->n) {
if (!out.empty()) {
out += "|";
@@ -22,10 +23,12 @@ namespace crucible {
++table;
}
if (n) {
ostringstream oss;
oss << "0x" << hex << n;
if (!out.empty()) {
out += "|";
}
out += to_hex(n);
out += oss.str();
}
if (out.empty()) {
out = "0";

View File

@@ -1,40 +0,0 @@
#include "crucible/openat2.h"
#include <sys/syscall.h>
// Compatibility for building on old libc for new kernel
#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 6, 0)
// Every arch that defines this uses 437, except Alpha, where 437 is
// mq_getsetattr.
#ifndef SYS_openat2
#ifdef __alpha__
#define SYS_openat2 547
#else
#define SYS_openat2 437
#endif
#endif
#endif // Linux version >= v5.6
#include <fcntl.h>
#include <unistd.h>
extern "C" {
int
__attribute__((weak))
openat2(int const dirfd, const char *const pathname, struct open_how *const how, size_t const size)
throw()
{
#ifdef SYS_openat2
return syscall(SYS_openat2, dirfd, pathname, how, size);
#else
errno = ENOSYS;
return -1;
#endif
}
};

View File

@@ -2,23 +2,16 @@
#include "crucible/chatter.h"
#include "crucible/error.h"
#include "crucible/ntoa.h"
#include <cstdlib>
#include <utility>
// for gettid()
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <unistd.h>
#include <sys/syscall.h>
extern "C" {
pid_t
__attribute__((weak))
gettid() throw()
{
return syscall(SYS_gettid);
}
};
namespace crucible {
using namespace std;
@@ -116,102 +109,13 @@ namespace crucible {
}
}
double
getloadavg1()
template<>
struct ResourceHandle<Process::id, Process>;
pid_t
gettid()
{
double loadavg[1];
const int rv = ::getloadavg(loadavg, 1);
if (rv != 1) {
THROW_ERRNO("getloadavg(..., 1)");
}
return loadavg[0];
}
double
getloadavg5()
{
double loadavg[2];
const int rv = ::getloadavg(loadavg, 2);
if (rv != 2) {
THROW_ERRNO("getloadavg(..., 2)");
}
return loadavg[1];
}
double
getloadavg15()
{
double loadavg[3];
const int rv = ::getloadavg(loadavg, 3);
if (rv != 3) {
THROW_ERRNO("getloadavg(..., 3)");
}
return loadavg[2];
}
static const struct bits_ntoa_table signals_table[] = {
// POSIX.1-1990
NTOA_TABLE_ENTRY_ENUM(SIGHUP),
NTOA_TABLE_ENTRY_ENUM(SIGINT),
NTOA_TABLE_ENTRY_ENUM(SIGQUIT),
NTOA_TABLE_ENTRY_ENUM(SIGILL),
NTOA_TABLE_ENTRY_ENUM(SIGABRT),
NTOA_TABLE_ENTRY_ENUM(SIGFPE),
NTOA_TABLE_ENTRY_ENUM(SIGKILL),
NTOA_TABLE_ENTRY_ENUM(SIGSEGV),
NTOA_TABLE_ENTRY_ENUM(SIGPIPE),
NTOA_TABLE_ENTRY_ENUM(SIGALRM),
NTOA_TABLE_ENTRY_ENUM(SIGTERM),
NTOA_TABLE_ENTRY_ENUM(SIGUSR1),
NTOA_TABLE_ENTRY_ENUM(SIGUSR2),
NTOA_TABLE_ENTRY_ENUM(SIGCHLD),
NTOA_TABLE_ENTRY_ENUM(SIGCONT),
NTOA_TABLE_ENTRY_ENUM(SIGSTOP),
NTOA_TABLE_ENTRY_ENUM(SIGTSTP),
NTOA_TABLE_ENTRY_ENUM(SIGTTIN),
NTOA_TABLE_ENTRY_ENUM(SIGTTOU),
// SUSv2 and POSIX.1-2001
NTOA_TABLE_ENTRY_ENUM(SIGBUS),
NTOA_TABLE_ENTRY_ENUM(SIGPOLL),
NTOA_TABLE_ENTRY_ENUM(SIGPROF),
NTOA_TABLE_ENTRY_ENUM(SIGSYS),
NTOA_TABLE_ENTRY_ENUM(SIGTRAP),
NTOA_TABLE_ENTRY_ENUM(SIGURG),
NTOA_TABLE_ENTRY_ENUM(SIGVTALRM),
NTOA_TABLE_ENTRY_ENUM(SIGXCPU),
NTOA_TABLE_ENTRY_ENUM(SIGXFSZ),
// Other
NTOA_TABLE_ENTRY_ENUM(SIGIOT),
#ifdef SIGEMT
NTOA_TABLE_ENTRY_ENUM(SIGEMT),
#endif
NTOA_TABLE_ENTRY_ENUM(SIGSTKFLT),
NTOA_TABLE_ENTRY_ENUM(SIGIO),
#ifdef SIGCLD
NTOA_TABLE_ENTRY_ENUM(SIGCLD),
#endif
NTOA_TABLE_ENTRY_ENUM(SIGPWR),
#ifdef SIGINFO
NTOA_TABLE_ENTRY_ENUM(SIGINFO),
#endif
#ifdef SIGLOST
NTOA_TABLE_ENTRY_ENUM(SIGLOST),
#endif
NTOA_TABLE_ENTRY_ENUM(SIGWINCH),
#ifdef SIGUNUSED
NTOA_TABLE_ENTRY_ENUM(SIGUNUSED),
#endif
NTOA_TABLE_ENTRY_END(),
};
string
signal_ntoa(int sig)
{
return bits_ntoa(sig, signals_table);
return syscall(SYS_gettid);
}
}

View File

@@ -1,7 +0,0 @@
#include "crucible/seeker.h"
namespace crucible {
thread_local shared_ptr<ostream> tl_seeker_debug_str;
};

View File

@@ -16,7 +16,7 @@ namespace crucible {
uint64_t
from_hex(const string &s)
{
return stoull(s, nullptr, 0);
return stoull(s, 0, 0);
}
vector<string>

View File

@@ -1,254 +0,0 @@
#include "crucible/table.h"
#include "crucible/string.h"
namespace crucible {
namespace Table {
using namespace std;
Content
Fill(const char c)
{
return [=](size_t width, size_t height) -> string {
string rv;
while (height--) {
rv += string(width, c);
if (height) {
rv += "\n";
}
}
return rv;
};
}
Content
Text(const string &s)
{
return [=](size_t width, size_t height) -> string {
const auto lines = split("\n", s);
string rv;
size_t line_count = 0;
for (const auto &i : lines) {
if (line_count++) {
rv += "\n";
}
if (i.length() < width) {
rv += string(width - i.length(), ' ');
}
rv += i;
}
while (line_count < height) {
if (line_count++) {
rv += "\n";
}
rv += string(width, ' ');
}
return rv;
};
}
Content
Number(const string &s)
{
return [=](size_t width, size_t height) -> string {
const auto lines = split("\n", s);
string rv;
size_t line_count = 0;
for (const auto &i : lines) {
if (line_count++) {
rv += "\n";
}
if (i.length() < width) {
rv += string(width - i.length(), ' ');
}
rv += i;
}
while (line_count < height) {
if (line_count++) {
rv += "\n";
}
rv += string(width, ' ');
}
return rv;
};
}
Cell::Cell(const Content &fn) :
m_content(fn)
{
}
Cell&
Cell::operator=(const Content &fn)
{
m_content = fn;
return *this;
}
string
Cell::text(size_t width, size_t height) const
{
return m_content(width, height);
}
size_t
Dimension::size() const
{
return m_elements.size();
}
size_t
Dimension::insert(size_t pos)
{
++m_next_pos;
const auto insert_pos = min(m_elements.size(), pos);
const auto it = m_elements.begin() + insert_pos;
m_elements.insert(it, m_next_pos);
return insert_pos;
}
void
Dimension::erase(size_t pos)
{
const auto it = m_elements.begin() + min(m_elements.size(), pos);
m_elements.erase(it);
}
size_t
Dimension::at(size_t pos) const
{
return m_elements.at(pos);
}
Dimension&
Table::rows()
{
return m_rows;
};
const Dimension&
Table::rows() const
{
return m_rows;
};
Dimension&
Table::cols()
{
return m_cols;
};
const Dimension&
Table::cols() const
{
return m_cols;
};
const Cell&
Table::at(size_t row, size_t col) const
{
const auto row_idx = m_rows.at(row);
const auto col_idx = m_cols.at(col);
const auto found = m_cells.find(make_pair(row_idx, col_idx));
if (found == m_cells.end()) {
static const Cell s_empty(Fill('.'));
return s_empty;
}
return found->second;
};
Cell&
Table::at(size_t row, size_t col)
{
const auto row_idx = m_rows.at(row);
const auto col_idx = m_cols.at(col);
return m_cells[make_pair(row_idx, col_idx)];
};
static
pair<size_t, size_t>
text_size(const string &s)
{
const auto s_split = split("\n", s);
size_t width = 0;
for (const auto &i : s_split) {
width = max(width, i.length());
}
return make_pair(width, s_split.size());
}
ostream& operator<<(ostream &os, const Table &table)
{
const auto rows = table.rows().size();
const auto cols = table.cols().size();
vector<size_t> row_heights(rows, 1);
vector<size_t> col_widths(cols, 1);
// Get the size of all fixed- and minimum-sized content cells
for (size_t row = 0; row < table.rows().size(); ++row) {
vector<string> col_text;
for (size_t col = 0; col < table.cols().size(); ++col) {
col_text.push_back(table.at(row, col).text(0, 0));
const auto tsize = text_size(*col_text.rbegin());
row_heights[row] = max(row_heights[row], tsize.second);
col_widths[col] = max(col_widths[col], tsize.first);
}
}
// Render the table
for (size_t row = 0; row < table.rows().size(); ++row) {
vector<string> lines(row_heights[row], "");
for (size_t col = 0; col < table.cols().size(); ++col) {
const auto& table_cell = table.at(row, col);
const auto table_text = table_cell.text(col_widths[col], row_heights[row]);
auto col_lines = split("\n", table_text);
col_lines.resize(row_heights[row], "");
for (size_t line = 0; line < row_heights[row]; ++line) {
if (col > 0) {
lines[line] += table.mid();
}
lines[line] += col_lines[line];
}
}
for (const auto &line : lines) {
os << table.left() << line << table.right() << "\n";
}
}
return os;
}
void
Table::left(const string &s)
{
m_left = s;
}
void
Table::mid(const string &s)
{
m_mid = s;
}
void
Table::right(const string &s)
{
m_right = s;
}
const string&
Table::left() const
{
return m_left;
}
const string&
Table::mid() const
{
return m_mid;
}
const string&
Table::right() const
{
return m_right;
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,13 +1,11 @@
#include "crucible/time.h"
#include "crucible/error.h"
#include "crucible/process.h"
#include <algorithm>
#include <thread>
#include <cmath>
#include <ctime>
#include <thread>
namespace crucible {
@@ -61,10 +59,16 @@ namespace crucible {
m_start = chrono::high_resolution_clock::now();
}
chrono::high_resolution_clock::time_point
Timer::get() const
void
Timer::set(const chrono::high_resolution_clock::time_point &start)
{
return m_start;
m_start = start;
}
void
Timer::set(double delta)
{
m_start += chrono::duration_cast<chrono::high_resolution_clock::duration>(chrono::duration<double>(delta));
}
double
@@ -98,16 +102,12 @@ namespace crucible {
m_rate(rate),
m_burst(burst)
{
THROW_CHECK1(invalid_argument, m_rate, m_rate > 0);
THROW_CHECK1(invalid_argument, m_burst, m_burst >= 0);
}
RateLimiter::RateLimiter(double rate) :
m_rate(rate),
m_burst(rate)
{
THROW_CHECK1(invalid_argument, m_rate, m_rate > 0);
THROW_CHECK1(invalid_argument, m_burst, m_burst >= 0);
}
void
@@ -120,27 +120,23 @@ namespace crucible {
}
}
double
RateLimiter::sleep_time(double cost)
{
THROW_CHECK1(invalid_argument, m_rate, m_rate > 0);
borrow(cost);
unique_lock<mutex> lock(m_mutex);
update_tokens();
if (m_tokens >= 0) {
return 0;
}
return -m_tokens / m_rate;
}
void
RateLimiter::sleep_for(double cost)
{
double time_to_sleep = sleep_time(cost);
if (time_to_sleep > 0.0) {
nanosleep(time_to_sleep);
} else {
return;
borrow(cost);
while (1) {
unique_lock<mutex> lock(m_mutex);
update_tokens();
if (m_tokens >= 0) {
return;
}
double sleep_time(-m_tokens / m_rate);
lock.unlock();
if (sleep_time > 0.0) {
nanosleep(sleep_time);
} else {
return;
}
}
}
@@ -159,211 +155,4 @@ namespace crucible {
m_tokens -= cost;
}
void
RateLimiter::rate(double const new_rate)
{
THROW_CHECK1(invalid_argument, new_rate, new_rate > 0);
unique_lock<mutex> lock(m_mutex);
m_rate = new_rate;
}
double
RateLimiter::rate() const
{
unique_lock<mutex> lock(m_mutex);
return m_rate;
}
RateEstimator::RateEstimator(double min_delay, double max_delay) :
m_min_delay(min_delay),
m_max_delay(max_delay)
{
THROW_CHECK1(invalid_argument, min_delay, min_delay > 0);
THROW_CHECK1(invalid_argument, max_delay, max_delay > 0);
THROW_CHECK2(invalid_argument, min_delay, max_delay, max_delay > min_delay);
}
void
RateEstimator::update_unlocked(uint64_t new_count)
{
// Gradually reduce the effect of previous updates
if (m_last_decay.age() > 1) {
m_num *= m_decay;
m_den *= m_decay;
m_last_decay.reset();
}
// Add units over time to running totals
auto increment = new_count - min(new_count, m_last_count);
auto delta = max(0.0, m_last_update.lap());
m_num += increment;
m_den += delta;
m_last_count = new_count;
// If count increased, wake up any waiters
if (delta > 0) {
m_condvar.notify_all();
}
}
void
RateEstimator::update(uint64_t new_count)
{
unique_lock<mutex> lock(m_mutex);
return update_unlocked(new_count);
}
void
RateEstimator::update_monotonic(uint64_t new_count)
{
unique_lock<mutex> lock(m_mutex);
if (m_last_count == numeric_limits<uint64_t>::max() || new_count > m_last_count) {
return update_unlocked(new_count);
} else {
return update_unlocked(m_last_count);
}
}
void
RateEstimator::increment(const uint64_t more)
{
unique_lock<mutex> lock(m_mutex);
return update_unlocked(m_last_count + more);
}
uint64_t
RateEstimator::count() const
{
unique_lock<mutex> lock(m_mutex);
return m_last_count;
}
pair<double, double>
RateEstimator::ratio_unlocked() const
{
auto num = max(m_num, 1.0);
// auto den = max(m_den, 1.0);
// Rate estimation slows down if there are no new units to count
auto den = max(m_den + m_last_update.age(), 1.0);
auto sec_per_count = den / num;
if (sec_per_count < m_min_delay) {
return make_pair(1.0, m_min_delay);
}
if (sec_per_count > m_max_delay) {
return make_pair(1.0, m_max_delay);
}
return make_pair(num, den);
}
pair<double, double>
RateEstimator::ratio() const
{
unique_lock<mutex> lock(m_mutex);
return ratio_unlocked();
}
pair<double, double>
RateEstimator::raw() const
{
unique_lock<mutex> lock(m_mutex);
return make_pair(m_num, m_den);
}
double
RateEstimator::rate_unlocked() const
{
auto r = ratio_unlocked();
return r.first / r.second;
}
double
RateEstimator::rate() const
{
unique_lock<mutex> lock(m_mutex);
return rate_unlocked();
}
ostream &
operator<<(ostream &os, const RateEstimator &re)
{
os << "RateEstimator { ";
auto ratio = re.ratio();
auto raw = re.raw();
os << "count = " << re.count() << ", raw = " << raw.first << " / " << raw.second << ", ratio = " << ratio.first << " / " << ratio.second << ", rate = " << re.rate() << ", duration(1) = " << re.duration(1).count() << ", seconds_for(1) = " << re.seconds_for(1) << " }";
return os;
}
chrono::duration<double>
RateEstimator::duration_unlocked(uint64_t relative_count) const
{
auto dur = relative_count / rate_unlocked();
dur = min(m_max_delay, dur);
dur = max(m_min_delay, dur);
return chrono::duration<double>(dur);
}
chrono::duration<double>
RateEstimator::duration(uint64_t relative_count) const
{
unique_lock<mutex> lock(m_mutex);
return duration_unlocked(relative_count);
}
chrono::high_resolution_clock::time_point
RateEstimator::time_point_unlocked(uint64_t absolute_count) const
{
auto relative_count = absolute_count - min(m_last_count, absolute_count);
auto relative_duration = duration_unlocked(relative_count);
return m_last_update.get() + chrono::duration_cast<chrono::high_resolution_clock::duration>(relative_duration);
// return chrono::high_resolution_clock::now() + chrono::duration_cast<chrono::high_resolution_clock::duration>(relative_duration);
}
chrono::high_resolution_clock::time_point
RateEstimator::time_point(uint64_t absolute_count) const
{
unique_lock<mutex> lock(m_mutex);
return time_point_unlocked(absolute_count);
}
void
RateEstimator::wait_until(uint64_t new_count_absolute) const
{
unique_lock<mutex> lock(m_mutex);
auto saved_count = m_last_count;
while (saved_count <= m_last_count && m_last_count < new_count_absolute) {
// Stop waiting if clock runs backwards
saved_count = m_last_count;
m_condvar.wait(lock);
}
}
void
RateEstimator::wait_for(uint64_t new_count_relative) const
{
unique_lock<mutex> lock(m_mutex);
auto saved_count = m_last_count;
auto new_count_absolute = m_last_count + new_count_relative;
while (saved_count <= m_last_count && m_last_count < new_count_absolute) {
// Stop waiting if clock runs backwards
saved_count = m_last_count;
m_condvar.wait(lock);
}
}
double
RateEstimator::seconds_for(uint64_t new_count_relative) const
{
unique_lock<mutex> lock(m_mutex);
auto ts = time_point_unlocked(new_count_relative + m_last_count);
auto delta_dur = ts - chrono::high_resolution_clock::now();
return max(min(chrono::duration<double>(delta_dur).count(), m_max_delay), m_min_delay);
}
double
RateEstimator::seconds_until(uint64_t new_count_absolute) const
{
unique_lock<mutex> lock(m_mutex);
auto ts = time_point_unlocked(new_count_absolute);
auto delta_dur = ts - chrono::high_resolution_clock::now();
return max(min(chrono::duration<double>(delta_dur).count(), m_max_delay), m_min_delay);
}
}

View File

@@ -1,11 +0,0 @@
#include "crucible/error.h"
#include "crucible/uname.h"
namespace crucible {
using namespace std;
Uname::Uname()
{
DIE_IF_NON_ZERO(uname(static_cast<utsname*>(this)));
}
}

16
lib/uuid.cc Normal file
View File

@@ -0,0 +1,16 @@
#include "crucible/uuid.h"
namespace crucible {
using namespace std;
const size_t uuid_unparsed_size = 37; // "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx\0"
string
uuid_unparse(const unsigned char in[16])
{
char out[uuid_unparsed_size];
::uuid_unparse(in, out);
return string(out);
}
}

View File

@@ -1,13 +1,4 @@
# Default:
CCFLAGS = -Wall -Wextra -Werror -O3
# Optimized:
# CCFLAGS = -Wall -Wextra -Werror -O3 -march=native
# Debug:
# CCFLAGS = -Wall -Wextra -Werror -O0 -ggdb
CCFLAGS += -I../include -D_FILE_OFFSET_BITS=64
BEES_CFLAGS = $(CCFLAGS) -std=c99 $(CFLAGS)
BEES_CXXFLAGS = $(CCFLAGS) -std=c++11 -Wold-style-cast -Wno-missing-field-initializers $(CXXFLAGS)
CCFLAGS = -Wall -Wextra -Werror -O3 -I../include -ggdb -fpic
# CCFLAGS = -Wall -Wextra -Werror -O0 -I../include -ggdb -fpic
CFLAGS = $(CCFLAGS) -std=c99
CXXFLAGS = $(CCFLAGS) -std=c++11 -Wold-style-cast

View File

@@ -1,34 +0,0 @@
## Config for Bees: /etc/bees/beesd.conf.sample
## https://github.com/Zygo/bees
## It's a default values, change it, if needed
# How to use?
# Copy this file to a new file name and adjust the UUID below
# Which FS will be used
UUID=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
## System Vars
# Change carefully
# WORK_DIR=/run/bees/
# MNT_DIR="$WORK_DIR/mnt/$UUID"
# BEESHOME="$MNT_DIR/.beeshome"
# BEESSTATUS="$WORK_DIR/$UUID.status"
## Options to apply, see `beesd --help` for details
# OPTIONS="--strip-paths --no-timestamps"
## Bees DB size
# Hash Table Sizing
# sHash table entries are 16 bytes each
# (64-bit hash, 52-bit block number, and some metadata bits)
# Each entry represents a minimum of 4K on disk.
# unique data size hash table size average dedupe block size
# 1TB 4GB 4K
# 1TB 1GB 16K
# 1TB 256MB 64K
# 1TB 16MB 1024K
# 64TB 1GB 1024K
#
# Size MUST be multiple of 128KB
# DB_SIZE=$((1024*1024*1024)) # 1G in bytes

View File

@@ -1,146 +0,0 @@
#!/bin/bash
# if not called from systemd try to replicate mount unsharing on ctrl+c
# see: https://github.com/Zygo/bees/issues/281
if [ -z "${SYSTEMD_EXEC_PID}" -a -z "${UNSHARE_DONE}" ]; then
UNSHARE_DONE=true
export UNSHARE_DONE
exec unshare -m --propagation private -- "$0" "$@"
fi
## Helpful functions
INFO(){ echo "INFO:" "$@"; }
ERRO(){ echo "ERROR:" "$@"; exit 1; }
YN(){ [[ "$1" =~ (1|Y|y) ]]; }
## Global vars
export BEESHOME BEESSTATUS
export WORK_DIR CONFIG_DIR
export CONFIG_FILE
export UUID AL16M AL128K
readonly AL128K="$((128*1024))"
readonly AL16M="$((16*1024*1024))"
readonly CONFIG_DIR=@ETC_PREFIX@/bees/
readonly bees_bin=$(realpath @DESTDIR@/@LIBEXEC_PREFIX@/bees)
command -v "$bees_bin" &> /dev/null || ERRO "Missing 'bees' agent"
uuid_valid(){
if uuidparse -n -o VARIANT $1 | grep -i -q invalid; then
false
fi
}
help(){
echo "Usage: beesd [options] <btrfs_uuid>"
echo "- - -"
exec "$bees_bin" --help
}
for i in $("$bees_bin" --help 2>&1 | grep -E " --" | sed -e "s/^[^-]*-/-/" -e "s/,[^-]*--/ --/" -e "s/ [^-]*$//")
do
TMP_ARGS="$TMP_ARGS $i"
done
IFS=" " read -r -a SUPPORTED_ARGS <<< $TMP_ARGS
NOT_SUPPORTED_ARGS=()
ARGUMENTS=()
for arg in "${@}"; do
supp=false
for supp_arg in "${SUPPORTED_ARGS[@]}"; do
if [[ "$arg" == ${supp_arg}* ]]; then
supp=true
break
fi
done
if $supp; then
ARGUMENTS+=($arg)
else
NOT_SUPPORTED_ARGS+=($arg)
fi
done
for arg in "${ARGUMENTS[@]}"; do
case $arg in
-h) help;;
--help) help;;
esac
done
for arg in "${NOT_SUPPORTED_ARGS[@]}"; do
if uuid_valid $arg; then
[ ! -z "$UUID" ] && help
UUID=$arg
fi
done
[ -z "$UUID" ] && help
FILE_CONFIG="$(grep -E -l '^[^#]*UUID\s*=\s*"?'"$UUID" "$CONFIG_DIR"/*.conf | head -1)"
[ ! -f "$FILE_CONFIG" ] && ERRO "No config for $UUID"
INFO "Find $UUID in $FILE_CONFIG, use as conf"
source "$FILE_CONFIG"
## Pre checks
{
[ ! -d "$CONFIG_DIR" ] && ERRO "Missing: $CONFIG_DIR"
[ "$UID" == "0" ] || ERRO "Must be run as root"
}
WORK_DIR="${WORK_DIR:-/run/bees/}"
MNT_DIR="${MNT_DIR:-$WORK_DIR/mnt/$UUID}"
BEESHOME="${BEESHOME:-$MNT_DIR/.beeshome}"
BEESSTATUS="${BEESSTATUS:-$WORK_DIR/$UUID.status}"
DB_SIZE="${DB_SIZE:-$((8192*AL128K))}"
INFO "Check: Disk exists"
if [ ! -b "/dev/disk/by-uuid/$UUID" ]; then
ERRO "Missing disk: /dev/disk/by-uuid/$UUID"
fi
is_btrfs(){ [ "$(blkid -s TYPE -o value "$1")" == "btrfs" ]; }
INFO "Check: Disk with btrfs"
if ! is_btrfs "/dev/disk/by-uuid/$UUID"; then
ERRO "Disk not contain btrfs: /dev/disk/by-uuid/$UUID"
fi
INFO "WORK DIR: $WORK_DIR"
mkdir -p "$WORK_DIR" || exit 1
INFO "MOUNT DIR: $MNT_DIR"
mkdir -p "$MNT_DIR" || exit 1
mount --make-private -osubvolid=5,nodev,noexec /dev/disk/by-uuid/$UUID "$MNT_DIR" || exit 1
if [ ! -d "$BEESHOME" ]; then
INFO "Create subvol $BEESHOME for store bees data"
btrfs sub cre "$BEESHOME"
fi
# Check DB size
{
DB_PATH="$BEESHOME/beeshash.dat"
touch "$DB_PATH"
OLD_SIZE="$(du -b "$DB_PATH" | sed 's/\t/ /g' | cut -d' ' -f1)"
NEW_SIZE="$DB_SIZE"
if (( "$NEW_SIZE"%AL128K > 0 )); then
ERRO "DB_SIZE Must be multiple of 128K"
fi
if (( "$OLD_SIZE" != "$NEW_SIZE" )); then
INFO "Resize db: $OLD_SIZE -> $NEW_SIZE"
rm -f "$BEESHOME/beescrawl.dat"
truncate -s $NEW_SIZE $DB_PATH
fi
chmod 700 "$DB_PATH"
}
MNT_DIR="$(realpath $MNT_DIR)"
cd "$MNT_DIR"
exec "$bees_bin" "${ARGUMENTS[@]}" $OPTIONS "$MNT_DIR"

View File

@@ -1,60 +0,0 @@
[Unit]
Description=Bees (%i)
Documentation=https://github.com/Zygo/bees
After=sysinit.target
[Service]
Type=simple
ExecStart=@PREFIX@/sbin/beesd --no-timestamps %i
CPUAccounting=true
CPUSchedulingPolicy=batch
CPUWeight=12
IOSchedulingClass=idle
IOSchedulingPriority=7
IOWeight=10
KillMode=control-group
KillSignal=SIGTERM
MemoryAccounting=true
Nice=19
Restart=on-abnormal
RuntimeDirectoryMode=0700
RuntimeDirectory=bees
StartupCPUWeight=25
StartupIOWeight=25
# Hide other users' process in /proc/
ProtectProc=invisible
# Mount / as read-only
ProtectSystem=strict
# Forbidden access to /home, /root and /run/user
ProtectHome=true
# Mount tmpfs on /tmp/ and /var/tmp/.
# Cannot mount at /run/ or /var/run/ for they are used by systemd.
PrivateTmp=true
# Disable network access
PrivateNetwork=true
# Use private IPC namespace, utc namespace
PrivateIPC=true
ProtectHostname=true
# Disable write access to kernel variables throug /proc
ProtectKernelTunables=true
# Disable access to control groups
ProtectControlGroups=true
# Set capabilities of the new program
# The first three are required for accessing any file on the mounted filesystem.
# The last one is required for mounting the filesystem.
AmbientCapabilities=CAP_DAC_OVERRIDE CAP_DAC_READ_SEARCH CAP_FOWNER CAP_SYS_ADMIN
# With NoNewPrivileges, running sudo cannot gain any new privilege
NoNewPrivileges=true
[Install]
WantedBy=basic.target

3
src/.gitignore vendored
View File

@@ -1,3 +0,0 @@
*.new.c
bees-usage.c
bees-version.[ch]

View File

@@ -1,12 +1,28 @@
BEES = ../bin/bees
PROGRAMS = \
../bin/bees \
../bin/fanotify-watch \
../bin/fiemap \
../bin/fiewalk \
all: $(BEES)
all: $(PROGRAMS) depends.mk
include ../makeflags
-include ../localconf
LIBS = -lcrucible -lpthread
BEES_LDFLAGS = -L../lib $(LDFLAGS)
LDFLAGS = -L../lib -Wl,-rpath=$(shell realpath ../lib)
depends.mk: Makefile *.cc
for x in *.cc; do $(CXX) $(CXXFLAGS) -M "$$x"; done > depends.mk.new
mv -fv depends.mk.new depends.mk
-include depends.mk
%.o: %.cc %.h
$(CXX) $(CXXFLAGS) -o "$@" -c "$<"
../bin/%: %.o
@echo Implicit bin rule "$<" '->' "$@"
$(CXX) $(CXXFLAGS) -o "$@" "$<" $(LDFLAGS) $(LIBS)
BEES_OBJS = \
bees.o \
@@ -15,32 +31,10 @@ BEES_OBJS = \
bees-resolve.o \
bees-roots.o \
bees-thread.o \
bees-trace.o \
bees-types.o \
ALL_OBJS = $(BEES_OBJS) $(PROGRAM_OBJS)
bees-version.c: bees.h $(BEES_OBJS:.o=.cc) Makefile ../lib/libcrucible.a
echo "const char *BEES_VERSION = \"$(BEES_VERSION)\";" > bees-version.c.new
if ! [ -e "$@" ] || ! cmp -s "$@.new" "$@"; then mv -fv $@.new $@; fi
bees-usage.c: bees-usage.txt Makefile
(echo 'const char *BEES_USAGE = '; sed -r 's/^(.*)$$/"\1\\n"/' < bees-usage.txt; echo ';') > bees-usage.new.c
mv -f bees-usage.new.c bees-usage.c
%.dep: %.cc Makefile
$(CXX) $(BEES_CXXFLAGS) -M -MF $@ -MT $(<:.cc=.o) $<
include $(ALL_OBJS:%.o=%.dep)
%.o: %.c ../makeflags
$(CC) $(BEES_CFLAGS) -o $@ -c $<
%.o: %.cc ../makeflags
$(CXX) $(BEES_CXXFLAGS) -o $@ -c $<
$(BEES): $(BEES_OBJS) bees-version.o bees-usage.o ../lib/libcrucible.a
$(CXX) $(BEES_CXXFLAGS) $(BEES_LDFLAGS) -o $@ $^ $(LIBS)
../bin/bees: $(BEES_OBJS)
$(CXX) $(CXXFLAGS) -o "$@" $(BEES_OBJS) $(LDFLAGS) $(LIBS)
clean:
rm -fv *.o bees-version.c
-rm -fv *.o

File diff suppressed because it is too large Load Diff

View File

@@ -1,21 +1,21 @@
#include "bees.h"
#include "crucible/city.h"
#include "crucible/crc64.h"
#include "crucible/string.h"
#include "crucible/uname.h"
#include <algorithm>
#include <random>
#include <sys/mman.h>
using namespace crucible;
using namespace std;
BeesHash::BeesHash(const uint8_t *ptr, size_t len) :
// m_hash(CityHash64(reinterpret_cast<const char *>(ptr), len))
m_hash(Digest::CRC::crc64(ptr, len))
static inline
bool
using_any_madvise()
{
return true;
}
ostream &
@@ -31,18 +31,16 @@ operator<<(ostream &os, const BeesHashTable::Cell &bhte)
<< BeesAddress(bhte.e_addr) << " }";
}
#if 0
static
void
dump_bucket_locked(BeesHashTable::Cell *p, BeesHashTable::Cell *q)
dump_bucket(BeesHashTable::Cell *p, BeesHashTable::Cell *q)
{
// Must be called while holding m_bucket_mutex
for (auto i = p; i < q; ++i) {
BEESLOG("Entry " << i - p << " " << *i);
}
}
#endif
static const bool VERIFY_CLEARS_BUGS = false;
const bool VERIFY_CLEARS_BUGS = false;
bool
verify_cell_range(BeesHashTable::Cell *p, BeesHashTable::Cell *q, bool clear_bugs = VERIFY_CLEARS_BUGS)
@@ -53,7 +51,7 @@ verify_cell_range(BeesHashTable::Cell *p, BeesHashTable::Cell *q, bool clear_bug
for (BeesHashTable::Cell *cell = p; cell < q; ++cell) {
if (cell->e_addr && cell->e_addr < 0x1000) {
BEESCOUNT(bug_hash_magic_addr);
BEESLOGDEBUG("Bad hash table address hash " << to_hex(cell->e_hash) << " addr " << to_hex(cell->e_addr));
BEESINFO("Bad hash table address hash " << to_hex(cell->e_hash) << " addr " << to_hex(cell->e_addr));
if (clear_bugs) {
cell->e_addr = 0;
cell->e_hash = 0;
@@ -62,8 +60,8 @@ verify_cell_range(BeesHashTable::Cell *p, BeesHashTable::Cell *q, bool clear_bug
}
if (cell->e_addr && !seen_it.insert(*cell).second) {
BEESCOUNT(bug_hash_duplicate_cell);
// BEESLOGDEBUG("Duplicate hash table entry:\nthis = " << *cell << "\nold = " << *seen_it.find(*cell));
BEESLOGDEBUG("Duplicate hash table entry: " << *cell);
// BEESLOG("Duplicate hash table entry:\nthis = " << *cell << "\nold = " << *seen_it.find(*cell));
BEESINFO("Duplicate hash table entry: " << *cell);
if (clear_bugs) {
cell->e_addr = 0;
cell->e_hash = 0;
@@ -100,132 +98,69 @@ BeesHashTable::get_extent_range(HashType hash)
return make_pair(bp, ep);
}
bool
BeesHashTable::flush_dirty_extent(uint64_t extent_index)
void
BeesHashTable::flush_dirty_extents()
{
BEESNOTE("flushing extent #" << extent_index << " of " << m_extents << " extents");
if (using_shared_map()) return;
auto lock = lock_extent_by_index(extent_index);
bool wrote_extent = false;
catch_all([&]() {
uint8_t *const dirty_extent = m_extent_ptr[extent_index].p_byte;
uint8_t *const dirty_extent_end = m_extent_ptr[extent_index + 1].p_byte;
const size_t dirty_extent_offset = dirty_extent - m_byte_ptr;
THROW_CHECK1(out_of_range, dirty_extent, dirty_extent >= m_byte_ptr);
THROW_CHECK1(out_of_range, dirty_extent_end, dirty_extent_end <= m_byte_ptr_end);
THROW_CHECK2(out_of_range, dirty_extent_end, dirty_extent, dirty_extent_end - dirty_extent == BLOCK_SIZE_HASHTAB_EXTENT);
BEESTOOLONG("pwrite(fd " << m_fd << " '" << name_fd(m_fd)<< "', length " << to_hex(dirty_extent_end - dirty_extent) << ", offset " << to_hex(dirty_extent - m_byte_ptr) << ")");
// Copy the extent because we might be stuck writing for a while
ByteVector extent_copy(dirty_extent, dirty_extent_end);
// Release the lock
lock.unlock();
// Write the extent (or not)
pwrite_or_die(m_fd, extent_copy, dirty_extent_offset);
BEESCOUNT(hash_extent_out);
// Nope, this causes a _dramatic_ loss of performance.
// const size_t dirty_extent_size = dirty_extent_end - dirty_extent;
// bees_unreadahead(m_fd, dirty_extent_offset, dirty_extent_size);
// Mark extent clean if write was successful
lock.lock();
m_extent_metadata.at(extent_index).m_dirty = false;
wrote_extent = true;
});
return wrote_extent;
}
size_t
BeesHashTable::flush_dirty_extents(bool slowly)
{
THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0);
uint64_t wrote_extents = 0;
for (size_t extent_index = 0; extent_index < m_extents; ++extent_index) {
// Skip the clean ones
auto lock = lock_extent_by_index(extent_index);
if (!m_extent_metadata.at(extent_index).m_dirty) {
continue;
}
lock.unlock();
if (flush_dirty_extent(extent_index)) {
++wrote_extents;
if (slowly) {
if (m_stop_requested) {
slowly = false;
continue;
}
BEESNOTE("flush rate limited after extent #" << extent_index << " of " << m_extents << " extents");
chrono::duration<double> sleep_time(m_flush_rate_limit.sleep_time(BLOCK_SIZE_HASHTAB_EXTENT));
unique_lock<mutex> lock(m_stop_mutex);
m_stop_condvar.wait_for(lock, sleep_time);
}
}
unique_lock<mutex> lock(m_extent_mutex);
auto dirty_extent_copy = m_buckets_dirty;
m_buckets_dirty.clear();
if (dirty_extent_copy.empty()) {
BEESNOTE("idle");
m_condvar.wait(lock);
return; // please call later, i.e. immediately
}
lock.unlock();
size_t extent_counter = 0;
for (auto extent_number : dirty_extent_copy) {
++extent_counter;
BEESNOTE("flush extent #" << extent_number << " (" << extent_counter << " of " << dirty_extent_copy.size() << ")");
catch_all([&]() {
uint8_t *dirty_extent = m_extent_ptr[extent_number].p_byte;
uint8_t *dirty_extent_end = m_extent_ptr[extent_number + 1].p_byte;
THROW_CHECK1(out_of_range, dirty_extent, dirty_extent >= m_byte_ptr);
THROW_CHECK1(out_of_range, dirty_extent_end, dirty_extent_end <= m_byte_ptr_end);
if (using_shared_map()) {
BEESTOOLONG("flush extent " << extent_number);
copy(dirty_extent, dirty_extent_end, dirty_extent);
} else {
BEESTOOLONG("pwrite(fd " << m_fd << " '" << name_fd(m_fd)<< "', length " << to_hex(dirty_extent_end - dirty_extent) << ", offset " << to_hex(dirty_extent - m_byte_ptr) << ")");
// Page locks slow us down more than copying the data does
vector<uint8_t> extent_copy(dirty_extent, dirty_extent_end);
pwrite_or_die(m_fd, extent_copy, dirty_extent - m_byte_ptr);
BEESCOUNT(hash_extent_out);
}
});
BEESNOTE("flush rate limited at extent #" << extent_number << " (" << extent_counter << " of " << dirty_extent_copy.size() << ")");
m_flush_rate_limit.sleep_for(BLOCK_SIZE_HASHTAB_EXTENT);
}
BEESLOGINFO("Flushed " << wrote_extents << " of " << m_extents << " hash table extents");
return wrote_extents;
}
void
BeesHashTable::set_extent_dirty_locked(uint64_t extent_index)
BeesHashTable::set_extent_dirty(HashType hash)
{
// Must already be locked
m_extent_metadata.at(extent_index).m_dirty = true;
// Signal writeback thread
unique_lock<mutex> dirty_lock(m_dirty_mutex);
m_dirty = true;
m_dirty_condvar.notify_one();
if (using_shared_map()) return;
THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0);
auto pr = get_extent_range(hash);
uint64_t extent_number = reinterpret_cast<Extent *>(pr.first) - m_extent_ptr;
THROW_CHECK1(runtime_error, extent_number, extent_number < m_extents);
unique_lock<mutex> lock(m_extent_mutex);
m_buckets_dirty.insert(extent_number);
m_condvar.notify_one();
}
void
BeesHashTable::writeback_loop()
{
while (!m_stop_requested) {
auto wrote_extents = flush_dirty_extents(true);
BEESNOTE("idle after writing " << wrote_extents << " of " << m_extents << " extents");
unique_lock<mutex> lock(m_dirty_mutex);
if (m_stop_requested) {
break;
}
if (m_dirty) {
m_dirty = false;
} else {
m_dirty_condvar.wait(lock);
if (!using_shared_map()) {
while (1) {
flush_dirty_extents();
}
}
// The normal loop exits at the end of one iteration when stop requested,
// but stop request will be in the middle of the loop, and some extents
// will still be dirty. Run the flush loop again to get those.
BEESNOTE("flushing hash table, round 2");
BEESLOGDEBUG("Flushing hash table");
flush_dirty_extents(false);
// If there were any Tasks still running, they may have updated
// some hash table pages during the second flush. These updates
// will be lost. The Tasks will be repeated on the next run because
// they were not completed prior to the stop request, and the
// Crawl progress was already flushed out before the Hash table
// started writing, so nothing is really lost here.
catch_all([&]() {
// trigger writeback on our way out
#if 0
// seems to trigger huge latency spikes
BEESTOOLONG("unreadahead hash table size " <<
pretty(m_size)); bees_unreadahead(m_fd, 0, m_size);
#endif
});
BEESLOGDEBUG("Exited hash table writeback_loop");
}
static
@@ -242,9 +177,14 @@ percent(size_t num, size_t den)
void
BeesHashTable::prefetch_loop()
{
Uname uname;
bool not_locked = true;
while (!m_stop_requested) {
// Always do the mlock, whether shared or not
THROW_CHECK1(runtime_error, m_size, m_size > 0);
catch_all([&]() {
BEESNOTE("mlock " << pretty(m_size));
DIE_IF_NON_ZERO(mlock(m_byte_ptr, m_size));
});
while (1) {
size_t width = 64;
vector<size_t> occupancy(width, 0);
size_t occupied_count = 0;
@@ -254,15 +194,14 @@ BeesHashTable::prefetch_loop()
size_t toxic_count = 0;
size_t unaligned_eof_count = 0;
m_prefetch_running = true;
for (uint64_t ext = 0; ext < m_extents && !m_stop_requested; ++ext) {
BEESNOTE("prefetching hash table extent #" << ext << " of " << m_extents);
for (uint64_t ext = 0; ext < m_extents; ++ext) {
BEESNOTE("prefetching hash table extent " << ext << " of " << m_extent_ptr_end - m_extent_ptr);
catch_all([&]() {
fetch_missing_extent_by_index(ext);
fetch_missing_extent(ext * c_buckets_per_extent);
BEESNOTE("analyzing hash table extent #" << ext << " of " << m_extents);
BEESNOTE("analyzing hash table extent " << ext << " of " << m_extent_ptr_end - m_extent_ptr);
bool duplicate_bugs_found = false;
auto lock = lock_extent_by_index(ext);
unique_lock<mutex> lock(m_bucket_mutex);
for (Bucket *bucket = m_extent_ptr[ext].p_buckets; bucket < m_extent_ptr[ext + 1].p_buckets; ++bucket) {
if (verify_cell_range(bucket[0].p_cells, bucket[1].p_cells)) {
duplicate_bugs_found = true;
@@ -291,12 +230,12 @@ BeesHashTable::prefetch_loop()
// Count these instead of calculating the number so we get better stats in case of exceptions
occupied_count += this_bucket_occupied_count;
}
lock.unlock();
if (duplicate_bugs_found) {
set_extent_dirty_locked(ext);
set_extent_dirty(ext);
}
});
}
m_prefetch_running = false;
BEESNOTE("calculating hash table statistics");
@@ -329,19 +268,20 @@ BeesHashTable::prefetch_loop()
out << "\n";
}
size_t uncompressed_count = occupied_count - compressed_offset_count;
size_t uncompressed_count = occupied_count - compressed_count;
size_t legacy_count = compressed_count - compressed_offset_count;
ostringstream graph_blob;
graph_blob << "Now: " << format_time(time(NULL)) << "\n";
graph_blob << "Uptime: " << m_ctx->total_timer().age() << " seconds\n";
graph_blob << "Version: " << BEES_VERSION << "\n";
graph_blob << "Kernel: " << uname.sysname << " " << uname.release << " " << uname.machine << " " << uname.version << "\n";
graph_blob
<< "\nHash table page occupancy histogram (" << occupied_count << "/" << total_count << " cells occupied, " << (occupied_count * 100 / total_count) << "%)\n"
graph_blob
<< "\nHash table page occupancy histogram (" << occupied_count << "/" << total_count << " cells occupied, " << (occupied_count * 100 / total_count) << "%)\n"
<< out.str() << "0% | 25% | 50% | 75% | 100% page fill\n"
<< "compressed " << compressed_count << " (" << percent(compressed_count, occupied_count) << ")\n"
<< "compressed " << compressed_count << " (" << percent(compressed_count, occupied_count) << ")"
<< " new-style " << compressed_offset_count << " (" << percent(compressed_offset_count, occupied_count) << ")"
<< " old-style " << legacy_count << " (" << percent(legacy_count, occupied_count) << ")\n"
<< "uncompressed " << uncompressed_count << " (" << percent(uncompressed_count, occupied_count) << ")"
<< " unaligned_eof " << unaligned_eof_count << " (" << percent(unaligned_eof_count, occupied_count) << ")"
<< " toxic " << toxic_count << " (" << percent(toxic_count, occupied_count) << ")";
@@ -356,148 +296,91 @@ BeesHashTable::prefetch_loop()
auto avg_rates = thisStats / m_ctx->total_timer().age();
graph_blob << "\t" << avg_rates << "\n";
graph_blob << m_ctx->get_progress();
BEESLOGINFO(graph_blob.str());
BEESLOG(graph_blob.str());
catch_all([&]() {
m_stats_file.write(graph_blob.str());
});
if (not_locked && !m_stop_requested) {
// Always do the mlock, whether shared or not
THROW_CHECK1(runtime_error, m_size, m_size > 0);
BEESLOGINFO("mlock(" << pretty(m_size) << ")...");
Timer lock_time;
catch_all([&]() {
BEESNOTE("mlock " << pretty(m_size));
DIE_IF_NON_ZERO(mlock(m_byte_ptr, m_size));
});
BEESLOGINFO("mlock(" << pretty(m_size) << ") done in " << lock_time << " sec");
not_locked = false;
}
BEESNOTE("idle " << BEES_HASH_TABLE_ANALYZE_INTERVAL << "s");
unique_lock<mutex> lock(m_stop_mutex);
if (m_stop_requested) {
BEESLOGDEBUG("Stop requested in hash table prefetch");
return;
}
m_stop_condvar.wait_for(lock, chrono::duration<double>(BEES_HASH_TABLE_ANALYZE_INTERVAL));
nanosleep(BEES_HASH_TABLE_ANALYZE_INTERVAL);
}
}
size_t
BeesHashTable::hash_to_extent_index(HashType hash)
{
auto pr = get_extent_range(hash);
uint64_t extent_index = reinterpret_cast<const Extent *>(pr.first) - m_extent_ptr;
THROW_CHECK2(runtime_error, extent_index, m_extents, extent_index < m_extents);
return extent_index;
}
BeesHashTable::ExtentMetaData::ExtentMetaData() :
m_mutex_ptr(make_shared<mutex>())
{
}
unique_lock<mutex>
BeesHashTable::lock_extent_by_index(uint64_t extent_index)
{
THROW_CHECK2(out_of_range, extent_index, m_extents, extent_index < m_extents);
return unique_lock<mutex>(*m_extent_metadata.at(extent_index).m_mutex_ptr);
}
unique_lock<mutex>
BeesHashTable::lock_extent_by_hash(HashType hash)
void
BeesHashTable::fetch_missing_extent(HashType hash)
{
BEESTOOLONG("fetch_missing_extent for hash " << to_hex(hash));
return lock_extent_by_index(hash_to_extent_index(hash));
}
if (using_shared_map()) return;
THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0);
auto pr = get_extent_range(hash);
uint64_t extent_number = reinterpret_cast<Extent *>(pr.first) - m_extent_ptr;
THROW_CHECK1(runtime_error, extent_number, extent_number < m_extents);
void
BeesHashTable::fetch_missing_extent_by_index(uint64_t extent_index)
{
BEESNOTE("checking hash extent #" << extent_index << " of " << m_extents << " extents");
auto lock = lock_extent_by_index(extent_index);
if (!m_extent_metadata.at(extent_index).m_missing) {
unique_lock<mutex> lock(m_extent_mutex);
if (!m_buckets_missing.count(extent_number)) {
return;
}
size_t missing_buckets = m_buckets_missing.size();
lock.unlock();
BEESNOTE("fetch waiting for hash extent #" << extent_number << ", " << missing_buckets << " left to fetch");
// Acquire blocking lock on this extent only
LockSet<uint64_t>::Lock extent_lock(m_extent_lock_set, extent_number);
// Check missing again because someone else might have fetched this
// extent for us while we didn't hold any locks
lock.lock();
if (!m_buckets_missing.count(extent_number)) {
BEESCOUNT(hash_extent_in_twice);
return;
}
lock.unlock();
// OK we have to read this extent
BEESNOTE("fetching hash extent #" << extent_index << " of " << m_extents << " extents");
BEESTRACE("Fetching hash extent #" << extent_index << " of " << m_extents << " extents");
BEESTOOLONG("Fetching hash extent #" << extent_index << " of " << m_extents << " extents");
BEESNOTE("fetching hash extent #" << extent_number << ", " << missing_buckets << " left to fetch");
uint8_t *const dirty_extent = m_extent_ptr[extent_index].p_byte;
uint8_t *const dirty_extent_end = m_extent_ptr[extent_index + 1].p_byte;
const size_t dirty_extent_size = dirty_extent_end - dirty_extent;
const size_t dirty_extent_offset = dirty_extent - m_byte_ptr;
BEESTRACE("Fetching missing hash extent " << extent_number);
uint8_t *dirty_extent = m_extent_ptr[extent_number].p_byte;
uint8_t *dirty_extent_end = m_extent_ptr[extent_number + 1].p_byte;
// If the read fails don't retry, just go with whatever data we have
m_extent_metadata.at(extent_index).m_missing = false;
catch_all([&]() {
{
BEESTOOLONG("pread(fd " << m_fd << " '" << name_fd(m_fd)<< "', length " << to_hex(dirty_extent_end - dirty_extent) << ", offset " << to_hex(dirty_extent - m_byte_ptr) << ")");
pread_or_die(m_fd, dirty_extent, dirty_extent_size, dirty_extent_offset);
// Only count extents successfully read
BEESCOUNT(hash_extent_in);
// Won't need that again
bees_unreadahead(m_fd, dirty_extent_offset, dirty_extent_size);
// If we are in prefetch, give the kernel a hint about the next extent
if (m_prefetch_running) {
// Use the kernel readahead here, because it might work for this use case
readahead(m_fd, dirty_extent_offset + dirty_extent_size, dirty_extent_size);
}
});
Cell *cell = m_extent_ptr[extent_index ].p_buckets[0].p_cells;
Cell *cell_end = m_extent_ptr[extent_index + 1].p_buckets[0].p_cells;
size_t toxic_cleared_count = 0;
set<BeesHashTable::Cell> seen_it(cell, cell_end);
while (cell < cell_end) {
if (cell->e_addr & BeesAddress::c_toxic_mask) {
++toxic_cleared_count;
cell->e_addr &= ~BeesAddress::c_toxic_mask;
// Clearing the toxic bit might mean we now have a duplicate.
// This could be due to a race between two
// inserts, one finds the extent toxic while the
// other does not. That's arguably a bug elsewhere,
// but we should rewrite the whole extent lookup/insert
// loop, not spend time fixing code that will be
// thrown out later anyway.
// If there is a cell that is identical to this one
// except for the toxic bit, then we don't need this one.
if (seen_it.count(*cell)) {
cell->e_addr = 0;
cell->e_hash = 0;
}
}
++cell;
}
if (toxic_cleared_count) {
BEESLOGDEBUG("Cleared " << toxic_cleared_count << " hashes while fetching hash table extent " << extent_index);
pread_or_die(m_fd, dirty_extent, dirty_extent_end - dirty_extent, dirty_extent - m_byte_ptr);
}
BEESCOUNT(hash_extent_in);
// We don't block when fetching an extent but we do slow down the
// prefetch thread.
m_prefetch_rate_limit.borrow(BLOCK_SIZE_HASHTAB_EXTENT);
lock.lock();
m_buckets_missing.erase(extent_number);
}
void
BeesHashTable::fetch_missing_extent_by_hash(HashType hash)
bool
BeesHashTable::is_toxic_hash(BeesHashTable::HashType hash) const
{
uint64_t extent_index = hash_to_extent_index(hash);
BEESNOTE("waiting to fetch hash extent #" << extent_index << " of " << m_extents << " extents");
fetch_missing_extent_by_index(extent_index);
return m_toxic_hashes.find(hash) != m_toxic_hashes.end();
}
vector<BeesHashTable::Cell>
BeesHashTable::find_cell(HashType hash)
{
fetch_missing_extent_by_hash(hash);
// This saves a lot of time prefilling the hash table, and there's no risk of eviction
if (is_toxic_hash(hash)) {
BEESCOUNT(hash_toxic);
BeesAddress toxic_addr(0x1000);
toxic_addr.set_toxic();
Cell toxic_cell(hash, toxic_addr);
vector<Cell> rv;
rv.push_back(toxic_cell);
return rv;
}
fetch_missing_extent(hash);
BEESTOOLONG("find_cell hash " << BeesHash(hash));
vector<Cell> rv;
auto lock = lock_extent_by_hash(hash);
unique_lock<mutex> lock(m_bucket_mutex);
auto er = get_cell_range(hash);
// FIXME: Weed out zero addresses in the table due to earlier bugs
copy_if(er.first, er.second, back_inserter(rv), [=](const Cell &ip) { return ip.e_hash == hash && ip.e_addr >= 0x1000; });
@@ -505,45 +388,46 @@ BeesHashTable::find_cell(HashType hash)
return rv;
}
/// Remove a hash from the table, leaving an empty space on the list
/// where the hash used to be. Used when an invalid address is found
/// because lookups on invalid addresses really hurt.
// Move an entry to the end of the list. Used after an attempt to resolve
// an address in the hash table fails. Probably more correctly called
// push_back_hash_addr, except it never inserts. Shared hash tables
// never erase anything, since there is no way to tell if an entry is
// out of date or just belonging to the wrong filesystem.
void
BeesHashTable::erase_hash_addr(HashType hash, AddrType addr)
{
fetch_missing_extent_by_hash(hash);
// if (m_shared) return;
fetch_missing_extent(hash);
BEESTOOLONG("erase hash " << to_hex(hash) << " addr " << addr);
auto lock = lock_extent_by_hash(hash);
unique_lock<mutex> lock(m_bucket_mutex);
auto er = get_cell_range(hash);
Cell mv(hash, addr);
Cell *ip = find(er.first, er.second, mv);
bool found = (ip < er.second);
if (found) {
// Lookups on invalid addresses really hurt us. Kill it with fire!
*ip = Cell(0, 0);
set_extent_dirty_locked(hash_to_extent_index(hash));
set_extent_dirty(hash);
BEESCOUNT(hash_erase);
#if 0
if (verify_cell_range(er.first, er.second)) {
BEESLOGDEBUG("while erasing hash " << hash << " addr " << addr);
BEESINFO("while erasing hash " << hash << " addr " << addr);
}
#endif
} else {
BEESCOUNT(hash_erase_miss);
}
}
/// Insert a hash entry at the head of the list. If entry is already
/// present in list, move it to the front of the list without dropping
/// any entries, and return true. If entry is not present in list,
/// insert it at the front of the list, possibly dropping the last entry
/// in the list, and return false. Used to move duplicate hash blocks
/// to the front of the list.
// If entry is already present in list, move it to the front of the
// list without dropping any entries, and return true. If entry is not
// present in list, insert it at the front of the list, possibly dropping
// the last entry in the list, and return false. Used to move duplicate
// hash blocks to the front of the list.
bool
BeesHashTable::push_front_hash_addr(HashType hash, AddrType addr)
{
fetch_missing_extent_by_hash(hash);
fetch_missing_extent(hash);
BEESTOOLONG("push_front_hash_addr hash " << BeesHash(hash) <<" addr " << BeesAddress(addr));
auto lock = lock_extent_by_hash(hash);
unique_lock<mutex> lock(m_bucket_mutex);
auto er = get_cell_range(hash);
Cell mv(hash, addr);
Cell *ip = find(er.first, er.second, mv);
@@ -561,7 +445,7 @@ BeesHashTable::push_front_hash_addr(HashType hash, AddrType addr)
auto dp = ip;
--sp;
// If we are deleting the last entry then don't copy it
if (dp == er.second) {
if (ip == er.second) {
--sp;
--dp;
BEESCOUNT(hash_evict);
@@ -573,44 +457,39 @@ BeesHashTable::push_front_hash_addr(HashType hash, AddrType addr)
// There is now a space at the front, insert there if different
if (er.first[0] != mv) {
er.first[0] = mv;
set_extent_dirty_locked(hash_to_extent_index(hash));
set_extent_dirty(hash);
BEESCOUNT(hash_front);
} else {
BEESCOUNT(hash_front_already);
}
#if 0
if (verify_cell_range(er.first, er.second)) {
BEESLOGDEBUG("while push_fronting hash " << hash << " addr " << addr);
BEESINFO("while push_fronting hash " << hash << " addr " << addr);
}
#endif
return found;
}
thread_local uniform_int_distribution<size_t> BeesHashTable::tl_distribution(0, c_cells_per_bucket - 1);
/// Insert a hash entry at some unspecified point in the list.
/// If entry is already present in list, returns true and does not
/// modify list. If entry is not present in list, returns false and
/// inserts at a random position in the list, possibly evicting the entry
/// at the end of the list. Used to insert new unique (not-yet-duplicate)
/// blocks in random order.
// If entry is already present in list, returns true and does not
// modify list. If entry is not present in list, returns false and
// inserts at a random position in the list, possibly evicting the entry
// at the end of the list. Used to insert new unique (not-yet-duplicate)
// blocks in random order.
bool
BeesHashTable::push_random_hash_addr(HashType hash, AddrType addr)
{
fetch_missing_extent_by_hash(hash);
fetch_missing_extent(hash);
BEESTOOLONG("push_random_hash_addr hash " << BeesHash(hash) << " addr " << BeesAddress(addr));
auto lock = lock_extent_by_hash(hash);
unique_lock<mutex> lock(m_bucket_mutex);
auto er = get_cell_range(hash);
Cell mv(hash, addr);
Cell *ip = find(er.first, er.second, mv);
bool found = (ip < er.second);
const auto pos = tl_distribution(bees_generator);
thread_local default_random_engine generator;
thread_local uniform_int_distribution<int> distribution(0, c_cells_per_bucket - 1);
auto pos = distribution(generator);
int case_cond = 0;
#if 0
vector<Cell> saved(er.first, er.second);
#endif
if (found) {
// If hash already exists after pos, swap with pos
@@ -656,25 +535,20 @@ BeesHashTable::push_random_hash_addr(HashType hash, AddrType addr)
}
// Evict something and insert at pos
// move_backward(er.first + pos, er.second - 1, er.second);
ip = er.second - 1;
while (ip > er.first + pos) {
auto dp = ip;
*dp = *--ip;
}
move_backward(er.first + pos, er.second - 1, er.second);
er.first[pos] = mv;
BEESCOUNT(hash_evict);
case_cond = 5;
ret_dirty:
BEESCOUNT(hash_insert);
set_extent_dirty_locked(hash_to_extent_index(hash));
set_extent_dirty(hash);
ret:
#if 0
if (verify_cell_range(er.first, er.second, false)) {
BEESLOG("while push_randoming (case " << case_cond << ") pos " << pos
<< " ip " << (ip - er.first) << " " << mv);
// dump_bucket_locked(saved.data(), saved.data() + saved.size());
// dump_bucket_locked(er.first, er.second);
// dump_bucket(saved.data(), saved.data() + saved.size());
// dump_bucket(er.first, er.second);
}
#else
(void)case_cond;
@@ -689,9 +563,9 @@ BeesHashTable::try_mmap_flags(int flags)
THROW_CHECK1(out_of_range, m_size, m_size > 0);
Timer map_time;
catch_all([&]() {
BEESLOGINFO("mapping hash table size " << m_size << " with flags " << mmap_flags_ntoa(flags));
BEESLOG("mapping hash table size " << m_size << " with flags " << mmap_flags_ntoa(flags));
void *ptr = mmap_or_die(nullptr, m_size, PROT_READ | PROT_WRITE, flags, flags & MAP_ANONYMOUS ? -1 : int(m_fd), 0);
BEESLOGINFO("mmap done in " << map_time << " sec");
BEESLOG("mmap done in " << map_time << " sec");
m_cell_ptr = static_cast<Cell *>(ptr);
void *ptr_end = static_cast<uint8_t *>(ptr) + m_size;
m_cell_ptr_end = static_cast<Cell *>(ptr_end);
@@ -700,39 +574,12 @@ BeesHashTable::try_mmap_flags(int flags)
}
void
BeesHashTable::open_file()
BeesHashTable::set_shared(bool shared)
{
// OK open hash table
BEESNOTE("opening hash table '" << m_filename << "' target size " << m_size << " (" << pretty(m_size) << ")");
// Try to open existing hash table
Fd new_fd = openat(m_ctx->home_fd(), m_filename.c_str(), FLAGS_OPEN_FILE_RW, 0700);
// If that doesn't work, try to make a new one
if (!new_fd) {
string tmp_filename = m_filename + ".tmp";
BEESNOTE("creating new hash table '" << tmp_filename << "'");
BEESLOGINFO("Creating new hash table '" << tmp_filename << "'");
unlinkat(m_ctx->home_fd(), tmp_filename.c_str(), 0);
new_fd = openat_or_die(m_ctx->home_fd(), tmp_filename, FLAGS_CREATE_FILE, 0700);
BEESNOTE("truncating new hash table '" << tmp_filename << "' size " << m_size << " (" << pretty(m_size) << ")");
BEESLOGINFO("Truncating new hash table '" << tmp_filename << "' size " << m_size << " (" << pretty(m_size) << ")");
ftruncate_or_die(new_fd, m_size);
BEESNOTE("truncating new hash table '" << tmp_filename << "' -> '" << m_filename << "'");
BEESLOGINFO("Truncating new hash table '" << tmp_filename << "' -> '" << m_filename << "'");
renameat_or_die(m_ctx->home_fd(), tmp_filename, m_ctx->home_fd(), m_filename);
}
Stat st(new_fd);
off_t new_size = st.st_size;
THROW_CHECK1(invalid_argument, new_size, new_size > 0);
THROW_CHECK1(invalid_argument, new_size, (new_size % BLOCK_SIZE_HASHTAB_EXTENT) == 0);
m_size = new_size;
m_fd = new_fd;
m_shared = shared;
}
BeesHashTable::BeesHashTable(shared_ptr<BeesContext> ctx, string filename, off_t size) :
BeesHashTable::BeesHashTable(shared_ptr<BeesContext> ctx, string filename) :
m_ctx(ctx),
m_size(0),
m_void_ptr(nullptr),
@@ -740,69 +587,66 @@ BeesHashTable::BeesHashTable(shared_ptr<BeesContext> ctx, string filename, off_t
m_buckets(0),
m_cells(0),
m_writeback_thread("hash_writeback"),
m_prefetch_thread("hash_prefetch"),
m_prefetch_thread("hash_prefetch " + m_ctx->root_path()),
m_flush_rate_limit(BEES_FLUSH_RATE),
m_prefetch_rate_limit(BEES_FLUSH_RATE),
m_stats_file(m_ctx->home_fd(), "beesstats.txt")
{
// Sanity checks to protect the implementation from its weaknesses
BEESNOTE("opening hash table " << filename);
m_fd = openat_or_die(m_ctx->home_fd(), filename, FLAGS_OPEN_FILE_RW, 0700);
Stat st(m_fd);
m_size = st.st_size;
BEESTRACE("hash table size " << m_size);
BEESTRACE("hash table bucket size " << BLOCK_SIZE_HASHTAB_BUCKET);
BEESTRACE("hash table extent size " << BLOCK_SIZE_HASHTAB_EXTENT);
THROW_CHECK2(invalid_argument, BLOCK_SIZE_HASHTAB_BUCKET, BLOCK_SIZE_HASHTAB_EXTENT, (BLOCK_SIZE_HASHTAB_EXTENT % BLOCK_SIZE_HASHTAB_BUCKET) == 0);
// Does the union work?
THROW_CHECK2(runtime_error, m_void_ptr, m_cell_ptr, m_void_ptr == m_cell_ptr);
THROW_CHECK2(runtime_error, m_void_ptr, m_byte_ptr, m_void_ptr == m_byte_ptr);
THROW_CHECK2(runtime_error, m_void_ptr, m_bucket_ptr, m_void_ptr == m_bucket_ptr);
THROW_CHECK2(runtime_error, m_void_ptr, m_extent_ptr, m_void_ptr == m_extent_ptr);
// There's more than one union
THROW_CHECK2(runtime_error, sizeof(Bucket), BLOCK_SIZE_HASHTAB_BUCKET, BLOCK_SIZE_HASHTAB_BUCKET == sizeof(Bucket));
THROW_CHECK2(runtime_error, sizeof(Bucket::p_byte), BLOCK_SIZE_HASHTAB_BUCKET, BLOCK_SIZE_HASHTAB_BUCKET == sizeof(Bucket::p_byte));
THROW_CHECK2(runtime_error, sizeof(Extent), BLOCK_SIZE_HASHTAB_EXTENT, BLOCK_SIZE_HASHTAB_EXTENT == sizeof(Extent));
THROW_CHECK2(runtime_error, sizeof(Extent::p_byte), BLOCK_SIZE_HASHTAB_EXTENT, BLOCK_SIZE_HASHTAB_EXTENT == sizeof(Extent::p_byte));
m_filename = filename;
m_size = size;
open_file();
// Now we know size we can compute stuff
BEESTRACE("hash table size " << m_size);
BEESTRACE("hash table bucket size " << BLOCK_SIZE_HASHTAB_BUCKET);
BEESTRACE("hash table extent size " << BLOCK_SIZE_HASHTAB_EXTENT);
BEESLOGINFO("opened hash table filename '" << filename << "' length " << m_size);
BEESLOG("opened hash table filename '" << filename << "' length " << m_size);
m_buckets = m_size / BLOCK_SIZE_HASHTAB_BUCKET;
m_cells = m_buckets * c_cells_per_bucket;
m_extents = (m_size + BLOCK_SIZE_HASHTAB_EXTENT - 1) / BLOCK_SIZE_HASHTAB_EXTENT;
BEESLOGINFO("\tcells " << m_cells << ", buckets " << m_buckets << ", extents " << m_extents);
BEESLOG("\tcells " << m_cells << ", buckets " << m_buckets << ", extents " << m_extents);
BEESLOGINFO("\tflush rate limit " << BEES_FLUSH_RATE);
BEESLOG("\tflush rate limit " << BEES_FLUSH_RATE);
// Try to mmap that much memory
try_mmap_flags(MAP_PRIVATE | MAP_ANONYMOUS);
if (using_shared_map()) {
try_mmap_flags(MAP_SHARED);
} else {
try_mmap_flags(MAP_PRIVATE | MAP_ANONYMOUS);
}
if (!m_cell_ptr) {
THROW_ERRNO("unable to mmap " << filename);
THROW_ERROR(runtime_error, "unable to mmap " << filename);
}
// Do unions work the way we think (and rely on)?
THROW_CHECK2(runtime_error, m_void_ptr, m_cell_ptr, m_void_ptr == m_cell_ptr);
THROW_CHECK2(runtime_error, m_void_ptr, m_byte_ptr, m_void_ptr == m_byte_ptr);
THROW_CHECK2(runtime_error, m_void_ptr, m_bucket_ptr, m_void_ptr == m_bucket_ptr);
THROW_CHECK2(runtime_error, m_void_ptr, m_extent_ptr, m_void_ptr == m_extent_ptr);
// Give all the madvise hints that the kernel understands
const struct madv_flag {
const char *name;
int value;
} madv_flags[] = {
{ .name = "MADV_HUGEPAGE", .value = MADV_HUGEPAGE },
{ .name = "MADV_DONTFORK", .value = MADV_DONTFORK },
{ .name = "MADV_DONTDUMP", .value = MADV_DONTDUMP },
{ .name = "", .value = 0 },
};
for (auto fp = madv_flags; fp->value; ++fp) {
BEESTOOLONG("madvise(" << fp->name << ")");
if (madvise(m_byte_ptr, m_size, fp->value)) {
BEESLOGNOTICE("madvise(..., " << fp->name << "): " << strerror(errno) << " (ignored)");
if (!using_shared_map()) {
// madvise fails if MAP_SHARED
if (using_any_madvise()) {
// DONTFORK because we sometimes do fork,
// but the child doesn't touch any of the many, many pages
BEESTOOLONG("madvise(MADV_HUGEPAGE | MADV_DONTFORK)");
DIE_IF_NON_ZERO(madvise(m_byte_ptr, m_size, MADV_HUGEPAGE | MADV_DONTFORK));
}
for (uint64_t i = 0; i < m_size / sizeof(Extent); ++i) {
m_buckets_missing.insert(i);
}
}
m_extent_metadata.resize(m_extents);
m_writeback_thread.exec([&]() {
writeback_loop();
});
@@ -811,69 +655,28 @@ BeesHashTable::BeesHashTable(shared_ptr<BeesContext> ctx, string filename, off_t
prefetch_loop();
});
// Blacklist might fail if the hash table is not stored on a btrfs,
// or if it's on a _different_ btrfs
// Blacklist might fail if the hash table is not stored on a btrfs
catch_all([&]() {
// Root is definitely a btrfs
BtrfsIoctlFsInfoArgs root_info;
root_info.do_ioctl(m_ctx->root_fd());
// Hash might not be a btrfs
BtrfsIoctlFsInfoArgs hash_info;
// If btrfs fs_info ioctl fails, it must be a different fs
if (!hash_info.do_ioctl_nothrow(m_fd)) return;
// If Hash is a btrfs, Root must be the same one
if (root_info.fsid() != hash_info.fsid()) return;
// Hash is on the same one, blacklist it
m_ctx->blacklist_insert(BeesFileId(m_fd));
m_ctx->blacklist_add(BeesFileId(m_fd));
});
// Skip zero because we already weed that out before it gets near a hash function
for (unsigned i = 1; i < 256; ++i) {
vector<uint8_t> v(BLOCK_SIZE_SUMS, i);
HashType hash = Digest::CRC::crc64(v.data(), v.size());
m_toxic_hashes.insert(hash);
}
}
BeesHashTable::~BeesHashTable()
{
BEESLOGDEBUG("Destroy BeesHashTable");
if (m_cell_ptr && m_size) {
// Dirty extents should have been flushed before now,
// e.g. in stop(). If that didn't happen, don't fall
// into the same trap (and maybe throw an exception) here.
// flush_dirty_extents(false);
flush_dirty_extents();
catch_all([&]() {
// drop the memory mapping
BEESTOOLONG("unmap handle table size " << pretty(m_size));
DIE_IF_NON_ZERO(munmap(m_cell_ptr, m_size));
m_cell_ptr = nullptr;
m_size = 0;
});
m_cell_ptr = nullptr;
m_size = 0;
}
BEESLOGDEBUG("BeesHashTable destroyed");
}
void
BeesHashTable::stop_request()
{
BEESNOTE("stopping BeesHashTable threads");
BEESLOGDEBUG("Stopping BeesHashTable threads");
unique_lock<mutex> lock(m_stop_mutex);
m_stop_requested = true;
m_stop_condvar.notify_all();
lock.unlock();
// Wake up hash writeback too
unique_lock<mutex> dirty_lock(m_dirty_mutex);
m_dirty_condvar.notify_all();
dirty_lock.unlock();
}
void
BeesHashTable::stop_wait()
{
BEESNOTE("waiting for hash_prefetch thread");
BEESLOGDEBUG("Waiting for hash_prefetch thread");
m_prefetch_thread.join();
BEESNOTE("waiting for hash_writeback thread");
BEESLOGDEBUG("Waiting for hash_writeback thread");
m_writeback_thread.join();
BEESLOGDEBUG("BeesHashTable stopped");
}

View File

@@ -98,79 +98,90 @@ BeesResolver::adjust_offset(const BeesFileRange &haystack, const BeesBlockData &
return BeesBlockData();
}
off_t haystack_offset = haystack.begin();
off_t lower_offset = haystack.begin();
off_t upper_offset = haystack.end();
bool is_compressed_offset = false;
bool is_exact = false;
bool is_legacy = false;
if (m_addr.is_compressed()) {
BtrfsExtentWalker ew(haystack.fd(), haystack.begin(), m_ctx->root_fd());
BEESTRACE("haystack extent data " << ew);
BEESTRACE("haystack extent data " << ew);
Extent e = ew.current();
THROW_CHECK1(runtime_error, m_addr, m_addr.has_compressed_offset());
off_t coff = m_addr.get_compressed_offset();
if (e.offset() > coff) {
// this extent begins after the target block
BEESCOUNT(adjust_offset_low);
return BeesBlockData();
if (m_addr.has_compressed_offset()) {
off_t coff = m_addr.get_compressed_offset();
if (e.offset() > coff) {
// this extent begins after the target block
BEESCOUNT(adjust_offset_low);
return BeesBlockData();
}
coff -= e.offset();
if (e.size() <= coff) {
// this extent ends before the target block
BEESCOUNT(adjust_offset_high);
return BeesBlockData();
}
lower_offset = e.begin() + coff;
upper_offset = lower_offset + BLOCK_SIZE_CLONE;
BEESCOUNT(adjust_offset_hit);
is_compressed_offset = true;
} else {
lower_offset = e.begin();
upper_offset = e.end();
BEESCOUNT(adjust_legacy);
is_legacy = true;
}
coff -= e.offset();
if (e.size() <= coff) {
// this extent ends before the target block
BEESCOUNT(adjust_offset_high);
return BeesBlockData();
}
haystack_offset = e.begin() + coff;
BEESCOUNT(adjust_offset_hit);
is_compressed_offset = true;
} else {
BEESCOUNT(adjust_exact);
is_exact = true;
}
BEESTRACE("Checking haystack " << haystack << " offset " << to_hex(haystack_offset));
BEESTRACE("Checking haystack " << haystack << " offsets " << to_hex(lower_offset) << ".." << to_hex(upper_offset));
// Check all the blocks in the list
THROW_CHECK1(out_of_range, haystack_offset, (haystack_offset & BLOCK_MASK_CLONE) == 0);
for (off_t haystack_offset = lower_offset; haystack_offset < upper_offset; haystack_offset += BLOCK_SIZE_CLONE) {
THROW_CHECK1(out_of_range, haystack_offset, (haystack_offset & BLOCK_MASK_CLONE) == 0);
// Straw cannot extend beyond end of haystack
if (haystack_offset + needle.size() > haystack_size) {
BEESCOUNT(adjust_needle_too_long);
return BeesBlockData();
}
// Straw cannot extend beyond end of haystack
if (haystack_offset + needle.size() > haystack_size) {
BEESCOUNT(adjust_needle_too_long);
break;
}
// Read the haystack
BEESTRACE("straw " << name_fd(haystack.fd()) << ", offset " << to_hex(haystack_offset) << ", length " << needle.size());
BeesBlockData straw(haystack.fd(), haystack_offset, needle.size());
// Read the haystack
BEESTRACE("straw " << name_fd(haystack.fd()) << ", offset " << to_hex(haystack_offset) << ", length " << needle.size());
BeesBlockData straw(haystack.fd(), haystack_offset, needle.size());
BEESTRACE("straw = " << straw);
BEESTRACE("straw = " << straw);
// Stop if we find a match
if (straw.is_data_equal(needle)) {
BEESCOUNT(adjust_hit);
m_found_data = true;
// Stop if we find a match
if (straw.is_data_equal(needle)) {
BEESCOUNT(adjust_hit);
m_found_data = true;
m_found_hash = true;
if (is_compressed_offset) BEESCOUNT(adjust_compressed_offset_correct);
if (is_legacy) BEESCOUNT(adjust_legacy_correct);
if (is_exact) BEESCOUNT(adjust_exact_correct);
return straw;
}
if (straw.hash() != needle.hash()) {
// Not the same hash or data, try next block
BEESCOUNT(adjust_miss);
continue;
}
// Found the hash but not the data. Yay!
m_found_hash = true;
if (is_compressed_offset) BEESCOUNT(adjust_compressed_offset_correct);
if (is_exact) BEESCOUNT(adjust_exact_correct);
return straw;
BEESLOG("HASH COLLISION\n"
<< "\tneedle " << needle << "\n"
<< "\tstraw " << straw);
BEESCOUNT(hash_collision);
}
if (straw.hash() != needle.hash()) {
// Not the same hash or data, try next block
BEESCOUNT(adjust_miss);
return BeesBlockData();
}
// Found the hash but not the data. Yay!
m_found_hash = true;
#if 0
BEESLOGINFO("HASH COLLISION\n"
<< "\tneedle " << needle << "\n"
<< "\tstraw " << straw);
#endif
BEESCOUNT(hash_collision);
// Ran out of offsets to try
BEESCOUNT(adjust_no_match);
if (is_compressed_offset) BEESCOUNT(adjust_compressed_offset_wrong);
if (is_legacy) BEESCOUNT(adjust_legacy_wrong);
if (is_exact) BEESCOUNT(adjust_exact_wrong);
m_wrong_data = true;
return BeesBlockData();
@@ -185,8 +196,8 @@ BeesResolver::chase_extent_ref(const BtrfsInodeOffsetRoot &bior, BeesBlockData &
Fd file_fd = m_ctx->roots()->open_root_ino(bior.m_root, bior.m_inum);
if (!file_fd) {
// Deleted snapshots generate craptons of these
// BEESLOGDEBUG("No FD in chase_extent_ref " << bior);
// Delete snapshots generate craptons of these
// BEESINFO("No FD in chase_extent_ref " << bior);
BEESCOUNT(chase_no_fd);
return BeesFileRange();
}
@@ -200,7 +211,7 @@ BeesResolver::chase_extent_ref(const BtrfsInodeOffsetRoot &bior, BeesBlockData &
// ...or are we?
if (file_addr.is_magic()) {
BEESLOGDEBUG("file_addr is magic: file_addr = " << file_addr << " bior = " << bior << " needle_bbd = " << needle_bbd);
BEESINFO("file_addr is magic: file_addr = " << file_addr << " bior = " << bior << " needle_bbd = " << needle_bbd);
BEESCOUNT(chase_wrong_magic);
return BeesFileRange();
}
@@ -209,7 +220,7 @@ BeesResolver::chase_extent_ref(const BtrfsInodeOffsetRoot &bior, BeesBlockData &
// Did we get the physical block we asked for? The magic bits have to match too,
// but the compressed offset bits do not.
if (file_addr.get_physical_or_zero() != m_addr.get_physical_or_zero()) {
// BEESLOGDEBUG("found addr " << file_addr << " at " << name_fd(file_fd) << " offset " << to_hex(bior.m_offset) << " but looking for " << m_addr);
// BEESINFO("found addr " << file_addr << " at " << name_fd(file_fd) << " offset " << to_hex(bior.m_offset) << " but looking for " << m_addr);
// FIEMAP/resolve are working, but the data is old.
BEESCOUNT(chase_wrong_addr);
return BeesFileRange();
@@ -229,12 +240,10 @@ BeesResolver::chase_extent_ref(const BtrfsInodeOffsetRoot &bior, BeesBlockData &
// Search near the resolved address for a matching data block.
// ...even if it's not compressed, we should do this sanity
// check before considering the block as a duplicate candidate.
// FIXME: this is mostly obsolete now and we shouldn't do it here.
// Don't bother fixing it because it will all go away with (extent, offset) reads.
auto new_bbd = adjust_offset(haystack_bbd, needle_bbd);
if (new_bbd.empty()) {
// matching offset search failed
BEESCOUNT(chase_no_data);
BEESCOUNT(chase_wrong_data);
return BeesFileRange();
}
if (new_bbd.begin() == haystack_bbd.begin()) {
@@ -359,8 +368,7 @@ BeesResolver::for_each_extent_ref(BeesBlockData bbd, function<bool(const BeesFil
}
// Look at the old data
// FIXME: propagate exceptions for now. Proper fix requires a rewrite.
// catch_all([&]() {
catch_all([&]() {
BEESTRACE("chase_extent_ref ino " << ino_off_root << " bbd " << bbd);
auto new_range = chase_extent_ref(ino_off_root, bbd);
// XXX: should we catch visitor's exceptions here?
@@ -370,12 +378,9 @@ BeesResolver::for_each_extent_ref(BeesBlockData bbd, function<bool(const BeesFil
// We have reliable block addresses now, so we guarantee we can hit the desired block.
// Failure in chase_extent_ref means we are done, and don't need to look up all the
// other references.
// Or...not? If we have a compressed extent, some refs will not match
// if there is are two references to the same extent with a reference
// to a different extent between them.
// stop_now = true;
stop_now = true;
}
// });
});
if (stop_now) {
break;
@@ -384,29 +389,26 @@ BeesResolver::for_each_extent_ref(BeesBlockData bbd, function<bool(const BeesFil
return stop_now;
}
BeesRangePair
BeesResolver::replace_dst(const BeesFileRange &dst_bfr_in)
BeesFileRange
BeesResolver::replace_dst(const BeesFileRange &dst_bfr)
{
BEESTRACE("replace_dst dst_bfr " << dst_bfr_in);
BEESTRACE("replace_dst dst_bfr " << dst_bfr);
BEESCOUNT(replacedst_try);
// Open dst, reuse it for all src
BEESNOTE("Opening dst bfr " << dst_bfr_in);
BEESTRACE("Opening dst bfr " << dst_bfr_in);
auto dst_bfr = dst_bfr_in;
BEESNOTE("Opening dst bfr " << dst_bfr);
BEESTRACE("Opening dst bfr " << dst_bfr);
dst_bfr.fd(m_ctx);
BeesFileRange overlap_bfr;
BEESTRACE("overlap_bfr " << overlap_bfr);
BeesBlockData bbd(dst_bfr);
BeesRangePair rv = { BeesFileRange(), BeesFileRange() };
for_each_extent_ref(bbd, [&](const BeesFileRange &src_bfr_in) -> bool {
for_each_extent_ref(bbd, [&](const BeesFileRange &src_bfr) -> bool {
// Open src
BEESNOTE("Opening src bfr " << src_bfr_in);
BEESTRACE("Opening src bfr " << src_bfr_in);
auto src_bfr = src_bfr_in;
BEESNOTE("Opening src bfr " << src_bfr);
BEESTRACE("Opening src bfr " << src_bfr);
src_bfr.fd(m_ctx);
if (dst_bfr.overlaps(src_bfr)) {
@@ -419,9 +421,7 @@ BeesResolver::replace_dst(const BeesFileRange &dst_bfr_in)
BeesBlockData src_bbd(src_bfr.fd(), src_bfr.begin(), min(BLOCK_SIZE_SUMS, src_bfr.size()));
if (bbd.addr().get_physical_or_zero() == src_bbd.addr().get_physical_or_zero()) {
BEESCOUNT(replacedst_same);
// stop looping here, all the other srcs will probably fail this test too
BeesTracer::set_silent();
throw runtime_error("FIXME: too many duplicate candidates, bailing out here");
return false; // i.e. continue
}
// Make pair(src, dst)
@@ -437,12 +437,21 @@ BeesResolver::replace_dst(const BeesFileRange &dst_bfr_in)
BEESCOUNT(replacedst_grown);
}
rv = brp;
m_found_dup = true;
return true;
// Dedup
BEESNOTE("dedup " << brp);
if (m_ctx->dedup(brp)) {
BEESCOUNT(replacedst_dedup_hit);
m_found_dup = true;
overlap_bfr = brp.second;
// FIXME: find best range first, then dedup that
return true; // i.e. break
} else {
BEESCOUNT(replacedst_dedup_miss);
return false; // i.e. continue
}
});
// BEESLOG("overlap_bfr after " << overlap_bfr);
return rv;
return overlap_bfr.copy_closed();
}
BeesFileRange
@@ -468,6 +477,11 @@ BeesResolver::find_all_matches(BeesBlockData &bbd)
bool
BeesResolver::operator<(const BeesResolver &that) const
{
// Lowest count, highest address
return tie(that.m_bior_count, m_addr) < tie(m_bior_count, that.m_addr);
if (that.m_bior_count < m_bior_count) {
return true;
} else if (m_bior_count < that.m_bior_count) {
return false;
}
return m_addr < that.m_addr;
}

File diff suppressed because it is too large Load Diff

View File

@@ -13,16 +13,19 @@ void
BeesThread::exec(function<void()> func)
{
m_timer.reset();
BEESLOGDEBUG("BeesThread exec " << m_name);
BEESLOG("BeesThread exec " << m_name);
m_thread_ptr = make_shared<thread>([=]() {
BEESLOG("Starting thread " << m_name);
BeesNote::set_name(m_name);
BEESLOGDEBUG("Starting thread " << m_name);
BEESNOTE("thread function");
Timer thread_time;
catch_all([&]() {
DIE_IF_MINUS_ERRNO(pthread_setname_np(pthread_self(), m_name.c_str()));
});
catch_all([&]() {
func();
});
BEESLOGDEBUG("Exiting thread " << m_name << ", " << thread_time << " sec");
BEESLOG("Exiting thread " << m_name << ", " << thread_time << " sec");
});
}
@@ -30,7 +33,7 @@ BeesThread::BeesThread(string name, function<void()> func) :
m_name(name)
{
THROW_CHECK1(invalid_argument, name, !name.empty());
BEESLOGDEBUG("BeesThread construct " << m_name);
BEESLOG("BeesThread construct " << m_name);
exec(func);
}
@@ -38,20 +41,20 @@ void
BeesThread::join()
{
if (!m_thread_ptr) {
BEESLOGDEBUG("Thread " << m_name << " no thread ptr");
BEESLOG("Thread " << m_name << " no thread ptr");
return;
}
BEESLOGDEBUG("BeesThread::join " << m_name);
BEESLOG("BeesThread::join " << m_name);
if (m_thread_ptr->joinable()) {
BEESLOGDEBUG("Joining thread " << m_name);
BEESLOG("Joining thread " << m_name);
Timer thread_time;
m_thread_ptr->join();
BEESLOGDEBUG("Waited for " << m_name << ", " << thread_time << " sec");
BEESLOG("Waited for " << m_name << ", " << thread_time << " sec");
} else if (!m_name.empty()) {
BEESLOGDEBUG("BeesThread " << m_name << " not joinable");
BEESLOG("BeesThread " << m_name << " not joinable");
} else {
BEESLOGDEBUG("BeesThread else " << m_name);
BEESLOG("BeesThread else " << m_name);
}
}
@@ -64,20 +67,25 @@ BeesThread::set_name(const string &name)
BeesThread::~BeesThread()
{
if (!m_thread_ptr) {
BEESLOGDEBUG("Thread " << m_name << " no thread ptr");
BEESLOG("Thread " << m_name << " no thread ptr");
return;
}
BEESLOGDEBUG("BeesThread destructor " << m_name);
BEESLOG("BeesThread destructor " << m_name);
if (m_thread_ptr->joinable()) {
BEESLOGDEBUG("Waiting for thread " << m_name);
BEESLOG("Cancelling thread " << m_name);
int rv = pthread_cancel(m_thread_ptr->native_handle());
if (rv) {
BEESLOG("pthread_cancel returned " << strerror(-rv));
}
BEESLOG("Waiting for thread " << m_name);
Timer thread_time;
m_thread_ptr->join();
BEESLOGDEBUG("Waited for " << m_name << ", " << thread_time << " sec");
BEESLOG("Waited for " << m_name << ", " << thread_time << " sec");
} else if (!m_name.empty()) {
BEESLOGDEBUG("Thread " << m_name << " not joinable");
BEESLOG("Thread " << m_name << " not joinable");
} else {
BEESLOGDEBUG("Thread destroy else " << m_name);
BEESLOG("Thread destroy else " << m_name);
}
}

View File

@@ -1,155 +0,0 @@
#include "bees.h"
// tracing ----------------------------------------
int bees_log_level = 8;
thread_local BeesTracer *BeesTracer::tl_next_tracer = nullptr;
thread_local bool BeesTracer::tl_first = true;
thread_local bool BeesTracer::tl_silent = false;
bool
exception_check()
{
#if __cplusplus >= 201703
return uncaught_exceptions();
#else
return uncaught_exception();
#endif
}
BeesTracer::~BeesTracer()
{
if (!tl_silent && exception_check()) {
if (tl_first) {
BEESLOG(BEES_TRACE_LEVEL, "TRACE: --- BEGIN TRACE --- exception ---");
tl_first = false;
}
try {
m_func();
} catch (exception &e) {
BEESLOG(BEES_TRACE_LEVEL, "TRACE: Nested exception: " << e.what());
} catch (...) {
BEESLOG(BEES_TRACE_LEVEL, "TRACE: Nested exception ...");
}
if (!m_next_tracer) {
BEESLOG(BEES_TRACE_LEVEL, "TRACE: --- END TRACE --- exception ---");
}
}
tl_next_tracer = m_next_tracer;
if (!m_next_tracer) {
tl_silent = false;
tl_first = true;
}
}
BeesTracer::BeesTracer(const function<void()> &f, bool silent) :
m_func(f)
{
m_next_tracer = tl_next_tracer;
tl_next_tracer = this;
tl_silent = silent;
}
void
BeesTracer::trace_now()
{
BeesTracer *tp = tl_next_tracer;
BEESLOG(BEES_TRACE_LEVEL, "TRACE: --- BEGIN TRACE ---");
while (tp) {
tp->m_func();
tp = tp->m_next_tracer;
}
BEESLOG(BEES_TRACE_LEVEL, "TRACE: --- END TRACE ---");
}
bool
BeesTracer::get_silent()
{
return tl_silent;
}
void
BeesTracer::set_silent()
{
tl_silent = true;
}
thread_local BeesNote *BeesNote::tl_next = nullptr;
mutex BeesNote::s_mutex;
map<pid_t, BeesNote*> BeesNote::s_status;
thread_local string BeesNote::tl_name;
BeesNote::~BeesNote()
{
tl_next = m_prev;
unique_lock<mutex> lock(s_mutex);
if (tl_next) {
s_status[gettid()] = tl_next;
} else {
s_status.erase(gettid());
}
}
BeesNote::BeesNote(function<void(ostream &os)> f) :
m_func(f)
{
m_name = get_name();
m_prev = tl_next;
tl_next = this;
unique_lock<mutex> lock(s_mutex);
s_status[gettid()] = tl_next;
}
void
BeesNote::set_name(const string &name)
{
tl_name = name;
pthread_setname(name);
}
string
BeesNote::get_name()
{
// Use explicit name if given
if (!tl_name.empty()) {
return tl_name;
}
// Try a Task name. If there is one, return it, but do not
// remember it. Each output message may be a different Task.
// The current task is thread_local so we don't need to worry
// about it being destroyed under us.
auto current_task = Task::current_task();
if (current_task) {
return current_task.title();
}
// OK try the pthread name next.
// thread_getname_np returns process name
// ...by default? ...for the main thread?
// ...except during exception handling?
// ...randomly?
return pthread_getname();
}
BeesNote::ThreadStatusMap
BeesNote::get_status()
{
unique_lock<mutex> lock(s_mutex);
ThreadStatusMap rv;
for (auto t : s_status) {
ostringstream oss;
if (!t.second->m_name.empty()) {
oss << t.second->m_name << ": ";
}
if (t.second->m_timer.age() > BEES_TOO_LONG) {
oss << "[" << t.second->m_timer << "s] ";
}
t.second->m_func(oss);
rv[t.first] = oss.str();
}
return rv;
}

View File

@@ -1,5 +1,6 @@
#include "bees.h"
#include "crucible/crc64.h"
#include "crucible/limits.h"
#include "crucible/ntoa.h"
#include "crucible/string.h"
@@ -70,18 +71,7 @@ operator<<(ostream &os, const BeesFileRange &bfr)
if (bfr.end() == numeric_limits<off_t>::max()) {
os << "- [" << to_hex(bfr.begin()) << "..eof]";
} else {
os << pretty(bfr.size()) << " ";
if (bfr.begin() != 0) {
os << "[" << to_hex(bfr.begin());
} else {
os << "(";
}
os << ".." << to_hex(bfr.end());
if (!!bfr.m_fd && bfr.end() >= bfr.file_size()) {
os << ")";
} else {
os << "]";
}
os << pretty(bfr.size()) << " [" << to_hex(bfr.begin()) << ".." << to_hex(bfr.end()) << "]";
}
if (bfr.m_fid) {
os << " fid = " << bfr.m_fid;
@@ -102,6 +92,8 @@ operator<<(ostream &os, const BeesRangePair &brp)
<< "\ndst = " << brp.second.fd() << " " << name_fd(brp.second.fd());
}
mutex BeesFileRange::s_mutex;
bool
BeesFileRange::operator<(const BeesFileRange &that) const
{
@@ -153,14 +145,14 @@ off_t
BeesFileRange::file_size() const
{
if (m_file_size <= 0) {
// Use method fd() not member m_fd() so we hold lock
Stat st(fd());
m_file_size = st.st_size;
// These checks could trigger on valid input, but that would mean we have
// lost a race (e.g. a file was truncated while we were building a
// matching range pair with it). In such cases we should probably stop
// whatever we were doing and backtrack to some higher level anyway.
// Well, OK, but we call this function from exception handlers...
THROW_CHECK1(invalid_argument, m_file_size, m_file_size >= 0);
THROW_CHECK1(invalid_argument, m_file_size, m_file_size > 0);
// THROW_CHECK2(invalid_argument, m_file_size, m_end, m_end <= m_file_size || m_end == numeric_limits<off_t>::max());
}
return m_file_size;
@@ -183,42 +175,34 @@ BeesFileRange::grow_begin(off_t delta)
return m_begin;
}
off_t
BeesFileRange::shrink_begin(off_t delta)
{
THROW_CHECK1(invalid_argument, delta, delta > 0);
THROW_CHECK3(invalid_argument, delta, m_begin, m_end, delta + m_begin < m_end);
m_begin += delta;
return m_begin;
}
off_t
BeesFileRange::shrink_end(off_t delta)
{
THROW_CHECK1(invalid_argument, delta, delta > 0);
THROW_CHECK2(invalid_argument, delta, m_end, m_end >= delta);
m_end -= delta;
return m_end;
}
BeesFileRange::BeesFileRange(const BeesBlockData &bbd) :
m_fd(bbd.fd()),
m_begin(bbd.begin()),
m_end(bbd.end())
m_end(bbd.end()),
m_file_size(-1)
{
}
BeesFileRange::BeesFileRange(Fd fd, off_t begin, off_t end) :
m_fd(fd),
m_begin(begin),
m_end(end)
m_end(end),
m_file_size(-1)
{
}
BeesFileRange::BeesFileRange(const BeesFileId &fid, off_t begin, off_t end) :
m_fid(fid),
m_begin(begin),
m_end(end)
m_end(end),
m_file_size(-1)
{
}
BeesFileRange::BeesFileRange() :
m_begin(0),
m_end(0),
m_file_size(-1)
{
}
@@ -256,6 +240,42 @@ BeesFileRange::overlaps(const BeesFileRange &that) const
return false;
}
bool
BeesFileRange::coalesce(const BeesFileRange &that)
{
// Let's define coalesce-with-null as identity,
// and coalesce-null-with-null as coalesced
if (!*this) {
operator=(that);
return true;
}
if (!that) {
return true;
}
// Can't coalesce different files
if (!is_same_file(that)) return false;
pair<uint64_t, uint64_t> a(m_begin, m_end);
pair<uint64_t, uint64_t> b(that.m_begin, that.m_end);
// range a starts lower than or equal b
if (b.first < a.first) {
swap(a, b);
}
// if b starts within a, they overlap
// (and the intersecting region is b.first..min(a.second, b.second))
// (and the union region is a.first..max(a.second, b.second))
if (b.first >= a.first && b.first < a.second) {
m_begin = a.first;
m_end = max(a.second, b.second);
return true;
}
return false;
}
BeesFileRange::operator BeesBlockData() const
{
BEESTRACE("operator BeesBlockData " << *this);
@@ -265,18 +285,22 @@ BeesFileRange::operator BeesBlockData() const
Fd
BeesFileRange::fd() const
{
unique_lock<mutex> lock(s_mutex);
return m_fd;
}
Fd
BeesFileRange::fd(const shared_ptr<BeesContext> &ctx)
BeesFileRange::fd(const shared_ptr<BeesContext> &ctx) const
{
unique_lock<mutex> lock(s_mutex);
// If we don't have a fid we can't do much here
if (m_fid) {
if (!m_fd) {
// If we don't have a fd, open by fid
if (m_fid && ctx) {
lock.unlock();
Fd new_fd = ctx->roots()->open_root_ino(m_fid);
lock.lock();
m_fd = new_fd;
}
} else {
@@ -350,7 +374,6 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
BEESTOOLONG("grow constrained = " << constrained << " *this = " << *this);
BEESTRACE("grow constrained = " << constrained << " *this = " << *this);
bool rv = false;
Timer grow_backward_timer;
THROW_CHECK1(invalid_argument, first.begin(), (first.begin() & BLOCK_MASK_CLONE) == 0);
THROW_CHECK1(invalid_argument, second.begin(), (second.begin() & BLOCK_MASK_CLONE) == 0);
@@ -367,8 +390,8 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
BEESTRACE("e_second " << e_second);
// Preread entire extent
bees_readahead_pair(second.fd(), e_second.begin(), e_second.size(),
first.fd(), e_second.begin() + first.begin() - second.begin(), e_second.size());
posix_fadvise(second.fd(), e_second.begin(), e_second.size(), POSIX_FADV_WILLNEED);
posix_fadvise(first.fd(), e_second.begin() + first.begin() - second.begin(), e_second.size(), POSIX_FADV_WILLNEED);
auto hash_table = ctx->hash_table();
@@ -387,7 +410,7 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
BEESCOUNT(pairbackward_hole);
break;
}
bees_readahead(second.fd(), e_second.begin(), e_second.size());
posix_fadvise(second.fd(), e_second.begin(), e_second.size(), POSIX_FADV_WILLNEED);
#else
// This tends to repeatedly process extents that were recently processed.
// We tend to catch duplicate blocks early since we scan them forwards.
@@ -406,6 +429,17 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
break;
}
// Source extent cannot be toxic
BeesAddress first_addr(first.fd(), new_first.begin());
if (!first_addr.is_magic()) {
auto first_resolved = ctx->resolve_addr(first_addr);
if (first_resolved.is_toxic()) {
BEESLOG("WORKAROUND: not growing matching pair backward because src addr is toxic:\n" << *this);
BEESCOUNT(pairbackward_toxic_addr);
break;
}
}
// Extend second range. If we hit BOF we can go no further.
BeesFileRange new_second = second;
BEESTRACE("new_second = " << new_second);
@@ -441,7 +475,6 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
}
// Source block cannot be zero in a non-compressed non-magic extent
BeesAddress first_addr(first.fd(), new_first.begin());
if (first_bbd.is_data_zero() && !first_addr.is_magic() && !first_addr.is_compressed()) {
BEESCOUNT(pairbackward_zero);
break;
@@ -457,7 +490,7 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
}
}
if (found_toxic) {
BEESLOGDEBUG("WORKAROUND: found toxic hash in " << first_bbd << " while extending backward:\n" << *this);
BEESLOG("WORKAROUND: found toxic hash in " << first_bbd << " while extending backward:\n" << *this);
BEESCOUNT(pairbackward_toxic_hash);
break;
}
@@ -469,11 +502,9 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
BEESCOUNT(pairbackward_hit);
}
BEESCOUNT(pairbackward_stop);
BEESCOUNTADD(pairbackward_ms, grow_backward_timer.age() * 1000);
// Look forward
BEESTRACE("grow_forward " << *this);
Timer grow_forward_timer;
while (first.size() < BLOCK_SIZE_MAX_EXTENT) {
if (second.end() >= e_second.end()) {
if (constrained) {
@@ -486,7 +517,7 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
BEESCOUNT(pairforward_hole);
break;
}
bees_readahead(second.fd(), e_second.begin(), e_second.size());
posix_fadvise(second.fd(), e_second.begin(), e_second.size(), POSIX_FADV_WILLNEED);
}
BEESCOUNT(pairforward_try);
@@ -499,6 +530,17 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
break;
}
// Source extent cannot be toxic
BeesAddress first_addr(first.fd(), new_first.begin());
if (!first_addr.is_magic()) {
auto first_resolved = ctx->resolve_addr(first_addr);
if (first_resolved.is_toxic()) {
BEESLOG("WORKAROUND: not growing matching pair forward because src is toxic:\n" << *this);
BEESCOUNT(pairforward_toxic);
break;
}
}
// Extend second range. If we hit EOF we can go no further.
BeesFileRange new_second = second;
BEESTRACE("new_second = " << new_second);
@@ -542,7 +584,6 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
}
// Source block cannot be zero in a non-compressed non-magic extent
BeesAddress first_addr(first.fd(), new_first.begin());
if (first_bbd.is_data_zero() && !first_addr.is_magic() && !first_addr.is_compressed()) {
BEESCOUNT(pairforward_zero);
break;
@@ -558,7 +599,7 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
}
}
if (found_toxic) {
BEESLOGDEBUG("WORKAROUND: found toxic hash in " << first_bbd << " while extending forward:\n" << *this);
BEESLOG("WORKAROUND: found toxic hash in " << first_bbd << " while extending forward:\n" << *this);
BEESCOUNT(pairforward_toxic_hash);
break;
}
@@ -572,12 +613,11 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
}
if (first.overlaps(second)) {
BEESLOGDEBUG("after grow, first " << first << "\n\toverlaps " << second);
BEESLOGTRACE("after grow, first " << first << "\n\toverlaps " << second);
BEESCOUNT(bug_grow_pair_overlaps);
}
BEESCOUNT(pairforward_stop);
BEESCOUNTADD(pairforward_ms, grow_forward_timer.age() * 1000);
return rv;
}
@@ -587,22 +627,6 @@ BeesRangePair::copy_closed() const
return BeesRangePair(first.copy_closed(), second.copy_closed());
}
void
BeesRangePair::shrink_begin(off_t const delta)
{
first.shrink_begin(delta);
second.shrink_begin(delta);
THROW_CHECK2(runtime_error, first.size(), second.size(), first.size() == second.size());
}
void
BeesRangePair::shrink_end(off_t const delta)
{
first.shrink_end(delta);
second.shrink_end(delta);
THROW_CHECK2(runtime_error, first.size(), second.size(), first.size() == second.size());
}
ostream &
operator<<(ostream &os, const BeesAddress &ba)
{
@@ -674,7 +698,7 @@ BeesAddress::magic_check(uint64_t flags)
static const unsigned recognized_flags = compressed_flags | delalloc_flags | ignore_flags | unusable_flags;
if (flags & ~recognized_flags) {
BEESLOGNOTICE("Unrecognized flags in " << fiemap_extent_flags_ntoa(flags));
BEESLOGTRACE("Unrecognized flags in " << fiemap_extent_flags_ntoa(flags));
m_addr = UNUSABLE;
// maybe we throw here?
BEESCOUNT(addr_unrecognized);
@@ -854,9 +878,6 @@ operator<<(ostream &os, const BeesBlockData &bbd)
os << ", hash = " << bbd.m_hash;
}
if (!bbd.m_data.empty()) {
// Turn this on to debug BeesBlockData, but leave it off otherwise.
// It's a massive data leak that is only interesting to developers.
#if 0
os << ", data[" << bbd.m_data.size() << "] = '";
size_t max_print = 12;
@@ -873,9 +894,6 @@ operator<<(ostream &os, const BeesBlockData &bbd)
}
}
os << "...'";
#else
os << ", data[" << bbd.m_data.size() << "]";
#endif
}
return os << " }";
}
@@ -918,13 +936,12 @@ BeesBlockData::data() const
{
if (m_data.empty()) {
THROW_CHECK1(invalid_argument, size(), size() > 0);
BEESNOTE("Reading BeesBlockData " << *this);
BEESTOOLONG("Reading BeesBlockData " << *this);
Timer read_timer;
Blob rv(size());
Blob rv(m_length);
pread_or_die(m_fd, rv, m_offset);
THROW_CHECK2(runtime_error, rv.size(), size(), ranged_cast<off_t>(rv.size()) == size());
THROW_CHECK2(runtime_error, rv.size(), m_length, ranged_cast<off_t>(rv.size()) == m_length);
m_data = rv;
BEESCOUNT(block_read);
BEESCOUNTADD(block_bytes, rv.size());
@@ -938,10 +955,14 @@ BeesHash
BeesBlockData::hash() const
{
if (!m_hash_done) {
// We can only dedupe unaligned EOF blocks against other unaligned EOF blocks,
// We can only dedup unaligned EOF blocks against other unaligned EOF blocks,
// so we do NOT round up to a full sum block size.
const Blob &blob = data();
m_hash = BeesHash(blob.data(), blob.size());
// TODO: It turns out that file formats with 4K block
// alignment and embedded CRC64 do exist, and every block
// of such files has the same hash. Could use a subset
// of SHA1 here instead.
m_hash = Digest::CRC::crc64(blob.data(), blob.size());
m_hash_done = true;
BEESCOUNT(block_hash);
}
@@ -953,8 +974,9 @@ bool
BeesBlockData::is_data_zero() const
{
// The CRC64 of zero is zero, so skip some work if we already know the CRC
// ...but that doesn't work for any other hash function, and it
// saves us next to nothing.
if (m_hash_done && m_hash != 0) {
return false;
}
// OK read block (maybe) and check every byte
for (auto c : data()) {

View File

@@ -1,36 +0,0 @@
Usage: %s [options] fs-root-path
Performs best-effort extent-same deduplication on btrfs.
fs-root-path MUST be the root of a btrfs filesystem tree (subvol id 5).
Other directories will be rejected.
Options:
-h, --help Show this help
Load management options:
-c, --thread-count Worker thread count (default CPU count * factor)
-C, --thread-factor Worker thread factor (default 1)
-G, --thread-min Minimum worker thread count (default 0)
-g, --loadavg-target Target load average for worker threads (default none)
--throttle-factor Idle time between operations (default 1.0)
Filesystem tree traversal options:
-m, --scan-mode Scanning mode (0..4, default 4)
Workarounds:
-a, --workaround-btrfs-send Workaround for btrfs send
(ignore RO snapshots)
Logging options:
-t, --timestamps Show timestamps in log output (default)
-T, --no-timestamps Omit timestamps in log output
-p, --absolute-paths Show absolute paths (default)
-P, --strip-paths Strip $CWD from beginning of all paths in the log
-v, --verbose Set maximum log level (0..8, default 8)
Optional environment variables:
BEESHOME Path to hash table and configuration files
(default is .beeshome/ in the root of the filesystem).
BEESSTATUS File to write status to (tmpfs recommended, e.g. /run).
No status is written if this variable is unset.

File diff suppressed because it is too large Load Diff

View File

@@ -1,7 +1,7 @@
#ifndef BEES_H
#define BEES_H
#include "crucible/btrfs-tree.h"
#include "crucible/bool.h"
#include "crucible/cache.h"
#include "crucible/chatter.h"
#include "crucible/error.h"
@@ -9,21 +9,18 @@
#include "crucible/fd.h"
#include "crucible/fs.h"
#include "crucible/lockset.h"
#include "crucible/multilock.h"
#include "crucible/pool.h"
#include "crucible/progress.h"
#include "crucible/time.h"
#include "crucible/task.h"
#include "crucible/timequeue.h"
#include "crucible/workqueue.h"
#include <array>
#include <functional>
#include <list>
#include <mutex>
#include <string>
#include <random>
#include <thread>
#include <endian.h>
#include <syslog.h>
using namespace crucible;
using namespace std;
@@ -31,7 +28,7 @@ using namespace std;
// Block size for clone alignment (FIXME: should read this from /sys/fs/btrfs/<FS-UUID>/clone_alignment)
const off_t BLOCK_SIZE_CLONE = 4096;
// Block size for dedupe checksums (arbitrary, but must be a multiple of clone alignment)
// Block size for dedup checksums (arbitrary, but must be a multiple of clone alignment)
const off_t BLOCK_SIZE_SUMS = 4096;
// Block size for memory allocations and file mappings (FIXME: should be CPU page size)
@@ -43,6 +40,13 @@ const off_t BLOCK_SIZE_MAX_EXTENT_SAME = 4096 * 4096;
// Maximum length of a compressed extent in bytes
const off_t BLOCK_SIZE_MAX_COMPRESSED_EXTENT = 128 * 1024;
// Try to combine smaller extents into larger ones
const off_t BLOCK_SIZE_MIN_EXTENT_DEFRAG = BLOCK_SIZE_MAX_COMPRESSED_EXTENT;
// Avoid splitting extents that are already too small
const off_t BLOCK_SIZE_MIN_EXTENT_SPLIT = BLOCK_SIZE_MAX_COMPRESSED_EXTENT;
// const off_t BLOCK_SIZE_MIN_EXTENT_SPLIT = 1024LL * 1024 * 1024 * 1024;
// Maximum length of any extent in bytes
// except we've seen 1.03G extents...
// ...FIEMAP is slow and full of lies
@@ -51,62 +55,59 @@ const off_t BLOCK_SIZE_MAX_EXTENT = 128 * 1024 * 1024;
// Masks, so we don't have to write "(BLOCK_SIZE_CLONE - 1)" everywhere
const off_t BLOCK_MASK_CLONE = BLOCK_SIZE_CLONE - 1;
const off_t BLOCK_MASK_SUMS = BLOCK_SIZE_SUMS - 1;
const off_t BLOCK_MASK_MMAP = BLOCK_SIZE_MMAP - 1;
const off_t BLOCK_MASK_MAX_COMPRESSED_EXTENT = BLOCK_SIZE_MAX_COMPRESSED_EXTENT * 2 - 1;
// Maximum temporary file size (maximum extent size for temporary copy)
// Maximum temporary file size
const off_t BLOCK_SIZE_MAX_TEMP_FILE = 1024 * 1024 * 1024;
// Bucket size for hash table (size of one hash bucket)
const off_t BLOCK_SIZE_HASHTAB_BUCKET = BLOCK_SIZE_MMAP;
// Extent size for hash table (since the nocow file attribute does not seem to be working today)
const off_t BLOCK_SIZE_HASHTAB_EXTENT = BLOCK_SIZE_MAX_COMPRESSED_EXTENT;
const off_t BLOCK_SIZE_HASHTAB_EXTENT = 16 * 1024 * 1024;
// Bytes per second we want to flush from hash table
// Optimistic sustained write rate for SD cards
const double BEES_FLUSH_RATE = 128 * 1024;
// Bytes per second we want to flush (8GB every two hours)
const double BEES_FLUSH_RATE = 8.0 * 1024 * 1024 * 1024 / 7200.0;
// Interval between writing crawl state to disk
// Interval between writing non-hash-table things to disk (15 minutes)
const int BEES_WRITEBACK_INTERVAL = 900;
// Statistics reports while scanning
const int BEES_STATS_INTERVAL = 3600;
// Progress shows instantaneous rates and thread status
const int BEES_PROGRESS_INTERVAL = BEES_STATS_INTERVAL;
const int BEES_PROGRESS_INTERVAL = 3600;
// Status is output every freakin second. Use a ramdisk.
const int BEES_STATUS_INTERVAL = 1;
// Number of file FDs to cache when not in active use
const size_t BEES_FILE_FD_CACHE_SIZE = 524288;
// Number of root FDs to cache when not in active use
const size_t BEES_ROOT_FD_CACHE_SIZE = 65536;
// Number of FDs to open (rlimit)
const size_t BEES_OPEN_FILE_LIMIT = BEES_FILE_FD_CACHE_SIZE + BEES_ROOT_FD_CACHE_SIZE + 100;
// Worker thread factor (multiplied by detected number of CPU cores)
const double BEES_DEFAULT_THREAD_FACTOR = 1.0;
// Log warnings when an operation takes too long
const double BEES_TOO_LONG = 5.0;
const double BEES_TOO_LONG = 2.5;
// Avoid any extent where LOGICAL_INO takes this much kernel CPU time
const double BEES_TOXIC_SYS_DURATION = 5.0;
// Avoid any extent where LOGICAL_INO takes this long
const double BEES_TOXIC_DURATION = 9.9;
// Maximum number of refs to a single extent before we have other problems
// If we have more than 10K refs to an extent, adding another will save 0.01% space
const size_t BEES_MAX_EXTENT_REF_COUNT = 9999; // (16 * 1024 * 1024 / 24);
// How long we should wait for new btrfs transactions
const double BEES_COMMIT_INTERVAL = 900;
// How long between hash table histograms
const double BEES_HASH_TABLE_ANALYZE_INTERVAL = BEES_STATS_INTERVAL;
const double BEES_HASH_TABLE_ANALYZE_INTERVAL = 3600;
// Wait at least this long for a new transid
const double BEES_TRANSID_POLL_INTERVAL = 30.0;
// Rate limiting of informational messages
const double BEES_INFO_RATE = 10.0;
const double BEES_INFO_BURST = 1.0;
// Workaround for silly dedupe / ineffective readahead behavior
const size_t BEES_READAHEAD_SIZE = 1024 * 1024;
// After we have this many events queued, wait
const size_t BEES_MAX_QUEUE_SIZE = 1024;
// Read this many items at a time in SEARCHv2
const size_t BEES_MAX_CRAWL_SIZE = 4096;
// If an extent has this many refs, pretend it does not exist
// to avoid a crippling btrfs performance bug
// The actual limit in LOGICAL_INO seems to be 2730, but let's leave a little headroom
const size_t BEES_MAX_EXTENT_REF_COUNT = 2560;
// Flags
const int FLAGS_OPEN_COMMON = O_NOFOLLOW | O_NONBLOCK | O_CLOEXEC | O_NOATIME | O_LARGEFILE | O_NOCTTY;
@@ -121,26 +122,19 @@ const int FLAGS_OPEN_FANOTIFY = O_RDWR | O_NOATIME | O_CLOEXEC | O_LARGEFILE;
// macros ----------------------------------------
#define BEESLOG(lv,x) do { if (lv < bees_log_level) { Chatter __chatter(lv, BeesNote::get_name()); __chatter << x; } } while (0)
#define BEESLOG(x) do { Chatter c(BeesNote::get_name()); c << x; } while (0)
#define BEESLOGTRACE(x) do { BEESLOG(x); BeesTracer::trace_now(); } while (0)
#define BEES_TRACE_LEVEL LOG_DEBUG
#define BEESTRACE(x) BeesTracer SRSLY_WTF_C(beesTracer_, __LINE__) ([&]() { BEESLOG(BEES_TRACE_LEVEL, "TRACE: " << x << " at " << __FILE__ << ":" << __LINE__); })
#define BEESTRACE(x) BeesTracer SRSLY_WTF_C(beesTracer_, __LINE__) ([&]() { BEESLOG(x); })
#define BEESTOOLONG(x) BeesTooLong SRSLY_WTF_C(beesTooLong_, __LINE__) ([&](ostream &_btl_os) { _btl_os << x; })
#define BEESNOTE(x) BeesNote SRSLY_WTF_C(beesNote_, __LINE__) ([&](ostream &_btl_os) { _btl_os << x; })
#define BEESLOGERR(x) BEESLOG(LOG_ERR, x)
#define BEESLOGWARN(x) BEESLOG(LOG_WARNING, x)
#define BEESLOGNOTICE(x) BEESLOG(LOG_NOTICE, x)
#define BEESLOGINFO(x) BEESLOG(LOG_INFO, x)
#define BEESLOGDEBUG(x) BEESLOG(LOG_DEBUG, x)
#define BEESLOGONCE(__x) do { \
static bool already_logged = false; \
if (!already_logged) { \
already_logged = true; \
BEESLOGNOTICE(__x); \
} \
} while (false)
#define BEESINFO(x) do { \
if (bees_info_rate_limit.is_ready()) { \
bees_info_rate_limit.borrow(1); \
Chatter c(BeesNote::get_name()); \
c << x; \
} \
} while (0)
#define BEESCOUNT(stat) do { \
BeesStats::s_global.add_count(#stat); \
@@ -160,16 +154,16 @@ class BeesStatTmpl {
map<string, T> m_stats_map;
mutable mutex m_mutex;
T& at(string idx);
public:
BeesStatTmpl() = default;
BeesStatTmpl(const BeesStatTmpl &that);
BeesStatTmpl &operator=(const BeesStatTmpl &that);
void add_count(string idx, size_t amount = 1);
T& at(string idx);
T at(string idx) const;
friend ostream& operator<< <>(ostream &os, const BeesStatTmpl<T> &bs);
friend struct BeesStats;
friend class BeesStats;
};
using BeesRates = BeesStatTmpl<double>;
@@ -188,16 +182,12 @@ class BeesBlockData;
class BeesTracer {
function<void()> m_func;
BeesTracer *m_next_tracer = 0;
thread_local static BeesTracer *tl_next_tracer;
thread_local static bool tl_silent;
thread_local static bool tl_first;
thread_local static BeesTracer *s_next_tracer;
public:
BeesTracer(const function<void()> &f, bool silent = false);
BeesTracer(function<void()> f);
~BeesTracer();
static void trace_now();
static bool get_silent();
static void set_silent();
};
class BeesNote {
@@ -209,8 +199,8 @@ class BeesNote {
static mutex s_mutex;
static map<pid_t, BeesNote*> s_status;
thread_local static BeesNote *tl_next;
thread_local static string tl_name;
thread_local static BeesNote *s_next;
thread_local static string s_name;
public:
BeesNote(function<void(ostream &)> f);
@@ -260,14 +250,15 @@ ostream& operator<<(ostream &os, const BeesFileId &bfi);
class BeesFileRange {
protected:
Fd m_fd;
static mutex s_mutex;
mutable Fd m_fd;
mutable BeesFileId m_fid;
off_t m_begin = 0, m_end = 0;
mutable off_t m_file_size = -1;
off_t m_begin, m_end;
mutable off_t m_file_size;
public:
BeesFileRange() = default;
BeesFileRange();
BeesFileRange(Fd fd, off_t begin, off_t end);
BeesFileRange(const BeesFileId &fid, off_t begin, off_t end);
BeesFileRange(const BeesBlockData &bbd);
@@ -282,36 +273,35 @@ public:
bool is_same_file(const BeesFileRange &that) const;
bool overlaps(const BeesFileRange &that) const;
// If file ranges overlap, extends this to include that.
// Coalesce with empty bfr = non-empty bfr
bool coalesce(const BeesFileRange &that);
// Remove that from this, creating 0, 1, or 2 new objects
pair<BeesFileRange, BeesFileRange> subtract(const BeesFileRange &that) const;
off_t begin() const { return m_begin; }
off_t end() const { return m_end; }
off_t size() const;
/// @{ Lazy accessors
// Lazy accessors
off_t file_size() const;
BeesFileId fid() const;
/// @}
/// Get the fd if there is one
// Get the fd if there is one
Fd fd() const;
/// Get the fd, opening it if necessary
Fd fd(const shared_ptr<BeesContext> &ctx);
// Get the fd, opening it if necessary
Fd fd(const shared_ptr<BeesContext> &ctx) const;
/// Copy the BeesFileId but not the Fd
BeesFileRange copy_closed() const;
/// Is it defined?
// Is it defined?
operator bool() const { return !!m_fd || m_fid; }
/// @{ Make range larger
// Make range larger
off_t grow_end(off_t delta);
off_t grow_begin(off_t delta);
/// @}
/// @{ Make range smaller
off_t shrink_end(off_t delta);
off_t shrink_begin(off_t delta);
/// @}
friend ostream & operator<<(ostream &os, const BeesFileRange &bfr);
};
@@ -326,6 +316,7 @@ public:
// Blocks with no physical address (not yet allocated, hole, or "other").
// PREALLOC blocks have a physical address so they're not magic enough to be handled here.
// Compressed blocks have a physical address but it's two-dimensional.
enum MagicValue {
ZERO, // BeesAddress uninitialized
DELALLOC, // delayed allocation
@@ -337,7 +328,6 @@ public:
BeesAddress(Type addr = ZERO) : m_addr(addr) {}
BeesAddress(MagicValue addr) : m_addr(addr) {}
BeesAddress& operator=(const BeesAddress &that) = default;
BeesAddress(const BeesAddress &that) = default;
operator Type() const { return m_addr; }
bool operator==(const BeesAddress &that) const;
bool operator==(const MagicValue that) const { return *this == BeesAddress(that); }
@@ -381,11 +371,9 @@ class BeesStringFile {
size_t m_limit;
public:
BeesStringFile(Fd dir_fd, string name, size_t limit = 16 * 1024 * 1024);
BeesStringFile(Fd dir_fd, string name, size_t limit = 1024 * 1024);
string read();
void write(string contents);
void name(const string &new_name);
string name() const;
};
class BeesHashTable {
@@ -398,7 +386,6 @@ public:
HashType e_hash;
AddrType e_addr;
Cell(const Cell &) = default;
Cell &operator=(const Cell &) = default;
Cell(HashType hash, AddrType addr) : e_hash(hash), e_addr(addr) { }
bool operator==(const Cell &e) const { return tie(e_hash, e_addr) == tie(e.e_hash, e.e_addr); }
bool operator!=(const Cell &e) const { return tie(e_hash, e_addr) != tie(e.e_hash, e.e_addr); }
@@ -420,17 +407,15 @@ public:
uint8_t p_byte[BLOCK_SIZE_HASHTAB_EXTENT];
} __attribute__((packed));
BeesHashTable(shared_ptr<BeesContext> ctx, string filename, off_t size = BLOCK_SIZE_HASHTAB_EXTENT);
BeesHashTable(shared_ptr<BeesContext> ctx, string filename);
~BeesHashTable();
void stop_request();
void stop_wait();
vector<Cell> find_cell(HashType hash);
bool push_random_hash_addr(HashType hash, AddrType addr);
void erase_hash_addr(HashType hash, AddrType addr);
bool push_front_hash_addr(HashType hash, AddrType addr);
bool flush_dirty_extent(uint64_t extent_index);
void set_shared(bool shared);
private:
string m_filename;
@@ -453,52 +438,36 @@ private:
uint64_t m_buckets;
uint64_t m_extents;
uint64_t m_cells;
set<uint64_t> m_buckets_dirty;
set<uint64_t> m_buckets_missing;
BeesThread m_writeback_thread;
BeesThread m_prefetch_thread;
RateLimiter m_flush_rate_limit;
RateLimiter m_prefetch_rate_limit;
mutex m_extent_mutex;
mutex m_bucket_mutex;
condition_variable m_condvar;
set<HashType> m_toxic_hashes;
BeesStringFile m_stats_file;
// Prefetch readahead hint
bool m_prefetch_running = false;
LockSet<uint64_t> m_extent_lock_set;
// Mutex/condvar for the writeback thread
mutex m_dirty_mutex;
condition_variable m_dirty_condvar;
bool m_dirty = false;
DefaultBool m_shared;
// Mutex/condvar to stop
mutex m_stop_mutex;
condition_variable m_stop_condvar;
bool m_stop_requested = false;
// Per-extent structures
struct ExtentMetaData {
shared_ptr<mutex> m_mutex_ptr; // Access serializer
bool m_dirty = false; // Needs to be written back to disk
bool m_missing = true; // Needs to be read from disk
ExtentMetaData();
};
vector<ExtentMetaData> m_extent_metadata;
void open_file();
void writeback_loop();
void prefetch_loop();
void try_mmap_flags(int flags);
pair<Cell *, Cell *> get_cell_range(HashType hash);
pair<uint8_t *, uint8_t *> get_extent_range(HashType hash);
void fetch_missing_extent_by_hash(HashType hash);
void fetch_missing_extent_by_index(uint64_t extent_index);
void set_extent_dirty_locked(uint64_t extent_index);
size_t flush_dirty_extents(bool slowly);
void fetch_missing_extent(HashType hash);
void set_extent_dirty(HashType hash);
void flush_dirty_extents();
bool is_toxic_hash(HashType h) const;
size_t hash_to_extent_index(HashType ht);
unique_lock<mutex> lock_extent_by_hash(HashType ht);
unique_lock<mutex> lock_extent_by_index(uint64_t extent_index);
bool using_shared_map() const { return false; }
BeesHashTable(const BeesHashTable &) = delete;
BeesHashTable &operator=(const BeesHashTable &) = delete;
static thread_local uniform_int_distribution<size_t> tl_distribution;
};
ostream &operator<<(ostream &os, const BeesHashTable::Cell &bhte);
@@ -518,115 +487,63 @@ class BeesCrawl {
shared_ptr<BeesContext> m_ctx;
mutex m_mutex;
BtrfsTreeItem m_next_extent_data;
bool m_deferred = false;
bool m_finished = false;
set<BeesFileRange> m_extents;
DefaultBool m_deferred;
mutex m_state_mutex;
ProgressTracker<BeesCrawlState> m_state;
BtrfsTreeObjectFetcher m_btof;
BeesCrawlState m_state;
bool fetch_extents();
void fetch_extents_harder();
bool restart_crawl_unlocked();
BeesFileRange bti_to_bfr(const BtrfsTreeItem &bti) const;
bool next_transid();
public:
BeesCrawl(shared_ptr<BeesContext> ctx, BeesCrawlState initial_state);
BeesFileRange peek_front();
BeesFileRange pop_front();
ProgressTracker<BeesCrawlState>::ProgressHolder hold_state(const BeesCrawlState &bcs);
BeesCrawlState get_state_begin();
BeesCrawlState get_state_end() const;
BeesCrawlState get_state();
void set_state(const BeesCrawlState &bcs);
void deferred(bool def_setting);
bool deferred() const;
bool finished() const;
bool restart_crawl();
};
class BeesScanMode;
class BeesRoots : public enable_shared_from_this<BeesRoots> {
class BeesRoots {
shared_ptr<BeesContext> m_ctx;
BeesStringFile m_crawl_state_file;
using CrawlMap = map<uint64_t, shared_ptr<BeesCrawl>>;
CrawlMap m_root_crawl_map;
BeesCrawlState m_crawl_current;
map<uint64_t, shared_ptr<BeesCrawl>> m_root_crawl_map;
mutex m_mutex;
uint64_t m_crawl_dirty = 0;
uint64_t m_crawl_clean = 0;
condition_variable m_condvar;
DefaultBool m_crawl_dirty;
Timer m_crawl_timer;
BeesThread m_crawl_thread;
BeesThread m_writeback_thread;
RateEstimator m_transid_re;
bool m_workaround_btrfs_send = false;
shared_ptr<BeesScanMode> m_scanner;
mutex m_tmpfiles_mutex;
map<BeesFileId, Fd> m_tmpfiles;
mutex m_stop_mutex;
condition_variable m_stop_condvar;
bool m_stop_requested = false;
CrawlMap insert_new_crawl();
void insert_new_crawl();
void insert_root(const BeesCrawlState &bcs);
Fd open_root_nocache(uint64_t root);
Fd open_root_ino_nocache(uint64_t root, uint64_t ino);
uint64_t transid_max_nocache();
uint64_t transid_min();
uint64_t transid_max();
void state_load();
ostream &state_to_stream(ostream &os);
void state_save();
void crawl_roots();
string crawl_state_filename() const;
BeesCrawlState crawl_state_get(uint64_t root);
void crawl_state_set_dirty();
void crawl_state_erase(const BeesCrawlState &bcs);
void crawl_thread();
void writeback_thread();
uint64_t next_root(uint64_t root = 0);
void current_state_set(const BeesCrawlState &bcs);
bool crawl_batch(shared_ptr<BeesCrawl> crawl);
void clear_caches();
shared_ptr<BeesCrawl> insert_root(const BeesCrawlState &bcs);
bool up_to_date(const BeesCrawlState &bcs);
friend class BeesCrawl;
friend class BeesFdCache;
friend class BeesScanMode;
friend class BeesScanModeSubvol;
friend class BeesScanModeExtent;
friend class BeesCrawl;
public:
BeesRoots(shared_ptr<BeesContext> ctx);
void start();
void stop_request();
void stop_wait();
void insert_tmpfile(Fd fd);
void erase_tmpfile(Fd fd);
Fd open_root(uint64_t root);
Fd open_root_ino(uint64_t root, uint64_t ino);
Fd open_root_ino(const BeesFileId &bfi) { return open_root_ino(bfi.root(), bfi.ino()); }
bool is_root_ro(uint64_t root);
enum ScanMode {
SCAN_MODE_LOCKSTEP,
SCAN_MODE_INDEPENDENT,
SCAN_MODE_SEQUENTIAL,
SCAN_MODE_RECENT,
SCAN_MODE_EXTENT,
SCAN_MODE_COUNT, // must be last
};
void set_scan_mode(ScanMode new_mode);
void set_workaround_btrfs_send(bool do_avoid);
uint64_t transid_min();
uint64_t transid_max();
void wait_for_transid(const uint64_t count);
};
struct BeesHash {
@@ -636,16 +553,15 @@ struct BeesHash {
BeesHash(Type that) : m_hash(that) { }
operator Type() const { return m_hash; }
BeesHash& operator=(const Type that) { m_hash = that; return *this; }
BeesHash(const uint8_t *ptr, size_t len);
private:
Type m_hash;
};
ostream & operator<<(ostream &os, const BeesHash &bh);
class BeesBlockData {
using Blob = ByteVector;
using Blob = vector<char>;
mutable Fd m_fd;
off_t m_offset;
@@ -653,7 +569,7 @@ class BeesBlockData {
mutable BeesAddress m_addr;
mutable Blob m_data;
mutable BeesHash m_hash;
mutable bool m_hash_done = false;
mutable DefaultBool m_hash_done;
public:
// Constructor with the immutable fields
@@ -686,125 +602,135 @@ class BeesRangePair : public pair<BeesFileRange, BeesFileRange> {
public:
BeesRangePair(const BeesFileRange &src, const BeesFileRange &dst);
bool grow(shared_ptr<BeesContext> ctx, bool constrained);
void shrink_begin(const off_t delta);
void shrink_end(const off_t delta);
BeesRangePair copy_closed() const;
bool operator<(const BeesRangePair &that) const;
friend ostream & operator<<(ostream &os, const BeesRangePair &brp);
};
class BeesWorkQueueBase {
string m_name;
protected:
static mutex s_mutex;
static set<BeesWorkQueueBase *> s_all_workers;
public:
virtual ~BeesWorkQueueBase();
BeesWorkQueueBase(const string &name);
string name() const;
void name(const string &new_name);
virtual size_t active_size() const = 0;
virtual list<string> peek_active(size_t count) const = 0;
static void for_each_work_queue(function<void(BeesWorkQueueBase *)> f);
};
template <class Task>
class BeesWorkQueue : public BeesWorkQueueBase {
WorkQueue<Task> m_active_queue;
public:
BeesWorkQueue(const string &name);
~BeesWorkQueue();
void push_active(const Task &task, size_t limit);
void push_active(const Task &task);
size_t active_size() const override;
list<string> peek_active(size_t count) const override;
Task pop();
};
class BeesTempFile {
shared_ptr<BeesContext> m_ctx;
shared_ptr<BeesRoots> m_roots;
Fd m_fd;
off_t m_end_offset;
void create();
void realign();
void resize(off_t new_end_offset);
public:
~BeesTempFile();
BeesTempFile(shared_ptr<BeesContext> ctx);
BeesFileRange make_hole(off_t count);
BeesFileRange make_copy(const BeesFileRange &src);
void reset();
};
class BeesFdCache {
shared_ptr<BeesContext> m_ctx;
LRUCache<Fd, uint64_t> m_root_cache;
LRUCache<Fd, uint64_t, uint64_t> m_file_cache;
Timer m_root_cache_timer;
Timer m_file_cache_timer;
LRUCache<Fd, shared_ptr<BeesContext>, uint64_t> m_root_cache;
LRUCache<Fd, shared_ptr<BeesContext>, uint64_t, uint64_t> m_file_cache;
Timer m_root_cache_timer;
public:
BeesFdCache(shared_ptr<BeesContext> ctx);
Fd open_root(uint64_t root);
Fd open_root_ino(uint64_t root, uint64_t ino);
void clear();
BeesFdCache();
Fd open_root(shared_ptr<BeesContext> ctx, uint64_t root);
Fd open_root_ino(shared_ptr<BeesContext> ctx, uint64_t root, uint64_t ino);
void insert_root_ino(shared_ptr<BeesContext> ctx, Fd fd);
};
struct BeesResolveAddrResult {
BeesResolveAddrResult();
vector<BtrfsInodeOffsetRoot> m_biors;
bool m_is_toxic = false;
DefaultBool m_is_toxic;
bool is_toxic() const { return m_is_toxic; }
};
class BeesContext : public enable_shared_from_this<BeesContext> {
shared_ptr<BeesContext> m_parent_ctx;
Fd m_home_fd;
shared_ptr<BeesFdCache> m_fd_cache;
shared_ptr<BeesHashTable> m_hash_table;
shared_ptr<BeesRoots> m_roots;
Pool<BeesTempFile> m_tmpfile_pool;
Pool<BtrfsIoctlLogicalInoArgs> m_logical_ino_pool;
map<thread::id, shared_ptr<BeesTempFile>> m_tmpfiles;
LRUCache<BeesResolveAddrResult, BeesAddress> m_resolve_cache;
string m_root_path;
Fd m_root_fd;
string m_root_uuid;
mutable mutex m_blacklist_mutex;
set<BeesFileId> m_blacklist;
string m_uuid;
Timer m_total_timer;
NamedPtr<Exclusion, uint64_t> m_extent_locks;
NamedPtr<Exclusion, uint64_t> m_inode_locks;
mutable mutex m_stop_mutex;
condition_variable m_stop_condvar;
bool m_stop_requested = false;
bool m_stop_status = false;
shared_ptr<BeesThread> m_progress_thread;
shared_ptr<BeesThread> m_status_thread;
mutex m_progress_mtx;
string m_progress_str;
void set_root_fd(Fd fd);
BeesResolveAddrResult resolve_addr_uncached(BeesAddress addr);
void scan_one_extent(const BeesFileRange &bfr, const Extent &e);
BeesFileRange scan_one_extent(const BeesFileRange &bfr, const Extent &e);
void rewrite_file_range(const BeesFileRange &bfr);
public:
BeesContext(shared_ptr<BeesContext> parent_ctx = nullptr);
void set_root_path(string path);
Fd root_fd() const { return m_root_fd; }
Fd home_fd();
Fd home_fd() const { return m_home_fd; }
string root_path() const { return m_root_path; }
string root_uuid() const { return m_root_uuid; }
bool scan_forward(const BeesFileRange &bfr);
BeesFileRange scan_forward(const BeesFileRange &bfr);
shared_ptr<BtrfsIoctlLogicalInoArgs> logical_ino(uint64_t bytenr, bool all_refs);
bool is_root_ro(uint64_t root);
BeesRangePair dup_extent(const BeesFileRange &src, const shared_ptr<BeesTempFile> &tmpfile);
BeesRangePair dup_extent(const BeesFileRange &src);
bool dedup(const BeesRangePair &brp);
void blacklist_insert(const BeesFileId &fid);
void blacklist_erase(const BeesFileId &fid);
void blacklist_add(const BeesFileId &fid);
bool is_blacklisted(const BeesFileId &fid) const;
shared_ptr<Exclusion> get_inode_mutex(uint64_t inode);
BeesResolveAddrResult resolve_addr(BeesAddress addr);
void invalidate_addr(BeesAddress addr);
void resolve_cache_clear();
void dump_status();
void show_progress();
void set_progress(const string &str);
string get_progress();
void start();
void stop();
bool stop_requested() const;
shared_ptr<BeesFdCache> fd_cache();
shared_ptr<BeesHashTable> hash_table();
@@ -812,6 +738,9 @@ public:
shared_ptr<BeesTempFile> tmpfile();
const Timer &total_timer() const { return m_total_timer; }
// TODO: move the rest of the FD cache methods here
void insert_root_ino(Fd fd);
};
class BeesResolver {
@@ -819,25 +748,25 @@ class BeesResolver {
BeesAddress m_addr;
vector<BtrfsInodeOffsetRoot> m_biors;
set<BeesFileRange> m_ranges;
size_t m_bior_count;
unsigned m_bior_count;
// We found matching data, so we can dedupe
bool m_found_data = false;
// We found matching data, so we can dedup
DefaultBool m_found_data;
// We found matching data, so we *did* dedupe
bool m_found_dup = false;
// We found matching data, so we *did* dedup
DefaultBool m_found_dup;
// We found matching hash, so the hash table is still correct
bool m_found_hash = false;
DefaultBool m_found_hash;
// We found matching physical address, so the hash table isn't totally wrong
bool m_found_addr = false;
DefaultBool m_found_addr;
// We found matching physical address, but data did not match
bool m_wrong_data = false;
DefaultBool m_wrong_data;
// The whole thing is a placebo to avoid crippling btrfs performance bugs
bool m_is_toxic = false;
DefaultBool m_is_toxic;
BeesFileRange chase_extent_ref(const BtrfsInodeOffsetRoot &bior, BeesBlockData &needle_bbd);
BeesBlockData adjust_offset(const BeesFileRange &haystack, const BeesBlockData &needle);
@@ -863,7 +792,7 @@ public:
BeesFileRange find_one_match(BeesHash hash);
void replace_src(const BeesFileRange &src_bfr);
BeesRangePair replace_dst(const BeesFileRange &dst_bfr);
BeesFileRange replace_dst(const BeesFileRange &dst_bfr);
bool found_addr() const { return m_found_addr; }
bool found_data() const { return m_found_data; }
@@ -891,16 +820,9 @@ public:
};
// And now, a giant pile of extern declarations
extern int bees_log_level;
extern const char *BEES_USAGE;
extern const char *BEES_VERSION;
extern thread_local default_random_engine bees_generator;
string pretty(double d);
void bees_readahead(int fd, off_t offset, size_t size);
void bees_readahead_pair(int fd, off_t offset, size_t size, int fd2, off_t offset2, size_t size2);
void bees_unreadahead(int fd, off_t offset, size_t size);
void bees_throttle(double time_used, const char *context);
extern RateLimiter bees_info_rate_limit;
void bees_sync(int fd);
string format_time(time_t t);
bool exception_check();
#endif

91
src/fanotify-watch.cc Normal file
View File

@@ -0,0 +1,91 @@
#include <crucible/error.h>
#include <crucible/fd.h>
#include <crucible/ntoa.h>
#include <iostream>
#include <iomanip>
#include <sstream>
#include <string>
#include <unistd.h>
#include <sys/fanotify.h>
using namespace crucible;
using namespace std;
static
void
usage(const char *name)
{
cerr << "Usage: " << name << " directory" << endl;
cerr << "Reports fanotify events from directory" << endl;
}
struct fan_read_block {
struct fanotify_event_metadata fem;
// more here in the future. Maybe.
};
static inline
string
fan_flag_ntoa(uint64_t ui)
{
static const bits_ntoa_table flag_names[] = {
NTOA_TABLE_ENTRY_BITS(FAN_ACCESS),
NTOA_TABLE_ENTRY_BITS(FAN_OPEN),
NTOA_TABLE_ENTRY_BITS(FAN_MODIFY),
NTOA_TABLE_ENTRY_BITS(FAN_CLOSE),
NTOA_TABLE_ENTRY_BITS(FAN_CLOSE_WRITE),
NTOA_TABLE_ENTRY_BITS(FAN_CLOSE_NOWRITE),
NTOA_TABLE_ENTRY_BITS(FAN_Q_OVERFLOW),
NTOA_TABLE_ENTRY_BITS(FAN_ACCESS_PERM),
NTOA_TABLE_ENTRY_BITS(FAN_OPEN_PERM),
NTOA_TABLE_ENTRY_END()
};
return bits_ntoa(ui, flag_names);
}
int
main(int argc, char **argv)
{
if (argc < 1) {
usage(argv[0]);
exit(EXIT_FAILURE);
}
Fd fd;
DIE_IF_MINUS_ONE(fd = fanotify_init(FAN_CLASS_NOTIF, O_RDONLY | O_LARGEFILE | O_CLOEXEC | O_NOATIME));
for (char **argvp = argv + 1; *argvp; ++argvp) {
cerr << "fanotify_mark(" << *argvp << ")..." << flush;
DIE_IF_MINUS_ONE(fanotify_mark(fd, FAN_MARK_ADD | FAN_MARK_MOUNT, FAN_CLOSE_WRITE | FAN_CLOSE_NOWRITE | FAN_OPEN, FAN_NOFD, *argvp));
cerr << endl;
}
while (1) {
struct fan_read_block frb;
read_or_die(fd, frb);
#if 0
cout << "event_len\t= " << frb.fem.event_len << endl;
cout << "vers\t= " << static_cast<int>(frb.fem.vers) << endl;
cout << "reserved\t= " << static_cast<int>(frb.fem.reserved) << endl;
cout << "metadata_len\t= " << frb.fem.metadata_len << endl;
cout << "mask\t= " << hex << frb.fem.mask << dec << "\t" << fan_flag_ntoa(frb.fem.mask) << endl;
cout << "fd\t= " << frb.fem.fd << endl;
cout << "pid\t= " << frb.fem.pid << endl;
#endif
cout << "flags " << fan_flag_ntoa(frb.fem.mask) << " pid " << frb.fem.pid << ' ' << flush;
Fd event_fd(frb.fem.fd);
ostringstream oss;
oss << "/proc/self/fd/" << event_fd;
cout << "file " << readlink_or_die(oss.str()) << endl;
// cout << endl;
}
return EXIT_SUCCESS;
}

52
src/fiemap.cc Normal file
View File

@@ -0,0 +1,52 @@
#include "crucible/fd.h"
#include "crucible/fs.h"
#include "crucible/error.h"
#include "crucible/string.h"
#include <iostream>
#include <fcntl.h>
#include <sys/stat.h>
#include <unistd.h>
using namespace crucible;
using namespace std;
int
main(int argc, char **argv)
{
catch_all([&]() {
THROW_CHECK1(invalid_argument, argc, argc > 1);
string filename = argv[1];
cout << "File: " << filename << endl;
Fd fd = open_or_die(filename, O_RDONLY);
Fiemap fm;
fm.m_max_count = 100;
if (argc > 2) { fm.fm_start = stoull(argv[2], nullptr, 0); }
if (argc > 3) { fm.fm_length = stoull(argv[3], nullptr, 0); }
if (argc > 4) { fm.fm_flags = stoull(argv[4], nullptr, 0); }
fm.fm_length = min(fm.fm_length, FIEMAP_MAX_OFFSET - fm.fm_start);
uint64_t stop_at = fm.fm_start + fm.fm_length;
uint64_t last_byte = fm.fm_start;
do {
fm.do_ioctl(fd);
// cerr << fm;
uint64_t last_logical = FIEMAP_MAX_OFFSET;
for (auto &extent : fm.m_extents) {
if (extent.fe_logical > last_byte) {
cout << "Log " << to_hex(last_byte) << ".." << to_hex(extent.fe_logical) << " Hole" << endl;
}
cout << "Log " << to_hex(extent.fe_logical) << ".." << to_hex(extent.fe_logical + extent.fe_length)
<< " Phy " << to_hex(extent.fe_physical) << ".." << to_hex(extent.fe_physical + extent.fe_length)
<< " Flags " << fiemap_extent_flags_ntoa(extent.fe_flags) << endl;
last_logical = extent.fe_logical + extent.fe_length;
last_byte = last_logical;
}
fm.fm_start = last_logical;
} while (fm.fm_start < stop_at);
});
exit(EXIT_SUCCESS);
}

40
src/fiewalk.cc Normal file
View File

@@ -0,0 +1,40 @@
#include "crucible/extentwalker.h"
#include "crucible/error.h"
#include "crucible/string.h"
#include <iostream>
#include <fcntl.h>
#include <unistd.h>
using namespace crucible;
using namespace std;
int
main(int argc, char **argv)
{
catch_all([&]() {
THROW_CHECK1(invalid_argument, argc, argc > 1);
string filename = argv[1];
cout << "File: " << filename << endl;
Fd fd = open_or_die(filename, O_RDONLY);
BtrfsExtentWalker ew(fd);
off_t pos = 0;
if (argc > 2) { pos = stoull(argv[2], nullptr, 0); }
ew.seek(pos);
do {
// cout << "\n\n>>>" << ew.current() << "<<<\n\n" << endl;
cout << ew.current() << endl;
} while (ew.next());
#if 0
cout << "\n\n\nAnd now, backwards...\n\n\n" << endl;
do {
cout << "\n\n>>>" << ew.current() << "<<<\n\n" << endl;
} while (ew.prev());
cout << "\n\n\nDone!\n\n\n" << endl;
#endif
});
exit(EXIT_SUCCESS);
}

Some files were not shown because too many files have changed in this diff Show More