1
0
mirror of https://github.com/Zygo/bees.git synced 2025-08-03 14:23:29 +02:00

4 Commits
v0.8 ... v0.7.y

Author SHA1 Message Date
KhalilSantana
27857406f5 Fixes a bad grep pattern caused by dffd6e0
Fixes #233
2022-10-13 16:32:48 -04:00
Khalil Santana
b44ed287dd Get rid of errors by using grep -E
"egrep: warning: egrep is obsolescent; using grep -E"
2022-10-05 22:36:33 -03:00
Ayla Ounce
20c469245c Fix beesd script arg parsing to respect PREFIX
Without this, if you install to a different PREFIX such as /usr/local
it will fail to recognize any arguments and if you use the systemd unit,
that makes --no-timestamps the first NOT_SUPPORTED_ARG which will get
passed to uuidparse, which doesn't recognize it and errors.
2022-10-05 22:36:33 -03:00
Javi Vilarroig
77cf2d794e Minimal changes in beesd script to make it functional in my system 2022-10-05 22:36:33 -03:00
37 changed files with 550 additions and 755 deletions

3
.gitignore vendored
View File

@@ -1,8 +1,7 @@
*.[ao]
*.bak
*.dep
*.new
*.tmp
*.dep
*.so*
Doxyfile
README.html

View File

@@ -61,7 +61,7 @@ install_bees: src $(RUN_INSTALL_TESTS)
install_scripts: ## Install scipts
install_scripts: scripts
install -Dm755 scripts/beesd $(DESTDIR)$(PREFIX)/sbin/beesd
install -Dm644 scripts/beesd.conf.sample $(DESTDIR)$(ETC_PREFIX)/bees/beesd.conf.sample
install -Dm644 scripts/beesd.conf.sample $(DESTDIR)/$(ETC_PREFIX)/bees/beesd.conf.sample
ifneq ($(SYSTEMD_SYSTEM_UNIT_DIR),)
install -Dm644 scripts/beesd@.service $(DESTDIR)$(SYSTEMD_SYSTEM_UNIT_DIR)/beesd@.service
endif

View File

@@ -17,7 +17,7 @@ Strengths
* Space-efficient hash table and matching algorithms - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
* Daemon incrementally dedupes new data using btrfs tree search
* Works with btrfs compression - dedupe any combination of compressed and uncompressed files
* **NEW** [Works around `btrfs send` problems with dedupe and incremental parent snapshots](docs/options.md)
* **NEW** [Works around `btrfs send` problems with dedupe and incremental parent shapshots](docs/options.md)
* Works around btrfs filesystem structure to free more disk space
* Persistent hash table for rapid restart after shutdown
* Whole-filesystem dedupe - including snapshots
@@ -70,6 +70,6 @@ You can also use Github:
Copyright & License
-------------------
Copyright 2015-2022 Zygo Blaxell <bees@furryterror.org>.
Copyright 2015-2018 Zygo Blaxell <bees@furryterror.org>.
GPL (version 3 or later).

View File

@@ -9,7 +9,7 @@ This issue is fixed in kernel 5.4.14 and later.
**Recommended kernel versions for bees are 4.19, 5.4, 5.10, 5.11, or 5.12,
with recent LTS and -stable updates.** The latest released kernel as
of this writing is 5.18.18.
of this writing is 5.12.3.
4.14, 4.9, and 4.4 LTS kernels with recent updates are OK with
some issues. Older kernels will be slower (a little slower or a lot
@@ -31,7 +31,7 @@ In some future bees release, this API version may become mandatory.
Kernel Bug Tracking Table
-------------------------
These bugs are particularly popular among bees users, though not all are specifically relevant to bees:
These bugs are particularly popular among bees users:
| First bad kernel | Last bad kernel | Issue Description | Fixed Kernel Versions | Fix Commit
| :---: | :---: | --- | :---: | ---
@@ -61,11 +61,7 @@ These bugs are particularly popular among bees users, though not all are specifi
| 5.4 | 5.11 | spurious tree checker failures on extent ref hash | 5.11.5, 5.12 and later | 1119a72e223f btrfs: tree-checker: do not error out if extent ref hash doesn't match
| - | 5.11 | tree mod log issue #5 | 4.4.263, 4.9.263, 4.14.227, 4.19.183, 5.4.108, 5.10.26, 5.11.9, 5.12 and later | dbcc7d57bffc btrfs: fix race when cloning extent buffer during rewind of an old root
| - | 5.12 | tree mod log issue #6 | 4.14.233, 4.19.191, 5.4.118, 5.10.36, 5.11.20, 5.12.3, 5.13 and later | f9690f426b21 btrfs: fix race when picking most recent mod log operation for an old root
| 4.15 | 5.16 | spurious warnings from `fs/fs-writeback.c` when `flushoncommit` is enabled | 5.15.27, 5.16.13, 5.17 and later | a0f0cf8341e3 btrfs: get rid of warning on transaction commit when using flushoncommit
| - | 5.17 | crash during device removal can make filesystem unmountable | 5.15.54, 5.16.20, 5.17.3, 5.18 and later | bbac58698a55 btrfs: remove device item and update super block in the same transaction
| - | 5.18 | wrong superblock num_devices makes filesystem unmountable | 4.14.283, 4.19.247, 5.4.198, 5.10.121, 5.15.46, 5.17.14, 5.18.3, 5.19 and later | d201238ccd2f btrfs: repair super block num_devices automatically
| 5.18 | 5.19 | parent transid verify failed during log tree replay after a crash during a rename operation | 5.18.18, 5.19.2, 6.0 and later | 723df2bcc9e1 btrfs: join running log transaction when logging new name
| 5.4 | - | kernel hang when multiple threads are running `LOGICAL_INO` and dedupe ioctl | - | workaround: reduce bees thread count to 1 with `-c1`
| 4.15 | - | spurious warnings from `fs/fs-writeback.c` when `flushoncommit` is enabled | - | workaround: comment out the `WARN_ON`
"Last bad kernel" refers to that version's last stable update from
kernel.org. Distro kernels may backport additional fixes. Consult
@@ -81,7 +77,7 @@ A "-" for "first bad kernel" indicates the bug has been present since
the relevant feature first appeared in btrfs.
A "-" for "last bad kernel" indicates the bug has not yet been fixed as
of 5.18.18.
of 5.8.14.
In cases where issues are fixed by commits spread out over multiple
kernel versions, "fixed kernel version" refers to the version that
@@ -91,11 +87,6 @@ contains all components of the fix.
Workarounds for known kernel bugs
---------------------------------
* **Hangs with high worker thread counts**: On kernels newer than
5.4, multiple threads running `LOGICAL_INO` and dedupe ioctls
at the same time can lead to a kernel hang. The workaround is
to reduce the thread count to 1 with `-c1`.
* **Tree mod log issues**: bees will detect that a btrfs balance is
running, and pause bees activity until the balance is done. This avoids
running both the `LOGICAL_INO` ioctl and btrfs balance at the same time,
@@ -137,7 +128,7 @@ Workarounds for known kernel bugs
Unfixed kernel bugs
-------------------
As of 5.18.18:
As of 5.12.3:
* **The kernel does not permit `btrfs send` and dedupe to run at the
same time**. Recent kernels no longer crash, but now refuse one
@@ -160,3 +151,22 @@ As of 5.18.18:
still saves some IO.
`btrfs receive` is not affected by this issue.
* **Spurious warnings in `fs/fs-writeback.c`** on kernel 4.15 and later
when filesystem is mounted with `flushoncommit`. These
seem to be harmless (there are other locks which prevent
concurrent umount of the filesystem), but the underlying
problems that trigger the `WARN_ON` are [not trivial to
fix](https://www.spinics.net/lists/linux-btrfs/msg87752.html).
The warnings can be especially voluminous when bees is running.
Workarounds:
1. mount with `-o noflushoncommit`
2. patch kernel to remove warning in `fs/fs-writeback.c`.
Note that using kernels 4.14 and earlier is *not* a viable workaround
for this issue, because kernels 4.14 and earlier will eventually
deadlock when a filesystem is mounted with `-o flushoncommit` (a single
commit fixes one bug and introduces the other).

View File

@@ -67,12 +67,11 @@ The `adjust` event group consists of operations related to translating stored vi
* `adjust_exact`: A block address from the hash table corresponding to an uncompressed data block was processed to find its `(root, inode, offset)` references.
* `adjust_exact_correct`: A block address corresponding to an uncompressed block was retrieved from the hash table and resolved to a physical block containing data that matches another block bees has already read.
* `adjust_exact_wrong`: A block address corresponding to an uncompressed block was retrieved from the hash table and resolved to a physical block containing data that matches the hash but not the data from another block bees has already read (i.e. there was a hash collision).
* `adjust_hit`: A block address was retrieved from the hash table and resolved to a physical block in an uncompressed extent containing data that matches the data from another block bees has already read (i.e. a duplicate match was found).
* `adjust_hit`: A block address was retrieved from the hash table and resolved to a physical block containing data that matches the data from another block bees has already read (i.e. a duplicate match was found).
* `adjust_miss`: A block address was retrieved from the hash table and resolved to a physical block containing a hash that does not match the hash from another block bees has already read (i.e. the hash table contained a stale entry and the data it referred to has since been overwritten in the filesystem).
* `adjust_needle_too_long`: A block address was retrieved from the hash table, but when the corresponding extent item was retrieved, its offset or length were out of range to be a match (i.e. the hash table contained a stale entry and the data it referred to has since been overwritten in the filesystem).
* `adjust_no_match`: A hash collision occurred (i.e. a block on disk was located with the same hash as the hash table entry but different data) . Effectively an alias for `hash_collision` as it is not possible to have one event without the other.
* `adjust_offset_high`: The `LOGICAL_INO` ioctl gave an extent item that does not overlap with the desired block because the extent item ends before the desired block in the extent data.
* `adjust_offset_hit`: A block address was retrieved from the hash table and resolved to a physical block in a compressed extent containing data that matches the data from another block bees has already read (i.e. a duplicate match was found).
* `adjust_offset_low`: The `LOGICAL_INO` ioctl gave an extent item that does not overlap with the desired block because the extent item begins after the desired block in the extent data.
* `adjust_try`: A block address and extent item candidate were passed to `BeesResolver::adjust_offset` for processing.
@@ -300,7 +299,6 @@ The `resolve` event group consists of operations related to translating a btrfs
* `resolve_large`: The `LOGICAL_INO` ioctl returned more than 2730 results (the limit of the v1 ioctl).
* `resolve_ms`: Total time spent in the `LOGICAL_INO` ioctl (i.e. wallclock time, not kernel CPU time).
* `resolve_ok`: The `LOGICAL_INO` ioctl returned success.
* `resolve_overflow`: The `LOGICAL_INO` ioctl returned more than 655050 extents (the limit of the v2 ioctl).
* `resolve_toxic`: The `LOGICAL_INO` ioctl took more than 0.1 seconds of kernel CPU time.
root
@@ -335,7 +333,6 @@ The `scan` event group consists of operations related to scanning incoming data.
* `scan_eof`: Scan past EOF was attempted.
* `scan_erase_redundant`: Blocks in the hash table were removed because they were removed from the filesystem by dedupe.
* `scan_extent`: An extent was scanned (`scan_one_extent`).
* `scan_extent_tiny`: An extent below 128K that was not the beginning or end of a file was scanned. No action is currently taken for these--they are merely counted.
* `scan_forward`: A logical byte range was scanned (`scan_forward`).
* `scan_found`: An entry was found in the hash table matching a scanned block from the filesystem.
* `scan_hash_hit`: A block was found on the filesystem corresponding to a block found in the hash table.

View File

@@ -45,7 +45,7 @@ bees will loop billions of times considering all possibilities. This is
a waste of time, so an exception is currently used to break out of such
loops early. The exception text in this case is:
`FIXME: too many duplicate candidates, bailing out here`
`FIXME: bailing out here, need to fix this further up the call stack`
Terminating bees with SIGTERM

View File

@@ -17,7 +17,7 @@ Strengths
* Space-efficient hash table and matching algorithms - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
* Daemon incrementally dedupes new data using btrfs tree search
* Works with btrfs compression - dedupe any combination of compressed and uncompressed files
* **NEW** [Works around `btrfs send` problems with dedupe and incremental parent snapshots](options.md)
* **NEW** [Works around `btrfs send` problems with dedupe and incremental parent shapshots](options.md)
* Works around btrfs filesystem structure to free more disk space
* Persistent hash table for rapid restart after shutdown
* Whole-filesystem dedupe - including snapshots
@@ -70,6 +70,6 @@ You can also use Github:
Copyright & License
-------------------
Copyright 2015-2022 Zygo Blaxell <bees@furryterror.org>.
Copyright 2015-2018 Zygo Blaxell <bees@furryterror.org>.
GPL (version 3 or later).

View File

@@ -216,28 +216,7 @@ enum btrfs_compression_type {
#define BTRFS_FS_INFO_FLAG_CSUM_INFO (1 << 0)
#endif
#ifndef BTRFS_FS_INFO_FLAG_GENERATION
/* Request information about filesystem generation */
#define BTRFS_FS_INFO_FLAG_GENERATION (1 << 1)
#endif
#ifndef BTRFS_FS_INFO_FLAG_METADATA_UUID
/* Request information about filesystem metadata UUID */
#define BTRFS_FS_INFO_FLAG_METADATA_UUID (1 << 2)
#endif
// BTRFS_CSUM_TYPE_CRC32 was a #define from 2008 to 2019.
// After that, it's an enum with the other 3 types.
// So if we do _not_ have CRC32 defined, it means we have the other 3;
// if we _do_ have CRC32 defined, it means we need the other 3.
// This seems likely to break some day.
#ifdef BTRFS_CSUM_TYPE_CRC32
#define BTRFS_CSUM_TYPE_XXHASH 1
#define BTRFS_CSUM_TYPE_SHA256 2
#define BTRFS_CSUM_TYPE_BLAKE2 3
#endif
struct btrfs_ioctl_fs_info_args_v3 {
struct btrfs_ioctl_fs_info_args_v2 {
__u64 max_id; /* out */
__u64 num_devices; /* out */
__u8 fsid[BTRFS_FSID_SIZE]; /* out */
@@ -248,9 +227,7 @@ struct btrfs_ioctl_fs_info_args_v3 {
__u16 csum_type; /* out */
__u16 csum_size; /* out */
__u64 flags; /* in/out */
__u64 generation; /* out */
__u8 metadata_uuid[BTRFS_FSID_SIZE]; /* out */
__u8 reserved[944]; /* pad to 1k */
__u8 reserved[968]; /* pad to 1k */
};
#endif // CRUCIBLE_BTRFS_H

View File

@@ -1,71 +0,0 @@
#ifndef _CRUCIBLE_BYTEVECTOR_H_
#define _CRUCIBLE_BYTEVECTOR_H_
#include <memory>
#include <cstdint>
#include <cstdlib>
namespace crucible {
using namespace std;
// new[] is a little slower than malloc
// shared_ptr is about 2x slower than unique_ptr
// vector<uint8_t> is ~160x slower
// so we won't bother with unique_ptr because we can't do shared copies with it
class ByteVector {
public:
using Pointer = shared_ptr<uint8_t>;
using value_type = Pointer::element_type;
using iterator = value_type*;
ByteVector() = default;
ByteVector(size_t size);
ByteVector(const ByteVector &that, size_t start, size_t length);
ByteVector(iterator begin, iterator end, size_t min_size = 0);
ByteVector at(size_t start, size_t length) const;
value_type& at(size_t) const;
iterator begin() const;
void clear();
value_type* data() const;
bool empty() const;
iterator end() const;
value_type& operator[](size_t) const;
size_t size() const;
bool operator==(const ByteVector &that) const;
// this version of erase only works at the beginning or end of the buffer, else throws exception
void erase(iterator first);
void erase(iterator first, iterator last);
// An important use case is ioctls that have a fixed-size header struct
// followed by a buffer for further arguments. These templates avoid
// doing reinterpret_casts every time.
template <class T> ByteVector(const T& object, size_t min_size);
template <class T> T* get() const;
private:
Pointer m_ptr;
size_t m_size = 0;
};
template <class T>
ByteVector::ByteVector(const T& object, size_t min_size)
{
const auto size = max(min_size, sizeof(T));
m_ptr = Pointer(static_cast<value_type*>(malloc(size)), free);
memcpy(m_ptr.get(), &object, sizeof(T));
m_size = size;
}
template <class T>
T*
ByteVector::get() const
{
return reinterpret_cast<T*>(data());
}
}
#endif // _CRUCIBLE_BYTEVECTOR_H_

View File

@@ -28,7 +28,7 @@ namespace crucible {
};
template<> struct le_to_cpu_helper<uint16_t> {
uint16_t operator()(const uint16_t v) { return le16toh(v); }
uint16_t operator()(const uint16_t v) { return le64toh(v); }
};
template<> struct le_to_cpu_helper<uint8_t> {

View File

@@ -126,13 +126,6 @@ namespace crucible {
} \
} while(0)
#define THROW_CHECK4(type, value1, value2, value3, value4, expr) do { \
if (!(expr)) { \
THROW_ERROR(type, #value1 << " = " << (value1) << ", " #value2 << " = " << (value2) << ", " #value3 << " = " << (value3) << ", " #value4 << " = " << (value4) \
<< " failed constraint check (" << #expr << ")"); \
} \
} while(0)
#define THROW_CHECK_BIN_OP(type, value1, op, value2) do { \
if (!((value1) op (value2))) { \
THROW_ERROR(type, "failed constraint check " << #value1 << " (" << (value1) << ") " << #op << " " << #value2 << " (" << (value2) << ")"); \

View File

@@ -42,6 +42,9 @@ namespace crucible {
uint64_t bytenr() const;
bool operator==(const Extent &that) const;
bool operator!=(const Extent &that) const { return !(*this == that); }
Extent() = default;
Extent(const Extent &e) = default;
};
class ExtentWalker {

View File

@@ -1,7 +1,6 @@
#ifndef CRUCIBLE_FD_H
#define CRUCIBLE_FD_H
#include "crucible/bytevector.h"
#include "crucible/namedptr.h"
#include <cstring>
@@ -126,14 +125,11 @@ namespace crucible {
// Specialization for strings which reads/writes the string content, not the struct string
template<> void write_or_die<string>(int fd, const string& str);
template<> void pread_or_die<string>(int fd, string& str, off_t offset);
template<> void pread_or_die<vector<char>>(int fd, vector<char>& str, off_t offset);
template<> void pread_or_die<vector<uint8_t>>(int fd, vector<uint8_t>& str, off_t offset);
template<> void pwrite_or_die<string>(int fd, const string& str, off_t offset);
template<> void pread_or_die<ByteVector>(int fd, ByteVector& str, off_t offset);
template<> void pwrite_or_die<ByteVector>(int fd, const ByteVector& str, off_t offset);
// Deprecated
template<> void pread_or_die<vector<uint8_t>>(int fd, vector<uint8_t>& str, off_t offset) = delete;
template<> void pwrite_or_die<vector<uint8_t>>(int fd, const vector<uint8_t>& str, off_t offset) = delete;
template<> void pread_or_die<vector<char>>(int fd, vector<char>& str, off_t offset) = delete;
template<> void pwrite_or_die<vector<char>>(int fd, const vector<char>& str, off_t offset) = delete;
template<> void pwrite_or_die<vector<char>>(int fd, const vector<char>& str, off_t offset);
template<> void pwrite_or_die<vector<uint8_t>>(int fd, const vector<uint8_t>& str, off_t offset);
// A different approach to reading a simple string
string read_string(int fd, size_t size);

View File

@@ -1,9 +1,9 @@
#ifndef CRUCIBLE_FS_H
#define CRUCIBLE_FS_H
#include "crucible/bytevector.h"
#include "crucible/endian.h"
#include "crucible/error.h"
#include "crucible/spanner.h"
// Terribly Linux-specific FS-wrangling functions
@@ -31,14 +31,12 @@ namespace crucible {
BtrfsExtentInfo(int dst_fd, off_t dst_offset);
};
struct BtrfsExtentSame {
struct BtrfsExtentSame : public btrfs_ioctl_same_args {
virtual ~BtrfsExtentSame();
BtrfsExtentSame(int src_fd, off_t src_offset, off_t src_length);
void add(int fd, off_t offset);
virtual void do_ioctl();
uint64_t m_logical_offset = 0;
uint64_t m_length = 0;
int m_fd;
vector<BtrfsExtentInfo> m_info;
};
@@ -55,17 +53,17 @@ namespace crucible {
ostream & operator<<(ostream &os, const BtrfsInodeOffsetRoot &p);
struct BtrfsDataContainer {
struct BtrfsDataContainer : public btrfs_data_container {
BtrfsDataContainer(size_t size = 64 * 1024);
void *prepare(size_t size);
size_t get_size() const;
decltype(btrfs_data_container::bytes_left) get_bytes_left() const;
decltype(btrfs_data_container::bytes_missing) get_bytes_missing() const;
decltype(btrfs_data_container::elem_cnt) get_elem_cnt() const;
decltype(btrfs_data_container::elem_missed) get_elem_missed() const;
decltype(bytes_left) get_bytes_left() const;
decltype(bytes_missing) get_bytes_missing() const;
decltype(elem_cnt) get_elem_cnt() const;
decltype(elem_missed) get_elem_missed() const;
ByteVector m_data;
vector<uint8_t> m_data;
};
struct BtrfsIoctlLogicalInoArgs : public btrfs_ioctl_logical_ino_args {
@@ -143,26 +141,16 @@ namespace crucible {
off_t end() const;
};
struct Fiemap {
// because fiemap.h insists on giving FIEMAP_MAX_OFFSET
// a different type from the struct fiemap members
static const uint64_t s_fiemap_max_offset = FIEMAP_MAX_OFFSET;
struct Fiemap : public fiemap {
// Get entire file
Fiemap(uint64_t start = 0, uint64_t length = s_fiemap_max_offset);
Fiemap(uint64_t start = 0, uint64_t length = FIEMAP_MAX_OFFSET);
void do_ioctl(int fd);
vector<FiemapExtent> m_extents;
decltype(fiemap::fm_extent_count) m_min_count = (4096 - sizeof(fiemap)) / sizeof(fiemap_extent);
decltype(fiemap::fm_extent_count) m_max_count = 16 * 1024 * 1024 / sizeof(fiemap_extent);
uint64_t m_start;
uint64_t m_length;
// FIEMAP is slow and full of lies.
// This makes FIEMAP even slower, but reduces the lies a little.
decltype(fiemap::fm_flags) m_flags = FIEMAP_FLAG_SYNC;
friend ostream &operator<<(ostream &, const Fiemap &);
uint64_t m_min_count = (4096 - sizeof(fiemap)) / sizeof(fiemap_extent);
uint64_t m_max_count = 16 * 1024 * 1024 / sizeof(fiemap_extent);
};
ostream & operator<<(ostream &os, const fiemap_extent *info);
@@ -178,8 +166,8 @@ namespace crucible {
struct BtrfsIoctlSearchHeader : public btrfs_ioctl_search_header {
BtrfsIoctlSearchHeader();
ByteVector m_data;
size_t set_data(const ByteVector &v, size_t offset);
Spanner<const uint8_t> m_data;
size_t set_data(const vector<uint8_t> &v, size_t offset);
bool operator<(const BtrfsIoctlSearchHeader &that) const;
};
@@ -193,18 +181,17 @@ namespace crucible {
ostream & operator<<(ostream &os, const BtrfsIoctlSearchHeader &hdr);
struct BtrfsIoctlSearchKey : public btrfs_ioctl_search_key {
BtrfsIoctlSearchKey(size_t buf_size = 1024);
bool do_ioctl_nothrow(int fd);
void do_ioctl(int fd);
BtrfsIoctlSearchKey(size_t buf_size = 4096);
virtual bool do_ioctl_nothrow(int fd);
virtual void do_ioctl(int fd);
// Copy objectid/type/offset so we move forward
void next_min(const BtrfsIoctlSearchHeader& ref);
// move forward to next object of a single type
void next_min(const BtrfsIoctlSearchHeader& ref, const uint8_t type);
size_t m_buf_size;
vector<uint8_t> m_ioctl_arg;
set<BtrfsIoctlSearchHeader> m_result;
};
ostream & operator<<(ostream &os, const btrfs_ioctl_search_key &key);
@@ -248,12 +235,11 @@ namespace crucible {
template<class V> ostream &hexdump(ostream &os, const V &v);
struct BtrfsIoctlFsInfoArgs : public btrfs_ioctl_fs_info_args_v3 {
struct BtrfsIoctlFsInfoArgs : public btrfs_ioctl_fs_info_args_v2 {
BtrfsIoctlFsInfoArgs();
void do_ioctl(int fd);
uint16_t csum_type() const;
uint16_t csum_size() const;
uint64_t generation() const;
};
ostream & operator<<(ostream &os, const BtrfsIoctlFsInfoArgs &a);

View File

@@ -82,7 +82,7 @@ namespace crucible {
// "our" map entry if it exists and is expired. The other
// thread would have done the same for us if the race had
// a different winner.
const auto found = m_map_rep->m_map.find(m_ret_key);
auto found = m_map_rep->m_map.find(m_ret_key);
if (found != m_map_rep->m_map.end() && found->second.expired()) {
m_map_rep->m_map.erase(found);
}
@@ -93,10 +93,10 @@ namespace crucible {
NamedPtr<Return, Arguments...>::lookup_item(const Key &k)
{
// Must be called with lock held
const auto found = m_map_rep->m_map.find(k);
auto found = m_map_rep->m_map.find(k);
if (found != m_map_rep->m_map.end()) {
// Get the strong pointer back
const auto rv = found->second.lock();
auto rv = found->second.lock();
if (rv) {
// Have strong pointer. Return value that shares map entry.
return shared_ptr<Return>(rv, rv->m_ret_ptr.get());
@@ -116,36 +116,34 @@ namespace crucible {
Key k(args...);
// Is it already in the map?
unique_lock<mutex> lock_lookup(m_map_rep->m_mutex);
unique_lock<mutex> lock(m_map_rep->m_mutex);
auto rv = lookup_item(k);
if (rv) {
return rv;
}
// Release map lock and acquire key lock
lock_lookup.unlock();
const auto key_lock = m_lockset.make_lock(k);
lock.unlock();
auto key_lock = m_lockset.make_lock(k);
// Did item appear in map while we were waiting for key?
lock_lookup.lock();
lock.lock();
rv = lookup_item(k);
if (rv) {
return rv;
}
// We now hold key and index locks, but item not in map (or expired).
// Release map lock so other threads can use the map
lock_lookup.unlock();
// Call the function and create a new Value outside of the map
const auto new_value_ptr = make_shared<Value>(fn(args...), k, m_map_rep);
// Release map lock
lock.unlock();
// Call the function and create a new Value
auto new_value_ptr = make_shared<Value>(fn(args...), k, m_map_rep);
// Function must return a non-null pointer
THROW_CHECK0(runtime_error, new_value_ptr->m_ret_ptr);
// Reacquire index lock for map insertion. We still hold the key lock.
// Use a different lock object to make exceptions unlock in the right order
unique_lock<mutex> lock_insert(m_map_rep->m_mutex);
// Reacquire index lock for map insertion
lock.lock();
// Insert return value in map or overwrite existing
// empty or expired weak_ptr value.
@@ -160,13 +158,14 @@ namespace crucible {
// to find and fix.
assert(new_item_ref.expired());
// Update the map slot we are sure is empty
// Update the empty map slot
new_item_ref = new_value_ptr;
// Drop lock so we don't deadlock in constructor exceptions
lock.unlock();
// Return shared_ptr to Return using strong pointer's reference counter
return shared_ptr<Return>(new_value_ptr, new_value_ptr->m_ret_ptr.get());
// Release map lock, then key lock
}
template <class Return, class... Arguments>
@@ -189,7 +188,7 @@ namespace crucible {
NamedPtr<Return, Arguments...>::insert(const Ptr &r, Arguments... args)
{
THROW_CHECK0(invalid_argument, r);
return insert_item([&](Arguments...) { return r; }, args...);
return insert_item([&](Arguments...) -> Ptr { return r; }, args...);
}
}

View File

@@ -20,8 +20,8 @@ namespace crucible {
using ProgressHolder = shared_ptr<ProgressHolderState>;
ProgressTracker(const value_type &v);
value_type begin() const;
value_type end() const;
value_type begin();
value_type end();
ProgressHolder hold(const value_type &v);
@@ -51,7 +51,7 @@ namespace crucible {
template <class T>
typename ProgressTracker<T>::value_type
ProgressTracker<T>::begin() const
ProgressTracker<T>::begin()
{
unique_lock<mutex> lock(m_state->m_mutex);
return m_state->m_begin;
@@ -59,7 +59,7 @@ namespace crucible {
template <class T>
typename ProgressTracker<T>::value_type
ProgressTracker<T>::end() const
ProgressTracker<T>::end()
{
unique_lock<mutex> lock(m_state->m_mutex);
return m_state->m_end;

167
include/crucible/spanner.h Normal file
View File

@@ -0,0 +1,167 @@
#ifndef CRUCIBLE_SPANNER_H
#define CRUCIBLE_SPANNER_H
#include "crucible/error.h"
#include <memory>
namespace crucible {
using namespace std;
// C++20 is already using the name "span" for something similar.
template <class T, class Head = T*, class Iter = Head>
class Spanner {
public:
using iterator = Iter;
using head_pointer = Head;
using value_type = T;
template <class Container>
Spanner(Container& container);
Spanner(head_pointer begin, iterator end);
Spanner(size_t size, head_pointer begin);
Spanner() = default;
Spanner &operator=(const Spanner &that) = default;
iterator begin() const;
iterator end() const;
value_type *data() const;
value_type &at(size_t n) const;
size_t size() const;
bool empty() const;
void clear();
value_type &operator[](size_t n) const;
iterator erase(iterator first, iterator last);
iterator erase(iterator first);
private:
head_pointer m_begin;
size_t m_size;
};
template <class Container, class Head = typename Container::value_type *, class Iter = Head>
Spanner<typename Container::value_type, Head, Iter> make_spanner(Container &container)
{
return Spanner<typename Container::value_type, Head, Iter>(container);
}
// This template is an attempt to turn a shared_ptr to a container
// into a range view that can be cheaply passed around.
// It probably doesn't quite work in the general case.
template <class Container, class Head = shared_ptr<typename Container::value_type>, class Iter = typename Container::value_type *>
Spanner<typename Container::value_type, Head, Iter> make_spanner(shared_ptr<Container> &cont_ptr)
{
shared_ptr<typename Container::value_type> head(cont_ptr, cont_ptr->data());
size_t const size = cont_ptr->size();
return Spanner<typename Container::value_type, Head, Iter>(size, head);
}
template <class T, class Head, class Iter>
template <class Container>
Spanner<T, Head, Iter>::Spanner(Container &container) :
m_begin(container.data()),
m_size(container.size())
{
}
template <class T, class Head, class Iter>
Spanner<T, Head, Iter>::Spanner(head_pointer begin, iterator end) :
m_begin(begin),
m_size(end - begin)
{
}
template <class T, class Head, class Iter>
Spanner<T, Head, Iter>::Spanner(size_t size, head_pointer begin) :
m_begin(begin),
m_size(size)
{
}
template <class T, class Head, class Iter>
typename Spanner<T, Head, Iter>::iterator
Spanner<T, Head, Iter>::erase(iterator first, iterator last)
{
auto end = m_begin + m_size;
if (first == m_begin) {
THROW_CHECK0(invalid_argument, last <= end);
m_begin = last;
return last;
}
if (last == end) {
THROW_CHECK0(invalid_argument, m_begin <= first);
m_size = first - m_begin;
return first;
}
THROW_ERROR(invalid_argument, "first != begin() and last != end()");
}
template <class T, class Head, class Iter>
typename Spanner<T, Head, Iter>::iterator
Spanner<T, Head, Iter>::erase(iterator first)
{
return erase(first, first + 1);
}
template <class T, class Head, class Iter>
typename Spanner<T, Head, Iter>::value_type &
Spanner<T, Head, Iter>::operator[](size_t n) const
{
return at(n);
}
template <class T, class Head, class Iter>
void
Spanner<T, Head, Iter>::clear()
{
m_begin = head_pointer();
m_size = 0;
}
template <class T, class Head, class Iter>
bool
Spanner<T, Head, Iter>::empty() const
{
return m_size == 0;
}
template <class T, class Head, class Iter>
size_t
Spanner<T, Head, Iter>::size() const
{
return m_size;
}
template <class T, class Head, class Iter>
typename Spanner<T, Head, Iter>::value_type *
Spanner<T, Head, Iter>::data() const
{
return &(*m_begin);
}
template <class T, class Head, class Iter>
typename Spanner<T, Head, Iter>::iterator
Spanner<T, Head, Iter>::begin() const
{
return data();
}
template <class T, class Head, class Iter>
typename Spanner<T, Head, Iter>::iterator
Spanner<T, Head, Iter>::end() const
{
return data() + m_size;
}
template <class T, class Head, class Iter>
typename Spanner<T, Head, Iter>::value_type &
Spanner<T, Head, Iter>::at(size_t n) const
{
THROW_CHECK2(out_of_range, n, size(), n < size());
return *(data() + n);
}
}
#endif // CRUCIBLE_SPANNER_H

View File

@@ -11,6 +11,23 @@
namespace crucible {
using namespace std;
// Zero-initialize a base class object (usually a C struct)
template <class Base>
void
memset_zero(Base *that)
{
memset(that, 0, sizeof(Base));
}
// Copy a base class object (usually a C struct) into a vector<uint8_t>
template <class Base>
vector<uint8_t>
vector_copy_struct(Base *that)
{
const uint8_t *begin_that = reinterpret_cast<const uint8_t *>(static_cast<const Base *>(that));
return vector<uint8_t>(begin_that, begin_that + sizeof(Base));
}
// int->hex conversion with sprintf
string to_hex(uint64_t i);

View File

@@ -1,14 +0,0 @@
#ifndef CRUCIBLE_UNAME_H
#define CRUCIBLE_UNAME_H
#include <sys/utsname.h>
namespace crucible {
using namespace std;
struct Uname : public utsname {
Uname();
};
}
#endif

View File

@@ -4,7 +4,6 @@ default: libcrucible.a
%.a: Makefile
CRUCIBLE_OBJS = \
bytevector.o \
chatter.o \
city.o \
cleanup.o \
@@ -19,7 +18,6 @@ CRUCIBLE_OBJS = \
string.o \
task.o \
time.o \
uname.o \
include ../makeflags
-include ../localconf

View File

@@ -1,147 +0,0 @@
#include "crucible/bytevector.h"
#include "crucible/error.h"
namespace crucible {
using namespace std;
ByteVector::iterator
ByteVector::begin() const
{
return m_ptr.get();
}
ByteVector::iterator
ByteVector::end() const
{
return m_ptr.get() + m_size;
}
size_t
ByteVector::size() const
{
return m_size;
}
bool
ByteVector::empty() const
{
return !m_ptr || !m_size;
}
void
ByteVector::clear()
{
m_ptr.reset();
m_size = 0;
}
ByteVector::value_type&
ByteVector::operator[](size_t size) const
{
return m_ptr.get()[size];
}
ByteVector::ByteVector(const ByteVector &that, size_t start, size_t length)
{
THROW_CHECK0(out_of_range, that.m_ptr);
THROW_CHECK2(out_of_range, start, that.m_size, start <= that.m_size);
THROW_CHECK2(out_of_range, start + length, that.m_size + length, start + length <= that.m_size + length);
m_ptr = Pointer(that.m_ptr, that.m_ptr.get() + start);
m_size = length;
}
ByteVector
ByteVector::at(size_t start, size_t length) const
{
return ByteVector(*this, start, length);
}
ByteVector::value_type&
ByteVector::at(size_t size) const
{
THROW_CHECK0(out_of_range, m_ptr);
THROW_CHECK2(out_of_range, size, m_size, size < m_size);
return m_ptr.get()[size];
}
static
void *
bv_allocate(size_t size)
{
#ifdef BEES_VALGRIND
// XXX: only do this to shut up valgrind
return calloc(1, size);
#else
return malloc(size);
#endif
}
ByteVector::ByteVector(size_t size)
{
m_ptr = Pointer(static_cast<value_type*>(bv_allocate(size)), free);
// bad_alloc doesn't fit THROW_CHECK's template
THROW_CHECK0(runtime_error, m_ptr);
m_size = size;
}
ByteVector::ByteVector(iterator begin, iterator end, size_t min_size)
{
const size_t size = end - begin;
const size_t alloc_size = max(size, min_size);
m_ptr = Pointer(static_cast<value_type*>(bv_allocate(alloc_size)), free);
THROW_CHECK0(runtime_error, m_ptr);
m_size = alloc_size;
memcpy(m_ptr.get(), begin, size);
}
bool
ByteVector::operator==(const ByteVector &that) const
{
if (!m_ptr) {
return !that.m_ptr;
}
if (!that.m_ptr) {
return false;
}
if (m_size != that.m_size) {
return false;
}
if (m_ptr.get() == that.m_ptr.get()) {
return true;
}
return !memcmp(m_ptr.get(), that.m_ptr.get(), m_size);
}
void
ByteVector::erase(iterator begin, iterator end)
{
const size_t size = end - begin;
if (!size) return;
THROW_CHECK0(out_of_range, m_ptr);
const iterator my_begin = m_ptr.get();
const iterator my_end = my_begin + m_size;
THROW_CHECK4(out_of_range, my_begin, begin, my_end, end, my_begin == begin || my_end == end);
if (begin == my_begin) {
if (end == my_end) {
m_size = 0;
m_ptr.reset();
return;
}
m_ptr = Pointer(m_ptr, end);
}
m_size -= size;
}
void
ByteVector::erase(iterator begin)
{
erase(begin, begin + 1);
}
ByteVector::value_type*
ByteVector::data() const
{
return m_ptr.get();
}
}

View File

@@ -496,7 +496,7 @@ namespace crucible {
BtrfsExtentWalker::Vec
BtrfsExtentWalker::get_extent_map(off_t pos)
{
BtrfsIoctlSearchKey sk;
BtrfsIoctlSearchKey sk(65536);
if (!m_root_fd) {
m_root_fd = m_fd;
}
@@ -640,7 +640,9 @@ namespace crucible {
ExtentWalker::get_extent_map(off_t pos)
{
EWLOG("get_extent_map(" << to_hex(pos) << ")");
Fiemap fm(ranged_cast<uint64_t>(pos), ranged_cast<uint64_t>(numeric_limits<off_t>::max() - pos));
Fiemap fm;
fm.fm_start = ranged_cast<uint64_t>(pos);
fm.fm_length = ranged_cast<uint64_t>(numeric_limits<off_t>::max() - pos);
fm.m_max_count = fm.m_min_count = sc_extent_fetch_max;
fm.do_ioctl(m_fd);
Vec rv;

View File

@@ -362,7 +362,7 @@ namespace crucible {
}
int rv = ::pwrite(fd, buf, size, offset);
if (rv != static_cast<int>(size)) {
THROW_ERROR(runtime_error, "pwrite: only " << rv << " of " << size << " bytes written at fd " << name_fd(fd) << " offset " << offset);
THROW_ERROR(runtime_error, "pwrite: only " << rv << " of " << size << " bytes written at offset " << offset);
}
}
@@ -442,7 +442,7 @@ namespace crucible {
THROW_ERRNO("pread: " << size << " bytes");
}
if (rv != static_cast<int>(size)) {
THROW_ERROR(runtime_error, "pread: " << size << " bytes at fd " << name_fd(fd) << " offset " << offset << " returned " << rv);
THROW_ERROR(runtime_error, "pread: " << size << " bytes at offset " << offset << " returned " << rv);
}
break;
}
@@ -458,14 +458,28 @@ namespace crucible {
template<>
void
pread_or_die<ByteVector>(int fd, ByteVector &text, off_t offset)
pread_or_die<vector<char>>(int fd, vector<char> &text, off_t offset)
{
return pread_or_die(fd, text.data(), text.size(), offset);
}
template<>
void
pwrite_or_die<ByteVector>(int fd, const ByteVector &text, off_t offset)
pread_or_die<vector<uint8_t>>(int fd, vector<uint8_t> &text, off_t offset)
{
return pread_or_die(fd, text.data(), text.size(), offset);
}
template<>
void
pwrite_or_die<vector<uint8_t>>(int fd, const vector<uint8_t> &text, off_t offset)
{
return pwrite_or_die(fd, text.data(), text.size(), offset);
}
template<>
void
pwrite_or_die<vector<char>>(int fd, const vector<char> &text, off_t offset)
{
return pwrite_or_die(fd, text.data(), text.size(), offset);
}
@@ -477,9 +491,9 @@ namespace crucible {
return pwrite_or_die(fd, text.data(), text.size(), offset);
}
Stat::Stat() :
stat( (stat) { } )
Stat::Stat()
{
memset_zero<stat>(this);
}
Stat &
@@ -498,15 +512,15 @@ namespace crucible {
return *this;
}
Stat::Stat(int fd) :
stat( (stat) { } )
Stat::Stat(int fd)
{
memset_zero<stat>(this);
fstat(fd);
}
Stat::Stat(const string &filename) :
stat( (stat) { } )
Stat::Stat(const string &filename)
{
memset_zero<stat>(this);
lstat(filename);
}

296
lib/fs.cc
View File

@@ -32,23 +32,19 @@ namespace crucible {
#endif
}
BtrfsExtentInfo::BtrfsExtentInfo(int dst_fd, off_t dst_offset) :
btrfs_ioctl_same_extent_info( (btrfs_ioctl_same_extent_info) { } )
BtrfsExtentInfo::BtrfsExtentInfo(int dst_fd, off_t dst_offset)
{
assert(fd == 0);
assert(logical_offset == 0);
assert(bytes_deduped == 0);
assert(status == 0);
assert(reserved == 0);
memset_zero<btrfs_ioctl_same_extent_info>(this);
fd = dst_fd;
logical_offset = dst_offset;
}
BtrfsExtentSame::BtrfsExtentSame(int src_fd, off_t src_offset, off_t src_length) :
m_logical_offset(src_offset),
m_length(src_length),
m_fd(src_fd)
{
memset_zero<btrfs_ioctl_same_args>(this);
logical_offset = src_offset;
length = src_length;
}
BtrfsExtentSame::~BtrfsExtentSame()
@@ -115,8 +111,11 @@ namespace crucible {
os << " '" << fd_name << "'";
});
}
os << ", .logical_offset = " << to_hex(bes.m_logical_offset);
os << ", .length = " << to_hex(bes.m_length);
os << ", .logical_offset = " << to_hex(bes.logical_offset);
os << ", .length = " << to_hex(bes.length);
os << ", .dest_count = " << bes.dest_count;
os << ", .reserved1 = " << bes.reserved1;
os << ", .reserved2 = " << bes.reserved2;
os << ", .info[] = {";
for (size_t i = 0; i < bes.m_info.size(); ++i) {
os << " [" << i << "] = " << &(bes.m_info[i]) << ",";
@@ -127,25 +126,22 @@ namespace crucible {
void
btrfs_clone_range(int src_fd, off_t src_offset, off_t src_length, int dst_fd, off_t dst_offset)
{
btrfs_ioctl_clone_range_args args ( (btrfs_ioctl_clone_range_args) {
.src_fd = src_fd,
.src_offset = ranged_cast<uint64_t, off_t>(src_offset),
.src_length = ranged_cast<uint64_t, off_t>(src_length),
.dest_offset = ranged_cast<uint64_t, off_t>(dst_offset),
} );
struct btrfs_ioctl_clone_range_args args;
memset_zero(&args);
args.src_fd = src_fd;
args.src_offset = src_offset;
args.src_length = src_length;
args.dest_offset = dst_offset;
DIE_IF_MINUS_ONE(ioctl(dst_fd, BTRFS_IOC_CLONE_RANGE, &args));
}
void
BtrfsExtentSame::do_ioctl()
{
const size_t buf_size = sizeof(btrfs_ioctl_same_args) + m_info.size() * sizeof(btrfs_ioctl_same_extent_info);
ByteVector ioctl_arg( (btrfs_ioctl_same_args) {
.logical_offset = m_logical_offset,
.length = m_length,
.dest_count = ranged_cast<decltype(btrfs_ioctl_same_args::dest_count)>(m_info.size()),
}, buf_size);
btrfs_ioctl_same_args *const ioctl_ptr = ioctl_arg.get<btrfs_ioctl_same_args>();
dest_count = m_info.size();
vector<uint8_t> ioctl_arg = vector_copy_struct<btrfs_ioctl_same_args>(this);
ioctl_arg.resize(sizeof(btrfs_ioctl_same_args) + dest_count * sizeof(btrfs_ioctl_same_extent_info), 0);
btrfs_ioctl_same_args *ioctl_ptr = reinterpret_cast<btrfs_ioctl_same_args *>(ioctl_arg.data());
size_t count = 0;
for (auto i = m_info.cbegin(); i != m_info.cend(); ++i) {
ioctl_ptr->info[count] = static_cast<const btrfs_ioctl_same_extent_info &>(m_info[count]);
@@ -198,15 +194,18 @@ namespace crucible {
void *
BtrfsDataContainer::prepare(size_t container_size)
{
if (m_data.size() < container_size) {
m_data.resize(container_size);
}
btrfs_data_container *p = reinterpret_cast<btrfs_data_container *>(m_data.data());
const size_t min_size = offsetof(btrfs_data_container, val);
if (container_size < min_size) {
THROW_ERROR(out_of_range, "container size " << container_size << " smaller than minimum " << min_size);
}
if (m_data.size() < container_size) {
m_data = ByteVector(container_size);
}
const auto p = m_data.get<btrfs_data_container>();
*p = (btrfs_data_container) { };
p->bytes_left = 0;
p->bytes_missing = 0;
p->elem_cnt = 0;
p->elem_missed = 0;
return p;
}
@@ -219,29 +218,25 @@ namespace crucible {
decltype(btrfs_data_container::bytes_left)
BtrfsDataContainer::get_bytes_left() const
{
const auto p = m_data.get<btrfs_data_container>();
return p->bytes_left;
return bytes_left;
}
decltype(btrfs_data_container::bytes_missing)
BtrfsDataContainer::get_bytes_missing() const
{
const auto p = m_data.get<btrfs_data_container>();
return p->bytes_missing;
return bytes_missing;
}
decltype(btrfs_data_container::elem_cnt)
BtrfsDataContainer::get_elem_cnt() const
{
const auto p = m_data.get<btrfs_data_container>();
return p->elem_cnt;
return elem_cnt;
}
decltype(btrfs_data_container::elem_missed)
BtrfsDataContainer::get_elem_missed() const
{
const auto p = m_data.get<btrfs_data_container>();
return p->elem_missed;
return elem_missed;
}
ostream &
@@ -262,13 +257,10 @@ namespace crucible {
}
BtrfsIoctlLogicalInoArgs::BtrfsIoctlLogicalInoArgs(uint64_t new_logical, size_t new_size) :
btrfs_ioctl_logical_ino_args( (btrfs_ioctl_logical_ino_args) { } ),
m_container_size(new_size),
m_container(new_size)
{
assert(logical == 0);
assert(size == 0);
assert(flags == 0);
memset_zero<btrfs_ioctl_logical_ino_args>(this);
logical = new_logical;
}
@@ -336,7 +328,7 @@ namespace crucible {
bool
BtrfsIoctlLogicalInoArgs::do_ioctl_nothrow(int fd)
{
btrfs_ioctl_logical_ino_args *const p = static_cast<btrfs_ioctl_logical_ino_args *>(this);
btrfs_ioctl_logical_ino_args *p = static_cast<btrfs_ioctl_logical_ino_args *>(this);
inodes = reinterpret_cast<uint64_t>(m_container.prepare(m_container_size));
size = m_container.get_size();
@@ -375,8 +367,8 @@ namespace crucible {
bili_version = BTRFS_IOC_LOGICAL_INO_V2;
}
btrfs_data_container *const bdc = reinterpret_cast<btrfs_data_container *>(p->inodes);
BtrfsInodeOffsetRoot *const input_iter = reinterpret_cast<BtrfsInodeOffsetRoot *>(bdc->val);
btrfs_data_container *bdc = reinterpret_cast<btrfs_data_container *>(p->inodes);
BtrfsInodeOffsetRoot *input_iter = reinterpret_cast<BtrfsInodeOffsetRoot *>(bdc->val);
// elem_cnt counts uint64_t, but BtrfsInodeOffsetRoot is 3x uint64_t
THROW_CHECK1(runtime_error, bdc->elem_cnt, bdc->elem_cnt % 3 == 0);
@@ -404,10 +396,9 @@ namespace crucible {
}
BtrfsIoctlInoPathArgs::BtrfsIoctlInoPathArgs(uint64_t inode, size_t new_size) :
btrfs_ioctl_ino_path_args( (btrfs_ioctl_ino_path_args) { } ),
m_container_size(new_size)
{
assert(inum == 0);
memset_zero<btrfs_ioctl_ino_path_args>(this);
inum = inode;
}
@@ -425,14 +416,14 @@ namespace crucible {
return false;
}
btrfs_data_container *const bdc = reinterpret_cast<btrfs_data_container *>(p->fspath);
btrfs_data_container *bdc = reinterpret_cast<btrfs_data_container *>(p->fspath);
m_paths.reserve(bdc->elem_cnt);
const uint64_t *up = reinterpret_cast<const uint64_t *>(bdc->val);
const char *const cp = reinterpret_cast<const char *>(bdc->val);
const char *cp = reinterpret_cast<const char *>(bdc->val);
for (auto count = bdc->elem_cnt; count > 0; --count) {
const char *const path = cp + *up++;
const char *path = cp + *up++;
if (static_cast<size_t>(path - cp) > container.get_size()) {
THROW_ERROR(out_of_range, "offset " << (path - cp) << " > size " << container.get_size() << " in " << __PRETTY_FUNCTION__);
}
@@ -467,10 +458,9 @@ namespace crucible {
return os;
}
BtrfsIoctlInoLookupArgs::BtrfsIoctlInoLookupArgs(uint64_t new_objectid) :
btrfs_ioctl_ino_lookup_args( (btrfs_ioctl_ino_lookup_args) { } )
BtrfsIoctlInoLookupArgs::BtrfsIoctlInoLookupArgs(uint64_t new_objectid)
{
assert(objectid == 0);
memset_zero<btrfs_ioctl_ino_lookup_args>(this);
objectid = new_objectid;
}
@@ -488,9 +478,9 @@ namespace crucible {
}
}
BtrfsIoctlDefragRangeArgs::BtrfsIoctlDefragRangeArgs() :
btrfs_ioctl_defrag_range_args( (btrfs_ioctl_defrag_range_args) { } )
BtrfsIoctlDefragRangeArgs::BtrfsIoctlDefragRangeArgs()
{
memset_zero<btrfs_ioctl_defrag_range_args>(this);
}
bool
@@ -547,9 +537,9 @@ namespace crucible {
return os;
}
FiemapExtent::FiemapExtent() :
fiemap_extent( (fiemap_extent) { } )
FiemapExtent::FiemapExtent()
{
memset_zero<fiemap_extent>(this);
}
FiemapExtent::FiemapExtent(const fiemap_extent &that)
@@ -656,10 +646,13 @@ namespace crucible {
operator<<(ostream &os, const Fiemap &args)
{
os << "Fiemap {";
os << " .m_start = " << to_hex(args.m_start) << ".." << to_hex(args.m_start + args.m_length);
os << ", .m_length = " << to_hex(args.m_length);
os << ", .m_flags = " << fiemap_flags_ntoa(args.m_flags);
os << ", .fm_extents[" << args.m_extents.size() << "] = {";
os << " .fm_start = " << to_hex(args.fm_start) << ".." << to_hex(args.fm_start + args.fm_length);
os << ", .fm_length = " << to_hex(args.fm_length);
if (args.fm_flags) os << ", .fm_flags = " << fiemap_flags_ntoa(args.fm_flags);
os << ", .fm_mapped_extents = " << args.fm_mapped_extents;
os << ", .fm_extent_count = " << args.fm_extent_count;
if (args.fm_reserved) os << ", .fm_reserved = " << args.fm_reserved;
os << ", .fm_extents[] = {";
size_t count = 0;
for (auto i = args.m_extents.cbegin(); i != args.m_extents.cend(); ++i) {
os << "\n\t[" << count++ << "] = " << &(*i) << ",";
@@ -667,35 +660,41 @@ namespace crucible {
return os << "\n}";
}
Fiemap::Fiemap(uint64_t start, uint64_t length) :
m_start(start),
m_length(length)
Fiemap::Fiemap(uint64_t start, uint64_t length)
{
memset_zero<fiemap>(this);
fm_start = start;
fm_length = length;
// FIEMAP is slow and full of lines.
// This makes FIEMAP even slower, but reduces the lies a little.
fm_flags = FIEMAP_FLAG_SYNC;
}
void
Fiemap::do_ioctl(int fd)
{
THROW_CHECK1(out_of_range, m_min_count, m_min_count <= m_max_count);
THROW_CHECK1(out_of_range, m_min_count, m_min_count > 0);
const auto extent_count = m_min_count;
ByteVector ioctl_arg(sizeof(fiemap) + extent_count * sizeof(fiemap_extent));
auto extent_count = m_min_count;
vector<uint8_t> ioctl_arg = vector_copy_struct<fiemap>(this);
fiemap *const ioctl_ptr = ioctl_arg.get<fiemap>();
ioctl_arg.resize(sizeof(fiemap) + extent_count * sizeof(fiemap_extent), 0);
auto start = m_start;
const auto end = m_start + m_length;
fiemap *ioctl_ptr = reinterpret_cast<fiemap *>(ioctl_arg.data());
auto start = fm_start;
auto end = fm_start + fm_length;
auto orig_start = fm_start;
auto orig_length = fm_length;
vector<FiemapExtent> extents;
while (start < end && extents.size() < m_max_count) {
*ioctl_ptr = (fiemap) {
.fm_start = start,
.fm_length = end - start,
.fm_flags = m_flags,
.fm_extent_count = extent_count,
};
ioctl_ptr->fm_start = start;
ioctl_ptr->fm_length = end - start;
ioctl_ptr->fm_extent_count = extent_count;
ioctl_ptr->fm_mapped_extents = 0;
// cerr << "Before (fd = " << fd << ") : " << ioctl_ptr << endl;
DIE_IF_MINUS_ONE(ioctl(fd, FS_IOC_FIEMAP, ioctl_ptr));
@@ -721,89 +720,74 @@ namespace crucible {
}
}
fiemap *this_ptr = static_cast<fiemap *>(this);
*this_ptr = *ioctl_ptr;
fm_start = orig_start;
fm_length = orig_length;
fm_extent_count = extents.size();
m_extents = extents;
}
BtrfsIoctlSearchKey::BtrfsIoctlSearchKey(size_t buf_size) :
btrfs_ioctl_search_key( (btrfs_ioctl_search_key) {
.max_objectid = numeric_limits<decltype(max_objectid)>::max(),
.max_offset = numeric_limits<decltype(max_offset)>::max(),
.max_transid = numeric_limits<decltype(max_transid)>::max(),
.max_type = numeric_limits<decltype(max_type)>::max(),
.nr_items = 1,
}),
m_buf_size(buf_size)
{
memset_zero<btrfs_ioctl_search_key>(this);
max_objectid = numeric_limits<decltype(max_objectid)>::max();
max_offset = numeric_limits<decltype(max_offset)>::max();
max_transid = numeric_limits<decltype(max_transid)>::max();
max_type = numeric_limits<decltype(max_type)>::max();
nr_items = numeric_limits<decltype(nr_items)>::max();
}
BtrfsIoctlSearchHeader::BtrfsIoctlSearchHeader() :
btrfs_ioctl_search_header( (btrfs_ioctl_search_header) { } )
BtrfsIoctlSearchHeader::BtrfsIoctlSearchHeader()
{
memset_zero<btrfs_ioctl_search_header>(this);
}
size_t
BtrfsIoctlSearchHeader::set_data(const ByteVector &v, size_t offset)
BtrfsIoctlSearchHeader::set_data(const vector<uint8_t> &v, size_t offset)
{
THROW_CHECK2(invalid_argument, offset, v.size(), offset + sizeof(btrfs_ioctl_search_header) <= v.size());
memcpy(static_cast<btrfs_ioctl_search_header *>(this), &v[offset], sizeof(btrfs_ioctl_search_header));
offset += sizeof(btrfs_ioctl_search_header);
THROW_CHECK2(invalid_argument, offset + len, v.size(), offset + len <= v.size());
m_data = ByteVector(v, offset, len);
m_data = Spanner<const uint8_t>(&v[offset], &v[offset + len]);
return offset + len;
}
bool
BtrfsIoctlSearchKey::do_ioctl_nothrow(int fd)
{
// It would be really nice if the kernel tells us whether our
// buffer overflowed or how big the overflowing object
// was; instead, we have to guess.
// Normally we like to be paranoid and fill empty bytes with zero,
// but these buffers can be huge. 80% of a 4GHz CPU huge.
// Keep the ioctl buffer from one run to the next to save on malloc costs
size_t target_buf_size = sizeof(btrfs_ioctl_search_args_v2) + m_buf_size;
m_ioctl_arg = vector_copy_struct<btrfs_ioctl_search_key>(this);
m_ioctl_arg.resize(target_buf_size);
m_result.clear();
// Make sure there is space for at least the search key and one (empty) header
size_t buf_size = max(m_buf_size, sizeof(btrfs_ioctl_search_args_v2) + sizeof(btrfs_ioctl_search_header));
ByteVector ioctl_arg;
btrfs_ioctl_search_args_v2 *ioctl_ptr;
do {
// ioctl buffer size does not include search key header or buffer size
ioctl_arg = ByteVector(buf_size + sizeof(btrfs_ioctl_search_args_v2));
ioctl_ptr = ioctl_arg.get<btrfs_ioctl_search_args_v2>();
ioctl_ptr->key = static_cast<const btrfs_ioctl_search_key&>(*this);
ioctl_ptr->buf_size = buf_size;
btrfs_ioctl_search_args_v2 *ioctl_ptr = reinterpret_cast<btrfs_ioctl_search_args_v2 *>(m_ioctl_arg.data());
ioctl_ptr->buf_size = m_buf_size;
// Don't bother supporting V1. Kernels that old have other problems.
int rv = ioctl(fd, BTRFS_IOC_TREE_SEARCH_V2, ioctl_arg.data());
if (rv != 0 && errno != EOVERFLOW) {
int rv = ioctl(fd, BTRFS_IOC_TREE_SEARCH_V2, ioctl_ptr);
if (rv != 0) {
return false;
}
if (rv == 0 && nr_items <= ioctl_ptr->key.nr_items) {
// got all the items we wanted, thanks
m_buf_size = max(m_buf_size, buf_size);
break;
}
// Didn't get all the items we wanted. Increase the buf size and try again.
// These sizes are very common on default-formatted btrfs, so use these
// instead of naive doubling.
if (buf_size < 4096) {
buf_size = 4096;
} else if (buf_size < 16384) {
buf_size = 16384;
} else if (buf_size < 65536) {
buf_size = 65536;
} else {
buf_size *= 2;
}
// don't automatically raise the buf size higher than 64K, the largest possible btrfs item
} while (buf_size < 65536);
// ioctl changes nr_items, this has to be copied back
static_cast<btrfs_ioctl_search_key&>(*this) = ioctl_ptr->key;
size_t offset = pointer_distance(ioctl_ptr->buf, ioctl_ptr);
for (decltype(nr_items) i = 0; i < nr_items; ++i) {
BtrfsIoctlSearchHeader item;
offset = item.set_data(ioctl_arg, offset);
offset = item.set_data(m_ioctl_arg, offset);
m_result.insert(item);
}
return true;
}
@@ -811,7 +795,7 @@ namespace crucible {
BtrfsIoctlSearchKey::do_ioctl(int fd)
{
if (!do_ioctl_nothrow(fd)) {
THROW_ERRNO("BTRFS_IOC_TREE_SEARCH_V2: " << name_fd(fd) << ": " << *this);
THROW_ERRNO("BTRFS_IOC_TREE_SEARCH_V2: " << name_fd(fd));
}
}
@@ -822,46 +806,8 @@ namespace crucible {
min_type = ref.type;
min_offset = ref.offset + 1;
if (min_offset < ref.offset) {
// We wrapped, try the next type
++min_type;
assert(min_offset == 0);
if (min_type < ref.type) {
assert(min_type == 0);
// We wrapped, try the next objectid
++min_objectid;
// no advancement possible at end
THROW_CHECK1(runtime_error, min_type, min_type == 0);
}
}
}
void
BtrfsIoctlSearchKey::next_min(const BtrfsIoctlSearchHeader &ref, const uint8_t type)
{
if (ref.type < type) {
// forward to type in same object with zero offset
min_objectid = ref.objectid;
min_type = type;
min_offset = 0;
} else if (ref.type > type) {
// skip directly to start of next objectid with target type
min_objectid = ref.objectid + 1;
// no advancement possible at end
THROW_CHECK2(out_of_range, min_objectid, ref.objectid, min_objectid > ref.objectid);
min_type = type;
min_offset = 0;
} else {
// advance within this type
min_objectid = ref.objectid;
min_type = ref.type;
min_offset = ref.offset + 1;
if (min_offset < ref.offset) {
// We wrapped, try the next objectid, same type
++min_objectid;
THROW_CHECK2(out_of_range, min_objectid, ref.objectid, min_objectid > ref.objectid);
min_type = type;
assert(min_offset == 0);
}
}
}
@@ -869,7 +815,7 @@ namespace crucible {
ostream &
hexdump(ostream &os, const V &v)
{
os << "V { size = " << v.size() << ", data:\n";
os << "vector<uint8_t> { size = " << v.size() << ", data:\n";
for (size_t i = 0; i < v.size(); i += 8) {
string hex, ascii;
for (size_t j = i; j < i + 8; ++j) {
@@ -1083,9 +1029,9 @@ namespace crucible {
return rv;
}
Statvfs::Statvfs() :
statvfs( (statvfs) { } )
Statvfs::Statvfs()
{
memset_zero<statvfs>(this);
}
Statvfs::Statvfs(int fd) :
@@ -1136,20 +1082,16 @@ namespace crucible {
return os << " }";
};
BtrfsIoctlFsInfoArgs::BtrfsIoctlFsInfoArgs() :
btrfs_ioctl_fs_info_args_v3( (btrfs_ioctl_fs_info_args_v3) {
.flags = 0
| BTRFS_FS_INFO_FLAG_CSUM_INFO
| BTRFS_FS_INFO_FLAG_GENERATION
,
})
BtrfsIoctlFsInfoArgs::BtrfsIoctlFsInfoArgs()
{
memset_zero<btrfs_ioctl_fs_info_args_v2>(this);
flags = BTRFS_FS_INFO_FLAG_CSUM_INFO;
}
void
BtrfsIoctlFsInfoArgs::do_ioctl(int fd)
{
btrfs_ioctl_fs_info_args_v3 *p = static_cast<btrfs_ioctl_fs_info_args_v3 *>(this);
btrfs_ioctl_fs_info_args_v2 *p = static_cast<btrfs_ioctl_fs_info_args_v2 *>(this);
if (ioctl(fd, BTRFS_IOC_FS_INFO, p)) {
THROW_ERRNO("BTRFS_IOC_FS_INFO: fd " << fd);
}
@@ -1158,19 +1100,13 @@ namespace crucible {
uint16_t
BtrfsIoctlFsInfoArgs::csum_type() const
{
return this->btrfs_ioctl_fs_info_args_v3::csum_type;
return this->btrfs_ioctl_fs_info_args_v2::csum_type;
}
uint16_t
BtrfsIoctlFsInfoArgs::csum_size() const
{
return this->btrfs_ioctl_fs_info_args_v3::csum_size;
}
uint64_t
BtrfsIoctlFsInfoArgs::generation() const
{
return this->btrfs_ioctl_fs_info_args_v3::generation;
return this->btrfs_ioctl_fs_info_args_v2::csum_size;
}
};

View File

@@ -89,7 +89,6 @@ namespace crucible {
TaskState &operator=(const TaskState &) = delete;
TaskState(const TaskState &) = delete;
TaskState(TaskState &&) = delete;
public:
~TaskState();
@@ -200,11 +199,7 @@ namespace crucible {
tlcc->m_local_queue.splice(tlcc->m_local_queue.begin(), queue);
} else {
// We are not executing under a TaskConsumer.
// If there is only one task, then just insert it at the front of the queue.
if (queue.size() == 1) {
TaskMasterState::push_front(queue);
} else {
// If there are multiple tasks, create a new task to wrap our post-exec queue,
// Create a new task to wrap our post-exec queue,
// then push it to the front of the global queue using normal locking methods.
TaskStatePtr rescue_task(make_shared<TaskState>("rescue_task", [](){}));
swap(rescue_task->m_post_exec_queue, queue);
@@ -212,8 +207,6 @@ namespace crucible {
TaskMasterState::push_front(tq_one);
}
}
assert(queue.empty());
}
TaskState::~TaskState()
{
@@ -293,23 +286,23 @@ namespace crucible {
--m_run_count;
m_is_running = true;
}
TaskStatePtr this_task = shared_from_this();
swap(this_task, tl_current_task);
lock.unlock();
char buf[24] = { 0 };
DIE_IF_MINUS_ERRNO(pthread_getname_np(pthread_self(), buf, sizeof(buf)));
DIE_IF_MINUS_ERRNO(pthread_setname_np(pthread_self(), m_title.c_str()));
TaskStatePtr this_task = shared_from_this();
swap(this_task, tl_current_task);
catch_all([&]() {
m_exec_fn();
});
swap(this_task, tl_current_task);
pthread_setname_np(pthread_self(), buf);
lock.lock();
swap(this_task, tl_current_task);
m_is_running = false;
// Splice task post_exec queue at front of local queue
@@ -756,7 +749,6 @@ namespace crucible {
// There is no longer a current consumer, but hold our own shared
// state so it's still there in the destructor
swap(this_consumer, tl_current_consumer);
assert(!tl_current_consumer);
// Release lock to rescue queue (may attempt to queue a new task at TaskMaster).
// rescue_queue normally sends tasks to the local queue of the current TaskConsumer thread,

View File

@@ -1,11 +0,0 @@
#include "crucible/error.h"
#include "crucible/uname.h"
namespace crucible {
using namespace std;
Uname::Uname()
{
DIE_IF_NON_ZERO(uname(static_cast<utsname*>(this)));
}
}

View File

@@ -10,4 +10,4 @@ CCFLAGS = -Wall -Wextra -Werror -O3
CCFLAGS += -I../include -D_FILE_OFFSET_BITS=64
BEES_CFLAGS = $(CCFLAGS) -std=c99 $(CFLAGS)
BEES_CXXFLAGS = $(CCFLAGS) -std=c++11 -Wold-style-cast -Wno-missing-field-initializers $(CXXFLAGS)
BEES_CXXFLAGS = $(CCFLAGS) -std=c++11 -Wold-style-cast $(CXXFLAGS)

View File

@@ -17,7 +17,6 @@ KillSignal=SIGTERM
MemoryAccounting=true
Nice=19
Restart=on-abnormal
RuntimeDirectory=bees
StartupCPUWeight=25
StartupIOWeight=25

View File

@@ -187,20 +187,20 @@ BeesContext::is_root_ro(uint64_t root)
}
bool
BeesContext::dedup(const BeesRangePair &brp_in)
BeesContext::dedup(const BeesRangePair &brp)
{
// TOOLONG and NOTE can retroactively fill in the filename details, but LOG can't
BEESNOTE("dedup " << brp_in);
BEESNOTE("dedup " << brp);
if (is_root_ro(brp_in.second.fid().root())) {
// BEESLOGDEBUG("WORKAROUND: dst root " << (brp_in.second.fid().root()) << " is read-only);
brp.second.fd(shared_from_this());
if (is_root_ro(brp.second.fid().root())) {
// BEESLOGDEBUG("WORKAROUND: dst root is read-only in " << name_fd(brp.second.fd()));
BEESCOUNT(dedup_workaround_btrfs_send);
return false;
}
auto brp = brp_in;
brp.first.fd(shared_from_this());
brp.second.fd(shared_from_this());
BEESTOOLONG("dedup " << brp);
@@ -209,8 +209,6 @@ BeesContext::dedup(const BeesRangePair &brp_in)
BEESLOGINFO("dedup: src " << pretty(brp.first.size()) << " [" << to_hex(brp.first.begin()) << ".." << to_hex(brp.first.end()) << "] {" << first_addr << "} " << name_fd(brp.first.fd()) << "\n"
<< " dst " << pretty(brp.second.size()) << " [" << to_hex(brp.second.begin()) << ".." << to_hex(brp.second.end()) << "] {" << second_addr << "} " << name_fd(brp.second.fd()));
BEESNOTE("dedup: src " << pretty(brp.first.size()) << " [" << to_hex(brp.first.begin()) << ".." << to_hex(brp.first.end()) << "] {" << first_addr << "} " << name_fd(brp.first.fd()) << "\n"
<< " dst " << pretty(brp.second.size()) << " [" << to_hex(brp.second.begin()) << ".." << to_hex(brp.second.end()) << "] {" << second_addr << "} " << name_fd(brp.second.fd()));
if (first_addr.get_physical_or_zero() == second_addr.get_physical_or_zero()) {
BEESLOGTRACE("equal physical addresses in dedup");
@@ -294,15 +292,6 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)
BEESTRACE("scan extent " << e);
BEESCOUNT(scan_extent);
// EXPERIMENT: Don't bother with tiny extents unless they are the entire file.
// We'll take a tiny extent at BOF or EOF but not in between.
if (e.begin() && e.size() < 128 * 1024 && e.end() != Stat(bfr.fd()).st_size) {
BEESCOUNT(scan_extent_tiny);
// This doesn't work properly with the current architecture,
// so we don't do an early return here.
// return bfr;
}
// We keep moving this method around
auto m_ctx = shared_from_this();
@@ -719,28 +708,27 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)
BEESLOGINFO("scan: " << pretty(e.size()) << " " << to_hex(e.begin()) << " [" << bar << "] " << to_hex(e.end()) << ' ' << name_fd(bfr.fd()));
}
// Costs 10% on benchmarks
// bees_unreadahead(bfr.fd(), bfr.begin(), bfr.size());
return bfr;
}
BeesFileRange
BeesContext::scan_forward(const BeesFileRange &bfr_in)
BeesContext::scan_forward(const BeesFileRange &bfr)
{
BEESTRACE("scan_forward " << bfr_in);
// What are we doing here?
BEESTRACE("scan_forward " << bfr);
BEESCOUNT(scan_forward);
Timer scan_timer;
// Silently filter out blacklisted files
if (is_blacklisted(bfr_in.fid())) {
if (is_blacklisted(bfr.fid())) {
BEESCOUNT(scan_blacklisted);
return bfr_in;
return bfr;
}
BEESNOTE("scan open " << bfr);
// Reconstitute FD
BEESNOTE("scan open " << bfr_in);
auto bfr = bfr_in;
bfr.fd(shared_from_this());
BEESNOTE("scan extent " << bfr);
@@ -808,7 +796,8 @@ BeesContext::wait_for_balance()
Timer balance_timer;
BEESNOTE("WORKAROUND: waiting for balance to stop");
while (true) {
btrfs_ioctl_balance_args args {};
btrfs_ioctl_balance_args args;
memset_zero<btrfs_ioctl_balance_args>(&args);
const int ret = ioctl(root_fd(), BTRFS_IOC_BALANCE_PROGRESS, &args);
if (ret < 0) {
// Either can't get balance status or not running, exit either way
@@ -846,6 +835,24 @@ BeesContext::resolve_addr_uncached(BeesAddress addr)
// transaction latency, competing threads, and freeze/SIGSTOP
// pausing the bees process.
// There can be only one of these running at a time, or some lingering
// backref bug will kill the whole system. Also it looks like there
// are so many locks held while LOGICAL_INO runs that there is no
// point in trying to run two of them on the same filesystem.
// ...but it works most of the time, and the performance hit from
// not running resolve in multiple threads is significant.
// But "most of the time" really just means "between forced reboots",
// and with recent improvements in kernel uptime, this is now in the
// top 3 crash causes.
static mutex s_resolve_mutex;
unique_lock<mutex> lock(s_resolve_mutex, defer_lock);
if (BEES_SERIALIZE_RESOLVE) {
BEESNOTE("waiting to resolve addr " << addr);
lock.lock();
}
// Is there a bug where resolve and balance cause a crash (BUG_ON at fs/btrfs/ctree.c:1227)?
// Apparently yes, and more than one.
// Wait for the balance to finish before we run LOGICAL_INO
wait_for_balance();
@@ -873,15 +880,15 @@ BeesContext::resolve_addr_uncached(BeesAddress addr)
struct rusage usage_after;
DIE_IF_MINUS_ONE(getrusage(RUSAGE_THREAD, &usage_after));
const double sys_usage_delta =
double sys_usage_delta =
(usage_after.ru_stime.tv_sec + usage_after.ru_stime.tv_usec / 1000000.0) -
(usage_before.ru_stime.tv_sec + usage_before.ru_stime.tv_usec / 1000000.0);
const double user_usage_delta =
double user_usage_delta =
(usage_after.ru_utime.tv_sec + usage_after.ru_utime.tv_usec / 1000000.0) -
(usage_before.ru_utime.tv_sec + usage_before.ru_utime.tv_usec / 1000000.0);
const auto rt_age = resolve_timer.age();
auto rt_age = resolve_timer.age();
BeesResolveAddrResult rv;
@@ -905,13 +912,12 @@ BeesContext::resolve_addr_uncached(BeesAddress addr)
// Count how many times this happens so we can figure out how
// important this case is
static const size_t max_logical_ino_v1_refs = 2730; // (65536 - header_len) / (sizeof(uint64_t) * 3)
static size_t most_refs_ever = max_logical_ino_v1_refs;
static size_t most_refs_ever = 2730;
if (rv_count > most_refs_ever) {
BEESLOGINFO("addr " << addr << " refs " << rv_count << " beats previous record " << most_refs_ever);
most_refs_ever = rv_count;
}
if (rv_count > max_logical_ino_v1_refs) {
if (rv_count > 2730) {
BEESCOUNT(resolve_large);
}
@@ -1054,13 +1060,9 @@ BeesContext::stop()
BEESLOGDEBUG("Waiting for progress thread");
m_progress_thread->join();
// Write status once with this message...
BEESNOTE("stopping status thread at " << stop_timer << " sec");
lock.lock();
m_stop_condvar.notify_all();
lock.unlock();
// then wake the thread up one more time to exit the while loop
// XXX: nobody can see this BEESNOTE because we are killing the
// thread that publishes it
BEESNOTE("waiting for status thread");
BEESLOGDEBUG("Waiting for status thread");
lock.lock();
m_stop_status = true;

View File

@@ -3,9 +3,9 @@
#include "crucible/city.h"
#include "crucible/crc64.h"
#include "crucible/string.h"
#include "crucible/uname.h"
#include <algorithm>
#include <random>
#include <sys/mman.h>
@@ -123,7 +123,7 @@ BeesHashTable::flush_dirty_extent(uint64_t extent_index)
THROW_CHECK2(out_of_range, dirty_extent_end, dirty_extent, dirty_extent_end - dirty_extent == BLOCK_SIZE_HASHTAB_EXTENT);
BEESTOOLONG("pwrite(fd " << m_fd << " '" << name_fd(m_fd)<< "', length " << to_hex(dirty_extent_end - dirty_extent) << ", offset " << to_hex(dirty_extent - m_byte_ptr) << ")");
// Copy the extent because we might be stuck writing for a while
ByteVector extent_copy(dirty_extent, dirty_extent_end);
vector<uint8_t> extent_copy(dirty_extent, dirty_extent_end);
// Mark extent non-dirty while we still hold the lock
m_extent_metadata.at(extent_index).m_dirty = false;
@@ -206,11 +206,8 @@ BeesHashTable::writeback_loop()
}
catch_all([&]() {
// trigger writeback on our way out
#if 0
// seems to trigger huge latency spikes
BEESTOOLONG("unreadahead hash table size " << pretty(m_size));
bees_unreadahead(m_fd, 0, m_size);
#endif
});
BEESLOGDEBUG("Exited hash table writeback_loop");
}
@@ -229,7 +226,6 @@ percent(size_t num, size_t den)
void
BeesHashTable::prefetch_loop()
{
Uname uname;
bool not_locked = true;
while (!m_stop_requested) {
size_t width = 64;
@@ -323,7 +319,6 @@ BeesHashTable::prefetch_loop()
graph_blob << "Now: " << format_time(time(NULL)) << "\n";
graph_blob << "Uptime: " << m_ctx->total_timer().age() << " seconds\n";
graph_blob << "Version: " << BEES_VERSION << "\n";
graph_blob << "Kernel: " << uname.sysname << " " << uname.release << " " << uname.machine << " " << uname.version << "\n";
graph_blob
<< "\nHash table page occupancy histogram (" << occupied_count << "/" << total_count << " cells occupied, " << (occupied_count * 100 / total_count) << "%)\n"
@@ -543,8 +538,6 @@ BeesHashTable::push_front_hash_addr(HashType hash, AddrType addr)
return found;
}
thread_local uniform_int_distribution<size_t> BeesHashTable::tl_distribution(0, c_cells_per_bucket - 1);
/// Insert a hash entry at some unspecified point in the list.
/// If entry is already present in list, returns true and does not
/// modify list. If entry is not present in list, returns false and
@@ -562,7 +555,9 @@ BeesHashTable::push_random_hash_addr(HashType hash, AddrType addr)
Cell *ip = find(er.first, er.second, mv);
bool found = (ip < er.second);
const auto pos = tl_distribution(bees_generator);
thread_local default_random_engine generator;
thread_local uniform_int_distribution<int> distribution(0, c_cells_per_bucket - 1);
auto pos = distribution(generator);
int case_cond = 0;
#if 0

View File

@@ -385,15 +385,14 @@ BeesResolver::for_each_extent_ref(BeesBlockData bbd, function<bool(const BeesFil
}
BeesFileRange
BeesResolver::replace_dst(const BeesFileRange &dst_bfr_in)
BeesResolver::replace_dst(const BeesFileRange &dst_bfr)
{
BEESTRACE("replace_dst dst_bfr " << dst_bfr_in);
BEESTRACE("replace_dst dst_bfr " << dst_bfr);
BEESCOUNT(replacedst_try);
// Open dst, reuse it for all src
BEESNOTE("Opening dst bfr " << dst_bfr_in);
BEESTRACE("Opening dst bfr " << dst_bfr_in);
auto dst_bfr = dst_bfr_in;
BEESNOTE("Opening dst bfr " << dst_bfr);
BEESTRACE("Opening dst bfr " << dst_bfr);
dst_bfr.fd(m_ctx);
BeesFileRange overlap_bfr;
@@ -401,11 +400,10 @@ BeesResolver::replace_dst(const BeesFileRange &dst_bfr_in)
BeesBlockData bbd(dst_bfr);
for_each_extent_ref(bbd, [&](const BeesFileRange &src_bfr_in) -> bool {
for_each_extent_ref(bbd, [&](const BeesFileRange &src_bfr) -> bool {
// Open src
BEESNOTE("Opening src bfr " << src_bfr_in);
BEESTRACE("Opening src bfr " << src_bfr_in);
auto src_bfr = src_bfr_in;
BEESNOTE("Opening src bfr " << src_bfr);
BEESTRACE("Opening src bfr " << src_bfr);
src_bfr.fd(m_ctx);
if (dst_bfr.overlaps(src_bfr)) {
@@ -420,7 +418,7 @@ BeesResolver::replace_dst(const BeesFileRange &dst_bfr_in)
BEESCOUNT(replacedst_same);
// stop looping here, all the other srcs will probably fail this test too
BeesTracer::set_silent();
throw runtime_error("FIXME: too many duplicate candidates, bailing out here");
throw runtime_error("FIXME: bailing out here, need to fix this further up the call stack");
}
// Make pair(src, dst)

View File

@@ -171,24 +171,16 @@ BeesRoots::crawl_state_erase(const BeesCrawlState &bcs)
uint64_t
BeesRoots::transid_min()
{
uint64_t rv = numeric_limits<uint64_t>::max();
uint64_t last_root = 0;
BEESNOTE("Calculating transid_min (" << rv << " so far, last_root " << last_root << ")");
BEESNOTE("Calculating transid_min");
unique_lock<mutex> lock(m_mutex);
if (m_root_crawl_map.empty()) {
return 0;
}
uint64_t rv = numeric_limits<uint64_t>::max();
const uint64_t max_rv = rv;
for (auto i : m_root_crawl_map) {
// Do not count subvols that are isolated by btrfs send workaround.
// They will not advance until the workaround is removed or they are set read-write.
catch_all([&](){
if (!is_root_ro(i.first)) {
rv = min(rv, i.second->get_state_end().m_min_transid);
}
});
last_root = i.first;
}
// If we get through this loop without setting rv, we'll create broken crawlers due to integer overflow.
THROW_CHECK2(runtime_error, rv, max_rv, max_rv > rv);
return rv;
@@ -209,7 +201,7 @@ BeesRoots::transid_max_nocache()
sk.min_objectid = sk.max_objectid = BTRFS_EXTENT_TREE_OBJECTID;
while (true) {
sk.nr_items = 4;
sk.nr_items = 1024;
BEESTRACE("transid_max search sk " << sk);
sk.do_ioctl(m_ctx->root_fd());
@@ -220,7 +212,7 @@ BeesRoots::transid_max_nocache()
// We are just looking for the highest transid on the filesystem.
// We don't care which object it comes from.
for (auto i : sk.m_result) {
sk.next_min(i, BTRFS_ROOT_ITEM_KEY);
sk.next_min(i);
if (i.transid > rv) {
rv = i.transid;
}
@@ -229,8 +221,6 @@ BeesRoots::transid_max_nocache()
// transid must be greater than zero, or we did something very wrong
THROW_CHECK1(runtime_error, rv, rv > 0);
// transid must be less than max, or we did something very wrong
THROW_CHECK1(runtime_error, rv, rv < numeric_limits<uint64_t>::max());
return rv;
}
@@ -634,6 +624,7 @@ BeesRoots::open_root_nocache(uint64_t rootid)
BEESTRACE("sk " << sk);
while (sk.min_objectid <= rootid) {
sk.nr_items = 1024;
sk.do_ioctl(m_ctx->root_fd());
if (sk.m_result.empty()) {
@@ -641,16 +632,16 @@ BeesRoots::open_root_nocache(uint64_t rootid)
}
for (auto i : sk.m_result) {
sk.next_min(i, BTRFS_ROOT_BACKREF_KEY);
sk.next_min(i);
if (i.type == BTRFS_ROOT_BACKREF_KEY && i.objectid == rootid) {
const auto dirid = btrfs_get_member(&btrfs_root_ref::dirid, i.m_data);
const auto name_len = btrfs_get_member(&btrfs_root_ref::name_len, i.m_data);
const auto name_start = sizeof(struct btrfs_root_ref);
const auto name_end = name_len + name_start;
auto dirid = btrfs_get_member(&btrfs_root_ref::dirid, i.m_data);
auto name_len = btrfs_get_member(&btrfs_root_ref::name_len, i.m_data);
auto name_start = sizeof(struct btrfs_root_ref);
auto name_end = name_len + name_start;
THROW_CHECK2(runtime_error, i.m_data.size(), name_end, i.m_data.size() >= name_end);
const string name(i.m_data.data() + name_start, i.m_data.data() + name_end);
string name(i.m_data.data() + name_start, i.m_data.data() + name_end);
const auto parent_rootid = i.offset;
auto parent_rootid = i.offset;
// BEESLOG("parent_rootid " << parent_rootid << " dirid " << dirid << " name " << name);
BEESTRACE("parent_rootid " << parent_rootid << " dirid " << dirid << " name " << name);
BEESCOUNT(root_parent_open_try);
@@ -770,6 +761,7 @@ BeesRoots::next_root(uint64_t root)
sk.min_objectid = root + 1;
while (true) {
sk.nr_items = 1024;
sk.do_ioctl(m_ctx->root_fd());
if (sk.m_result.empty()) {
@@ -777,7 +769,7 @@ BeesRoots::next_root(uint64_t root)
}
for (auto i : sk.m_result) {
sk.next_min(i, BTRFS_ROOT_BACKREF_KEY);
sk.next_min(i);
if (i.type == BTRFS_ROOT_BACKREF_KEY) {
// BEESLOGDEBUG("Found root " << i.objectid << " parent " << i.offset << " transid " << i.transid);
return i.objectid;
@@ -955,8 +947,8 @@ BeesCrawl::BeesCrawl(shared_ptr<BeesContext> ctx, BeesCrawlState initial_state)
bool
BeesCrawl::next_transid()
{
const auto roots = m_ctx->roots();
const auto next_transid = roots->transid_max();
auto roots = m_ctx->roots();
auto next_transid = roots->transid_max();
auto crawl_state = get_state_end();
// If we are already at transid_max then we are still finished
@@ -966,7 +958,7 @@ BeesCrawl::next_transid()
m_deferred = true;
} else {
// Log performance stats from the old crawl
const auto current_time = time(NULL);
auto current_time = time(NULL);
// Start new crawl
crawl_state.m_min_transid = crawl_state.m_max_transid;
@@ -1001,11 +993,25 @@ BeesCrawl::fetch_extents()
return next_transid();
}
// Check for btrfs send workaround: don't scan RO roots at all, pretend
// they are just empty. We can't free any space there, and we
// don't have the necessary analysis logic to be able to use
// them as dedupe src extents (yet).
//
// This will keep the max_transid up to date so if the root
// is ever switched back to read-write, it won't trigger big
// expensive in-kernel searches for ancient transids.
if (m_ctx->is_root_ro(old_state.m_root)) {
BEESLOGDEBUG("WORKAROUND: skipping scan of RO root " << old_state.m_root);
BEESCOUNT(root_workaround_btrfs_send);
return next_transid();
}
BEESNOTE("crawling " << get_state_end());
Timer crawl_timer;
BtrfsIoctlSearchKey sk;
BtrfsIoctlSearchKey sk(BEES_MAX_CRAWL_BYTES);
sk.tree_id = old_state.m_root;
sk.min_objectid = old_state.m_objectid;
sk.min_type = sk.max_type = BTRFS_EXTENT_DATA_KEY;
@@ -1013,7 +1019,7 @@ BeesCrawl::fetch_extents()
sk.min_transid = old_state.m_min_transid;
// Don't set max_transid to m_max_transid here. See below.
sk.max_transid = numeric_limits<uint64_t>::max();
sk.nr_items = 4;
sk.nr_items = BEES_MAX_CRAWL_ITEMS;
// Lock in the old state
set_state(old_state);
@@ -1041,43 +1047,6 @@ BeesCrawl::fetch_extents()
return next_transid();
}
// Check for btrfs send workaround: don't scan RO roots at all, pretend
// they are just empty. We can't free any space there, and we
// don't have the necessary analysis logic to be able to use
// them as dedupe src extents (yet).
bool ro_root = true;
catch_all([&](){
ro_root = m_ctx->is_root_ro(old_state.m_root);
});
if (ro_root) {
BEESLOGDEBUG("WORKAROUND: skipping scan of RO root " << old_state.m_root);
BEESCOUNT(root_workaround_btrfs_send);
// We would call next_transid() here, but we want to do a few things differently.
// We immediately defer further crawling on this subvol.
// We track max_transid if the subvol scan has never started.
// We postpone the started timestamp since we haven't started.
auto crawl_state = get_state_end();
if (crawl_state.m_objectid == 0) {
// This will keep the max_transid up to date so if the root
// is ever switched back to read-write, it won't trigger big
// expensive in-kernel searches for ancient transids.
// If the root is made RO while crawling is in progress, we will
// have the big expensive in-kernel searches (same as if we have
// been not running for a long time).
// Don't allow transid_max to ever move backwards.
const auto roots = m_ctx->roots();
const auto next_transid = roots->transid_max();
const auto current_time = time(NULL);
crawl_state.m_max_transid = max(next_transid, crawl_state.m_max_transid);
// Move the start time forward too, since we have not started crawling yet.
crawl_state.m_started = current_time;
set_state(crawl_state);
}
// Mark this root deferred so we won't see it until the next transid cycle
m_deferred = true;
return false;
}
// BEESLOGINFO("Crawling " << sk.m_result.size() << " results from " << get_state_end());
auto results_left = sk.m_result.size();
BEESNOTE("crawling " << results_left << " results from " << get_state_end());
@@ -1089,7 +1058,7 @@ BeesCrawl::fetch_extents()
size_t count_high = 0;
BeesFileRange last_bfr;
for (auto i : sk.m_result) {
sk.next_min(i, BTRFS_EXTENT_DATA_KEY);
sk.next_min(i);
--results_left;
BEESCOUNT(crawl_items);

View File

@@ -287,7 +287,7 @@ BeesFileRange::fd() const
}
Fd
BeesFileRange::fd(const shared_ptr<BeesContext> &ctx)
BeesFileRange::fd(const shared_ptr<BeesContext> &ctx) const
{
// If we don't have a fid we can't do much here
if (m_fid) {

View File

@@ -231,23 +231,17 @@ bees_readahead(int const fd, off_t offset, size_t size)
Timer readahead_timer;
BEESNOTE("readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
BEESTOOLONG("readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
#if 1
// In the kernel, readahead() is identical to posix_fadvise(..., POSIX_FADV_DONTNEED)
DIE_IF_NON_ZERO(readahead(fd, offset, size));
#else
#if 0
// Make sure this data is in page cache by brute force
// This isn't necessary and it might even be slower,
// but the btrfs kernel code does readahead with lower ioprio
// and might discard the readahead request entirely,
// so it's maybe, *maybe*, worth doing both.
// This isn't necessary and it might even be slower
BEESNOTE("emulating readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
while (size) {
// don't care about multithreaded writes to this buffer--it is garbage anyway
static uint8_t dummy[BEES_READAHEAD_SIZE];
size_t this_read_size = min(size, sizeof(dummy));
// Ignore errors and short reads. It turns out our size
// parameter isn't all that accurate, so we can't use
// the pread_or_die template.
// Ignore errors and short reads.
// It turns out our size parameter isn't all that accurate.
(void)!pread(fd, dummy, this_read_size, offset);
BEESCOUNT(readahead_count);
BEESCOUNTADD(readahead_bytes, this_read_size);
@@ -268,13 +262,6 @@ bees_unreadahead(int const fd, off_t offset, size_t size)
BEESCOUNTADD(readahead_unread_ms, unreadahead_timer.age() * 1000);
}
thread_local random_device bees_random_device;
thread_local uniform_int_distribution<default_random_engine::result_type> bees_random_seed_dist(
numeric_limits<default_random_engine::result_type>::min(),
numeric_limits<default_random_engine::result_type>::max()
);
thread_local default_random_engine bees_generator(bees_random_seed_dist(bees_random_device));
BeesStringFile::BeesStringFile(Fd dir_fd, string name, size_t limit) :
m_dir_fd(dir_fd),
m_name(name),

View File

@@ -13,15 +13,15 @@
#include "crucible/time.h"
#include "crucible/task.h"
#include <atomic>
#include <functional>
#include <list>
#include <mutex>
#include <string>
#include <random>
#include <thread>
#include <endian.h>
#include <syslog.h>
#include <endian.h>
using namespace crucible;
using namespace std;
@@ -101,6 +101,12 @@ const double BEES_HASH_TABLE_ANALYZE_INTERVAL = BEES_STATS_INTERVAL;
// Stop growing the work queue after we have this many tasks queued
const size_t BEES_MAX_QUEUE_SIZE = 128;
// Read this many items at a time in SEARCHv2
const size_t BEES_MAX_CRAWL_ITEMS = 8;
// Read this many bytes at a time in SEARCHv2 (one maximum-sized metadata page)
const size_t BEES_MAX_CRAWL_BYTES = 64 * 1024;
// Insert this many items before switching to a new subvol
const size_t BEES_MAX_CRAWL_BATCH = 128;
@@ -110,6 +116,9 @@ const size_t BEES_TRANSID_FACTOR = 10;
// Wait this long for a balance to stop
const double BEES_BALANCE_POLL_INTERVAL = 60.0;
// Workaround for backref bugs
const bool BEES_SERIALIZE_RESOLVE = false;
// Workaround for tree mod log bugs
const bool BEES_SERIALIZE_BALANCE = false;
@@ -260,7 +269,7 @@ ostream& operator<<(ostream &os, const BeesFileId &bfi);
class BeesFileRange {
protected:
Fd m_fd;
mutable Fd m_fd;
mutable BeesFileId m_fid;
off_t m_begin = 0, m_end = 0;
mutable off_t m_file_size = -1;
@@ -301,7 +310,7 @@ public:
Fd fd() const;
// Get the fd, opening it if necessary
Fd fd(const shared_ptr<BeesContext> &ctx);
Fd fd(const shared_ptr<BeesContext> &ctx) const;
BeesFileRange copy_closed() const;
@@ -336,7 +345,6 @@ public:
BeesAddress(Type addr = ZERO) : m_addr(addr) {}
BeesAddress(MagicValue addr) : m_addr(addr) {}
BeesAddress& operator=(const BeesAddress &that) = default;
BeesAddress(const BeesAddress &that) = default;
operator Type() const { return m_addr; }
bool operator==(const BeesAddress &that) const;
bool operator==(const MagicValue that) const { return *this == BeesAddress(that); }
@@ -397,7 +405,6 @@ public:
HashType e_hash;
AddrType e_addr;
Cell(const Cell &) = default;
Cell &operator=(const Cell &) = default;
Cell(HashType hash, AddrType addr) : e_hash(hash), e_addr(addr) { }
bool operator==(const Cell &e) const { return tie(e_hash, e_addr) == tie(e.e_hash, e.e_addr); }
bool operator!=(const Cell &e) const { return tie(e_hash, e_addr) != tie(e.e_hash, e.e_addr); }
@@ -461,7 +468,7 @@ private:
// Mutex/condvar for the writeback thread
mutex m_dirty_mutex;
condition_variable m_dirty_condvar;
bool m_dirty = false;
bool m_dirty;
// Mutex/condvar to stop
mutex m_stop_mutex;
@@ -495,8 +502,6 @@ private:
BeesHashTable(const BeesHashTable &) = delete;
BeesHashTable &operator=(const BeesHashTable &) = delete;
static thread_local uniform_int_distribution<size_t> tl_distribution;
};
ostream &operator<<(ostream &os, const BeesHashTable::Cell &bhte);
@@ -634,7 +639,7 @@ private:
ostream & operator<<(ostream &os, const BeesHash &bh);
class BeesBlockData {
using Blob = ByteVector;
using Blob = vector<uint8_t>;
mutable Fd m_fd;
off_t m_offset;
@@ -807,7 +812,7 @@ class BeesResolver {
BeesAddress m_addr;
vector<BtrfsInodeOffsetRoot> m_biors;
set<BeesFileRange> m_ranges;
size_t m_bior_count;
unsigned m_bior_count;
// We found matching data, so we can dedupe
bool m_found_data = false;
@@ -882,7 +887,6 @@ public:
extern int bees_log_level;
extern const char *BEES_USAGE;
extern const char *BEES_VERSION;
extern thread_local default_random_engine bees_generator;
string pretty(double d);
void bees_sync(int fd);
void bees_readahead(int fd, off_t offset, size_t size);

View File

@@ -22,21 +22,19 @@ main(int argc, char **argv)
cout << "File: " << filename << endl;
Fd fd = open_or_die(filename, O_RDONLY);
uint64_t start = 0;
uint64_t length = Fiemap::s_fiemap_max_offset;
if (argc > 2) { start = stoull(argv[2], nullptr, 0); }
if (argc > 3) { length = stoull(argv[3], nullptr, 0); }
length = min(length, Fiemap::s_fiemap_max_offset - start);
Fiemap fm(start, length);
fm.m_flags &= ~(FIEMAP_FLAG_SYNC);
Fiemap fm;
fm.fm_flags &= ~(FIEMAP_FLAG_SYNC);
fm.m_max_count = 100;
if (argc > 4) { fm.m_flags = stoull(argv[4], nullptr, 0); }
uint64_t stop_at = start + length;
uint64_t last_byte = start;
if (argc > 2) { fm.fm_start = stoull(argv[2], nullptr, 0); }
if (argc > 3) { fm.fm_length = stoull(argv[3], nullptr, 0); }
if (argc > 4) { fm.fm_flags = stoull(argv[4], nullptr, 0); }
fm.fm_length = min(fm.fm_length, FIEMAP_MAX_OFFSET - fm.fm_start);
uint64_t stop_at = fm.fm_start + fm.fm_length;
uint64_t last_byte = fm.fm_start;
do {
fm.do_ioctl(fd);
// cerr << fm;
uint64_t last_logical = Fiemap::s_fiemap_max_offset;
uint64_t last_logical = FIEMAP_MAX_OFFSET;
for (auto &extent : fm.m_extents) {
if (extent.fe_logical > last_byte) {
cout << "Log " << to_hex(last_byte) << ".." << to_hex(extent.fe_logical) << " Hole" << endl;
@@ -47,8 +45,8 @@ main(int argc, char **argv)
last_logical = extent.fe_logical + extent.fe_length;
last_byte = last_logical;
}
fm.m_start = last_logical;
} while (fm.m_start < stop_at);
fm.fm_start = last_logical;
} while (fm.fm_start < stop_at);
});
exit(EXIT_SUCCESS);
}