mirror of
https://github.com/Zygo/bees.git
synced 2025-08-02 13:53:28 +02:00
Compare commits
6 Commits
fanotify-w
...
v0.2
Author | SHA1 | Date | |
---|---|---|---|
|
dd21e6f848 | ||
|
06e111c229 | ||
|
38bb70f5d0 | ||
|
a57404442c | ||
|
1e621cf4e7 | ||
|
1303fb9da8 |
63
README.md
63
README.md
@@ -1,30 +1,52 @@
|
||||
BEES
|
||||
====
|
||||
|
||||
Best-Effort Extent-Same, a btrfs deduplication daemon.
|
||||
Best-Effort Extent-Same, a btrfs dedup agent.
|
||||
|
||||
About Bees
|
||||
----------
|
||||
|
||||
Bees is a daemon designed to run continuously on live file servers.
|
||||
Bees scans and deduplicates whole filesystems in a single pass instead
|
||||
of separate scan and dedup phases. RAM usage does _not_ depend on
|
||||
unique data size or the number of input files. Hash tables and scan
|
||||
progress are stored persistently so the daemon can resume after a reboot.
|
||||
Bees uses the Linux kernel's `dedupe_file_range` feature to ensure data
|
||||
is handled safely even if other applications concurrently modify it.
|
||||
Bees is a block-oriented userspace dedup agent designed to avoid
|
||||
scalability problems on large filesystems.
|
||||
|
||||
Bees is intentionally btrfs-specific for performance and capability.
|
||||
Bees uses the btrfs `SEARCH_V2` ioctl to scan for new data without the
|
||||
overhead of repeatedly walking filesystem trees with the POSIX API.
|
||||
Bees uses `LOGICAL_INO` and `INO_PATHS` to leverage btrfs's existing
|
||||
metadata instead of building its own redundant data structures.
|
||||
Bees can cope with Btrfs filesystem compression. Bees can reassemble
|
||||
Btrfs extents to deduplicate extents that contain a mix of duplicate
|
||||
and unique data blocks.
|
||||
Bees is designed to degrade gracefully when underprovisioned with RAM.
|
||||
Bees does not use more RAM or storage as filesystem data size increases.
|
||||
The dedup hash table size is fixed at creation time and does not change.
|
||||
The effective dedup block size is dynamic and adjusts automatically to
|
||||
fit the hash table into the configured RAM limit. Hash table overflow
|
||||
is not implemented to eliminate the IO overhead of hash table overflow.
|
||||
Hash table entries are only 16 bytes per dedup block to keep the average
|
||||
dedup block size small.
|
||||
|
||||
Bees includes a number of workarounds for Btrfs kernel bugs to (try to)
|
||||
avoid ruining your day. You're welcome.
|
||||
Bees does not require alignment between dedup blocks or extent boundaries
|
||||
(i.e. it can handle any multiple-of-4K offset between dup block pairs).
|
||||
Bees rearranges blocks into shared and unique extents if required to
|
||||
work within current btrfs kernel dedup limitations.
|
||||
|
||||
Bees can dedup any combination of compressed and uncompressed extents.
|
||||
|
||||
Bees operates in a single pass which removes duplicate extents immediately
|
||||
during scan. There are no separate scanning and dedup phases.
|
||||
|
||||
Bees uses only data-safe btrfs kernel operations, so it can dedup live
|
||||
data (e.g. build servers, sqlite databases, VM disk images). It does
|
||||
not modify file attributes or timestamps.
|
||||
|
||||
Bees does not store any information about filesystem structure, so it is
|
||||
not affected by the number or size of files (except to the extent that
|
||||
these cause performance problems for btrfs in general). It retrieves such
|
||||
information on demand through btrfs SEARCH_V2 and LOGICAL_INO ioctls.
|
||||
This eliminates the storage required to maintain the equivalents of
|
||||
these functions in userspace. It's also why bees has no XFS support.
|
||||
|
||||
Bees is a daemon designed to run continuously and maintain its state
|
||||
across crahes and reboots. Bees uses checkpoints for persistence to
|
||||
eliminate the IO overhead of a transactional data store. On restart,
|
||||
bees will dedup any data that was added to the filesystem since the
|
||||
last checkpoint.
|
||||
|
||||
Bees is used to dedup filesystems ranging in size from 16GB to 35TB, with
|
||||
hash tables ranging in size from 128MB to 11GB.
|
||||
|
||||
How Bees Works
|
||||
--------------
|
||||
@@ -270,9 +292,10 @@ Not really a bug, but a gotcha nonetheless:
|
||||
Requirements
|
||||
------------
|
||||
|
||||
* C++11 compiler (tested with GCC 4.9)
|
||||
* C++11 compiler (tested with GCC 4.9 and 6.2.0)
|
||||
|
||||
Sorry. I really like closures.
|
||||
Sorry. I really like closures and shared_ptr, so support
|
||||
for earlier compiler versions is unlikely.
|
||||
|
||||
* btrfs-progs (tested with 4.1..4.7)
|
||||
|
||||
|
@@ -8,6 +8,7 @@
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
namespace crucible {
|
||||
using namespace std;
|
||||
|
@@ -86,16 +86,6 @@ namespace crucible {
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct ChatterTraits<ostream &> {
|
||||
Chatter &
|
||||
operator()(Chatter &c, ostream & arg)
|
||||
{
|
||||
c.get_os() << arg;
|
||||
return c;
|
||||
}
|
||||
};
|
||||
|
||||
class ChatterBox {
|
||||
string m_file;
|
||||
int m_line;
|
||||
|
@@ -120,6 +120,9 @@ namespace crucible {
|
||||
template<> void pread_or_die<string>(int fd, string& str, off_t offset);
|
||||
template<> void pread_or_die<vector<char>>(int fd, vector<char>& str, off_t offset);
|
||||
template<> void pread_or_die<vector<uint8_t>>(int fd, vector<uint8_t>& str, off_t offset);
|
||||
template<> void pwrite_or_die<string>(int fd, const string& str, off_t offset);
|
||||
template<> void pwrite_or_die<vector<char>>(int fd, const vector<char>& str, off_t offset);
|
||||
template<> void pwrite_or_die<vector<uint8_t>>(int fd, const vector<uint8_t>& str, off_t offset);
|
||||
|
||||
// A different approach to reading a simple string
|
||||
string read_string(int fd, size_t size);
|
||||
|
@@ -171,7 +171,7 @@ namespace crucible {
|
||||
ostream & operator<<(ostream &os, const BtrfsIoctlSearchKey &key);
|
||||
|
||||
string btrfs_search_type_ntoa(unsigned type);
|
||||
string btrfs_search_objectid_ntoa(unsigned objectid);
|
||||
string btrfs_search_objectid_ntoa(uint64_t objectid);
|
||||
|
||||
uint64_t btrfs_get_root_id(int fd);
|
||||
uint64_t btrfs_get_root_transid(int fd);
|
||||
|
@@ -7,12 +7,12 @@ namespace crucible {
|
||||
using namespace std;
|
||||
|
||||
struct bits_ntoa_table {
|
||||
unsigned long n;
|
||||
unsigned long mask;
|
||||
unsigned long long n;
|
||||
unsigned long long mask;
|
||||
const char *a;
|
||||
};
|
||||
|
||||
string bits_ntoa(unsigned long n, const bits_ntoa_table *a);
|
||||
string bits_ntoa(unsigned long long n, const bits_ntoa_table *a);
|
||||
|
||||
};
|
||||
|
||||
|
@@ -23,7 +23,7 @@ namespace crucible {
|
||||
private:
|
||||
struct Item {
|
||||
Timestamp m_time;
|
||||
unsigned m_id;
|
||||
unsigned long m_id;
|
||||
Task m_task;
|
||||
|
||||
bool operator<(const Item &that) const {
|
||||
|
@@ -15,7 +15,7 @@
|
||||
namespace crucible {
|
||||
using namespace std;
|
||||
|
||||
static auto_ptr<set<string>> chatter_names;
|
||||
static shared_ptr<set<string>> chatter_names;
|
||||
static const char *SPACETAB = " \t";
|
||||
|
||||
static
|
||||
|
@@ -72,14 +72,10 @@ namespace crucible {
|
||||
catch_all([&]() {
|
||||
parent_fd->close();
|
||||
import_fd_fn(child_fd);
|
||||
// system("ls -l /proc/$$/fd/ >&2");
|
||||
|
||||
rv = f();
|
||||
});
|
||||
_exit(rv);
|
||||
cerr << "PID " << getpid() << " TID " << gettid() << "STILL ALIVE" << endl;
|
||||
system("ls -l /proc/$$/task/ >&2");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
|
21
lib/fd.cc
21
lib/fd.cc
@@ -426,6 +426,27 @@ namespace crucible {
|
||||
return pread_or_die(fd, text.data(), text.size(), offset);
|
||||
}
|
||||
|
||||
template<>
|
||||
void
|
||||
pwrite_or_die<vector<uint8_t>>(int fd, const vector<uint8_t> &text, off_t offset)
|
||||
{
|
||||
return pwrite_or_die(fd, text.data(), text.size(), offset);
|
||||
}
|
||||
|
||||
template<>
|
||||
void
|
||||
pwrite_or_die<vector<char>>(int fd, const vector<char> &text, off_t offset)
|
||||
{
|
||||
return pwrite_or_die(fd, text.data(), text.size(), offset);
|
||||
}
|
||||
|
||||
template<>
|
||||
void
|
||||
pwrite_or_die<string>(int fd, const string &text, off_t offset)
|
||||
{
|
||||
return pwrite_or_die(fd, text.data(), text.size(), offset);
|
||||
}
|
||||
|
||||
Stat::Stat()
|
||||
{
|
||||
memset_zero<stat>(this);
|
||||
|
@@ -834,7 +834,7 @@ namespace crucible {
|
||||
}
|
||||
|
||||
string
|
||||
btrfs_search_objectid_ntoa(unsigned objectid)
|
||||
btrfs_search_objectid_ntoa(uint64_t objectid)
|
||||
{
|
||||
static const bits_ntoa_table table[] = {
|
||||
NTOA_TABLE_ENTRY_ENUM(BTRFS_ROOT_TREE_OBJECTID),
|
||||
|
@@ -7,7 +7,7 @@
|
||||
namespace crucible {
|
||||
using namespace std;
|
||||
|
||||
string bits_ntoa(unsigned long n, const bits_ntoa_table *table)
|
||||
string bits_ntoa(unsigned long long n, const bits_ntoa_table *table)
|
||||
{
|
||||
string out;
|
||||
while (n && table->a) {
|
||||
|
@@ -1,6 +1,5 @@
|
||||
PROGRAMS = \
|
||||
../bin/bees \
|
||||
../bin/fanotify-watch \
|
||||
../bin/fiemap \
|
||||
../bin/fiewalk \
|
||||
|
||||
|
@@ -5,6 +5,7 @@
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
using namespace crucible;
|
||||
using namespace std;
|
||||
|
@@ -50,9 +50,18 @@ string
|
||||
BeesRoots::crawl_state_filename() const
|
||||
{
|
||||
string rv;
|
||||
|
||||
// Legacy filename included UUID
|
||||
rv += "beescrawl.";
|
||||
rv += m_ctx->root_uuid();
|
||||
rv += ".dat";
|
||||
|
||||
struct stat buf;
|
||||
if (fstatat(m_ctx->home_fd(), rv.c_str(), &buf, AT_SYMLINK_NOFOLLOW)) {
|
||||
// Use new filename
|
||||
rv = "beescrawl.dat";
|
||||
}
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
||||
@@ -101,6 +110,12 @@ BeesRoots::state_save()
|
||||
|
||||
m_crawl_state_file.write(ofs.str());
|
||||
|
||||
// Renaming things is hard after release
|
||||
if (m_crawl_state_file.name() != "beescrawl.dat") {
|
||||
renameat(m_ctx->home_fd(), m_crawl_state_file.name().c_str(), m_ctx->home_fd(), "beescrawl.dat");
|
||||
m_crawl_state_file.name("beescrawl.dat");
|
||||
}
|
||||
|
||||
BEESNOTE("relocking crawl state");
|
||||
lock.lock();
|
||||
// Not really correct but probably close enough
|
||||
|
12
src/bees.cc
12
src/bees.cc
@@ -351,6 +351,18 @@ BeesStringFile::BeesStringFile(Fd dir_fd, string name, size_t limit) :
|
||||
BEESLOG("BeesStringFile " << name_fd(m_dir_fd) << "/" << m_name << " max size " << pretty(m_limit));
|
||||
}
|
||||
|
||||
void
|
||||
BeesStringFile::name(const string &new_name)
|
||||
{
|
||||
m_name = new_name;
|
||||
}
|
||||
|
||||
string
|
||||
BeesStringFile::name() const
|
||||
{
|
||||
return m_name;
|
||||
}
|
||||
|
||||
string
|
||||
BeesStringFile::read()
|
||||
{
|
||||
|
@@ -374,6 +374,8 @@ public:
|
||||
BeesStringFile(Fd dir_fd, string name, size_t limit = 1024 * 1024);
|
||||
string read();
|
||||
void write(string contents);
|
||||
void name(const string &new_name);
|
||||
string name() const;
|
||||
};
|
||||
|
||||
class BeesHashTable {
|
||||
|
@@ -1,91 +0,0 @@
|
||||
#include <crucible/error.h>
|
||||
#include <crucible/fd.h>
|
||||
#include <crucible/ntoa.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#include <unistd.h>
|
||||
#include <sys/fanotify.h>
|
||||
|
||||
using namespace crucible;
|
||||
using namespace std;
|
||||
|
||||
static
|
||||
void
|
||||
usage(const char *name)
|
||||
{
|
||||
cerr << "Usage: " << name << " directory" << endl;
|
||||
cerr << "Reports fanotify events from directory" << endl;
|
||||
}
|
||||
|
||||
struct fan_read_block {
|
||||
struct fanotify_event_metadata fem;
|
||||
// more here in the future. Maybe.
|
||||
};
|
||||
|
||||
static inline
|
||||
string
|
||||
fan_flag_ntoa(uint64_t ui)
|
||||
{
|
||||
static const bits_ntoa_table flag_names[] = {
|
||||
NTOA_TABLE_ENTRY_BITS(FAN_ACCESS),
|
||||
NTOA_TABLE_ENTRY_BITS(FAN_OPEN),
|
||||
NTOA_TABLE_ENTRY_BITS(FAN_MODIFY),
|
||||
NTOA_TABLE_ENTRY_BITS(FAN_CLOSE),
|
||||
NTOA_TABLE_ENTRY_BITS(FAN_CLOSE_WRITE),
|
||||
NTOA_TABLE_ENTRY_BITS(FAN_CLOSE_NOWRITE),
|
||||
NTOA_TABLE_ENTRY_BITS(FAN_Q_OVERFLOW),
|
||||
NTOA_TABLE_ENTRY_BITS(FAN_ACCESS_PERM),
|
||||
NTOA_TABLE_ENTRY_BITS(FAN_OPEN_PERM),
|
||||
NTOA_TABLE_ENTRY_END()
|
||||
};
|
||||
return bits_ntoa(ui, flag_names);
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
if (argc < 1) {
|
||||
usage(argv[0]);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
Fd fd;
|
||||
|
||||
DIE_IF_MINUS_ONE(fd = fanotify_init(FAN_CLASS_NOTIF, O_RDONLY | O_LARGEFILE | O_CLOEXEC | O_NOATIME));
|
||||
|
||||
for (char **argvp = argv + 1; *argvp; ++argvp) {
|
||||
cerr << "fanotify_mark(" << *argvp << ")..." << flush;
|
||||
DIE_IF_MINUS_ONE(fanotify_mark(fd, FAN_MARK_ADD | FAN_MARK_MOUNT, FAN_CLOSE_WRITE | FAN_CLOSE_NOWRITE | FAN_OPEN, FAN_NOFD, *argvp));
|
||||
cerr << endl;
|
||||
}
|
||||
|
||||
while (1) {
|
||||
struct fan_read_block frb;
|
||||
read_or_die(fd, frb);
|
||||
|
||||
#if 0
|
||||
cout << "event_len\t= " << frb.fem.event_len << endl;
|
||||
cout << "vers\t= " << static_cast<int>(frb.fem.vers) << endl;
|
||||
cout << "reserved\t= " << static_cast<int>(frb.fem.reserved) << endl;
|
||||
cout << "metadata_len\t= " << frb.fem.metadata_len << endl;
|
||||
cout << "mask\t= " << hex << frb.fem.mask << dec << "\t" << fan_flag_ntoa(frb.fem.mask) << endl;
|
||||
cout << "fd\t= " << frb.fem.fd << endl;
|
||||
cout << "pid\t= " << frb.fem.pid << endl;
|
||||
#endif
|
||||
|
||||
cout << "flags " << fan_flag_ntoa(frb.fem.mask) << " pid " << frb.fem.pid << ' ' << flush;
|
||||
|
||||
Fd event_fd(frb.fem.fd);
|
||||
ostringstream oss;
|
||||
oss << "/proc/self/fd/" << event_fd;
|
||||
cout << "file " << readlink_or_die(oss.str()) << endl;
|
||||
|
||||
// cout << endl;
|
||||
}
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
@@ -126,7 +126,13 @@ test_cast_0x80000000_to_things()
|
||||
{
|
||||
auto sv = 0x80000000LL;
|
||||
auto uv = 0x80000000ULL;
|
||||
SHOULD_PASS(ranged_cast<off_t>(sv), sv);
|
||||
if (sizeof(off_t) == 4) {
|
||||
SHOULD_FAIL(ranged_cast<off_t>(sv));
|
||||
} else if (sizeof(off_t) == 8) {
|
||||
SHOULD_PASS(ranged_cast<off_t>(sv), sv);
|
||||
} else {
|
||||
assert(!"unhandled case, please add code for off_t here");
|
||||
}
|
||||
SHOULD_PASS(ranged_cast<uint64_t>(uv), uv);
|
||||
SHOULD_PASS(ranged_cast<uint32_t>(uv), uv);
|
||||
SHOULD_FAIL(ranged_cast<uint16_t>(uv));
|
||||
@@ -141,7 +147,13 @@ test_cast_0x80000000_to_things()
|
||||
SHOULD_FAIL(ranged_cast<unsigned short>(uv));
|
||||
SHOULD_FAIL(ranged_cast<unsigned char>(uv));
|
||||
SHOULD_PASS(ranged_cast<signed long long>(sv), sv);
|
||||
SHOULD_PASS(ranged_cast<signed long>(sv), sv);
|
||||
if (sizeof(long) == 4) {
|
||||
SHOULD_FAIL(ranged_cast<signed long>(sv));
|
||||
} else if (sizeof(long) == 8) {
|
||||
SHOULD_PASS(ranged_cast<signed long>(sv), sv);
|
||||
} else {
|
||||
assert(!"unhandled case, please add code for long here");
|
||||
}
|
||||
SHOULD_FAIL(ranged_cast<signed short>(sv));
|
||||
SHOULD_FAIL(ranged_cast<signed char>(sv));
|
||||
if (sizeof(int) == 4) {
|
||||
@@ -149,7 +161,7 @@ test_cast_0x80000000_to_things()
|
||||
} else if (sizeof(int) == 8) {
|
||||
SHOULD_PASS(ranged_cast<signed int>(sv), sv);
|
||||
} else {
|
||||
assert(!"unhandled case, please add code here");
|
||||
assert(!"unhandled case, please add code for int here");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -159,7 +171,13 @@ test_cast_0xffffffff_to_things()
|
||||
{
|
||||
auto sv = 0xffffffffLL;
|
||||
auto uv = 0xffffffffULL;
|
||||
SHOULD_PASS(ranged_cast<off_t>(sv), sv);
|
||||
if (sizeof(off_t) == 4) {
|
||||
SHOULD_FAIL(ranged_cast<off_t>(sv));
|
||||
} else if (sizeof(off_t) == 8) {
|
||||
SHOULD_PASS(ranged_cast<off_t>(sv), sv);
|
||||
} else {
|
||||
assert(!"unhandled case, please add code for off_t here");
|
||||
}
|
||||
SHOULD_PASS(ranged_cast<uint64_t>(uv), uv);
|
||||
SHOULD_PASS(ranged_cast<uint32_t>(uv), uv);
|
||||
SHOULD_FAIL(ranged_cast<uint16_t>(uv));
|
||||
@@ -174,7 +192,13 @@ test_cast_0xffffffff_to_things()
|
||||
SHOULD_FAIL(ranged_cast<unsigned short>(uv));
|
||||
SHOULD_FAIL(ranged_cast<unsigned char>(uv));
|
||||
SHOULD_PASS(ranged_cast<signed long long>(sv), sv);
|
||||
SHOULD_PASS(ranged_cast<signed long>(sv), sv);
|
||||
if (sizeof(long) == 4) {
|
||||
SHOULD_FAIL(ranged_cast<signed long>(sv));
|
||||
} else if (sizeof(long) == 8) {
|
||||
SHOULD_PASS(ranged_cast<signed long>(sv), sv);
|
||||
} else {
|
||||
assert(!"unhandled case, please add code for long here");
|
||||
}
|
||||
SHOULD_FAIL(ranged_cast<signed short>(sv));
|
||||
SHOULD_FAIL(ranged_cast<signed char>(sv));
|
||||
if (sizeof(int) == 4) {
|
||||
@@ -182,7 +206,7 @@ test_cast_0xffffffff_to_things()
|
||||
} else if (sizeof(int) == 8) {
|
||||
SHOULD_PASS(ranged_cast<signed int>(sv), sv);
|
||||
} else {
|
||||
assert(!"unhandled case, please add code here");
|
||||
assert(!"unhandled case, please add code for int here");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -192,7 +216,13 @@ test_cast_0xfffffffff_to_things()
|
||||
{
|
||||
auto sv = 0xfffffffffLL;
|
||||
auto uv = 0xfffffffffULL;
|
||||
SHOULD_PASS(ranged_cast<off_t>(sv), sv);
|
||||
if (sizeof(off_t) == 4) {
|
||||
SHOULD_FAIL(ranged_cast<off_t>(sv));
|
||||
} else if (sizeof(off_t) == 8) {
|
||||
SHOULD_PASS(ranged_cast<off_t>(sv), sv);
|
||||
} else {
|
||||
assert(!"unhandled case, please add code for off_t here");
|
||||
}
|
||||
SHOULD_PASS(ranged_cast<uint64_t>(uv), uv);
|
||||
SHOULD_FAIL(ranged_cast<uint32_t>(uv));
|
||||
SHOULD_FAIL(ranged_cast<uint16_t>(uv));
|
||||
|
Reference in New Issue
Block a user