1
0
mirror of https://github.com/Zygo/bees.git synced 2025-08-03 14:23:29 +02:00

bees: remove local cruft, throw at github

This commit is contained in:
Zygo Blaxell
2016-11-15 23:32:44 -05:00
commit cca0ee26a8
66 changed files with 12785 additions and 0 deletions

39
src/Makefile Normal file
View File

@@ -0,0 +1,39 @@
PROGRAMS = \
../bin/bees \
../bin/fiemap \
../bin/fiewalk \
all: $(PROGRAMS) depends.mk
include ../makeflags
LIBS = -lcrucible -lpthread
LDFLAGS = -L../lib -Wl,-rpath=$(shell realpath ../lib)
depends.mk: Makefile *.cc
for x in *.cc; do $(CXX) $(CXXFLAGS) -M "$$x"; done > depends.mk.new
mv -fv depends.mk.new depends.mk
-include depends.mk
%.o: %.cc %.h
$(CXX) $(CXXFLAGS) -o "$@" -c "$<"
../bin/%: %.o
@echo Implicit bin rule "$<" '->' "$@"
$(CXX) $(CXXFLAGS) -o "$@" "$<" $(LDFLAGS) $(LIBS)
BEES_OBJS = \
bees.o \
bees-context.o \
bees-hash.o \
bees-resolve.o \
bees-roots.o \
bees-thread.o \
bees-types.o \
../bin/bees: $(BEES_OBJS)
$(CXX) $(CXXFLAGS) -o "$@" $(BEES_OBJS) $(LDFLAGS) $(LIBS)
clean:
-rm -fv *.o

1009
src/bees-context.cc Normal file

File diff suppressed because it is too large Load Diff

682
src/bees-hash.cc Normal file
View File

@@ -0,0 +1,682 @@
#include "bees.h"
#include "crucible/crc64.h"
#include "crucible/string.h"
#include <algorithm>
#include <random>
#include <sys/mman.h>
using namespace crucible;
using namespace std;
static inline
bool
using_any_madvise()
{
return true;
}
ostream &
operator<<(ostream &os, const BeesHash &bh)
{
return os << to_hex(BeesHash::Type(bh));
}
ostream &
operator<<(ostream &os, const BeesHashTable::Cell &bhte)
{
return os << "BeesHashTable::Cell { hash = " << BeesHash(bhte.e_hash) << ", addr = "
<< BeesAddress(bhte.e_addr) << " }";
}
void
dump_bucket(BeesHashTable::Cell *p, BeesHashTable::Cell *q)
{
// Must be called while holding m_bucket_mutex
for (auto i = p; i < q; ++i) {
BEESLOG("Entry " << i - p << " " << *i);
}
}
const bool VERIFY_CLEARS_BUGS = false;
bool
verify_cell_range(BeesHashTable::Cell *p, BeesHashTable::Cell *q, bool clear_bugs = VERIFY_CLEARS_BUGS)
{
// Must be called while holding m_bucket_mutex
bool bugs_found = false;
set<BeesHashTable::Cell> seen_it;
for (BeesHashTable::Cell *cell = p; cell < q; ++cell) {
if (cell->e_addr && cell->e_addr < 0x1000) {
BEESCOUNT(bug_hash_magic_addr);
BEESINFO("Bad hash table address hash " << to_hex(cell->e_hash) << " addr " << to_hex(cell->e_addr));
if (clear_bugs) {
cell->e_addr = 0;
cell->e_hash = 0;
}
bugs_found = true;
}
if (cell->e_addr && !seen_it.insert(*cell).second) {
BEESCOUNT(bug_hash_duplicate_cell);
// BEESLOG("Duplicate hash table entry:\nthis = " << *cell << "\nold = " << *seen_it.find(*cell));
BEESINFO("Duplicate hash table entry: " << *cell);
if (clear_bugs) {
cell->e_addr = 0;
cell->e_hash = 0;
}
bugs_found = true;
}
}
return bugs_found;
}
pair<BeesHashTable::Cell *, BeesHashTable::Cell *>
BeesHashTable::get_cell_range(HashType hash)
{
THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0);
THROW_CHECK1(runtime_error, m_bucket_ptr, m_bucket_ptr != nullptr);
Bucket *pp = &m_bucket_ptr[hash % m_buckets];
Cell *bp = pp[0].p_cells;
Cell *ep = pp[1].p_cells;
THROW_CHECK2(out_of_range, m_cell_ptr, bp, bp >= m_cell_ptr);
THROW_CHECK2(out_of_range, m_cell_ptr_end, ep, ep <= m_cell_ptr_end);
return make_pair(bp, ep);
}
pair<uint8_t *, uint8_t *>
BeesHashTable::get_extent_range(HashType hash)
{
THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0);
THROW_CHECK1(runtime_error, m_bucket_ptr, m_bucket_ptr != nullptr);
Extent *iop = &m_extent_ptr[ (hash % m_buckets) / c_buckets_per_extent ];
uint8_t *bp = iop[0].p_byte;
uint8_t *ep = iop[1].p_byte;
THROW_CHECK2(out_of_range, m_byte_ptr, bp, bp >= m_byte_ptr);
THROW_CHECK2(out_of_range, m_byte_ptr_end, ep, ep <= m_byte_ptr_end);
return make_pair(bp, ep);
}
void
BeesHashTable::flush_dirty_extents()
{
if (using_shared_map()) return;
THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0);
unique_lock<mutex> lock(m_extent_mutex);
auto dirty_extent_copy = m_buckets_dirty;
m_buckets_dirty.clear();
if (dirty_extent_copy.empty()) {
BEESNOTE("idle");
m_condvar.wait(lock);
return; // please call later, i.e. immediately
}
lock.unlock();
size_t extent_counter = 0;
for (auto extent_number : dirty_extent_copy) {
++extent_counter;
BEESNOTE("flush extent #" << extent_number << " (" << extent_counter << " of " << dirty_extent_copy.size() << ")");
catch_all([&]() {
uint8_t *dirty_extent = m_extent_ptr[extent_number].p_byte;
uint8_t *dirty_extent_end = m_extent_ptr[extent_number + 1].p_byte;
THROW_CHECK1(out_of_range, dirty_extent, dirty_extent >= m_byte_ptr);
THROW_CHECK1(out_of_range, dirty_extent_end, dirty_extent_end <= m_byte_ptr_end);
if (using_shared_map()) {
BEESTOOLONG("flush extent " << extent_number);
copy(dirty_extent, dirty_extent_end, dirty_extent);
} else {
BEESTOOLONG("pwrite(fd " << m_fd << " '" << name_fd(m_fd)<< "', length " << to_hex(dirty_extent_end - dirty_extent) << ", offset " << to_hex(dirty_extent - m_byte_ptr) << ")");
// Page locks slow us down more than copying the data does
vector<uint8_t> extent_copy(dirty_extent, dirty_extent_end);
pwrite_or_die(m_fd, extent_copy, dirty_extent - m_byte_ptr);
BEESCOUNT(hash_extent_out);
}
});
BEESNOTE("flush rate limited at extent #" << extent_number << " (" << extent_counter << " of " << dirty_extent_copy.size() << ")");
m_flush_rate_limit.sleep_for(BLOCK_SIZE_HASHTAB_EXTENT);
}
}
void
BeesHashTable::set_extent_dirty(HashType hash)
{
if (using_shared_map()) return;
THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0);
auto pr = get_extent_range(hash);
uint64_t extent_number = reinterpret_cast<Extent *>(pr.first) - m_extent_ptr;
THROW_CHECK1(runtime_error, extent_number, extent_number < m_extents);
unique_lock<mutex> lock(m_extent_mutex);
m_buckets_dirty.insert(extent_number);
m_condvar.notify_one();
}
void
BeesHashTable::writeback_loop()
{
if (!using_shared_map()) {
while (1) {
flush_dirty_extents();
}
}
}
static
string
percent(size_t num, size_t den)
{
if (den) {
return astringprintf("%u%%", num * 100 / den);
} else {
return "--%";
}
}
void
BeesHashTable::prefetch_loop()
{
// Always do the mlock, whether shared or not
THROW_CHECK1(runtime_error, m_size, m_size > 0);
catch_all([&]() {
BEESNOTE("mlock " << pretty(m_size));
DIE_IF_NON_ZERO(mlock(m_byte_ptr, m_size));
});
while (1) {
size_t width = 64;
vector<size_t> occupancy(width, 0);
size_t occupied_count = 0;
size_t total_count = 0;
size_t compressed_count = 0;
size_t compressed_offset_count = 0;
size_t toxic_count = 0;
size_t unaligned_eof_count = 0;
for (uint64_t ext = 0; ext < m_extents; ++ext) {
BEESNOTE("prefetching hash table extent " << ext << " of " << m_extent_ptr_end - m_extent_ptr);
catch_all([&]() {
fetch_missing_extent(ext * c_buckets_per_extent);
BEESNOTE("analyzing hash table extent " << ext << " of " << m_extent_ptr_end - m_extent_ptr);
bool duplicate_bugs_found = false;
unique_lock<mutex> lock(m_bucket_mutex);
for (Bucket *bucket = m_extent_ptr[ext].p_buckets; bucket < m_extent_ptr[ext + 1].p_buckets; ++bucket) {
if (verify_cell_range(bucket[0].p_cells, bucket[1].p_cells)) {
duplicate_bugs_found = true;
}
size_t this_bucket_occupied_count = 0;
for (Cell *cell = bucket[0].p_cells; cell < bucket[1].p_cells; ++cell) {
if (cell->e_addr) {
++this_bucket_occupied_count;
BeesAddress a(cell->e_addr);
if (a.is_compressed()) {
++compressed_count;
if (a.has_compressed_offset()) {
++compressed_offset_count;
}
}
if (a.is_toxic()) {
++toxic_count;
}
if (a.is_unaligned_eof()) {
++unaligned_eof_count;
}
}
++total_count;
}
++occupancy.at(this_bucket_occupied_count * width / (1 + c_cells_per_bucket) );
// Count these instead of calculating the number so we get better stats in case of exceptions
occupied_count += this_bucket_occupied_count;
}
lock.unlock();
if (duplicate_bugs_found) {
set_extent_dirty(ext);
}
});
}
BEESNOTE("calculating hash table statistics");
vector<string> histogram;
vector<size_t> thresholds;
size_t threshold = 1;
bool threshold_exceeded = false;
do {
threshold_exceeded = false;
histogram.push_back(string(width, ' '));
thresholds.push_back(threshold);
for (size_t x = 0; x < width; ++x) {
if (occupancy.at(x) >= threshold) {
histogram.back().at(x) = '#';
threshold_exceeded = true;
}
}
threshold *= 2;
} while (threshold_exceeded);
ostringstream out;
size_t count = histogram.size();
bool first_line = true;
for (auto it = histogram.rbegin(); it != histogram.rend(); ++it) {
out << *it << " " << thresholds.at(--count);
if (first_line) {
first_line = false;
out << " pages";
}
out << "\n";
}
size_t uncompressed_count = occupied_count - compressed_count;
size_t legacy_count = compressed_count - compressed_offset_count;
ostringstream graph_blob;
graph_blob << "Now: " << format_time(time(NULL)) << "\n";
graph_blob << "Uptime: " << m_ctx->total_timer().age() << " seconds\n";
graph_blob
<< "\nHash table page occupancy histogram (" << occupied_count << "/" << total_count << " cells occupied, " << (occupied_count * 100 / total_count) << "%)\n"
<< out.str() << "0% | 25% | 50% | 75% | 100% page fill\n"
<< "compressed " << compressed_count << " (" << percent(compressed_count, occupied_count) << ")"
<< " new-style " << compressed_offset_count << " (" << percent(compressed_offset_count, occupied_count) << ")"
<< " old-style " << legacy_count << " (" << percent(legacy_count, occupied_count) << ")\n"
<< "uncompressed " << uncompressed_count << " (" << percent(uncompressed_count, occupied_count) << ")"
<< " unaligned_eof " << unaligned_eof_count << " (" << percent(unaligned_eof_count, occupied_count) << ")"
<< " toxic " << toxic_count << " (" << percent(toxic_count, occupied_count) << ")";
graph_blob << "\n\n";
graph_blob << "TOTAL:\n";
auto thisStats = BeesStats::s_global;
graph_blob << "\t" << thisStats << "\n";
graph_blob << "\nRATES:\n";
auto avg_rates = thisStats / m_ctx->total_timer().age();
graph_blob << "\t" << avg_rates << "\n";
BEESLOG(graph_blob.str());
catch_all([&]() {
m_stats_file.write(graph_blob.str());
});
BEESNOTE("idle " << BEES_HASH_TABLE_ANALYZE_INTERVAL << "s");
nanosleep(BEES_HASH_TABLE_ANALYZE_INTERVAL);
}
}
void
BeesHashTable::fetch_missing_extent(HashType hash)
{
BEESTOOLONG("fetch_missing_extent for hash " << to_hex(hash));
if (using_shared_map()) return;
THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0);
auto pr = get_extent_range(hash);
uint64_t extent_number = reinterpret_cast<Extent *>(pr.first) - m_extent_ptr;
THROW_CHECK1(runtime_error, extent_number, extent_number < m_extents);
unique_lock<mutex> lock(m_extent_mutex);
if (!m_buckets_missing.count(extent_number)) {
return;
}
size_t missing_buckets = m_buckets_missing.size();
lock.unlock();
BEESNOTE("fetch waiting for hash extent #" << extent_number << ", " << missing_buckets << " left to fetch");
// Acquire blocking lock on this extent only
LockSet<uint64_t>::Lock extent_lock(m_extent_lock_set, extent_number);
// Check missing again because someone else might have fetched this
// extent for us while we didn't hold any locks
lock.lock();
if (!m_buckets_missing.count(extent_number)) {
BEESCOUNT(hash_extent_in_twice);
return;
}
lock.unlock();
// OK we have to read this extent
BEESNOTE("fetching hash extent #" << extent_number << ", " << missing_buckets << " left to fetch");
BEESTRACE("Fetching missing hash extent " << extent_number);
uint8_t *dirty_extent = m_extent_ptr[extent_number].p_byte;
uint8_t *dirty_extent_end = m_extent_ptr[extent_number + 1].p_byte;
{
BEESTOOLONG("pread(fd " << m_fd << " '" << name_fd(m_fd)<< "', length " << to_hex(dirty_extent_end - dirty_extent) << ", offset " << to_hex(dirty_extent - m_byte_ptr) << ")");
pread_or_die(m_fd, dirty_extent, dirty_extent_end - dirty_extent, dirty_extent - m_byte_ptr);
}
BEESCOUNT(hash_extent_in);
// We don't block when fetching an extent but we do slow down the
// prefetch thread.
m_prefetch_rate_limit.borrow(BLOCK_SIZE_HASHTAB_EXTENT);
lock.lock();
m_buckets_missing.erase(extent_number);
}
bool
BeesHashTable::is_toxic_hash(BeesHashTable::HashType hash) const
{
return m_toxic_hashes.find(hash) != m_toxic_hashes.end();
}
vector<BeesHashTable::Cell>
BeesHashTable::find_cell(HashType hash)
{
// This saves a lot of time prefilling the hash table, and there's no risk of eviction
if (is_toxic_hash(hash)) {
BEESCOUNT(hash_toxic);
BeesAddress toxic_addr(0x1000);
toxic_addr.set_toxic();
Cell toxic_cell(hash, toxic_addr);
vector<Cell> rv;
rv.push_back(toxic_cell);
return rv;
}
fetch_missing_extent(hash);
BEESTOOLONG("find_cell hash " << BeesHash(hash));
vector<Cell> rv;
unique_lock<mutex> lock(m_bucket_mutex);
auto er = get_cell_range(hash);
// FIXME: Weed out zero addresses in the table due to earlier bugs
copy_if(er.first, er.second, back_inserter(rv), [=](const Cell &ip) { return ip.e_hash == hash && ip.e_addr >= 0x1000; });
BEESCOUNT(hash_lookup);
return rv;
}
// Move an entry to the end of the list. Used after an attempt to resolve
// an address in the hash table fails. Probably more correctly called
// push_back_hash_addr, except it never inserts. Shared hash tables
// never erase anything, since there is no way to tell if an entry is
// out of date or just belonging to the wrong filesystem.
void
BeesHashTable::erase_hash_addr(HashType hash, AddrType addr)
{
// if (m_shared) return;
fetch_missing_extent(hash);
BEESTOOLONG("erase hash " << to_hex(hash) << " addr " << addr);
unique_lock<mutex> lock(m_bucket_mutex);
auto er = get_cell_range(hash);
Cell mv(hash, addr);
Cell *ip = find(er.first, er.second, mv);
bool found = (ip < er.second);
if (found) {
// Lookups on invalid addresses really hurt us. Kill it with fire!
*ip = Cell(0, 0);
set_extent_dirty(hash);
BEESCOUNT(hash_erase);
#if 0
if (verify_cell_range(er.first, er.second)) {
BEESINFO("while erasing hash " << hash << " addr " << addr);
}
#endif
}
}
// If entry is already present in list, move it to the front of the
// list without dropping any entries, and return true. If entry is not
// present in list, insert it at the front of the list, possibly dropping
// the last entry in the list, and return false. Used to move duplicate
// hash blocks to the front of the list.
bool
BeesHashTable::push_front_hash_addr(HashType hash, AddrType addr)
{
fetch_missing_extent(hash);
BEESTOOLONG("push_front_hash_addr hash " << BeesHash(hash) <<" addr " << BeesAddress(addr));
unique_lock<mutex> lock(m_bucket_mutex);
auto er = get_cell_range(hash);
Cell mv(hash, addr);
Cell *ip = find(er.first, er.second, mv);
bool found = (ip < er.second);
if (!found) {
// If no match found, get rid of an empty space instead
// If no empty spaces, ip will point to end
ip = find(er.first, er.second, Cell(0, 0));
}
if (ip > er.first) {
// Delete matching entry, first empty entry,
// or last entry whether empty or not
// move_backward(er.first, ip - 1, ip);
auto sp = ip;
auto dp = ip;
--sp;
// If we are deleting the last entry then don't copy it
if (ip == er.second) {
--sp;
--dp;
BEESCOUNT(hash_evict);
}
while (dp > er.first) {
*dp-- = *sp--;
}
}
// There is now a space at the front, insert there if different
if (er.first[0] != mv) {
er.first[0] = mv;
set_extent_dirty(hash);
BEESCOUNT(hash_front);
}
#if 0
if (verify_cell_range(er.first, er.second)) {
BEESINFO("while push_fronting hash " << hash << " addr " << addr);
}
#endif
return found;
}
// If entry is already present in list, returns true and does not
// modify list. If entry is not present in list, returns false and
// inserts at a random position in the list, possibly evicting the entry
// at the end of the list. Used to insert new unique (not-yet-duplicate)
// blocks in random order.
bool
BeesHashTable::push_random_hash_addr(HashType hash, AddrType addr)
{
fetch_missing_extent(hash);
BEESTOOLONG("push_random_hash_addr hash " << BeesHash(hash) << " addr " << BeesAddress(addr));
unique_lock<mutex> lock(m_bucket_mutex);
auto er = get_cell_range(hash);
Cell mv(hash, addr);
Cell *ip = find(er.first, er.second, mv);
bool found = (ip < er.second);
thread_local default_random_engine generator;
thread_local uniform_int_distribution<int> distribution(0, c_cells_per_bucket - 1);
auto pos = distribution(generator);
int case_cond = 0;
vector<Cell> saved(er.first, er.second);
if (found) {
// If hash already exists after pos, swap with pos
if (ip > er.first + pos) {
// move_backward(er.first + pos, ip - 1, ip);
auto sp = ip;
auto dp = ip;
--sp;
while (dp > er.first + pos) {
*dp-- = *sp--;
}
*dp = mv;
BEESCOUNT(hash_bump);
case_cond = 1;
goto ret_dirty;
}
// Hash already exists before (or at) pos, leave it there
BEESCOUNT(hash_already);
case_cond = 2;
goto ret;
}
// Find an empty space to back of pos
for (ip = er.first + pos; ip < er.second; ++ip) {
if (*ip == Cell(0, 0)) {
*ip = mv;
case_cond = 3;
goto ret_dirty;
}
}
// Find an empty space to front of pos
// if there is anything to front of pos
if (pos > 0) {
for (ip = er.first + pos - 1; ip >= er.first; --ip) {
if (*ip == Cell(0, 0)) {
*ip = mv;
case_cond = 4;
goto ret_dirty;
}
}
}
// Evict something and insert at pos
move_backward(er.first + pos, er.second - 1, er.second);
er.first[pos] = mv;
BEESCOUNT(hash_evict);
case_cond = 5;
ret_dirty:
BEESCOUNT(hash_insert);
set_extent_dirty(hash);
ret:
#if 0
if (verify_cell_range(er.first, er.second, false)) {
BEESLOG("while push_randoming (case " << case_cond << ") pos " << pos
<< " ip " << (ip - er.first) << " " << mv);
// dump_bucket(saved.data(), saved.data() + saved.size());
// dump_bucket(er.first, er.second);
}
#else
(void)case_cond;
#endif
return found;
}
void
BeesHashTable::try_mmap_flags(int flags)
{
if (!m_cell_ptr) {
THROW_CHECK1(out_of_range, m_size, m_size > 0);
Timer map_time;
catch_all([&]() {
BEESLOG("mapping hash table size " << m_size << " with flags " << mmap_flags_ntoa(flags));
void *ptr = mmap_or_die(nullptr, m_size, PROT_READ | PROT_WRITE, flags, flags & MAP_ANONYMOUS ? -1 : int(m_fd), 0);
BEESLOG("mmap done in " << map_time << " sec");
m_cell_ptr = static_cast<Cell *>(ptr);
void *ptr_end = static_cast<uint8_t *>(ptr) + m_size;
m_cell_ptr_end = static_cast<Cell *>(ptr_end);
});
}
}
void
BeesHashTable::set_shared(bool shared)
{
m_shared = shared;
}
BeesHashTable::BeesHashTable(shared_ptr<BeesContext> ctx, string filename) :
m_ctx(ctx),
m_size(0),
m_void_ptr(nullptr),
m_void_ptr_end(nullptr),
m_buckets(0),
m_cells(0),
m_writeback_thread("hash_writeback"),
m_prefetch_thread("hash_prefetch " + m_ctx->root_path()),
m_flush_rate_limit(BEES_FLUSH_RATE),
m_prefetch_rate_limit(BEES_FLUSH_RATE),
m_stats_file(m_ctx->home_fd(), "beesstats.txt")
{
BEESNOTE("opening hash table " << filename);
m_fd = openat_or_die(m_ctx->home_fd(), filename, FLAGS_OPEN_FILE_RW, 0700);
Stat st(m_fd);
m_size = st.st_size;
BEESTRACE("hash table size " << m_size);
BEESTRACE("hash table bucket size " << BLOCK_SIZE_HASHTAB_BUCKET);
BEESTRACE("hash table extent size " << BLOCK_SIZE_HASHTAB_EXTENT);
THROW_CHECK2(invalid_argument, BLOCK_SIZE_HASHTAB_BUCKET, BLOCK_SIZE_HASHTAB_EXTENT, (BLOCK_SIZE_HASHTAB_EXTENT % BLOCK_SIZE_HASHTAB_BUCKET) == 0);
// Does the union work?
THROW_CHECK2(runtime_error, m_void_ptr, m_cell_ptr, m_void_ptr == m_cell_ptr);
THROW_CHECK2(runtime_error, m_void_ptr, m_byte_ptr, m_void_ptr == m_byte_ptr);
THROW_CHECK2(runtime_error, m_void_ptr, m_bucket_ptr, m_void_ptr == m_bucket_ptr);
THROW_CHECK2(runtime_error, m_void_ptr, m_extent_ptr, m_void_ptr == m_extent_ptr);
// There's more than one union
THROW_CHECK2(runtime_error, sizeof(Bucket), BLOCK_SIZE_HASHTAB_BUCKET, BLOCK_SIZE_HASHTAB_BUCKET == sizeof(Bucket));
THROW_CHECK2(runtime_error, sizeof(Bucket::p_byte), BLOCK_SIZE_HASHTAB_BUCKET, BLOCK_SIZE_HASHTAB_BUCKET == sizeof(Bucket::p_byte));
THROW_CHECK2(runtime_error, sizeof(Extent), BLOCK_SIZE_HASHTAB_EXTENT, BLOCK_SIZE_HASHTAB_EXTENT == sizeof(Extent));
THROW_CHECK2(runtime_error, sizeof(Extent::p_byte), BLOCK_SIZE_HASHTAB_EXTENT, BLOCK_SIZE_HASHTAB_EXTENT == sizeof(Extent::p_byte));
BEESLOG("opened hash table filename '" << filename << "' length " << m_size);
m_buckets = m_size / BLOCK_SIZE_HASHTAB_BUCKET;
m_cells = m_buckets * c_cells_per_bucket;
m_extents = (m_size + BLOCK_SIZE_HASHTAB_EXTENT - 1) / BLOCK_SIZE_HASHTAB_EXTENT;
BEESLOG("\tcells " << m_cells << ", buckets " << m_buckets << ", extents " << m_extents);
BEESLOG("\tflush rate limit " << BEES_FLUSH_RATE);
if (using_shared_map()) {
try_mmap_flags(MAP_SHARED);
} else {
try_mmap_flags(MAP_PRIVATE | MAP_ANONYMOUS);
}
if (!m_cell_ptr) {
THROW_ERROR(runtime_error, "unable to mmap " << filename);
}
if (!using_shared_map()) {
// madvise fails if MAP_SHARED
if (using_any_madvise()) {
// DONTFORK because we sometimes do fork,
// but the child doesn't touch any of the many, many pages
BEESTOOLONG("madvise(MADV_HUGEPAGE | MADV_DONTFORK)");
DIE_IF_NON_ZERO(madvise(m_byte_ptr, m_size, MADV_HUGEPAGE | MADV_DONTFORK));
}
for (uint64_t i = 0; i < m_size / sizeof(Extent); ++i) {
m_buckets_missing.insert(i);
}
}
m_writeback_thread.exec([&]() {
writeback_loop();
});
m_prefetch_thread.exec([&]() {
prefetch_loop();
});
// Blacklist might fail if the hash table is not stored on a btrfs
catch_all([&]() {
m_ctx->blacklist_add(BeesFileId(m_fd));
});
// Skip zero because we already weed that out before it gets near a hash function
for (unsigned i = 1; i < 256; ++i) {
vector<uint8_t> v(BLOCK_SIZE_SUMS, i);
HashType hash = Digest::CRC::crc64(v.data(), v.size());
m_toxic_hashes.insert(hash);
}
}
BeesHashTable::~BeesHashTable()
{
if (m_cell_ptr && m_size) {
flush_dirty_extents();
catch_all([&]() {
DIE_IF_NON_ZERO(munmap(m_cell_ptr, m_size));
m_cell_ptr = nullptr;
m_size = 0;
});
}
}

487
src/bees-resolve.cc Normal file
View File

@@ -0,0 +1,487 @@
#include "bees.h"
#include "crucible/limits.h"
#include "crucible/string.h"
using namespace crucible;
using namespace std;
BeesAddress
BeesResolver::addr(BeesAddress new_addr)
{
THROW_CHECK1(invalid_argument, new_addr, !new_addr.is_magic());
m_found_data = false;
m_found_dup = false;
m_found_hash = false;
m_wrong_data = false;
m_biors.clear();
m_ranges.clear();
m_addr = new_addr;
m_bior_count = 0;
auto rv = m_ctx->resolve_addr(m_addr);
m_biors = rv.m_biors;
m_is_toxic = rv.m_is_toxic;
m_bior_count = m_biors.size();
return m_addr;
}
BeesResolver::BeesResolver(shared_ptr<BeesContext> ctx, BeesAddress new_addr) :
m_ctx(ctx),
m_bior_count(0)
{
addr(new_addr);
}
BeesBlockData
BeesResolver::adjust_offset(const BeesFileRange &haystack, const BeesBlockData &needle)
{
BEESTRACE("Searching for needle " << needle << "\n\tin haystack " << haystack);
BEESCOUNT(adjust_try);
// Constraint checks
THROW_CHECK1(invalid_argument, needle.begin(), (needle.begin() & BLOCK_MASK_CLONE) == 0);
THROW_CHECK1(invalid_argument, haystack.begin(), (haystack.begin() & BLOCK_MASK_CLONE) == 0);
// Need to know the precise dimensions of the haystack and needle
off_t haystack_size = haystack.file_size();
// If the needle is not a full block then it can only match at EOF
off_t needle_len = needle.size();
bool is_unaligned_eof = needle_len & BLOCK_MASK_CLONE;
BEESTRACE("is_unaligned_eof = " << is_unaligned_eof << ", needle_len = " << to_hex(needle_len) << ", haystack_size = " << to_hex(haystack_size));
// Unaligned EOF can only match at EOF, so only check there
if (is_unaligned_eof) {
BEESTRACE("Construct needle_bfr from " << needle);
BeesFileRange needle_bfr(needle);
// Census
if (haystack_size & BLOCK_MASK_CLONE) {
BEESCOUNT(adjust_eof_haystack);
}
if (needle_bfr.end() & BLOCK_MASK_CLONE) {
BEESCOUNT(adjust_eof_needle);
}
// Non-aligned part of the lengths must be the same
if ( (haystack_size & BLOCK_MASK_CLONE) != (needle_bfr.end() & BLOCK_MASK_CLONE) ) {
BEESCOUNT(adjust_eof_fail);
return BeesBlockData();
}
// Read the haystack block
BEESTRACE("Reading haystack (haystack_size = " << to_hex(haystack_size) << ")");
BeesBlockData straw(haystack.fd(), haystack_size & ~BLOCK_MASK_CLONE, haystack_size & BLOCK_MASK_CLONE);
// It either matches or it doesn't
BEESTRACE("Verifying haystack " << straw);
if (straw.is_data_equal(needle)) {
BEESCOUNT(adjust_eof_hit);
m_found_data = true;
m_found_hash = true;
return straw;
}
// Check for matching hash
BEESTRACE("Verifying haystack hash");
if (straw.hash() == needle.hash()) {
// OK at least the hash is still valid
m_found_hash = true;
}
BEESCOUNT(adjust_eof_miss);
// BEESLOG("adjust_eof_miss " << straw);
return BeesBlockData();
}
off_t lower_offset = haystack.begin();
off_t upper_offset = haystack.end();
bool is_compressed_offset = false;
bool is_exact = false;
bool is_legacy = false;
if (m_addr.is_compressed()) {
BtrfsExtentWalker ew(haystack.fd(), haystack.begin(), m_ctx->root_fd());
BEESTRACE("haystack extent data " << ew);
Extent e = ew.current();
if (m_addr.has_compressed_offset()) {
off_t coff = m_addr.get_compressed_offset();
if (e.offset() > coff) {
// this extent begins after the target block
BEESCOUNT(adjust_offset_low);
return BeesBlockData();
}
coff -= e.offset();
if (e.size() <= coff) {
// this extent ends before the target block
BEESCOUNT(adjust_offset_high);
return BeesBlockData();
}
lower_offset = e.begin() + coff;
upper_offset = lower_offset + BLOCK_SIZE_CLONE;
BEESCOUNT(adjust_offset_hit);
is_compressed_offset = true;
} else {
lower_offset = e.begin();
upper_offset = e.end();
BEESCOUNT(adjust_legacy);
is_legacy = true;
}
} else {
BEESCOUNT(adjust_exact);
is_exact = true;
}
BEESTRACE("Checking haystack " << haystack << " offsets " << to_hex(lower_offset) << ".." << to_hex(upper_offset));
// Check all the blocks in the list
for (off_t haystack_offset = lower_offset; haystack_offset < upper_offset; haystack_offset += BLOCK_SIZE_CLONE) {
THROW_CHECK1(out_of_range, haystack_offset, (haystack_offset & BLOCK_MASK_CLONE) == 0);
// Straw cannot extend beyond end of haystack
if (haystack_offset + needle.size() > haystack_size) {
BEESCOUNT(adjust_needle_too_long);
break;
}
// Read the haystack
BEESTRACE("straw " << name_fd(haystack.fd()) << ", offset " << to_hex(haystack_offset) << ", length " << needle.size());
BeesBlockData straw(haystack.fd(), haystack_offset, needle.size());
BEESTRACE("straw = " << straw);
// Stop if we find a match
if (straw.is_data_equal(needle)) {
BEESCOUNT(adjust_hit);
m_found_data = true;
m_found_hash = true;
if (is_compressed_offset) BEESCOUNT(adjust_compressed_offset_correct);
if (is_legacy) BEESCOUNT(adjust_legacy_correct);
if (is_exact) BEESCOUNT(adjust_exact_correct);
return straw;
}
if (straw.hash() != needle.hash()) {
// Not the same hash or data, try next block
BEESCOUNT(adjust_miss);
continue;
}
// Found the hash but not the data. Yay!
m_found_hash = true;
BEESLOG("HASH COLLISION\n"
<< "\tneedle " << needle << "\n"
<< "\tstraw " << straw);
BEESCOUNT(hash_collision);
}
// Ran out of offsets to try
BEESCOUNT(adjust_no_match);
if (is_compressed_offset) BEESCOUNT(adjust_compressed_offset_wrong);
if (is_legacy) BEESCOUNT(adjust_legacy_wrong);
if (is_exact) BEESCOUNT(adjust_exact_wrong);
m_wrong_data = true;
return BeesBlockData();
}
BeesFileRange
BeesResolver::chase_extent_ref(const BtrfsInodeOffsetRoot &bior, BeesBlockData &needle_bbd)
{
BEESTRACE("chase_extent_ref bior " << bior << " needle_bbd " << needle_bbd);
BEESNOTE("chase_extent_ref bior " << bior << " needle_bbd " << needle_bbd);
BEESCOUNT(chase_try);
Fd file_fd = m_ctx->roots()->open_root_ino(bior.m_root, bior.m_inum);
if (!file_fd) {
// Delete snapshots generate craptons of these
// BEESINFO("No FD in chase_extent_ref " << bior);
BEESCOUNT(chase_no_fd);
return BeesFileRange();
}
BEESNOTE("searching at offset " << to_hex(bior.m_offset) << " in file " << name_fd(file_fd) << "\n\tfor " << needle_bbd);
BEESTRACE("bior file " << name_fd(file_fd));
BEESTRACE("get file_addr " << bior);
BeesAddress file_addr(file_fd, bior.m_offset, m_ctx);
BEESTRACE("file_addr " << file_addr);
// ...or are we?
if (file_addr.is_magic()) {
BEESINFO("file_addr is magic: file_addr = " << file_addr << " bior = " << bior << " needle_bbd = " << needle_bbd);
BEESCOUNT(chase_wrong_magic);
return BeesFileRange();
}
THROW_CHECK1(invalid_argument, m_addr, !m_addr.is_magic());
// Did we get the physical block we asked for? The magic bits have to match too,
// but the compressed offset bits do not.
if (file_addr.get_physical_or_zero() != m_addr.get_physical_or_zero()) {
// BEESINFO("found addr " << file_addr << " at " << name_fd(file_fd) << " offset " << to_hex(bior.m_offset) << " but looking for " << m_addr);
// FIEMAP/resolve are working, but the data is old.
BEESCOUNT(chase_wrong_addr);
return BeesFileRange();
}
// Calculate end of range, which is a sum block or less
// It's a sum block because we have to compare content now
off_t file_size = Stat(file_fd).st_size;
off_t bior_offset = ranged_cast<off_t>(bior.m_offset);
off_t end_offset = min(file_size, bior_offset + needle_bbd.size());
BeesBlockData haystack_bbd(file_fd, bior_offset, end_offset - bior_offset);
BEESTRACE("matched haystack_bbd " << haystack_bbd << " file_addr " << file_addr);
// If the data was compressed and no offset was captured then
// we won't get an exact address from resolve.
// Search near the resolved address for a matching data block.
// ...even if it's not compressed, we should do this sanity
// check before considering the block as a duplicate candidate.
auto new_bbd = adjust_offset(haystack_bbd, needle_bbd);
if (new_bbd.empty()) {
// matching offset search failed
BEESCOUNT(chase_wrong_data);
return BeesFileRange();
}
if (new_bbd.begin() == haystack_bbd.begin()) {
BEESCOUNT(chase_uncorrected);
} else {
// corrected the bfr
BEESCOUNT(chase_corrected);
haystack_bbd = new_bbd;
}
// We have found at least one duplicate block, so resolve was a success
BEESCOUNT(chase_hit);
// Matching block
BEESTRACE("Constructing dst_bfr { " << BeesFileId(haystack_bbd.fd()) << ", " << to_hex(haystack_bbd.begin()) << ".." << to_hex(haystack_bbd.end()) << " }");
BeesFileRange dst_bfr(BeesFileId(haystack_bbd.fd()), haystack_bbd.begin(), haystack_bbd.end());
return dst_bfr;
}
void
BeesResolver::replace_src(const BeesFileRange &src_bfr)
{
BEESTRACE("replace_src src_bfr " << src_bfr);
THROW_CHECK0(runtime_error, !m_is_toxic);
BEESCOUNT(replacesrc_try);
// Open src, reuse it for all dst
auto i_bfr = src_bfr;
BEESNOTE("Opening src bfr " << i_bfr);
BEESTRACE("Opening src bfr " << i_bfr);
i_bfr.fd(m_ctx);
BeesBlockData bbd(i_bfr);
for_each_extent_ref(bbd, [&](const BeesFileRange &j) -> bool {
// Open dst
auto j_bfr = j;
BEESNOTE("Opening dst bfr " << j_bfr);
BEESTRACE("Opening dst bfr " << j_bfr);
j_bfr.fd(m_ctx);
if (i_bfr.overlaps(j_bfr)) {
BEESCOUNT(replacesrc_overlaps);
return false; // i.e. continue
}
// Make pair(src, dst)
BEESTRACE("creating brp (" << i_bfr << ", " << j_bfr << ")");
BeesRangePair brp(i_bfr, j_bfr);
BEESTRACE("Found matching range: " << brp);
// Extend range at beginning
BEESNOTE("Extending matching range: " << brp);
// No particular reason to be constrained?
if (brp.grow(m_ctx, true)) {
BEESCOUNT(replacesrc_grown);
}
// Dedup
BEESNOTE("dedup " << brp);
if (m_ctx->dedup(brp)) {
BEESCOUNT(replacesrc_dedup_hit);
m_found_dup = true;
} else {
BEESCOUNT(replacesrc_dedup_miss);
}
return false; // i.e. continue
});
}
void
BeesResolver::find_matches(bool just_one, BeesBlockData &bbd)
{
// Walk through the (ino, offset, root) tuples until we find a match.
BEESTRACE("finding all matches for " << bbd << " at " << m_addr << ": " << m_biors.size() << " found");
THROW_CHECK0(runtime_error, !m_is_toxic);
bool stop_now = false;
for (auto ino_off_root : m_biors) {
if (m_wrong_data) {
return;
}
BEESTRACE("ino_off_root " << ino_off_root);
BeesFileId this_fid(ino_off_root.m_root, ino_off_root.m_inum);
// Silently ignore blacklisted files, e.g. BeesTempFile files
if (m_ctx->is_blacklisted(this_fid)) {
continue;
}
// Look at the old data
catch_all([&]() {
BEESTRACE("chase_extent_ref ino " << ino_off_root << " bbd " << bbd);
auto new_range = chase_extent_ref(ino_off_root, bbd);
if (new_range) {
m_ranges.insert(new_range.copy_closed());
stop_now = true;
}
});
if (just_one && stop_now) {
break;
}
}
}
bool
BeesResolver::for_each_extent_ref(BeesBlockData bbd, function<bool(const BeesFileRange &bfr)> visitor)
{
// Walk through the (ino, offset, root) tuples until we are told to stop
BEESTRACE("for_each_extent_ref " << bbd << " at " << m_addr << ": " << m_biors.size() << " found");
THROW_CHECK0(runtime_error, !m_is_toxic);
bool stop_now = false;
for (auto ino_off_root : m_biors) {
BEESTRACE("ino_off_root " << ino_off_root);
BeesFileId this_fid(ino_off_root.m_root, ino_off_root.m_inum);
// Silently ignore blacklisted files, e.g. BeesTempFile files
if (m_ctx->is_blacklisted(this_fid)) {
continue;
}
// Look at the old data
catch_all([&]() {
BEESTRACE("chase_extent_ref ino " << ino_off_root << " bbd " << bbd);
auto new_range = chase_extent_ref(ino_off_root, bbd);
// XXX: should we catch visitor's exceptions here?
if (new_range) {
stop_now = visitor(new_range);
} else {
// We have reliable block addresses now, so we guarantee we can hit the desired block.
// Failure in chase_extent_ref means we are done, and don't need to look up all the
// other references.
stop_now = true;
}
});
if (stop_now) {
break;
}
}
return stop_now;
}
BeesFileRange
BeesResolver::replace_dst(const BeesFileRange &dst_bfr)
{
BEESTRACE("replace_dst dst_bfr " << dst_bfr);
BEESCOUNT(replacedst_try);
// Open dst, reuse it for all src
BEESNOTE("Opening dst bfr " << dst_bfr);
BEESTRACE("Opening dst bfr " << dst_bfr);
dst_bfr.fd(m_ctx);
BeesFileRange overlap_bfr;
BEESTRACE("overlap_bfr " << overlap_bfr);
BeesBlockData bbd(dst_bfr);
for_each_extent_ref(bbd, [&](const BeesFileRange &src_bfr) -> bool {
// Open src
BEESNOTE("Opening src bfr " << src_bfr);
BEESTRACE("Opening src bfr " << src_bfr);
src_bfr.fd(m_ctx);
if (dst_bfr.overlaps(src_bfr)) {
BEESCOUNT(replacedst_overlaps);
return false; // i.e. continue
}
// If dst is already occupying src, skip.
// FIXME: BeesContext::scan_one_extent should be weeding these out, but does not.
BeesBlockData src_bbd(src_bfr.fd(), src_bfr.begin(), min(BLOCK_SIZE_SUMS, src_bfr.size()));
if (bbd.addr().get_physical_or_zero() == src_bbd.addr().get_physical_or_zero()) {
BEESCOUNT(replacedst_same);
return false; // i.e. continue
}
// Make pair(src, dst)
BEESTRACE("creating brp (" << src_bfr << ", " << dst_bfr << ")");
BeesRangePair brp(src_bfr, dst_bfr);
BEESTRACE("Found matching range: " << brp);
// Extend range at beginning
BEESNOTE("Extending matching range: " << brp);
// 'false' Has nasty loops, and may not be faster.
// 'true' At best, keeps fragmentation constant...but can also make it worse
if (brp.grow(m_ctx, true)) {
BEESCOUNT(replacedst_grown);
}
// Dedup
BEESNOTE("dedup " << brp);
if (m_ctx->dedup(brp)) {
BEESCOUNT(replacedst_dedup_hit);
m_found_dup = true;
overlap_bfr = brp.second;
// FIXME: find best range first, then dedup that
return true; // i.e. break
} else {
BEESCOUNT(replacedst_dedup_miss);
return false; // i.e. continue
}
});
// BEESLOG("overlap_bfr after " << overlap_bfr);
return overlap_bfr.copy_closed();
}
BeesFileRange
BeesResolver::find_one_match(BeesBlockData &bbd)
{
THROW_CHECK0(runtime_error, !m_is_toxic);
find_matches(true, bbd);
if (m_ranges.empty()) {
return BeesFileRange();
} else {
return *m_ranges.begin();
}
}
set<BeesFileRange>
BeesResolver::find_all_matches(BeesBlockData &bbd)
{
THROW_CHECK0(runtime_error, !m_is_toxic);
find_matches(false, bbd);
return m_ranges;
}
bool
BeesResolver::operator<(const BeesResolver &that) const
{
if (that.m_bior_count < m_bior_count) {
return true;
} else if (m_bior_count < that.m_bior_count) {
return false;
}
return m_addr < that.m_addr;
}

823
src/bees-roots.cc Normal file
View File

@@ -0,0 +1,823 @@
#include "bees.h"
#include "crucible/cache.h"
#include "crucible/string.h"
#include <fstream>
#include <tuple>
using namespace crucible;
using namespace std;
string
format_time(time_t t)
{
struct tm *tmp = localtime(&t);
char buf[1024];
strftime(buf, sizeof(buf), "%Y-%m-%d-%H-%M-%S", tmp);
return buf;
}
ostream &
operator<<(ostream &os, const BeesCrawlState &bcs)
{
time_t now = time(NULL);
auto age = now - bcs.m_started;
return os << "BeesCrawlState "
<< bcs.m_root << ":" << bcs.m_objectid << " offset " << to_hex(bcs.m_offset)
<< " transid " << bcs.m_min_transid << ".." << bcs.m_max_transid
<< " started " << format_time(bcs.m_started) << " (" << age << "s ago)";
}
BeesCrawlState::BeesCrawlState() :
m_root(0),
m_objectid(0),
m_offset(0),
m_min_transid(0),
m_max_transid(0),
m_started(time(NULL))
{
}
bool
BeesCrawlState::operator<(const BeesCrawlState &that) const
{
return tie(m_root, m_objectid, m_offset, m_min_transid, m_max_transid)
< tie(that.m_root, that.m_objectid, that.m_offset, that.m_min_transid, that.m_max_transid);
}
string
BeesRoots::crawl_state_filename() const
{
string rv;
rv += "beescrawl.";
rv += m_ctx->root_uuid();
rv += ".dat";
return rv;
}
void
BeesRoots::state_save()
{
// Make sure we have a full complement of crawlers
insert_new_crawl();
BEESNOTE("saving crawl state");
BEESLOG("Saving crawl state");
BEESTOOLONG("Saving crawl state");
Timer save_time;
unique_lock<mutex> lock(m_mutex);
// We don't have ofstreamat or ofdstream in C++11, so we're building a string and writing it with raw syscalls.
ostringstream ofs;
if (!m_crawl_dirty) {
BEESLOG("Nothing to save");
return;
}
for (auto i : m_root_crawl_map) {
auto ibcs = i.second->get_state();
if (ibcs.m_max_transid) {
ofs << "root " << ibcs.m_root << " ";
ofs << "objectid " << ibcs.m_objectid << " ";
ofs << "offset " << ibcs.m_offset << " ";
ofs << "min_transid " << ibcs.m_min_transid << " ";
ofs << "max_transid " << ibcs.m_max_transid << " ";
ofs << "started " << ibcs.m_started << " ";
ofs << "start_ts " << format_time(ibcs.m_started) << "\n";
}
}
if (ofs.str().empty()) {
BEESLOG("Crawl state empty!");
m_crawl_dirty = false;
return;
}
lock.unlock();
m_crawl_state_file.write(ofs.str());
BEESNOTE("relocking crawl state");
lock.lock();
// Not really correct but probably close enough
m_crawl_dirty = false;
BEESLOG("Saved crawl state in " << save_time << "s");
}
BeesCrawlState
BeesRoots::crawl_state_get(uint64_t rootid)
{
unique_lock<mutex> lock(m_mutex);
auto rv = m_root_crawl_map.at(rootid)->get_state();
THROW_CHECK2(runtime_error, rv.m_root, rootid, rv.m_root == rootid);
return rv;
}
void
BeesRoots::crawl_state_set_dirty()
{
unique_lock<mutex> lock(m_mutex);
m_crawl_dirty = true;
}
void
BeesRoots::crawl_state_erase(const BeesCrawlState &bcs)
{
unique_lock<mutex> lock(m_mutex);
// Do not delete the last entry, it holds our max_transid
if (m_root_crawl_map.size() < 2) {
BEESCOUNT(crawl_no_empty);
return;
}
if (m_root_crawl_map.count(bcs.m_root)) {
m_root_crawl_map.erase(bcs.m_root);
m_crawl_dirty = true;
}
}
uint64_t
BeesRoots::transid_min()
{
BEESNOTE("Calculating transid_min");
unique_lock<mutex> lock(m_mutex);
if (m_root_crawl_map.empty()) {
return 0;
}
uint64_t rv = numeric_limits<uint64_t>::max();
for (auto i : m_root_crawl_map) {
rv = min(rv, i.second->get_state().m_min_transid);
}
return rv;
}
uint64_t
BeesRoots::transid_max()
{
BEESNOTE("Calculating transid_max");
uint64_t rv = 0;
uint64_t root = 0;
BEESTRACE("Calculating transid_max...");
do {
root = next_root(root);
if (root) {
catch_all([&]() {
auto transid = btrfs_get_root_transid(open_root(root));
rv = max(rv, transid);
// BEESLOG("\troot " << root << " transid " << transid << " max " << rv);
});
}
} while (root);
return rv;
}
void
BeesRoots::crawl_roots()
{
BEESNOTE("Crawling roots");
unique_lock<mutex> lock(m_mutex);
if (m_root_crawl_map.empty()) {
BEESNOTE("idle, crawl map is empty");
m_condvar.wait(lock);
// Don't count the time we were waiting as part of the crawl time
m_crawl_timer.reset();
}
// Work from a copy because BeesCrawl might change the world under us
auto crawl_map_copy = m_root_crawl_map;
lock.unlock();
BeesFileRange first_range;
shared_ptr<BeesCrawl> first_crawl;
for (auto i : crawl_map_copy) {
auto this_crawl = i.second;
auto this_range = this_crawl->peek_front();
if (this_range) {
auto tuple_this = make_tuple(this_range.fid().ino(), this_range.fid().root(), this_range.begin());
auto tuple_first = make_tuple(first_range.fid().ino(), first_range.fid().root(), first_range.begin());
if (!first_range || tuple_this < tuple_first) {
first_crawl = this_crawl;
first_range = this_range;
}
}
}
if (first_range) {
catch_all([&]() {
// BEESINFO("scan_forward " << first_range);
m_ctx->scan_forward(first_range);
});
BEESCOUNT(crawl_scan);
m_crawl_current = first_crawl->get_state();
auto first_range_popped = first_crawl->pop_front();
THROW_CHECK2(runtime_error, first_range, first_range_popped, first_range == first_range_popped);
return;
}
BEESLOG("Crawl ran out of data after " << m_crawl_timer.lap() << "s, waiting for more...");
BEESCOUNT(crawl_done);
BEESNOTE("idle, waiting for more data");
lock.lock();
m_condvar.wait(lock);
// Don't count the time we were waiting as part of the crawl time
m_crawl_timer.reset();
}
void
BeesRoots::crawl_thread()
{
BEESNOTE("crawling");
while (1) {
catch_all([&]() {
crawl_roots();
});
}
}
void
BeesRoots::writeback_thread()
{
while (1) {
BEESNOTE(m_crawl_current << (m_crawl_dirty ? " (dirty)" : ""));
catch_all([&]() {
BEESNOTE("saving crawler state");
state_save();
});
nanosleep(BEES_WRITEBACK_INTERVAL);
}
}
void
BeesRoots::insert_root(const BeesCrawlState &new_bcs)
{
unique_lock<mutex> lock(m_mutex);
if (!m_root_crawl_map.count(new_bcs.m_root)) {
auto new_bcp = make_shared<BeesCrawl>(m_ctx, new_bcs);
auto new_pair = make_pair(new_bcs.m_root, new_bcp);
m_root_crawl_map.insert(new_pair);
m_crawl_dirty = true;
}
}
void
BeesRoots::insert_new_crawl()
{
BEESNOTE("adding crawlers for new subvols and removing crawlers for removed subvols");
BeesCrawlState new_bcs;
// Avoid a wasted loop iteration by starting from root 5
new_bcs.m_root = BTRFS_FS_TREE_OBJECTID;
new_bcs.m_min_transid = transid_min();
new_bcs.m_max_transid = transid_max();
unique_lock<mutex> lock(m_mutex);
set<uint64_t> excess_roots;
for (auto i : m_root_crawl_map) {
excess_roots.insert(i.first);
}
lock.unlock();
while (new_bcs.m_root) {
excess_roots.erase(new_bcs.m_root);
insert_root(new_bcs);
BEESCOUNT(crawl_create);
new_bcs.m_root = next_root(new_bcs.m_root);
}
for (auto i : excess_roots) {
new_bcs.m_root = i;
crawl_state_erase(new_bcs);
}
// Wake up crawl_roots if sleeping
lock.lock();
m_condvar.notify_all();
}
void
BeesRoots::state_load()
{
BEESNOTE("loading crawl state");
BEESLOG("loading crawl state");
string crawl_data = m_crawl_state_file.read();
for (auto line : split("\n", crawl_data)) {
BEESLOG("Read line: " << line);
map<string, uint64_t> d;
auto words = split(" ", line);
for (auto it = words.begin(); it < words.end(); ++it) {
auto it1 = it;
++it;
THROW_CHECK1(out_of_range, words.size(), it < words.end());
string key = *it1;
uint64_t val = from_hex(*it);
BEESTRACE("key " << key << " val " << val);
auto result = d.insert(make_pair(key, val));
THROW_CHECK0(runtime_error, result.second);
}
BeesCrawlState loaded_state;
loaded_state.m_root = d.at("root");
loaded_state.m_objectid = d.at("objectid");
loaded_state.m_offset = d.at("offset");
loaded_state.m_min_transid = d.count("gen_current") ? d.at("gen_current") : d.at("min_transid");
loaded_state.m_max_transid = d.count("gen_next") ? d.at("gen_next") : d.at("max_transid");
if (d.count("started")) {
loaded_state.m_started = d.at("started");
}
BEESLOG("loaded_state " << loaded_state);
insert_root(loaded_state);
}
}
BeesRoots::BeesRoots(shared_ptr<BeesContext> ctx) :
m_ctx(ctx),
m_crawl_state_file(ctx->home_fd(), crawl_state_filename()),
m_crawl_thread("crawl " + ctx->root_path()),
m_writeback_thread("crawl_writeback " + ctx->root_path())
{
m_crawl_thread.exec([&]() {
catch_all([&]() {
state_load();
});
m_writeback_thread.exec([&]() {
writeback_thread();
});
crawl_thread();
});
}
Fd
BeesRoots::open_root_nocache(uint64_t rootid)
{
BEESTRACE("open_root_nocache " << rootid);
BEESNOTE("open_root_nocache " << rootid);
// Stop recursion at the root of the filesystem tree
if (rootid == BTRFS_FS_TREE_OBJECTID) {
return m_ctx->root_fd();
}
// Find backrefs for this rootid and follow up to root
BtrfsIoctlSearchKey sk;
sk.tree_id = BTRFS_ROOT_TREE_OBJECTID;
sk.min_objectid = sk.max_objectid = rootid;
sk.min_type = sk.max_type = BTRFS_ROOT_BACKREF_KEY;
BEESTRACE("sk " << sk);
while (sk.min_objectid <= rootid) {
sk.nr_items = 1024;
sk.do_ioctl(m_ctx->root_fd());
if (sk.m_result.empty()) {
break;
}
for (auto i : sk.m_result) {
sk.next_min(i);
if (i.type == BTRFS_ROOT_BACKREF_KEY && i.objectid == rootid) {
auto dirid = call_btrfs_get(btrfs_stack_root_ref_dirid, i.m_data);
auto name_len = call_btrfs_get(btrfs_stack_root_ref_name_len, i.m_data);
auto name_start = sizeof(struct btrfs_root_ref);
auto name_end = name_len + name_start;
THROW_CHECK2(runtime_error, i.m_data.size(), name_end, i.m_data.size() >= name_end);
string name(i.m_data.data() + name_start, i.m_data.data() + name_end);
auto parent_rootid = i.offset;
// BEESLOG("parent_rootid " << parent_rootid << " dirid " << dirid << " name " << name);
BEESTRACE("parent_rootid " << parent_rootid << " dirid " << dirid << " name " << name);
Fd parent_fd = open_root(parent_rootid);
if (!parent_fd) {
BEESLOGTRACE("no parent_fd");
continue;
}
if (dirid != BTRFS_FIRST_FREE_OBJECTID) {
BEESTRACE("dirid " << dirid << " root " << rootid << " INO_PATH");
BtrfsIoctlInoPathArgs ino(dirid);
if (!ino.do_ioctl_nothrow(parent_fd)) {
BEESINFO("dirid " << dirid << " inode path lookup failed in parent_fd " << name_fd(parent_fd));
continue;
}
if (ino.m_paths.empty()) {
BEESINFO("dirid " << dirid << " inode has no paths in parent_fd " << name_fd(parent_fd));
continue;
}
BEESTRACE("dirid " << dirid << " path " << ino.m_paths.at(0));
parent_fd = openat(parent_fd, ino.m_paths.at(0).c_str(), FLAGS_OPEN_DIR);
if (!parent_fd) {
BEESLOGTRACE("no parent_fd from dirid");
continue;
}
}
// BEESLOG("openat(" << name_fd(parent_fd) << ", " << name << ")");
BEESTRACE("openat(" << name_fd(parent_fd) << ", " << name << ")");
Fd rv = openat(parent_fd, name.c_str(), FLAGS_OPEN_DIR);
if (!rv) {
BEESLOGTRACE("open failed for name " << name);
continue;
}
BEESCOUNT(root_found);
// Verify correct root ID
auto new_root_id = btrfs_get_root_id(rv);
THROW_CHECK2(runtime_error, new_root_id, rootid, new_root_id == rootid);
Stat st(rv);
THROW_CHECK1(runtime_error, st.st_ino, st.st_ino == BTRFS_FIRST_FREE_OBJECTID);
BEESINFO("open_root_nocache " << rootid << ": " << name_fd(rv));
return rv;
}
}
}
BEESINFO("No path for rootid " << rootid);
BEESCOUNT(root_notfound);
return Fd();
}
Fd
BeesRoots::open_root(uint64_t rootid)
{
// Ignore some of the crap that comes out of LOGICAL_INO
if (rootid == BTRFS_ROOT_TREE_OBJECTID) {
return Fd();
}
return m_ctx->fd_cache()->open_root(m_ctx, rootid);
}
uint64_t
BeesRoots::next_root(uint64_t root)
{
BEESNOTE("Next root from " << root);
BEESTRACE("Next root from " << root);
// BTRFS_FS_TREE_OBJECTID has no backref keys so we can't find it that way
if (root < BTRFS_FS_TREE_OBJECTID) {
// BEESLOG("First root is BTRFS_FS_TREE_OBJECTID = " << BTRFS_FS_TREE_OBJECTID);
return BTRFS_FS_TREE_OBJECTID;
}
BtrfsIoctlSearchKey sk;
sk.tree_id = BTRFS_ROOT_TREE_OBJECTID;
sk.min_type = sk.max_type = BTRFS_ROOT_BACKREF_KEY;
sk.min_objectid = root + 1;
while (true) {
sk.nr_items = 1024;
sk.do_ioctl(m_ctx->root_fd());
if (sk.m_result.empty()) {
return 0;
}
for (auto i : sk.m_result) {
sk.next_min(i);
if (i.type == BTRFS_ROOT_BACKREF_KEY) {
// BEESLOG("Found root " << i.objectid << " parent " << i.offset);
return i.objectid;
}
}
}
}
Fd
BeesRoots::open_root_ino_nocache(uint64_t root, uint64_t ino)
{
BEESTRACE("opening root " << root << " ino " << ino);
Fd root_fd = open_root(root);
if (!root_fd) {
return root_fd;
}
BEESTOOLONG("open_root_ino(root " << root << ", ino " << ino << ")");
BEESTRACE("looking up ino " << ino);
BtrfsIoctlInoPathArgs ipa(ino);
if (!ipa.do_ioctl_nothrow(root_fd)) {
BEESINFO("Lookup root " << root << " ino " << ino << " failed: " << strerror(errno));
return Fd();
}
BEESTRACE("searching paths for root " << root << " ino " << ino);
Fd rv;
if (ipa.m_paths.empty()) {
BEESLOG("No paths for root " << root << " ino " << ino);
}
for (auto file_path : ipa.m_paths) {
BEESTRACE("Looking up root " << root << " ino " << ino << " in dir " << name_fd(root_fd) << " path " << file_path);
BEESCOUNT(open_file);
// Try to open file RW, fall back to RO
const char *fp_cstr = file_path.c_str();
rv = openat(root_fd, fp_cstr, FLAGS_OPEN_FILE);
if (!rv) {
BEESCOUNT(open_fail);
// errno == ENOENT is common during snapshot delete, ignore it
if (errno != ENOENT) {
BEESLOG("Could not open path '" << file_path << "' at root " << root << " " << name_fd(root_fd) << ": " << strerror(errno));
BEESNOTE("ipa" << ipa);
}
continue;
}
// Correct inode?
Stat file_stat(rv);
if (file_stat.st_ino != ino) {
BEESLOG("Opening " << name_fd(root_fd) << "/" << file_path << " found wrong inode " << file_stat.st_ino << " instead of " << ino);
rv = Fd();
BEESCOUNT(open_wrong_ino);
break;
}
// Correct root?
auto file_root = btrfs_get_root_id(rv);
if (file_root != root) {
BEESLOG("Opening " << name_fd(root_fd) << "/" << file_path << " found wrong root " << file_root << " instead of " << root);
rv = Fd();
BEESCOUNT(open_wrong_root);
break;
}
// Same filesystem?
Stat root_stat(root_fd);
if (root_stat.st_dev != file_stat.st_dev) {
BEESLOG("Opening root " << name_fd(root_fd) << " path " << file_path << " found path st_dev " << file_stat.st_dev << " but root st_dev is " << root_stat.st_dev);
rv = Fd();
BEESCOUNT(open_wrong_dev);
break;
}
BEESTRACE("mapped " << BeesFileId(root, ino));
BEESTRACE("\tto " << name_fd(rv));
BEESCOUNT(open_hit);
return rv;
}
// Odd, we didn't find a path.
return Fd();
}
Fd
BeesRoots::open_root_ino(uint64_t root, uint64_t ino)
{
return m_ctx->fd_cache()->open_root_ino(m_ctx, root, ino);
}
BeesCrawl::BeesCrawl(shared_ptr<BeesContext> ctx, BeesCrawlState initial_state) :
m_ctx(ctx),
m_state(initial_state)
{
}
bool
BeesCrawl::next_transid()
{
// If this crawl is recently empty, quickly and _silently_ bail out
auto current_time = time(NULL);
auto crawl_state = get_state();
auto elapsed_time = current_time - crawl_state.m_started;
if (elapsed_time < BEES_COMMIT_INTERVAL) {
if (!m_deferred) {
BEESLOG("Deferring next transid in " << get_state());
}
m_deferred = true;
BEESCOUNT(crawl_defer);
return false;
}
// Log performance stats from the old crawl
BEESLOG("Next transid in " << get_state());
// Start new crawl
m_deferred = false;
auto roots = m_ctx->roots();
crawl_state.m_min_transid = crawl_state.m_max_transid;
crawl_state.m_max_transid = roots->transid_max();
crawl_state.m_objectid = 0;
crawl_state.m_offset = 0;
crawl_state.m_started = current_time;
BEESLOG("Restarting crawl " << get_state());
BEESCOUNT(crawl_restart);
set_state(crawl_state);
return true;
}
bool
BeesCrawl::fetch_extents()
{
THROW_CHECK1(runtime_error, m_extents.size(), m_extents.empty());
auto old_state = get_state();
if (m_deferred || old_state.m_max_transid <= old_state.m_min_transid) {
BEESTRACE("Nothing to crawl in " << get_state());
return next_transid();
}
BEESNOTE("crawling " << get_state());
BEESLOG("Crawling " << get_state());
Timer crawl_timer;
BtrfsIoctlSearchKey sk;
sk.tree_id = old_state.m_root;
sk.min_objectid = old_state.m_objectid;
sk.min_type = sk.max_type = BTRFS_EXTENT_DATA_KEY;
sk.min_offset = old_state.m_offset;
sk.min_transid = old_state.m_min_transid;
sk.max_transid = old_state.m_max_transid;
sk.nr_items = BEES_MAX_CRAWL_SIZE;
// Lock in the old state
set_state(old_state);
BEESTRACE("Searching crawl sk " << static_cast<btrfs_ioctl_search_key&>(sk));
bool ioctl_ok = false;
{
BEESNOTE("searching crawl sk " << static_cast<btrfs_ioctl_search_key&>(sk));
BEESTOOLONG("Searching crawl sk " << static_cast<btrfs_ioctl_search_key&>(sk));
ioctl_ok = sk.do_ioctl_nothrow(m_ctx->root_fd());
}
if (ioctl_ok) {
BEESCOUNT(crawl_search);
} else {
BEESLOG("Search ioctl failed: " << strerror(errno));
BEESCOUNT(crawl_fail);
}
if (!ioctl_ok || sk.m_result.empty()) {
BEESCOUNT(crawl_empty);
BEESLOG("Crawl empty " << get_state());
return next_transid();
}
BEESLOG("Crawling " << sk.m_result.size() << " results from " << get_state());
auto results_left = sk.m_result.size();
BEESNOTE("crawling " << results_left << " results from " << get_state());
size_t count_other = 0;
size_t count_inline = 0;
size_t count_unknown = 0;
size_t count_data = 0;
size_t count_low = 0;
size_t count_high = 0;
BeesFileRange last_bfr;
for (auto i : sk.m_result) {
sk.next_min(i);
--results_left;
BEESCOUNT(crawl_items);
BEESTRACE("i = " << i);
#if 1
// We need the "+ 1" and objectid rollover that next_min does.
auto new_state = get_state();
new_state.m_objectid = sk.min_objectid;
new_state.m_offset = sk.min_offset;
// Saving state here means we can skip a search result
// if we are interrupted. Not saving state here means we
// can fail to make forward progress in cases where there
// is a lot of metadata we can't process. Favor forward
// progress over losing search results.
set_state(new_state);
#endif
// Ignore things that aren't EXTENT_DATA_KEY
if (i.type != BTRFS_EXTENT_DATA_KEY) {
++count_other;
BEESCOUNT(crawl_nondata);
continue;
}
auto gen = call_btrfs_get(btrfs_stack_file_extent_generation, i.m_data);
if (gen < get_state().m_min_transid) {
BEESCOUNT(crawl_gen_low);
++count_low;
// We probably want (need?) to scan these anyway.
// continue;
}
if (gen > get_state().m_max_transid) {
BEESCOUNT(crawl_gen_high);
++count_high;
// This shouldn't ever happen
// continue;
}
auto type = call_btrfs_get(btrfs_stack_file_extent_type, i.m_data);
switch (type) {
default:
BEESINFO("Unhandled file extent type " << type << " in root " << get_state().m_root << " ino " << i.objectid << " offset " << to_hex(i.offset));
++count_unknown;
BEESCOUNT(crawl_unknown);
break;
case BTRFS_FILE_EXTENT_INLINE:
// Ignore these for now.
// BEESINFO("Ignored file extent type INLINE in root " << get_state().m_root << " ino " << i.objectid << " offset " << to_hex(i.offset));
++count_inline;
// TODO: replace with out-of-line dup extents
BEESCOUNT(crawl_inline);
break;
case BTRFS_FILE_EXTENT_PREALLOC:
BEESCOUNT(crawl_prealloc);
case BTRFS_FILE_EXTENT_REG: {
auto physical = call_btrfs_get(btrfs_stack_file_extent_disk_bytenr, i.m_data);
auto ram = call_btrfs_get(btrfs_stack_file_extent_ram_bytes, i.m_data);
auto len = call_btrfs_get(btrfs_stack_file_extent_num_bytes, i.m_data);
auto offset = call_btrfs_get(btrfs_stack_file_extent_offset, i.m_data);
BEESTRACE("Root " << get_state().m_root << " ino " << i.objectid << " physical " << to_hex(physical)
<< " logical " << to_hex(i.offset) << ".." << to_hex(i.offset + len)
<< " gen " << gen);
++count_data;
if (physical) {
THROW_CHECK1(runtime_error, ram, ram > 0);
THROW_CHECK1(runtime_error, len, len > 0);
THROW_CHECK2(runtime_error, offset, ram, offset < ram);
BeesFileId bfi(get_state().m_root, i.objectid);
if (m_ctx->is_blacklisted(bfi)) {
BEESCOUNT(crawl_blacklisted);
} else {
BeesFileRange bfr(bfi, i.offset, i.offset + len);
// BEESNOTE("pushing bfr " << bfr << " limit " << BEES_MAX_QUEUE_SIZE);
m_extents.insert(bfr);
BEESCOUNT(crawl_push);
}
} else {
BEESCOUNT(crawl_hole);
}
break;
}
}
}
BEESLOG("Crawled inline " << count_inline << " data " << count_data << " other " << count_other << " unknown " << count_unknown << " gen_low " << count_low << " gen_high " << count_high << " " << get_state() << " in " << crawl_timer << "s");
return true;
}
void
BeesCrawl::fetch_extents_harder()
{
BEESNOTE("fetch_extents_harder " << get_state() << " with " << m_extents.size() << " extents");
while (m_extents.empty()) {
bool progress_made = fetch_extents();
if (!progress_made) {
return;
}
}
}
BeesFileRange
BeesCrawl::peek_front()
{
unique_lock<mutex> lock(m_mutex);
fetch_extents_harder();
if (m_extents.empty()) {
return BeesFileRange();
}
return *m_extents.begin();
}
BeesFileRange
BeesCrawl::pop_front()
{
unique_lock<mutex> lock(m_mutex);
fetch_extents_harder();
if (m_extents.empty()) {
return BeesFileRange();
}
auto rv = *m_extents.begin();
m_extents.erase(m_extents.begin());
#if 0
auto state = get_state();
state.m_objectid = rv.fid().ino();
state.m_offset = rv.begin();
set_state(state);
#endif
return rv;
}
BeesCrawlState
BeesCrawl::get_state()
{
unique_lock<mutex> lock(m_state_mutex);
return m_state;
}
void
BeesCrawl::set_state(const BeesCrawlState &bcs)
{
unique_lock<mutex> lock(m_state_mutex);
m_state = bcs;
lock.unlock();
m_ctx->roots()->crawl_state_set_dirty();
}

91
src/bees-thread.cc Normal file
View File

@@ -0,0 +1,91 @@
#include "bees.h"
using namespace crucible;
using namespace std;
BeesThread::BeesThread(string name) :
m_name(name)
{
THROW_CHECK1(invalid_argument, name, !name.empty());
}
void
BeesThread::exec(function<void()> func)
{
m_timer.reset();
BEESLOG("BeesThread exec " << m_name);
m_thread_ptr = make_shared<thread>([=]() {
BEESLOG("Starting thread " << m_name);
BeesNote::set_name(m_name);
BEESNOTE("thread function");
Timer thread_time;
catch_all([&]() {
DIE_IF_MINUS_ERRNO(pthread_setname_np(pthread_self(), m_name.c_str()));
});
catch_all([&]() {
func();
});
BEESLOG("Exiting thread " << m_name << ", " << thread_time << " sec");
});
}
BeesThread::BeesThread(string name, function<void()> func) :
m_name(name)
{
THROW_CHECK1(invalid_argument, name, !name.empty());
BEESLOG("BeesThread construct " << m_name);
exec(func);
}
void
BeesThread::join()
{
if (!m_thread_ptr) {
BEESLOG("Thread " << m_name << " no thread ptr");
return;
}
BEESLOG("BeesThread::join " << m_name);
if (m_thread_ptr->joinable()) {
BEESLOG("Joining thread " << m_name);
Timer thread_time;
m_thread_ptr->join();
BEESLOG("Waited for " << m_name << ", " << thread_time << " sec");
} else if (!m_name.empty()) {
BEESLOG("BeesThread " << m_name << " not joinable");
} else {
BEESLOG("BeesThread else " << m_name);
}
}
void
BeesThread::set_name(const string &name)
{
m_name = name;
}
BeesThread::~BeesThread()
{
if (!m_thread_ptr) {
BEESLOG("Thread " << m_name << " no thread ptr");
return;
}
BEESLOG("BeesThread destructor " << m_name);
if (m_thread_ptr->joinable()) {
BEESLOG("Cancelling thread " << m_name);
int rv = pthread_cancel(m_thread_ptr->native_handle());
if (rv) {
BEESLOG("pthread_cancel returned " << strerror(-rv));
}
BEESLOG("Waiting for thread " << m_name);
Timer thread_time;
m_thread_ptr->join();
BEESLOG("Waited for " << m_name << ", " << thread_time << " sec");
} else if (!m_name.empty()) {
BEESLOG("Thread " << m_name << " not joinable");
} else {
BEESLOG("Thread destroy else " << m_name);
}
}

1006
src/bees-types.cc Normal file

File diff suppressed because it is too large Load Diff

599
src/bees.cc Normal file
View File

@@ -0,0 +1,599 @@
#include "bees.h"
#include "crucible/interp.h"
#include "crucible/limits.h"
#include "crucible/process.h"
#include "crucible/string.h"
#include <cctype>
#include <cmath>
#include <iostream>
#include <memory>
// PRIx64
#include <inttypes.h>
#include <sched.h>
#include <sys/fanotify.h>
#include <linux/fs.h>
#include <sys/ioctl.h>
using namespace crucible;
using namespace std;
int
do_cmd_help(const ArgList &argv)
{
cerr << "Usage: " << argv[0] << " fs-root-path [fs-root-path-2...]\n"
"Performs best-effort extent-same deduplication on btrfs.\n"
"\n"
"fs-root-path MUST be the root of a btrfs filesystem tree (id 5).\n"
"Other directories will be rejected.\n"
"\n"
"Multiple filesystems can share a single hash table (BEESHOME)\n"
"but this only works well if the content of each filesystem\n"
"is distinct from all the others.\n"
"\n"
"Required environment variables:\n"
"\tBEESHOME\tPath to hash table and configuration files\n"
"\n"
"Optional environment variables:\n"
"\tBEESSTATUS\tFile to write status to (tmpfs recommended, e.g. /run)\n"
"\n"
<< endl;
return 0;
}
// tracing ----------------------------------------
RateLimiter bees_info_rate_limit(BEES_INFO_RATE, BEES_INFO_BURST);
thread_local BeesTracer *BeesTracer::s_next_tracer = nullptr;
BeesTracer::~BeesTracer()
{
if (uncaught_exception()) {
m_func();
if (!m_next_tracer) {
BEESLOG("--- END TRACE --- exception ---");
}
}
s_next_tracer = m_next_tracer;
}
BeesTracer::BeesTracer(function<void()> f) :
m_func(f)
{
m_next_tracer = s_next_tracer;
s_next_tracer = this;
}
void
BeesTracer::trace_now()
{
BeesTracer *tp = s_next_tracer;
BEESLOG("--- BEGIN TRACE ---");
while (tp) {
tp->m_func();
tp = tp->m_next_tracer;
}
BEESLOG("--- END TRACE ---");
}
thread_local BeesNote *BeesNote::s_next = nullptr;
mutex BeesNote::s_mutex;
map<pid_t, BeesNote*> BeesNote::s_status;
thread_local string BeesNote::s_name;
BeesNote::~BeesNote()
{
unique_lock<mutex> lock(s_mutex);
s_next = m_prev;
if (s_next) {
s_status[gettid()] = s_next;
} else {
s_status.erase(gettid());
}
}
BeesNote::BeesNote(function<void(ostream &os)> f) :
m_func(f)
{
unique_lock<mutex> lock(s_mutex);
m_name = s_name;
m_prev = s_next;
s_next = this;
s_status[gettid()] = s_next;
}
void
BeesNote::set_name(const string &name)
{
unique_lock<mutex> lock(s_mutex);
s_name = name;
}
string
BeesNote::get_name()
{
unique_lock<mutex> lock(s_mutex);
if (s_name.empty()) {
return "bees";
} else {
return s_name;
}
}
BeesNote::ThreadStatusMap
BeesNote::get_status()
{
unique_lock<mutex> lock(s_mutex);
ThreadStatusMap rv;
for (auto t : s_status) {
ostringstream oss;
if (!t.second->m_name.empty()) {
oss << t.second->m_name << ": ";
}
if (t.second->m_timer.age() > BEES_TOO_LONG) {
oss << "[" << t.second->m_timer << "s] ";
}
t.second->m_func(oss);
rv[t.first] = oss.str();
}
return rv;
}
// static inline helpers ----------------------------------------
static inline
bool
bees_addr_check(uint64_t v)
{
return !(v & (1ULL << 63));
}
static inline
bool
bees_addr_check(int64_t v)
{
return !(v & (1ULL << 63));
}
string
pretty(double d)
{
static const char * units[] = { "", "K", "M", "G", "T", "P", "E" };
static const char * *units_stop = units + sizeof(units) / sizeof(units[0]) - 1;
const char * *unit = units;
while (d >= 1024 && unit < units_stop) {
d /= 1024;
++unit;
}
ostringstream oss;
oss << (round(d * 1000.0) / 1000.0) << *unit;
return oss.str();
}
// ostream operators ----------------------------------------
template <class T>
ostream &
operator<<(ostream &os, const BeesStatTmpl<T> &bs)
{
unique_lock<mutex> lock(bs.m_mutex);
bool first = true;
string last_tag;
for (auto i : bs.m_stats_map) {
if (i.second == 0) {
continue;
}
string tag = i.first.substr(0, i.first.find_first_of("_"));
if (!last_tag.empty() && tag != last_tag) {
os << "\n\t";
} else if (!first) {
os << " ";
}
last_tag = tag;
first = false;
os << i.first << "=" << i.second;
}
return os;
}
// other ----------------------------------------
template <class T>
T&
BeesStatTmpl<T>::at(string idx)
{
unique_lock<mutex> lock(m_mutex);
if (!m_stats_map.count(idx)) {
m_stats_map[idx] = 0;
}
return m_stats_map[idx];
}
template <class T>
T
BeesStatTmpl<T>::at(string idx) const
{
unique_lock<mutex> lock(m_mutex);
return m_stats_map.at(idx);
}
template <class T>
void
BeesStatTmpl<T>::add_count(string idx, size_t amount)
{
unique_lock<mutex> lock(m_mutex);
if (!m_stats_map.count(idx)) {
m_stats_map[idx] = 0;
}
m_stats_map.at(idx) += amount;
}
template <class T>
BeesStatTmpl<T>::BeesStatTmpl(const BeesStatTmpl &that)
{
if (&that == this) return;
unique_lock<mutex> lock(m_mutex);
unique_lock<mutex> lock2(that.m_mutex);
m_stats_map = that.m_stats_map;
}
template <class T>
BeesStatTmpl<T> &
BeesStatTmpl<T>::operator=(const BeesStatTmpl<T> &that)
{
if (&that == this) return *this;
unique_lock<mutex> lock(m_mutex);
unique_lock<mutex> lock2(that.m_mutex);
m_stats_map = that.m_stats_map;
return *this;
}
BeesStats BeesStats::s_global;
BeesStats
BeesStats::operator-(const BeesStats &that) const
{
if (&that == this) return BeesStats();
unique_lock<mutex> this_lock(m_mutex);
BeesStats this_copy;
this_copy.m_stats_map = m_stats_map;
unique_lock<mutex> that_lock(that.m_mutex);
BeesStats that_copy;
that_copy.m_stats_map = that.m_stats_map;
this_lock.unlock();
that_lock.unlock();
for (auto i : that.m_stats_map) {
if (i.second != 0) {
this_copy.at(i.first) -= i.second;
}
}
return this_copy;
}
BeesRates
BeesStats::operator/(double d) const
{
BeesRates rv;
unique_lock<mutex> lock(m_mutex);
for (auto i : m_stats_map) {
rv.m_stats_map[i.first] = ceil(i.second / d * 1000) / 1000;
}
return rv;
}
BeesStats::operator bool() const
{
unique_lock<mutex> lock(m_mutex);
for (auto i : m_stats_map) {
if (i.second != 0) {
return true;
}
}
return false;
}
BeesTooLong::BeesTooLong(const string &s, double limit) :
m_limit(limit),
m_func([s](ostream &os) { os << s; })
{
}
BeesTooLong::BeesTooLong(const func_type &func, double limit) :
m_limit(limit),
m_func(func)
{
}
void
BeesTooLong::check() const
{
if (age() > m_limit) {
ostringstream oss;
m_func(oss);
BEESLOG("PERFORMANCE: " << *this << " sec: " << oss.str());
}
}
BeesTooLong::~BeesTooLong()
{
check();
}
BeesTooLong &
BeesTooLong::operator=(const func_type &f)
{
m_func = f;
return *this;
}
void
bees_sync(int fd)
{
Timer sync_timer;
BEESNOTE("syncing " << name_fd(fd));
BEESTOOLONG("syncing " << name_fd(fd));
DIE_IF_NON_ZERO(fsync(fd));
BEESCOUNT(sync_count);
BEESCOUNTADD(sync_ms, sync_timer.age() * 1000);
}
BeesStringFile::BeesStringFile(Fd dir_fd, string name, size_t limit) :
m_dir_fd(dir_fd),
m_name(name),
m_limit(limit)
{
BEESLOG("BeesStringFile " << name_fd(m_dir_fd) << "/" << m_name << " max size " << pretty(m_limit));
}
string
BeesStringFile::read()
{
BEESNOTE("opening " << m_name << " in " << name_fd(m_dir_fd));
Fd fd(openat(m_dir_fd, m_name.c_str(), FLAGS_OPEN_FILE));
if (!fd) {
return string();
}
BEESNOTE("sizing " << m_name << " in " << name_fd(m_dir_fd));
Stat st(fd);
THROW_CHECK1(out_of_range, st.st_size, st.st_size > 0);
THROW_CHECK1(out_of_range, st.st_size, st.st_size < ranged_cast<off_t>(m_limit));
BEESNOTE("reading " << m_name << " in " << name_fd(m_dir_fd));
return read_string(fd, st.st_size);
}
void
BeesStringFile::write(string contents)
{
THROW_CHECK2(out_of_range, contents.size(), m_limit, contents.size() < m_limit);
auto tmpname = m_name + ".tmp";
BEESNOTE("unlinking " << tmpname << " in " << name_fd(m_dir_fd));
unlinkat(m_dir_fd, tmpname.c_str(), 0);
// ignore error
BEESNOTE("closing " << tmpname << " in " << name_fd(m_dir_fd));
{
Fd ofd = openat_or_die(m_dir_fd, tmpname, FLAGS_CREATE_FILE, S_IRUSR | S_IWUSR);
BEESNOTE("writing " << tmpname << " in " << name_fd(m_dir_fd));
write_or_die(ofd, contents);
BEESNOTE("fsyncing " << tmpname << " in " << name_fd(m_dir_fd));
DIE_IF_NON_ZERO(fsync(ofd));
}
BEESNOTE("renaming " << tmpname << " to " << m_name << " in FD " << name_fd(m_dir_fd));
BEESTRACE("renaming " << tmpname << " to " << m_name << " in FD " << name_fd(m_dir_fd));
renameat_or_die(m_dir_fd, tmpname, m_dir_fd, m_name);
}
void
BeesTempFile::create()
{
// BEESLOG("creating temporary file in " << m_ctx->root_path());
BEESNOTE("creating temporary file in " << m_ctx->root_path());
BEESTOOLONG("creating temporary file in " << m_ctx->root_path());
DIE_IF_MINUS_ONE(m_fd = openat(m_ctx->root_fd(), ".", FLAGS_OPEN_TMPFILE, S_IRUSR | S_IWUSR));
BEESCOUNT(tmp_create);
// Can't reopen this file, so don't allow any resolves there
// Resolves won't work there anyway. There are lots of tempfiles
// and they're short-lived, so this ends up being just a memory leak
// m_ctx->blacklist_add(BeesFileId(m_fd));
m_ctx->insert_root_ino(m_fd);
// Set compression attribute
int flags = 0;
BEESTRACE("Getting FS_COMPR_FL on m_fd " << name_fd(m_fd) << " flags " << to_hex(flags));
DIE_IF_MINUS_ONE(ioctl(m_fd, FS_IOC_GETFLAGS, &flags));
flags |= FS_COMPR_FL;
BEESTRACE("Setting FS_COMPR_FL on m_fd " << name_fd(m_fd) << " flags " << to_hex(flags));
DIE_IF_MINUS_ONE(ioctl(m_fd, FS_IOC_SETFLAGS, &flags));
// Always leave first block empty to avoid creating a file with an inline extent
m_end_offset = BLOCK_SIZE_CLONE;
}
void
BeesTempFile::resize(off_t offset)
{
BEESTOOLONG("Resizing temporary file to " << to_hex(offset));
BEESNOTE("Resizing temporary file " << name_fd(m_fd) << " to " << to_hex(offset));
BEESTRACE("Resizing temporary file " << name_fd(m_fd) << " to " << to_hex(offset));
// Ensure that file covers m_end_offset..offset
THROW_CHECK2(invalid_argument, m_end_offset, offset, m_end_offset < offset);
// Truncate
DIE_IF_NON_ZERO(ftruncate(m_fd, offset));
BEESCOUNT(tmp_resize);
// Success
m_end_offset = offset;
}
BeesTempFile::BeesTempFile(shared_ptr<BeesContext> ctx) :
m_ctx(ctx),
m_end_offset(0)
{
create();
}
void
BeesTempFile::realign()
{
if (m_end_offset > BLOCK_SIZE_MAX_TEMP_FILE) {
BEESLOG("temporary file size " << to_hex(m_end_offset) << " > max " << BLOCK_SIZE_MAX_TEMP_FILE);
BEESCOUNT(tmp_trunc);
return create();
}
if (m_end_offset & BLOCK_MASK_CLONE) {
// BEESTRACE("temporary file size " << to_hex(m_end_offset) << " not aligned");
BEESCOUNT(tmp_realign);
return create();
}
// OK as is
BEESCOUNT(tmp_aligned);
}
BeesFileRange
BeesTempFile::make_hole(off_t count)
{
THROW_CHECK1(invalid_argument, count, count > 0);
realign();
BEESTRACE("make hole at " << m_end_offset);
auto end = m_end_offset + count;
BeesFileRange rv(m_fd, m_end_offset, end);
resize(end);
BEESTRACE("created temporary hole " << rv);
BEESCOUNT(tmp_hole);
return rv;
}
BeesFileRange
BeesTempFile::make_copy(const BeesFileRange &src)
{
BEESLOG("copy: " << src);
BEESNOTE("Copying " << src);
BEESTRACE("Copying " << src);
THROW_CHECK1(invalid_argument, src, src.size() > 0);
// FIXME: don't know where these come from, but we can't handle them.
// Grab a trace for the log.
THROW_CHECK1(invalid_argument, src, src.size() < BLOCK_SIZE_MAX_TEMP_FILE);
realign();
auto begin = m_end_offset;
auto end = m_end_offset + src.size();
resize(end);
BeesFileRange rv(m_fd, begin, end);
BEESTRACE("copying to: " << rv);
BEESNOTE("copying " << src << " to " << rv);
auto src_p = src.begin();
auto dst_p = begin;
bool did_block_write = false;
while (dst_p < end) {
auto len = min(BLOCK_SIZE_CLONE, end - dst_p);
BeesBlockData bbd(src.fd(), src_p, len);
// Don't fill in holes
if (bbd.is_data_zero()) {
BEESCOUNT(tmp_block_zero);
} else {
BEESNOTE("copying " << src << " to " << rv << "\n"
"\tpwrite " << bbd << " to " << name_fd(m_fd) << " offset " << to_hex(dst_p) << " len " << len);
pwrite_or_die(m_fd, bbd.data().data(), len, dst_p);
did_block_write = true;
BEESCOUNT(tmp_block);
BEESCOUNTADD(tmp_bytes, len);
}
src_p += len;
dst_p += len;
}
// We seem to get lockups without this!
if (did_block_write) {
bees_sync(m_fd);
}
BEESCOUNT(tmp_copy);
return rv;
}
int
bees_main(ArgList args)
{
set_catch_explainer([&](string s) {
BEESLOG("\n\n*** EXCEPTION ***\n\t" << s << "\n***\n");
BEESCOUNT(exception_caught);
});
BEESNOTE("main");
BeesNote::set_name("main");
list<shared_ptr<BeesContext>> all_contexts;
shared_ptr<BeesContext> bc;
// Subscribe to fanotify events
bool did_subscription = false;
for (string arg : args) {
catch_all([&]() {
bc = make_shared<BeesContext>(bc);
bc->set_root_path(arg);
did_subscription = true;
});
}
if (!did_subscription) {
BEESLOG("WARNING: no filesystems added");
}
BeesThread status_thread("status", [&]() {
bc->dump_status();
});
// Now we just wait forever
bc->show_progress();
// That is all.
return 0;
}
int
main(int argc, const char **argv)
{
if (argc < 2) {
do_cmd_help(argv);
return 2;
}
ArgList args(argv + 1);
int rv = 1;
catch_and_explain([&]() {
rv = bees_main(args);
});
return rv;
}
// instantiate templates for linkage ----------------------------------------
template class BeesStatTmpl<uint64_t>;
template ostream & operator<<(ostream &os, const BeesStatTmpl<uint64_t> &bs);
template class BeesStatTmpl<double>;
template ostream & operator<<(ostream &os, const BeesStatTmpl<double> &bs);

828
src/bees.h Normal file
View File

@@ -0,0 +1,828 @@
#ifndef BEES_H
#define BEES_H
#include "crucible/bool.h"
#include "crucible/cache.h"
#include "crucible/chatter.h"
#include "crucible/error.h"
#include "crucible/extentwalker.h"
#include "crucible/fd.h"
#include "crucible/fs.h"
#include "crucible/lockset.h"
#include "crucible/time.h"
#include "crucible/timequeue.h"
#include "crucible/workqueue.h"
#include <array>
#include <functional>
#include <list>
#include <mutex>
#include <string>
#include <thread>
#include <endian.h>
using namespace crucible;
using namespace std;
// Block size for clone alignment (FIXME: should read this from /sys/fs/btrfs/<FS-UUID>/clone_alignment)
const off_t BLOCK_SIZE_CLONE = 4096;
// Block size for dedup checksums (arbitrary, but must be a multiple of clone alignment)
const off_t BLOCK_SIZE_SUMS = 4096;
// Block size for memory allocations and file mappings (FIXME: should be CPU page size)
const off_t BLOCK_SIZE_MMAP = 4096;
// Maximum length parameter to extent-same ioctl (FIXME: hardcoded in kernel)
const off_t BLOCK_SIZE_MAX_EXTENT_SAME = 4096 * 4096;
// Maximum length of a compressed extent in bytes
const off_t BLOCK_SIZE_MAX_COMPRESSED_EXTENT = 128 * 1024;
// Try to combine smaller extents into larger ones
const off_t BLOCK_SIZE_MIN_EXTENT_DEFRAG = BLOCK_SIZE_MAX_COMPRESSED_EXTENT;
// Avoid splitting extents that are already too small
const off_t BLOCK_SIZE_MIN_EXTENT_SPLIT = BLOCK_SIZE_MAX_COMPRESSED_EXTENT;
// const off_t BLOCK_SIZE_MIN_EXTENT_SPLIT = 1024LL * 1024 * 1024 * 1024;
// Maximum length of any extent in bytes
// except we've seen 1.03G extents...
// ...FIEMAP is slow and full of lies
const off_t BLOCK_SIZE_MAX_EXTENT = 128 * 1024 * 1024;
// Masks, so we don't have to write "(BLOCK_SIZE_CLONE - 1)" everywhere
const off_t BLOCK_MASK_CLONE = BLOCK_SIZE_CLONE - 1;
const off_t BLOCK_MASK_SUMS = BLOCK_SIZE_SUMS - 1;
const off_t BLOCK_MASK_MMAP = BLOCK_SIZE_MMAP - 1;
const off_t BLOCK_MASK_MAX_COMPRESSED_EXTENT = BLOCK_SIZE_MAX_COMPRESSED_EXTENT * 2 - 1;
// Maximum temporary file size
const off_t BLOCK_SIZE_MAX_TEMP_FILE = 1024 * 1024 * 1024;
// Bucket size for hash table (size of one hash bucket)
const off_t BLOCK_SIZE_HASHTAB_BUCKET = BLOCK_SIZE_MMAP;
// Extent size for hash table (since the nocow file attribute does not seem to be working today)
const off_t BLOCK_SIZE_HASHTAB_EXTENT = 16 * 1024 * 1024;
// Bytes per second we want to flush (8GB every two hours)
const double BEES_FLUSH_RATE = 8.0 * 1024 * 1024 * 1024 / 7200.0;
// Interval between writing non-hash-table things to disk (15 minutes)
const int BEES_WRITEBACK_INTERVAL = 900;
// Statistics reports while scanning
const int BEES_STATS_INTERVAL = 3600;
// Progress shows instantaneous rates and thread status
const int BEES_PROGRESS_INTERVAL = 3600;
// Status is output every freakin second. Use a ramdisk.
const int BEES_STATUS_INTERVAL = 1;
// Log warnings when an operation takes too long
const double BEES_TOO_LONG = 2.5;
// Avoid any extent where LOGICAL_INO takes this long
const double BEES_TOXIC_DURATION = 9.9;
// How long we should wait for new btrfs transactions
const double BEES_COMMIT_INTERVAL = 900;
// How long between hash table histograms
const double BEES_HASH_TABLE_ANALYZE_INTERVAL = 3600;
// Rate limiting of informational messages
const double BEES_INFO_RATE = 10.0;
const double BEES_INFO_BURST = 1.0;
// After we have this many events queued, wait
const size_t BEES_MAX_QUEUE_SIZE = 1024;
// Read this many items at a time in SEARCHv2
const size_t BEES_MAX_CRAWL_SIZE = 4096;
// If an extent has this many refs, pretend it does not exist
// to avoid a crippling btrfs performance bug
// The actual limit in LOGICAL_INO seems to be 2730, but let's leave a little headroom
const size_t BEES_MAX_EXTENT_REF_COUNT = 2560;
// Flags
const int FLAGS_OPEN_COMMON = O_NOFOLLOW | O_NONBLOCK | O_CLOEXEC | O_NOATIME | O_LARGEFILE | O_NOCTTY;
const int FLAGS_OPEN_DIR = FLAGS_OPEN_COMMON | O_RDONLY | O_DIRECTORY;
const int FLAGS_OPEN_FILE = FLAGS_OPEN_COMMON | O_RDONLY;
const int FLAGS_OPEN_FILE_RW = FLAGS_OPEN_COMMON | O_RDWR;
const int FLAGS_OPEN_TMPFILE = FLAGS_OPEN_FILE_RW | O_TMPFILE | O_TRUNC | O_EXCL;
const int FLAGS_CREATE_FILE = FLAGS_OPEN_COMMON | O_WRONLY | O_CREAT | O_EXCL;
// Fanotify allows O_APPEND, O_DSYNC, O_NOATIME, O_NONBLOCK, O_CLOEXEC, O_LARGEFILE
const int FLAGS_OPEN_FANOTIFY = O_RDWR | O_NOATIME | O_CLOEXEC | O_LARGEFILE;
// macros ----------------------------------------
#define BEESLOG(x) do { Chatter c(BeesNote::get_name()); c << x; } while (0)
#define BEESLOGTRACE(x) do { BEESLOG(x); BeesTracer::trace_now(); } while (0)
#define BEESTRACE(x) BeesTracer SRSLY_WTF_C(beesTracer_, __LINE__) ([&]() { BEESLOG(x); })
#define BEESTOOLONG(x) BeesTooLong SRSLY_WTF_C(beesTooLong_, __LINE__) ([&](ostream &_btl_os) { _btl_os << x; })
#define BEESNOTE(x) BeesNote SRSLY_WTF_C(beesNote_, __LINE__) ([&](ostream &_btl_os) { _btl_os << x; })
#define BEESINFO(x) do { \
if (bees_info_rate_limit.is_ready()) { \
bees_info_rate_limit.borrow(1); \
Chatter c(BeesNote::get_name()); \
c << x; \
} \
} while (0)
#define BEESCOUNT(stat) do { \
BeesStats::s_global.add_count(#stat); \
} while (0)
#define BEESCOUNTADD(stat, amount) do { \
BeesStats::s_global.add_count(#stat, (amount)); \
} while (0)
// ----------------------------------------
template <class T> class BeesStatTmpl;
template <class T> ostream& operator<<(ostream &os, const BeesStatTmpl<T> &bs);
template <class T>
class BeesStatTmpl {
map<string, T> m_stats_map;
mutable mutex m_mutex;
public:
BeesStatTmpl() = default;
BeesStatTmpl(const BeesStatTmpl &that);
BeesStatTmpl &operator=(const BeesStatTmpl &that);
void add_count(string idx, size_t amount = 1);
T& at(string idx);
T at(string idx) const;
friend ostream& operator<< <>(ostream &os, const BeesStatTmpl<T> &bs);
friend class BeesStats;
};
using BeesRates = BeesStatTmpl<double>;
struct BeesStats : public BeesStatTmpl<uint64_t> {
static BeesStats s_global;
BeesStats operator-(const BeesStats &that) const;
BeesRates operator/(double d) const;
explicit operator bool() const;
};
class BeesContext;
class BeesBlockData;
class BeesTracer {
function<void()> m_func;
BeesTracer *m_next_tracer = 0;
thread_local static BeesTracer *s_next_tracer;
public:
BeesTracer(function<void()> f);
~BeesTracer();
static void trace_now();
};
class BeesNote {
function<void(ostream &)> m_func;
BeesNote *m_prev;
Timer m_timer;
string m_name;
static mutex s_mutex;
static map<pid_t, BeesNote*> s_status;
thread_local static BeesNote *s_next;
thread_local static string s_name;
public:
BeesNote(function<void(ostream &)> f);
~BeesNote();
using ThreadStatusMap = map<pid_t, string>;
static ThreadStatusMap get_status();
static void set_name(const string &name);
static string get_name();
};
// C++ threads dumbed down even further
class BeesThread {
string m_name;
Timer m_timer;
shared_ptr<thread> m_thread_ptr;
public:
~BeesThread();
BeesThread(string name);
BeesThread(string name, function<void()> args);
void exec(function<void()> args);
void join();
void set_name(const string &name);
};
class BeesFileId {
uint64_t m_root;
uint64_t m_ino;
public:
uint64_t root() const { return m_root; }
uint64_t ino() const { return m_ino; }
bool operator<(const BeesFileId &that) const;
bool operator!=(const BeesFileId &that) const;
bool operator==(const BeesFileId &that) const;
operator bool() const;
BeesFileId(const BtrfsInodeOffsetRoot &bior);
BeesFileId(int fd);
BeesFileId(uint64_t root, uint64_t ino);
BeesFileId();
};
ostream& operator<<(ostream &os, const BeesFileId &bfi);
class BeesFileRange {
protected:
static mutex s_mutex;
mutable Fd m_fd;
mutable BeesFileId m_fid;
off_t m_begin, m_end;
mutable off_t m_file_size;
public:
BeesFileRange();
BeesFileRange(Fd fd, off_t begin, off_t end);
BeesFileRange(const BeesFileId &fid, off_t begin, off_t end);
BeesFileRange(const BeesBlockData &bbd);
operator BeesBlockData() const;
bool operator<(const BeesFileRange &that) const;
bool operator==(const BeesFileRange &that) const;
bool operator!=(const BeesFileRange &that) const;
bool empty() const;
bool is_same_file(const BeesFileRange &that) const;
bool overlaps(const BeesFileRange &that) const;
// If file ranges overlap, extends this to include that.
// Coalesce with empty bfr = non-empty bfr
bool coalesce(const BeesFileRange &that);
// Remove that from this, creating 0, 1, or 2 new objects
pair<BeesFileRange, BeesFileRange> subtract(const BeesFileRange &that) const;
off_t begin() const { return m_begin; }
off_t end() const { return m_end; }
off_t size() const;
// Lazy accessors
off_t file_size() const;
BeesFileId fid() const;
// Get the fd if there is one
Fd fd() const;
// Get the fd, opening it if necessary
Fd fd(const shared_ptr<BeesContext> &ctx) const;
BeesFileRange copy_closed() const;
// Is it defined?
operator bool() const { return !!m_fd || m_fid; }
// Make range larger
off_t grow_end(off_t delta);
off_t grow_begin(off_t delta);
friend ostream & operator<<(ostream &os, const BeesFileRange &bfr);
};
class BeesAddress {
public:
using Type = uint64_t;
private:
Type m_addr = ZERO;
bool magic_check(uint64_t flags);
public:
// Blocks with no physical address (not yet allocated, hole, or "other").
// PREALLOC blocks have a physical address so they're not magic enough to be handled here.
// Compressed blocks have a physical address but it's two-dimensional.
enum MagicValue {
ZERO, // BeesAddress uninitialized
DELALLOC, // delayed allocation
HOLE, // no extent present, no space allocated
UNUSABLE, // inline extent or unrecognized FIEMAP flags
LAST, // all further values are non-magic
};
BeesAddress(Type addr = ZERO) : m_addr(addr) {}
BeesAddress(MagicValue addr) : m_addr(addr) {}
BeesAddress& operator=(const BeesAddress &that) = default;
operator Type() const { return m_addr; }
bool operator==(const BeesAddress &that) const;
bool operator==(const MagicValue that) const { return *this == BeesAddress(that); }
bool operator!=(const BeesAddress &that) const { return !(*this == that); }
bool operator!=(const MagicValue that) const { return *this != BeesAddress(that); }
bool operator<(const BeesAddress &that) const;
static const Type c_offset_min = 1;
static const Type c_offset_max = BLOCK_SIZE_MAX_COMPRESSED_EXTENT / BLOCK_SIZE_CLONE;
// if this isn't 0x3f we will have problems
static const Type c_offset_mask = (c_offset_max - 1) | (c_offset_max);
static const Type c_compressed_mask = 1 << 11;
static const Type c_eof_mask = 1 << 10;
static const Type c_toxic_mask = 1 << 9;
static const Type c_all_mask = c_compressed_mask | c_eof_mask | c_offset_mask | c_toxic_mask;
bool is_compressed() const { return m_addr >= MagicValue::LAST && (m_addr & c_compressed_mask); }
bool has_compressed_offset() const { return m_addr >= MagicValue::LAST && (m_addr & c_compressed_mask) && (m_addr & c_offset_mask); }
bool is_toxic() const { return m_addr >= MagicValue::LAST && (m_addr & c_toxic_mask); }
bool is_unaligned_eof() const { return m_addr >= MagicValue::LAST && (m_addr & c_eof_mask); }
bool is_magic() const { return m_addr < MagicValue::LAST; }
Type get_compressed_offset() const;
Type get_physical_or_zero() const;
void set_toxic();
BeesAddress(int fd, off_t offset);
BeesAddress(int fd, off_t offset, shared_ptr<BeesContext> ctx);
BeesAddress(const Extent &e, off_t offset);
};
ostream & operator<<(ostream &os, const BeesAddress &ba);
class BeesStringFile {
Fd m_dir_fd;
string m_name;
size_t m_limit;
public:
BeesStringFile(Fd dir_fd, string name, size_t limit = 1024 * 1024);
string read();
void write(string contents);
};
class BeesHashTable {
shared_ptr<BeesContext> m_ctx;
public:
using HashType = uint64_t;
using AddrType = uint64_t;
struct Cell {
HashType e_hash;
AddrType e_addr;
Cell(const Cell &) = default;
Cell(HashType hash, AddrType addr) : e_hash(hash), e_addr(addr) { }
bool operator==(const Cell &e) const { return tie(e_hash, e_addr) == tie(e.e_hash, e.e_addr); }
bool operator!=(const Cell &e) const { return tie(e_hash, e_addr) != tie(e.e_hash, e.e_addr); }
bool operator<(const Cell &e) const { return tie(e_hash, e_addr) < tie(e.e_hash, e.e_addr); }
} __attribute__((packed));
private:
static const uint64_t c_cells_per_bucket = BLOCK_SIZE_HASHTAB_BUCKET / sizeof(Cell);
static const uint64_t c_buckets_per_extent = BLOCK_SIZE_HASHTAB_EXTENT / BLOCK_SIZE_HASHTAB_BUCKET;
public:
union Bucket {
Cell p_cells[c_cells_per_bucket];
uint8_t p_byte[BLOCK_SIZE_HASHTAB_BUCKET];
} __attribute__((packed));
union Extent {
Bucket p_buckets[BLOCK_SIZE_HASHTAB_EXTENT / BLOCK_SIZE_HASHTAB_BUCKET];
uint8_t p_byte[BLOCK_SIZE_HASHTAB_EXTENT];
} __attribute__((packed));
BeesHashTable(shared_ptr<BeesContext> ctx, string filename);
~BeesHashTable();
vector<Cell> find_cell(HashType hash);
bool push_random_hash_addr(HashType hash, AddrType addr);
void erase_hash_addr(HashType hash, AddrType addr);
bool push_front_hash_addr(HashType hash, AddrType addr);
void set_shared(bool shared);
private:
string m_filename;
Fd m_fd;
uint64_t m_size;
union {
void *m_void_ptr; // Save some casting
uint8_t *m_byte_ptr; // for pointer arithmetic
Cell *m_cell_ptr; // pointer to one table cell (entry)
Bucket *m_bucket_ptr; // all cells in one LRU unit
Extent *m_extent_ptr; // all buckets in one I/O unit
};
union {
void *m_void_ptr_end;
uint8_t *m_byte_ptr_end;
Cell *m_cell_ptr_end;
Bucket *m_bucket_ptr_end;
Extent *m_extent_ptr_end;
};
uint64_t m_buckets;
uint64_t m_extents;
uint64_t m_cells;
set<uint64_t> m_buckets_dirty;
set<uint64_t> m_buckets_missing;
BeesThread m_writeback_thread;
BeesThread m_prefetch_thread;
RateLimiter m_flush_rate_limit;
RateLimiter m_prefetch_rate_limit;
mutex m_extent_mutex;
mutex m_bucket_mutex;
condition_variable m_condvar;
set<HashType> m_toxic_hashes;
BeesStringFile m_stats_file;
LockSet<uint64_t> m_extent_lock_set;
DefaultBool m_shared;
void writeback_loop();
void prefetch_loop();
void try_mmap_flags(int flags);
pair<Cell *, Cell *> get_cell_range(HashType hash);
pair<uint8_t *, uint8_t *> get_extent_range(HashType hash);
void fetch_missing_extent(HashType hash);
void set_extent_dirty(HashType hash);
void flush_dirty_extents();
bool is_toxic_hash(HashType h) const;
bool using_shared_map() const { return false; }
BeesHashTable(const BeesHashTable &) = delete;
BeesHashTable &operator=(const BeesHashTable &) = delete;
};
ostream &operator<<(ostream &os, const BeesHashTable::Cell &bhte);
struct BeesCrawlState {
uint64_t m_root;
uint64_t m_objectid;
uint64_t m_offset;
uint64_t m_min_transid;
uint64_t m_max_transid;
time_t m_started;
BeesCrawlState();
bool operator<(const BeesCrawlState &that) const;
};
class BeesCrawl {
shared_ptr<BeesContext> m_ctx;
mutex m_mutex;
set<BeesFileRange> m_extents;
DefaultBool m_deferred;
mutex m_state_mutex;
BeesCrawlState m_state;
bool fetch_extents();
void fetch_extents_harder();
bool next_transid();
public:
BeesCrawl(shared_ptr<BeesContext> ctx, BeesCrawlState initial_state);
BeesFileRange peek_front();
BeesFileRange pop_front();
BeesCrawlState get_state();
void set_state(const BeesCrawlState &bcs);
};
class BeesRoots {
shared_ptr<BeesContext> m_ctx;
BeesStringFile m_crawl_state_file;
BeesCrawlState m_crawl_current;
map<uint64_t, shared_ptr<BeesCrawl>> m_root_crawl_map;
mutex m_mutex;
condition_variable m_condvar;
DefaultBool m_crawl_dirty;
Timer m_crawl_timer;
BeesThread m_crawl_thread;
BeesThread m_writeback_thread;
void insert_new_crawl();
void insert_root(const BeesCrawlState &bcs);
Fd open_root_nocache(uint64_t root);
Fd open_root_ino_nocache(uint64_t root, uint64_t ino);
uint64_t transid_min();
uint64_t transid_max();
void state_load();
void state_save();
void crawl_roots();
string crawl_state_filename() const;
BeesCrawlState crawl_state_get(uint64_t root);
void crawl_state_set_dirty();
void crawl_state_erase(const BeesCrawlState &bcs);
void crawl_thread();
void writeback_thread();
uint64_t next_root(uint64_t root = 0);
void current_state_set(const BeesCrawlState &bcs);
friend class BeesFdCache;
friend class BeesCrawl;
public:
BeesRoots(shared_ptr<BeesContext> ctx);
Fd open_root(uint64_t root);
Fd open_root_ino(uint64_t root, uint64_t ino);
Fd open_root_ino(const BeesFileId &bfi) { return open_root_ino(bfi.root(), bfi.ino()); }
};
struct BeesHash {
using Type = uint64_t;
BeesHash() : m_hash(0) { }
BeesHash(Type that) : m_hash(that) { }
operator Type() const { return m_hash; }
BeesHash& operator=(const Type that) { m_hash = that; return *this; }
private:
Type m_hash;
};
ostream & operator<<(ostream &os, const BeesHash &bh);
class BeesBlockData {
using Blob = vector<char>;
mutable Fd m_fd;
off_t m_offset;
off_t m_length;
mutable BeesAddress m_addr;
mutable Blob m_data;
mutable BeesHash m_hash;
mutable DefaultBool m_hash_done;
public:
// Constructor with the immutable fields
BeesBlockData(Fd fd, off_t offset, size_t read_length = BLOCK_SIZE_SUMS);
BeesBlockData();
// Non-lazy accessors
Fd fd() const { return m_fd; }
// Renaming
off_t begin() const { return m_offset; }
off_t end() const { return m_offset + m_length; }
off_t size() const { return m_length; }
bool empty() const { return !m_length; }
// Lazy accessors may modify const things
const Blob &data() const;
BeesHash hash() const;
BeesAddress addr() const;
bool is_data_zero() const;
bool is_data_equal(const BeesBlockData &that) const;
// Setters
BeesBlockData &addr(const BeesAddress &a);
friend ostream &operator<<(ostream &, const BeesBlockData &);
};
class BeesRangePair : public pair<BeesFileRange, BeesFileRange> {
public:
BeesRangePair(const BeesFileRange &src, const BeesFileRange &dst);
bool grow(shared_ptr<BeesContext> ctx, bool constrained);
BeesRangePair copy_closed() const;
bool operator<(const BeesRangePair &that) const;
friend ostream & operator<<(ostream &os, const BeesRangePair &brp);
};
class BeesWorkQueueBase {
string m_name;
protected:
static mutex s_mutex;
static set<BeesWorkQueueBase *> s_all_workers;
public:
virtual ~BeesWorkQueueBase();
BeesWorkQueueBase(const string &name);
string name() const;
void name(const string &new_name);
virtual size_t active_size() const = 0;
virtual list<string> peek_active(size_t count) const = 0;
static void for_each_work_queue(function<void(BeesWorkQueueBase *)> f);
};
template <class Task>
class BeesWorkQueue : public BeesWorkQueueBase {
WorkQueue<Task> m_active_queue;
public:
BeesWorkQueue(const string &name);
~BeesWorkQueue();
void push_active(const Task &task, size_t limit);
void push_active(const Task &task);
size_t active_size() const override;
list<string> peek_active(size_t count) const override;
Task pop();
};
class BeesTempFile {
shared_ptr<BeesContext> m_ctx;
Fd m_fd;
off_t m_end_offset;
void create();
void realign();
void resize(off_t new_end_offset);
public:
BeesTempFile(shared_ptr<BeesContext> ctx);
BeesFileRange make_hole(off_t count);
BeesFileRange make_copy(const BeesFileRange &src);
};
class BeesFdCache {
LRUCache<Fd, shared_ptr<BeesContext>, uint64_t> m_root_cache;
LRUCache<Fd, shared_ptr<BeesContext>, uint64_t, uint64_t> m_file_cache;
Timer m_root_cache_timer;
public:
BeesFdCache();
Fd open_root(shared_ptr<BeesContext> ctx, uint64_t root);
Fd open_root_ino(shared_ptr<BeesContext> ctx, uint64_t root, uint64_t ino);
void insert_root_ino(shared_ptr<BeesContext> ctx, Fd fd);
};
struct BeesResolveAddrResult {
BeesResolveAddrResult();
vector<BtrfsInodeOffsetRoot> m_biors;
DefaultBool m_is_toxic;
bool is_toxic() const { return m_is_toxic; }
};
class BeesContext : public enable_shared_from_this<BeesContext> {
shared_ptr<BeesContext> m_parent_ctx;
Fd m_home_fd;
shared_ptr<BeesFdCache> m_fd_cache;
shared_ptr<BeesHashTable> m_hash_table;
shared_ptr<BeesRoots> m_roots;
map<thread::id, shared_ptr<BeesTempFile>> m_tmpfiles;
LRUCache<BeesResolveAddrResult, BeesAddress> m_resolve_cache;
string m_root_path;
Fd m_root_fd;
string m_root_uuid;
mutable mutex m_blacklist_mutex;
set<BeesFileId> m_blacklist;
string m_uuid;
Timer m_total_timer;
void set_root_fd(Fd fd);
BeesResolveAddrResult resolve_addr_uncached(BeesAddress addr);
BeesFileRange scan_one_extent(const BeesFileRange &bfr, const Extent &e);
void rewrite_file_range(const BeesFileRange &bfr);
public:
BeesContext(shared_ptr<BeesContext> parent_ctx = nullptr);
void set_root_path(string path);
Fd root_fd() const { return m_root_fd; }
Fd home_fd() const { return m_home_fd; }
string root_path() const { return m_root_path; }
string root_uuid() const { return m_root_uuid; }
BeesFileRange scan_forward(const BeesFileRange &bfr);
BeesRangePair dup_extent(const BeesFileRange &src);
bool dedup(const BeesRangePair &brp);
void blacklist_add(const BeesFileId &fid);
bool is_blacklisted(const BeesFileId &fid) const;
BeesResolveAddrResult resolve_addr(BeesAddress addr);
void invalidate_addr(BeesAddress addr);
void dump_status();
void show_progress();
shared_ptr<BeesFdCache> fd_cache();
shared_ptr<BeesHashTable> hash_table();
shared_ptr<BeesRoots> roots();
shared_ptr<BeesTempFile> tmpfile();
const Timer &total_timer() const { return m_total_timer; }
// TODO: move the rest of the FD cache methods here
void insert_root_ino(Fd fd);
};
class BeesResolver {
shared_ptr<BeesContext> m_ctx;
BeesAddress m_addr;
vector<BtrfsInodeOffsetRoot> m_biors;
set<BeesFileRange> m_ranges;
unsigned m_bior_count;
// We found matching data, so we can dedup
DefaultBool m_found_data;
// We found matching data, so we *did* dedup
DefaultBool m_found_dup;
// We found matching hash, so the hash table is still correct
DefaultBool m_found_hash;
// We found matching physical address, so the hash table isn't totally wrong
DefaultBool m_found_addr;
// We found matching physical address, but data did not match
DefaultBool m_wrong_data;
// The whole thing is a placebo to avoid crippling btrfs performance bugs
DefaultBool m_is_toxic;
BeesFileRange chase_extent_ref(const BtrfsInodeOffsetRoot &bior, BeesBlockData &needle_bbd);
BeesBlockData adjust_offset(const BeesFileRange &haystack, const BeesBlockData &needle);
void find_matches(bool just_one, BeesBlockData &bbd);
// FIXME: Do we need these? We probably always have at least one BBD
BeesFileRange chase_extent_ref(const BtrfsInodeOffsetRoot &bior, BeesHash hash);
BeesBlockData adjust_offset(const BeesFileRange &haystack, bool inexact, BeesHash needle);
void find_matches(bool just_one, BeesHash hash);
public:
BeesResolver(shared_ptr<BeesContext> ctx, BeesAddress addr);
BeesAddress addr(BeesAddress new_addr);
// visitor returns true to stop loop, false to continue
bool for_each_extent_ref(BeesBlockData bbd, function<bool(const BeesFileRange &bfr)> visitor);
set<BeesFileRange> find_all_matches(BeesBlockData &bbd);
set<BeesFileRange> find_all_matches(BeesHash hash);
// TODO: Replace these with "for_each_extent_ref"
BeesFileRange find_one_match(BeesBlockData &bbd);
BeesFileRange find_one_match(BeesHash hash);
void replace_src(const BeesFileRange &src_bfr);
BeesFileRange replace_dst(const BeesFileRange &dst_bfr);
bool found_addr() const { return m_found_addr; }
bool found_data() const { return m_found_data; }
bool found_dup() const { return m_found_dup; }
bool found_hash() const { return m_found_hash; }
bool is_toxic() const { return m_is_toxic; }
size_t count() const { return m_bior_count; }
BeesAddress addr() const { return m_addr; }
bool operator<(const BeesResolver &that) const;
};
class BeesTooLong : public Timer {
using func_type = function<void(ostream &)>;
double m_limit;
func_type m_func;
public:
BeesTooLong(const func_type &func = [](ostream &os) { os << __PRETTY_FUNCTION__; }, double limit = BEES_TOO_LONG);
BeesTooLong(const string &s, double limit = BEES_TOO_LONG);
BeesTooLong &operator=(const func_type &s);
~BeesTooLong();
void check() const;
};
// And now, a giant pile of extern declarations
string pretty(double d);
extern RateLimiter bees_info_rate_limit;
void bees_sync(int fd);
string format_time(time_t t);
#endif

52
src/fiemap.cc Normal file
View File

@@ -0,0 +1,52 @@
#include "crucible/fd.h"
#include "crucible/fs.h"
#include "crucible/error.h"
#include "crucible/string.h"
#include <iostream>
#include <fcntl.h>
#include <sys/stat.h>
#include <unistd.h>
using namespace crucible;
using namespace std;
int
main(int argc, char **argv)
{
catch_all([&]() {
THROW_CHECK1(invalid_argument, argc, argc > 1);
string filename = argv[1];
cout << "File: " << filename << endl;
Fd fd = open_or_die(filename, O_RDONLY);
Fiemap fm;
fm.m_max_count = 100;
if (argc > 2) { fm.fm_start = stoull(argv[2], nullptr, 0); }
if (argc > 3) { fm.fm_length = stoull(argv[3], nullptr, 0); }
if (argc > 4) { fm.fm_flags = stoull(argv[4], nullptr, 0); }
fm.fm_length = min(fm.fm_length, FIEMAP_MAX_OFFSET - fm.fm_start);
uint64_t stop_at = fm.fm_start + fm.fm_length;
uint64_t last_byte = fm.fm_start;
do {
fm.do_ioctl(fd);
// cerr << fm;
uint64_t last_logical = FIEMAP_MAX_OFFSET;
for (auto &extent : fm.m_extents) {
if (extent.fe_logical > last_byte) {
cout << "Log " << to_hex(last_byte) << ".." << to_hex(extent.fe_logical) << " Hole" << endl;
}
cout << "Log " << to_hex(extent.fe_logical) << ".." << to_hex(extent.fe_logical + extent.fe_length)
<< " Phy " << to_hex(extent.fe_physical) << ".." << to_hex(extent.fe_physical + extent.fe_length)
<< " Flags " << fiemap_extent_flags_ntoa(extent.fe_flags) << endl;
last_logical = extent.fe_logical + extent.fe_length;
last_byte = last_logical;
}
fm.fm_start = last_logical;
} while (fm.fm_start < stop_at);
});
exit(EXIT_SUCCESS);
}

40
src/fiewalk.cc Normal file
View File

@@ -0,0 +1,40 @@
#include "crucible/extentwalker.h"
#include "crucible/error.h"
#include "crucible/string.h"
#include <iostream>
#include <fcntl.h>
#include <unistd.h>
using namespace crucible;
using namespace std;
int
main(int argc, char **argv)
{
catch_all([&]() {
THROW_CHECK1(invalid_argument, argc, argc > 1);
string filename = argv[1];
cout << "File: " << filename << endl;
Fd fd = open_or_die(filename, O_RDONLY);
BtrfsExtentWalker ew(fd);
off_t pos = 0;
if (argc > 2) { pos = stoull(argv[2], nullptr, 0); }
ew.seek(pos);
do {
// cout << "\n\n>>>" << ew.current() << "<<<\n\n" << endl;
cout << ew.current() << endl;
} while (ew.next());
#if 0
cout << "\n\n\nAnd now, backwards...\n\n\n" << endl;
do {
cout << "\n\n>>>" << ew.current() << "<<<\n\n" << endl;
} while (ew.prev());
cout << "\n\n\nDone!\n\n\n" << endl;
#endif
});
exit(EXIT_SUCCESS);
}