mirror of
https://github.com/Zygo/bees.git
synced 2025-05-18 05:45:45 +02:00
Faster and more reliable toxic extent detection means we can now be much less paranoid about creating toxic extents. The paranoia has significant impact on dedupe hit rates because every extent that contains even one toxic hash is abandoned. The preloaded toxic hashes were chosen because they occur more frequently than any other block contents in typical filesystem data. The combination of these resulted in as much as 30% of duplicate extents being left untouched. Remove the preloaded toxic extent blacklist, and rely on the new kernel-CPU-usage-based workaround instead. Signed-off-by: Zygo Blaxell <bees@furryterror.org>
717 lines
22 KiB
C++
717 lines
22 KiB
C++
#include "bees.h"
|
|
|
|
#include "crucible/crc64.h"
|
|
#include "crucible/string.h"
|
|
|
|
#include <algorithm>
|
|
#include <random>
|
|
|
|
#include <sys/mman.h>
|
|
|
|
using namespace crucible;
|
|
using namespace std;
|
|
|
|
ostream &
|
|
operator<<(ostream &os, const BeesHash &bh)
|
|
{
|
|
return os << to_hex(BeesHash::Type(bh));
|
|
}
|
|
|
|
ostream &
|
|
operator<<(ostream &os, const BeesHashTable::Cell &bhte)
|
|
{
|
|
return os << "BeesHashTable::Cell { hash = " << BeesHash(bhte.e_hash) << ", addr = "
|
|
<< BeesAddress(bhte.e_addr) << " }";
|
|
}
|
|
|
|
#if 0
|
|
static
|
|
void
|
|
dump_bucket_locked(BeesHashTable::Cell *p, BeesHashTable::Cell *q)
|
|
{
|
|
for (auto i = p; i < q; ++i) {
|
|
BEESLOG("Entry " << i - p << " " << *i);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
const bool VERIFY_CLEARS_BUGS = false;
|
|
|
|
bool
|
|
verify_cell_range(BeesHashTable::Cell *p, BeesHashTable::Cell *q, bool clear_bugs = VERIFY_CLEARS_BUGS)
|
|
{
|
|
// Must be called while holding m_bucket_mutex
|
|
bool bugs_found = false;
|
|
set<BeesHashTable::Cell> seen_it;
|
|
for (BeesHashTable::Cell *cell = p; cell < q; ++cell) {
|
|
if (cell->e_addr && cell->e_addr < 0x1000) {
|
|
BEESCOUNT(bug_hash_magic_addr);
|
|
BEESLOGDEBUG("Bad hash table address hash " << to_hex(cell->e_hash) << " addr " << to_hex(cell->e_addr));
|
|
if (clear_bugs) {
|
|
cell->e_addr = 0;
|
|
cell->e_hash = 0;
|
|
}
|
|
bugs_found = true;
|
|
}
|
|
if (cell->e_addr && !seen_it.insert(*cell).second) {
|
|
BEESCOUNT(bug_hash_duplicate_cell);
|
|
// BEESLOGDEBUG("Duplicate hash table entry:\nthis = " << *cell << "\nold = " << *seen_it.find(*cell));
|
|
BEESLOGDEBUG("Duplicate hash table entry: " << *cell);
|
|
if (clear_bugs) {
|
|
cell->e_addr = 0;
|
|
cell->e_hash = 0;
|
|
}
|
|
bugs_found = true;
|
|
}
|
|
}
|
|
return bugs_found;
|
|
}
|
|
|
|
pair<BeesHashTable::Cell *, BeesHashTable::Cell *>
|
|
BeesHashTable::get_cell_range(HashType hash)
|
|
{
|
|
THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0);
|
|
THROW_CHECK1(runtime_error, m_bucket_ptr, m_bucket_ptr != nullptr);
|
|
Bucket *pp = &m_bucket_ptr[hash % m_buckets];
|
|
Cell *bp = pp[0].p_cells;
|
|
Cell *ep = pp[1].p_cells;
|
|
THROW_CHECK2(out_of_range, m_cell_ptr, bp, bp >= m_cell_ptr);
|
|
THROW_CHECK2(out_of_range, m_cell_ptr_end, ep, ep <= m_cell_ptr_end);
|
|
return make_pair(bp, ep);
|
|
}
|
|
|
|
pair<uint8_t *, uint8_t *>
|
|
BeesHashTable::get_extent_range(HashType hash)
|
|
{
|
|
THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0);
|
|
THROW_CHECK1(runtime_error, m_bucket_ptr, m_bucket_ptr != nullptr);
|
|
Extent *iop = &m_extent_ptr[ (hash % m_buckets) / c_buckets_per_extent ];
|
|
uint8_t *bp = iop[0].p_byte;
|
|
uint8_t *ep = iop[1].p_byte;
|
|
THROW_CHECK2(out_of_range, m_byte_ptr, bp, bp >= m_byte_ptr);
|
|
THROW_CHECK2(out_of_range, m_byte_ptr_end, ep, ep <= m_byte_ptr_end);
|
|
return make_pair(bp, ep);
|
|
}
|
|
|
|
bool
|
|
BeesHashTable::flush_dirty_extent(uint64_t extent_index)
|
|
{
|
|
BEESNOTE("flushing extent #" << extent_index << " of " << m_extents << " extents");
|
|
|
|
auto lock = lock_extent_by_index(extent_index);
|
|
|
|
// Not dirty, nothing to do
|
|
if (!m_extent_metadata.at(extent_index).m_dirty) {
|
|
return false;
|
|
}
|
|
|
|
bool wrote_extent = false;
|
|
|
|
catch_all([&]() {
|
|
uint8_t *dirty_extent = m_extent_ptr[extent_index].p_byte;
|
|
uint8_t *dirty_extent_end = m_extent_ptr[extent_index + 1].p_byte;
|
|
THROW_CHECK1(out_of_range, dirty_extent, dirty_extent >= m_byte_ptr);
|
|
THROW_CHECK1(out_of_range, dirty_extent_end, dirty_extent_end <= m_byte_ptr_end);
|
|
THROW_CHECK2(out_of_range, dirty_extent_end, dirty_extent, dirty_extent_end - dirty_extent == BLOCK_SIZE_HASHTAB_EXTENT);
|
|
BEESTOOLONG("pwrite(fd " << m_fd << " '" << name_fd(m_fd)<< "', length " << to_hex(dirty_extent_end - dirty_extent) << ", offset " << to_hex(dirty_extent - m_byte_ptr) << ")");
|
|
// Copy the extent because we might be stuck writing for a while
|
|
vector<uint8_t> extent_copy(dirty_extent, dirty_extent_end);
|
|
|
|
// Mark extent non-dirty while we still hold the lock
|
|
m_extent_metadata.at(extent_index).m_dirty = false;
|
|
|
|
// Release the lock
|
|
lock.unlock();
|
|
|
|
// Write the extent (or not)
|
|
pwrite_or_die(m_fd, extent_copy, dirty_extent - m_byte_ptr);
|
|
BEESCOUNT(hash_extent_out);
|
|
|
|
wrote_extent = true;
|
|
});
|
|
|
|
BEESNOTE("flush rate limited after extent #" << extent_index << " of " << m_extents << " extents");
|
|
m_flush_rate_limit.sleep_for(BLOCK_SIZE_HASHTAB_EXTENT);
|
|
return wrote_extent;
|
|
}
|
|
|
|
void
|
|
BeesHashTable::flush_dirty_extents()
|
|
{
|
|
THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0);
|
|
|
|
uint64_t wrote_extents = 0;
|
|
for (size_t extent_index = 0; extent_index < m_extents; ++extent_index) {
|
|
if (flush_dirty_extent(extent_index)) {
|
|
++wrote_extents;
|
|
}
|
|
}
|
|
|
|
BEESNOTE("idle after writing " << wrote_extents << " of " << m_extents << " extents");
|
|
unique_lock<mutex> lock(m_dirty_mutex);
|
|
m_dirty_condvar.wait(lock);
|
|
}
|
|
|
|
void
|
|
BeesHashTable::set_extent_dirty_locked(uint64_t extent_index)
|
|
{
|
|
// Must already be locked
|
|
m_extent_metadata.at(extent_index).m_dirty = true;
|
|
|
|
// Signal writeback thread
|
|
unique_lock<mutex> dirty_lock(m_dirty_mutex);
|
|
m_dirty_condvar.notify_one();
|
|
}
|
|
|
|
void
|
|
BeesHashTable::writeback_loop()
|
|
{
|
|
while (true) {
|
|
flush_dirty_extents();
|
|
}
|
|
}
|
|
|
|
static
|
|
string
|
|
percent(size_t num, size_t den)
|
|
{
|
|
if (den) {
|
|
return astringprintf("%u%%", num * 100 / den);
|
|
} else {
|
|
return "--%";
|
|
}
|
|
}
|
|
|
|
void
|
|
BeesHashTable::prefetch_loop()
|
|
{
|
|
bool not_locked = true;
|
|
while (true) {
|
|
size_t width = 64;
|
|
vector<size_t> occupancy(width, 0);
|
|
size_t occupied_count = 0;
|
|
size_t total_count = 0;
|
|
size_t compressed_count = 0;
|
|
size_t compressed_offset_count = 0;
|
|
size_t toxic_count = 0;
|
|
size_t unaligned_eof_count = 0;
|
|
|
|
for (uint64_t ext = 0; ext < m_extents; ++ext) {
|
|
BEESNOTE("prefetching hash table extent #" << ext << " of " << m_extents);
|
|
catch_all([&]() {
|
|
fetch_missing_extent_by_index(ext);
|
|
|
|
BEESNOTE("analyzing hash table extent #" << ext << " of " << m_extents);
|
|
bool duplicate_bugs_found = false;
|
|
auto lock = lock_extent_by_index(ext);
|
|
for (Bucket *bucket = m_extent_ptr[ext].p_buckets; bucket < m_extent_ptr[ext + 1].p_buckets; ++bucket) {
|
|
if (verify_cell_range(bucket[0].p_cells, bucket[1].p_cells)) {
|
|
duplicate_bugs_found = true;
|
|
}
|
|
size_t this_bucket_occupied_count = 0;
|
|
for (Cell *cell = bucket[0].p_cells; cell < bucket[1].p_cells; ++cell) {
|
|
if (cell->e_addr) {
|
|
++this_bucket_occupied_count;
|
|
BeesAddress a(cell->e_addr);
|
|
if (a.is_compressed()) {
|
|
++compressed_count;
|
|
if (a.has_compressed_offset()) {
|
|
++compressed_offset_count;
|
|
}
|
|
}
|
|
if (a.is_toxic()) {
|
|
++toxic_count;
|
|
}
|
|
if (a.is_unaligned_eof()) {
|
|
++unaligned_eof_count;
|
|
}
|
|
}
|
|
++total_count;
|
|
}
|
|
++occupancy.at(this_bucket_occupied_count * width / (1 + c_cells_per_bucket) );
|
|
// Count these instead of calculating the number so we get better stats in case of exceptions
|
|
occupied_count += this_bucket_occupied_count;
|
|
}
|
|
if (duplicate_bugs_found) {
|
|
set_extent_dirty_locked(ext);
|
|
}
|
|
});
|
|
}
|
|
|
|
BEESNOTE("calculating hash table statistics");
|
|
|
|
vector<string> histogram;
|
|
vector<size_t> thresholds;
|
|
size_t threshold = 1;
|
|
bool threshold_exceeded = false;
|
|
do {
|
|
threshold_exceeded = false;
|
|
histogram.push_back(string(width, ' '));
|
|
thresholds.push_back(threshold);
|
|
for (size_t x = 0; x < width; ++x) {
|
|
if (occupancy.at(x) >= threshold) {
|
|
histogram.back().at(x) = '#';
|
|
threshold_exceeded = true;
|
|
}
|
|
}
|
|
threshold *= 2;
|
|
} while (threshold_exceeded);
|
|
|
|
ostringstream out;
|
|
size_t count = histogram.size();
|
|
bool first_line = true;
|
|
for (auto it = histogram.rbegin(); it != histogram.rend(); ++it) {
|
|
out << *it << " " << thresholds.at(--count);
|
|
if (first_line) {
|
|
first_line = false;
|
|
out << " pages";
|
|
}
|
|
out << "\n";
|
|
}
|
|
|
|
size_t uncompressed_count = occupied_count - compressed_offset_count;
|
|
|
|
ostringstream graph_blob;
|
|
|
|
graph_blob << "Now: " << format_time(time(NULL)) << "\n";
|
|
graph_blob << "Uptime: " << m_ctx->total_timer().age() << " seconds\n";
|
|
graph_blob << "Version: " << BEES_VERSION << "\n";
|
|
|
|
graph_blob
|
|
<< "\nHash table page occupancy histogram (" << occupied_count << "/" << total_count << " cells occupied, " << (occupied_count * 100 / total_count) << "%)\n"
|
|
<< out.str() << "0% | 25% | 50% | 75% | 100% page fill\n"
|
|
<< "compressed " << compressed_count << " (" << percent(compressed_count, occupied_count) << ")\n"
|
|
<< "uncompressed " << uncompressed_count << " (" << percent(uncompressed_count, occupied_count) << ")"
|
|
<< " unaligned_eof " << unaligned_eof_count << " (" << percent(unaligned_eof_count, occupied_count) << ")"
|
|
<< " toxic " << toxic_count << " (" << percent(toxic_count, occupied_count) << ")";
|
|
|
|
graph_blob << "\n\n";
|
|
|
|
graph_blob << "TOTAL:\n";
|
|
auto thisStats = BeesStats::s_global;
|
|
graph_blob << "\t" << thisStats << "\n";
|
|
|
|
graph_blob << "\nRATES:\n";
|
|
auto avg_rates = thisStats / m_ctx->total_timer().age();
|
|
graph_blob << "\t" << avg_rates << "\n";
|
|
|
|
BEESLOGINFO(graph_blob.str());
|
|
catch_all([&]() {
|
|
m_stats_file.write(graph_blob.str());
|
|
});
|
|
|
|
if (not_locked) {
|
|
// Always do the mlock, whether shared or not
|
|
THROW_CHECK1(runtime_error, m_size, m_size > 0);
|
|
BEESLOGINFO("mlock(" << pretty(m_size) << ")...");
|
|
Timer lock_time;
|
|
catch_all([&]() {
|
|
BEESNOTE("mlock " << pretty(m_size));
|
|
DIE_IF_NON_ZERO(mlock(m_byte_ptr, m_size));
|
|
});
|
|
BEESLOGINFO("mlock(" << pretty(m_size) << ") done in " << lock_time << " sec");
|
|
not_locked = false;
|
|
}
|
|
|
|
BEESNOTE("idle " << BEES_HASH_TABLE_ANALYZE_INTERVAL << "s");
|
|
nanosleep(BEES_HASH_TABLE_ANALYZE_INTERVAL);
|
|
}
|
|
}
|
|
|
|
size_t
|
|
BeesHashTable::hash_to_extent_index(HashType hash)
|
|
{
|
|
auto pr = get_extent_range(hash);
|
|
uint64_t extent_index = reinterpret_cast<const Extent *>(pr.first) - m_extent_ptr;
|
|
THROW_CHECK2(runtime_error, extent_index, m_extents, extent_index < m_extents);
|
|
return extent_index;
|
|
}
|
|
|
|
BeesHashTable::ExtentMetaData::ExtentMetaData() :
|
|
m_mutex_ptr(make_shared<mutex>())
|
|
{
|
|
}
|
|
|
|
unique_lock<mutex>
|
|
BeesHashTable::lock_extent_by_index(uint64_t extent_index)
|
|
{
|
|
THROW_CHECK2(out_of_range, extent_index, m_extents, extent_index < m_extents);
|
|
return unique_lock<mutex>(*m_extent_metadata.at(extent_index).m_mutex_ptr);
|
|
}
|
|
|
|
unique_lock<mutex>
|
|
BeesHashTable::lock_extent_by_hash(HashType hash)
|
|
{
|
|
BEESTOOLONG("fetch_missing_extent for hash " << to_hex(hash));
|
|
return lock_extent_by_index(hash_to_extent_index(hash));
|
|
}
|
|
|
|
void
|
|
BeesHashTable::fetch_missing_extent_by_index(uint64_t extent_index)
|
|
{
|
|
BEESNOTE("checking hash extent #" << extent_index << " of " << m_extents << " extents");
|
|
auto lock = lock_extent_by_index(extent_index);
|
|
if (!m_extent_metadata.at(extent_index).m_missing) {
|
|
return;
|
|
}
|
|
|
|
// OK we have to read this extent
|
|
BEESNOTE("fetching hash extent #" << extent_index << " of " << m_extents << " extents");
|
|
BEESTRACE("Fetching hash extent #" << extent_index << " of " << m_extents << " extents");
|
|
BEESTOOLONG("Fetching hash extent #" << extent_index << " of " << m_extents << " extents");
|
|
|
|
uint8_t *dirty_extent = m_extent_ptr[extent_index].p_byte;
|
|
uint8_t *dirty_extent_end = m_extent_ptr[extent_index + 1].p_byte;
|
|
|
|
// If the read fails don't retry, just go with whatever data we have
|
|
m_extent_metadata.at(extent_index).m_missing = false;
|
|
|
|
catch_all([&]() {
|
|
BEESTOOLONG("pread(fd " << m_fd << " '" << name_fd(m_fd)<< "', length " << to_hex(dirty_extent_end - dirty_extent) << ", offset " << to_hex(dirty_extent - m_byte_ptr) << ")");
|
|
pread_or_die(m_fd, dirty_extent, dirty_extent_end - dirty_extent, dirty_extent - m_byte_ptr);
|
|
});
|
|
|
|
// Only count extents successfully read
|
|
BEESCOUNT(hash_extent_in);
|
|
}
|
|
|
|
void
|
|
BeesHashTable::fetch_missing_extent_by_hash(HashType hash)
|
|
{
|
|
uint64_t extent_index = hash_to_extent_index(hash);
|
|
BEESNOTE("waiting to fetch hash extent #" << extent_index << " of " << m_extents << " extents");
|
|
|
|
fetch_missing_extent_by_index(extent_index);
|
|
}
|
|
|
|
vector<BeesHashTable::Cell>
|
|
BeesHashTable::find_cell(HashType hash)
|
|
{
|
|
fetch_missing_extent_by_hash(hash);
|
|
BEESTOOLONG("find_cell hash " << BeesHash(hash));
|
|
vector<Cell> rv;
|
|
auto lock = lock_extent_by_hash(hash);
|
|
auto er = get_cell_range(hash);
|
|
// FIXME: Weed out zero addresses in the table due to earlier bugs
|
|
copy_if(er.first, er.second, back_inserter(rv), [=](const Cell &ip) { return ip.e_hash == hash && ip.e_addr >= 0x1000; });
|
|
BEESCOUNT(hash_lookup);
|
|
return rv;
|
|
}
|
|
|
|
// Move an entry to the end of the list. Used after an attempt to resolve
|
|
// an address in the hash table fails. Probably more correctly called
|
|
// push_back_hash_addr, except it never inserts. Shared hash tables
|
|
// never erase anything, since there is no way to tell if an entry is
|
|
// out of date or just belonging to the wrong filesystem.
|
|
void
|
|
BeesHashTable::erase_hash_addr(HashType hash, AddrType addr)
|
|
{
|
|
fetch_missing_extent_by_hash(hash);
|
|
BEESTOOLONG("erase hash " << to_hex(hash) << " addr " << addr);
|
|
auto lock = lock_extent_by_hash(hash);
|
|
auto er = get_cell_range(hash);
|
|
Cell mv(hash, addr);
|
|
Cell *ip = find(er.first, er.second, mv);
|
|
bool found = (ip < er.second);
|
|
if (found) {
|
|
// Lookups on invalid addresses really hurt us. Kill it with fire!
|
|
*ip = Cell(0, 0);
|
|
set_extent_dirty_locked(hash_to_extent_index(hash));
|
|
BEESCOUNT(hash_erase);
|
|
#if 0
|
|
if (verify_cell_range(er.first, er.second)) {
|
|
BEESLOGDEBUG("while erasing hash " << hash << " addr " << addr);
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
|
|
// If entry is already present in list, move it to the front of the
|
|
// list without dropping any entries, and return true. If entry is not
|
|
// present in list, insert it at the front of the list, possibly dropping
|
|
// the last entry in the list, and return false. Used to move duplicate
|
|
// hash blocks to the front of the list.
|
|
bool
|
|
BeesHashTable::push_front_hash_addr(HashType hash, AddrType addr)
|
|
{
|
|
fetch_missing_extent_by_hash(hash);
|
|
BEESTOOLONG("push_front_hash_addr hash " << BeesHash(hash) <<" addr " << BeesAddress(addr));
|
|
auto lock = lock_extent_by_hash(hash);
|
|
auto er = get_cell_range(hash);
|
|
Cell mv(hash, addr);
|
|
Cell *ip = find(er.first, er.second, mv);
|
|
bool found = (ip < er.second);
|
|
if (!found) {
|
|
// If no match found, get rid of an empty space instead
|
|
// If no empty spaces, ip will point to end
|
|
ip = find(er.first, er.second, Cell(0, 0));
|
|
}
|
|
if (ip > er.first) {
|
|
// Delete matching entry, first empty entry,
|
|
// or last entry whether empty or not
|
|
// move_backward(er.first, ip - 1, ip);
|
|
auto sp = ip;
|
|
auto dp = ip;
|
|
--sp;
|
|
// If we are deleting the last entry then don't copy it
|
|
if (ip == er.second) {
|
|
--sp;
|
|
--dp;
|
|
BEESCOUNT(hash_evict);
|
|
}
|
|
while (dp > er.first) {
|
|
*dp-- = *sp--;
|
|
}
|
|
}
|
|
// There is now a space at the front, insert there if different
|
|
if (er.first[0] != mv) {
|
|
er.first[0] = mv;
|
|
set_extent_dirty_locked(hash_to_extent_index(hash));
|
|
BEESCOUNT(hash_front);
|
|
}
|
|
#if 0
|
|
if (verify_cell_range(er.first, er.second)) {
|
|
BEESLOGDEBUG("while push_fronting hash " << hash << " addr " << addr);
|
|
}
|
|
#endif
|
|
return found;
|
|
}
|
|
|
|
// If entry is already present in list, returns true and does not
|
|
// modify list. If entry is not present in list, returns false and
|
|
// inserts at a random position in the list, possibly evicting the entry
|
|
// at the end of the list. Used to insert new unique (not-yet-duplicate)
|
|
// blocks in random order.
|
|
bool
|
|
BeesHashTable::push_random_hash_addr(HashType hash, AddrType addr)
|
|
{
|
|
fetch_missing_extent_by_hash(hash);
|
|
BEESTOOLONG("push_random_hash_addr hash " << BeesHash(hash) << " addr " << BeesAddress(addr));
|
|
auto lock = lock_extent_by_hash(hash);
|
|
auto er = get_cell_range(hash);
|
|
Cell mv(hash, addr);
|
|
Cell *ip = find(er.first, er.second, mv);
|
|
bool found = (ip < er.second);
|
|
|
|
thread_local default_random_engine generator;
|
|
thread_local uniform_int_distribution<int> distribution(0, c_cells_per_bucket - 1);
|
|
auto pos = distribution(generator);
|
|
|
|
int case_cond = 0;
|
|
#if 0
|
|
vector<Cell> saved(er.first, er.second);
|
|
#endif
|
|
|
|
if (found) {
|
|
// If hash already exists after pos, swap with pos
|
|
if (ip > er.first + pos) {
|
|
|
|
// move_backward(er.first + pos, ip - 1, ip);
|
|
auto sp = ip;
|
|
auto dp = ip;
|
|
--sp;
|
|
while (dp > er.first + pos) {
|
|
*dp-- = *sp--;
|
|
}
|
|
*dp = mv;
|
|
BEESCOUNT(hash_bump);
|
|
case_cond = 1;
|
|
goto ret_dirty;
|
|
}
|
|
// Hash already exists before (or at) pos, leave it there
|
|
BEESCOUNT(hash_already);
|
|
case_cond = 2;
|
|
goto ret;
|
|
}
|
|
|
|
// Find an empty space to back of pos
|
|
for (ip = er.first + pos; ip < er.second; ++ip) {
|
|
if (*ip == Cell(0, 0)) {
|
|
*ip = mv;
|
|
case_cond = 3;
|
|
goto ret_dirty;
|
|
}
|
|
}
|
|
|
|
// Find an empty space to front of pos
|
|
// if there is anything to front of pos
|
|
if (pos > 0) {
|
|
for (ip = er.first + pos - 1; ip >= er.first; --ip) {
|
|
if (*ip == Cell(0, 0)) {
|
|
*ip = mv;
|
|
case_cond = 4;
|
|
goto ret_dirty;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Evict something and insert at pos
|
|
move_backward(er.first + pos, er.second - 1, er.second);
|
|
er.first[pos] = mv;
|
|
BEESCOUNT(hash_evict);
|
|
case_cond = 5;
|
|
ret_dirty:
|
|
BEESCOUNT(hash_insert);
|
|
set_extent_dirty_locked(hash_to_extent_index(hash));
|
|
ret:
|
|
#if 0
|
|
if (verify_cell_range(er.first, er.second, false)) {
|
|
BEESLOG("while push_randoming (case " << case_cond << ") pos " << pos
|
|
<< " ip " << (ip - er.first) << " " << mv);
|
|
// dump_bucket_locked(saved.data(), saved.data() + saved.size());
|
|
// dump_bucket_locked(er.first, er.second);
|
|
}
|
|
#else
|
|
(void)case_cond;
|
|
#endif
|
|
return found;
|
|
}
|
|
|
|
void
|
|
BeesHashTable::try_mmap_flags(int flags)
|
|
{
|
|
if (!m_cell_ptr) {
|
|
THROW_CHECK1(out_of_range, m_size, m_size > 0);
|
|
Timer map_time;
|
|
catch_all([&]() {
|
|
BEESLOGINFO("mapping hash table size " << m_size << " with flags " << mmap_flags_ntoa(flags));
|
|
void *ptr = mmap_or_die(nullptr, m_size, PROT_READ | PROT_WRITE, flags, flags & MAP_ANONYMOUS ? -1 : int(m_fd), 0);
|
|
BEESLOGINFO("mmap done in " << map_time << " sec");
|
|
m_cell_ptr = static_cast<Cell *>(ptr);
|
|
void *ptr_end = static_cast<uint8_t *>(ptr) + m_size;
|
|
m_cell_ptr_end = static_cast<Cell *>(ptr_end);
|
|
});
|
|
}
|
|
}
|
|
|
|
void
|
|
BeesHashTable::open_file()
|
|
{
|
|
// OK open hash table
|
|
BEESNOTE("opening hash table '" << m_filename << "' target size " << m_size << " (" << pretty(m_size) << ")");
|
|
|
|
// Try to open existing hash table
|
|
Fd new_fd = openat(m_ctx->home_fd(), m_filename.c_str(), FLAGS_OPEN_FILE_RW, 0700);
|
|
|
|
// If that doesn't work, try to make a new one
|
|
if (!new_fd) {
|
|
string tmp_filename = m_filename + ".tmp";
|
|
BEESNOTE("creating new hash table '" << tmp_filename << "'");
|
|
BEESLOGINFO("Creating new hash table '" << tmp_filename << "'");
|
|
unlinkat(m_ctx->home_fd(), tmp_filename.c_str(), 0);
|
|
new_fd = openat_or_die(m_ctx->home_fd(), tmp_filename, FLAGS_CREATE_FILE, 0700);
|
|
BEESNOTE("truncating new hash table '" << tmp_filename << "' size " << m_size << " (" << pretty(m_size) << ")");
|
|
BEESLOGINFO("Truncating new hash table '" << tmp_filename << "' size " << m_size << " (" << pretty(m_size) << ")");
|
|
ftruncate_or_die(new_fd, m_size);
|
|
BEESNOTE("truncating new hash table '" << tmp_filename << "' -> '" << m_filename << "'");
|
|
BEESLOGINFO("Truncating new hash table '" << tmp_filename << "' -> '" << m_filename << "'");
|
|
renameat_or_die(m_ctx->home_fd(), tmp_filename, m_ctx->home_fd(), m_filename);
|
|
}
|
|
|
|
Stat st(new_fd);
|
|
off_t new_size = st.st_size;
|
|
|
|
THROW_CHECK1(invalid_argument, new_size, new_size > 0);
|
|
THROW_CHECK1(invalid_argument, new_size, (new_size % BLOCK_SIZE_HASHTAB_EXTENT) == 0);
|
|
m_size = new_size;
|
|
m_fd = new_fd;
|
|
}
|
|
|
|
BeesHashTable::BeesHashTable(shared_ptr<BeesContext> ctx, string filename, off_t size) :
|
|
m_ctx(ctx),
|
|
m_size(0),
|
|
m_void_ptr(nullptr),
|
|
m_void_ptr_end(nullptr),
|
|
m_buckets(0),
|
|
m_cells(0),
|
|
m_writeback_thread("hash_writeback"),
|
|
m_prefetch_thread("hash_prefetch"),
|
|
m_flush_rate_limit(BEES_FLUSH_RATE),
|
|
m_stats_file(m_ctx->home_fd(), "beesstats.txt")
|
|
{
|
|
// Sanity checks to protect the implementation from its weaknesses
|
|
THROW_CHECK2(invalid_argument, BLOCK_SIZE_HASHTAB_BUCKET, BLOCK_SIZE_HASHTAB_EXTENT, (BLOCK_SIZE_HASHTAB_EXTENT % BLOCK_SIZE_HASHTAB_BUCKET) == 0);
|
|
|
|
// There's more than one union
|
|
THROW_CHECK2(runtime_error, sizeof(Bucket), BLOCK_SIZE_HASHTAB_BUCKET, BLOCK_SIZE_HASHTAB_BUCKET == sizeof(Bucket));
|
|
THROW_CHECK2(runtime_error, sizeof(Bucket::p_byte), BLOCK_SIZE_HASHTAB_BUCKET, BLOCK_SIZE_HASHTAB_BUCKET == sizeof(Bucket::p_byte));
|
|
THROW_CHECK2(runtime_error, sizeof(Extent), BLOCK_SIZE_HASHTAB_EXTENT, BLOCK_SIZE_HASHTAB_EXTENT == sizeof(Extent));
|
|
THROW_CHECK2(runtime_error, sizeof(Extent::p_byte), BLOCK_SIZE_HASHTAB_EXTENT, BLOCK_SIZE_HASHTAB_EXTENT == sizeof(Extent::p_byte));
|
|
|
|
m_filename = filename;
|
|
m_size = size;
|
|
open_file();
|
|
|
|
// Now we know size we can compute stuff
|
|
|
|
BEESTRACE("hash table size " << m_size);
|
|
BEESTRACE("hash table bucket size " << BLOCK_SIZE_HASHTAB_BUCKET);
|
|
BEESTRACE("hash table extent size " << BLOCK_SIZE_HASHTAB_EXTENT);
|
|
|
|
BEESLOGINFO("opened hash table filename '" << filename << "' length " << m_size);
|
|
m_buckets = m_size / BLOCK_SIZE_HASHTAB_BUCKET;
|
|
m_cells = m_buckets * c_cells_per_bucket;
|
|
m_extents = (m_size + BLOCK_SIZE_HASHTAB_EXTENT - 1) / BLOCK_SIZE_HASHTAB_EXTENT;
|
|
BEESLOGINFO("\tcells " << m_cells << ", buckets " << m_buckets << ", extents " << m_extents);
|
|
|
|
BEESLOGINFO("\tflush rate limit " << BEES_FLUSH_RATE);
|
|
|
|
// Try to mmap that much memory
|
|
try_mmap_flags(MAP_PRIVATE | MAP_ANONYMOUS);
|
|
|
|
if (!m_cell_ptr) {
|
|
THROW_ERRNO("unable to mmap " << filename);
|
|
}
|
|
|
|
// Do unions work the way we think (and rely on)?
|
|
THROW_CHECK2(runtime_error, m_void_ptr, m_cell_ptr, m_void_ptr == m_cell_ptr);
|
|
THROW_CHECK2(runtime_error, m_void_ptr, m_byte_ptr, m_void_ptr == m_byte_ptr);
|
|
THROW_CHECK2(runtime_error, m_void_ptr, m_bucket_ptr, m_void_ptr == m_bucket_ptr);
|
|
THROW_CHECK2(runtime_error, m_void_ptr, m_extent_ptr, m_void_ptr == m_extent_ptr);
|
|
|
|
// Give all the madvise hints that the kernel understands
|
|
const struct madv_flag {
|
|
const char *name;
|
|
int value;
|
|
} madv_flags[] = {
|
|
{ .name = "MADV_HUGEPAGE", .value = MADV_HUGEPAGE },
|
|
{ .name = "MADV_DONTFORK", .value = MADV_DONTFORK },
|
|
{ .name = "MADV_DONTDUMP", .value = MADV_DONTDUMP },
|
|
{ .name = "", .value = 0 },
|
|
};
|
|
for (auto fp = madv_flags; fp->value; ++fp) {
|
|
BEESTOOLONG("madvise(" << fp->name << ")");
|
|
if (madvise(m_byte_ptr, m_size, fp->value)) {
|
|
BEESLOGWARN("madvise(..., " << fp->name << "): " << strerror(errno) << " (ignored)");
|
|
}
|
|
}
|
|
|
|
m_extent_metadata.resize(m_extents);
|
|
|
|
m_writeback_thread.exec([&]() {
|
|
writeback_loop();
|
|
});
|
|
|
|
m_prefetch_thread.exec([&]() {
|
|
prefetch_loop();
|
|
});
|
|
|
|
// Blacklist might fail if the hash table is not stored on a btrfs
|
|
catch_all([&]() {
|
|
m_ctx->blacklist_add(BeesFileId(m_fd));
|
|
});
|
|
}
|
|
|
|
BeesHashTable::~BeesHashTable()
|
|
{
|
|
if (m_cell_ptr && m_size) {
|
|
flush_dirty_extents();
|
|
catch_all([&]() {
|
|
DIE_IF_NON_ZERO(munmap(m_cell_ptr, m_size));
|
|
m_cell_ptr = nullptr;
|
|
m_size = 0;
|
|
});
|
|
}
|
|
}
|
|
|