From aa74a238b37305c7502950f2555db21779b35f78 Mon Sep 17 00:00:00 2001 From: Zygo Blaxell Date: Wed, 31 Oct 2018 23:03:01 -0400 Subject: [PATCH] hash: remove preloaded toxic hash blacklist Faster and more reliable toxic extent detection means we can now be much less paranoid about creating toxic extents. The paranoia has significant impact on dedupe hit rates because every extent that contains even one toxic hash is abandoned. The preloaded toxic hashes were chosen because they occur more frequently than any other block contents in typical filesystem data. The combination of these resulted in as much as 30% of duplicate extents being left untouched. Remove the preloaded toxic extent blacklist, and rely on the new kernel-CPU-usage-based workaround instead. Signed-off-by: Zygo Blaxell --- src/bees-hash.cc | 23 ----------------------- src/bees.h | 2 -- 2 files changed, 25 deletions(-) diff --git a/src/bees-hash.cc b/src/bees-hash.cc index e774c11..45a46ef 100644 --- a/src/bees-hash.cc +++ b/src/bees-hash.cc @@ -384,25 +384,9 @@ BeesHashTable::fetch_missing_extent_by_hash(HashType hash) fetch_missing_extent_by_index(extent_index); } -bool -BeesHashTable::is_toxic_hash(BeesHashTable::HashType hash) const -{ - return m_toxic_hashes.find(hash) != m_toxic_hashes.end(); -} - vector BeesHashTable::find_cell(HashType hash) { - // This saves a lot of time prefilling the hash table, and there's no risk of eviction - if (is_toxic_hash(hash)) { - BEESCOUNT(hash_toxic); - BeesAddress toxic_addr(0x1000); - toxic_addr.set_toxic(); - Cell toxic_cell(hash, toxic_addr); - vector rv; - rv.push_back(toxic_cell); - return rv; - } fetch_missing_extent_by_hash(hash); BEESTOOLONG("find_cell hash " << BeesHash(hash)); vector rv; @@ -716,13 +700,6 @@ BeesHashTable::BeesHashTable(shared_ptr ctx, string filename, off_t catch_all([&]() { m_ctx->blacklist_add(BeesFileId(m_fd)); }); - - // Skip zero because we already weed that out before it gets near a hash function - for (unsigned i = 1; i < 256; ++i) { - vector v(BLOCK_SIZE_SUMS, i); - HashType hash = Digest::CRC::crc64(v.data(), v.size()); - m_toxic_hashes.insert(hash); - } } BeesHashTable::~BeesHashTable() diff --git a/src/bees.h b/src/bees.h index 01283bb..536a912 100644 --- a/src/bees.h +++ b/src/bees.h @@ -441,7 +441,6 @@ private: BeesThread m_writeback_thread; BeesThread m_prefetch_thread; RateLimiter m_flush_rate_limit; - set m_toxic_hashes; BeesStringFile m_stats_file; // Mutex/condvar for the writeback thread @@ -468,7 +467,6 @@ private: void set_extent_dirty_locked(uint64_t extent_index); void flush_dirty_extents(); bool flush_dirty_extent(uint64_t extent_index); - bool is_toxic_hash(HashType h) const; size_t hash_to_extent_index(HashType ht); unique_lock lock_extent_by_hash(HashType ht);