From ce0367dafebd7fa39f3aa6ca9f731f425d6b928a Mon Sep 17 00:00:00 2001 From: Zygo Blaxell Date: Wed, 27 Nov 2024 20:29:09 -0500 Subject: [PATCH] scan_one_extent: reduce the number of LOGICAL_INO calls before finding a duplicate block range When we have multiple possible matches for a block, we proceed in three phases: 1. retrieve each match's extent refs and put them in a list, 2. iterate over the list converting viable block matches into range matches, 3. sort and flatten the list of range matches into a non-overlapping list of ranges that cover all duplicate blocks exactly once. The separation of phase 1 and 2 creates a performance issue when there are many block matches in phase 1, and all the range matches in phase 2 are the same length. Even though we might quickly find the longest possible matching range early in phase 2, we first extract all of the extent refs from every possible matching block in phase 1, even though most of those refs will never be used. Fix this by moving the extent ref retrieval in phase 1 into a single loop in phase 2, and stop looping over matching blocks as soon as any dedupe range is created. This avoids iterating over a large list of blocks with expensive `LOGICAL_INO` ioctls in an attempt to improve the match when there is no hope of improvement, e.g. when all match ranges are 4K and the content is extremely prevalent in the data. If we find a matched block that is part of a short matching range, we can replace it with a block that is part of a long matching range, because there is a good chance we will find a matching hash block in the long range by looking up hashes after the end of the short range. In that case, overlapping dedupe ranges covering both blocks in the target extent will be inserted into the dedupe list, and the longest matches will be selected at phase 3. This usually provides a similar result to that of the loop in phase 1, but _much_ more efficiently. Some operations are left in phase 1, but they are all using internal functions, not ioctls. Signed-off-by: Zygo Blaxell --- src/bees-context.cc | 188 ++++++++++++++++++++------------------------ 1 file changed, 86 insertions(+), 102 deletions(-) diff --git a/src/bees-context.cc b/src/bees-context.cc index 2a85223..94989bd 100644 --- a/src/bees-context.cc +++ b/src/bees-context.cc @@ -451,13 +451,8 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e) const auto found = hash_table->find_cell(hash); BEESCOUNT(scan_lookup); - set resolved_addrs; set found_addrs; - - // We know that there is at least one copy of the data and where it is, - // but we don't want to do expensive LOGICAL_INO operations unless there - // are at least two distinct addresses to look at. - found_addrs.insert(addr); + list ordered_addrs; for (const auto &i : found) { BEESTRACE("found (hash, address): " << i); @@ -466,6 +461,9 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e) // Hash has to match THROW_CHECK2(runtime_error, i.e_hash, hash, i.e_hash == hash); + // We know that there is at least one copy of the data and where it is. + // Filter out anything that can't possibly match before we pull out the + // LOGICAL_INO hammer. BeesAddress found_addr(i.e_addr); // If address already in hash table, move on to next extent. @@ -484,15 +482,16 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e) return; } - // Block must have matching EOF alignment - if (found_addr.is_unaligned_eof() != addr.is_unaligned_eof()) { - BEESCOUNT(scan_malign); + // Address is a duplicate. + // Check this early so we don't have duplicate counts. + if (!found_addrs.insert(found_addr).second) { + BEESCOUNT(scan_twice); continue; } - // Address is a duplicate - if (!found_addrs.insert(found_addr).second) { - BEESCOUNT(scan_twice); + // Block must have matching EOF alignment + if (found_addr.is_unaligned_eof() != addr.is_unaligned_eof()) { + BEESCOUNT(scan_malign); continue; } @@ -506,104 +505,89 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e) return; } - // Distinct address, go resolve it - bool abandon_extent = false; - catch_all([&]() { - BEESNOTE("resolving " << found_addr << " matched " << bbd); - BEESTRACE("resolving " << found_addr << " matched " << bbd); - BEESTRACE("BeesContext::scan_one_extent calling BeesResolver " << found_addr); - BeesResolver resolved(m_ctx, found_addr); - // Toxic extents are really toxic - if (resolved.is_toxic()) { - BEESLOGWARN("WORKAROUND: discovered toxic match at found_addr " << found_addr << " matching bbd " << bbd); - BEESCOUNT(scan_toxic_match); - // Make sure we never see this hash again. - // It has become toxic since it was inserted into the hash table. - found_addr.set_toxic(); - hash_table->push_front_hash_addr(hash, found_addr); - abandon_extent = true; - } else if (!resolved.count()) { - BEESCOUNT(scan_resolve_zero); - // Didn't find a block at the table address, address is dead - BEESLOGDEBUG("Erasing stale addr " << addr << " hash " << hash); - hash_table->erase_hash_addr(hash, found_addr); - } else { - resolved_addrs.insert(resolved); - BEESCOUNT(scan_resolve_hit); - } - }); + // Put this address in the list without changing hash table order + ordered_addrs.push_back(found_addr); + } - if (abandon_extent) { + // Cheap filtering is now out of the way, now for some heavy lifting + for (auto found_addr : ordered_addrs) { + // Hash table says there's a matching block on the filesystem. + // Go find refs to it. + BEESNOTE("resolving " << found_addr << " matched " << bbd); + BEESTRACE("resolving " << found_addr << " matched " << bbd); + BEESTRACE("BeesContext::scan_one_extent calling BeesResolver " << found_addr); + BeesResolver resolved(m_ctx, found_addr); + // Toxic extents are really toxic + if (resolved.is_toxic()) { + BEESLOGWARN("WORKAROUND: discovered toxic match at found_addr " << found_addr << " matching bbd " << bbd); + BEESCOUNT(scan_toxic_match); + // Make sure we never see this hash again. + // It has become toxic since it was inserted into the hash table. + found_addr.set_toxic(); + hash_table->push_front_hash_addr(hash, found_addr); return; + } else if (!resolved.count()) { + BEESCOUNT(scan_resolve_zero); + // Didn't find a block at the table address, address is dead + BEESLOGDEBUG("Erasing stale addr " << addr << " hash " << hash); + hash_table->erase_hash_addr(hash, found_addr); + continue; + } else { + BEESCOUNT(scan_resolve_hit); } - } - // This shouldn't happen (often), so let's count it separately - if (resolved_addrs.size() > 2) { - BEESCOUNT(matched_3_or_more); - } - if (resolved_addrs.size() > 1) { - BEESCOUNT(matched_2_or_more); - } - - // No need to do all this unless there are one or more distinct matches - if (!resolved_addrs.empty()) { + // `resolved` contains references to a block on the filesystem that still exists. bar.at(bar_p) = 'M'; - BEESCOUNT(matched_1_or_more); - BEESTRACE("resolved_addrs.size() = " << resolved_addrs.size()); - BEESNOTE("resolving " << resolved_addrs.size() << " matches for hash " << hash); - BeesFileRange replaced_bfr; + BEESNOTE("finding one match (out of " << resolved.count() << ") at " << resolved.addr() << " for " << bbd); + BEESTRACE("finding one match (out of " << resolved.count() << ") at " << resolved.addr() << " for " << bbd); + auto replaced_brp = resolved.replace_dst(bbd); + BeesFileRange &replaced_bfr = replaced_brp.second; + BEESTRACE("next_p " << to_hex(next_p) << " -> replaced_bfr " << replaced_bfr); - for (auto it = resolved_addrs.begin(); it != resolved_addrs.end(); ++it) { - auto it_copy = *it; - BEESNOTE("finding one match (out of " << it_copy.count() << ") at " << it_copy.addr() << " for " << bbd); - BEESTRACE("finding one match (out of " << it_copy.count() << ") at " << it_copy.addr() << " for " << bbd); - // If we replaced extents containing the incoming addr, - // push the addr we kept to the front of the hash LRU. - auto replaced_brp = it_copy.replace_dst(bbd); - replaced_bfr = replaced_brp.second; - BEESTRACE("next_p " << to_hex(next_p) << " -> replaced_bfr " << replaced_bfr); - - // If we did find a block, but not this hash, correct the hash table - if (it_copy.found_hash()) { - BEESCOUNT(scan_hash_hit); - } else { - BEESLOGDEBUG("Erasing stale hash " << hash << " addr " << it_copy.addr()); - hash_table->erase_hash_addr(hash, it_copy.addr()); - BEESCOUNT(scan_hash_miss); - } - - if (it_copy.found_dup()) { - THROW_CHECK0(runtime_error, replaced_bfr); - BEESCOUNT(scan_dup_hit); - - // Save this match. If a better match is found later, - // it will be replaced. (FIXME: not really...) - dedupe_list.push_back(replaced_brp); - - // Push matching block to front of LRU - front_hash_list.push_back(make_pair(hash, it_copy.addr())); - - off_t bar_p = (p - e.begin()) / BLOCK_SIZE_SUMS; - bar.at(bar_p) = '='; - - // Invalidate resolve cache so we can count refs correctly - invalidate_addr_list.push_back(it_copy.addr()); - invalidate_addr_list.push_back(bbd.addr()); - - // next_p may be past EOF so check p only - THROW_CHECK2(runtime_error, p, replaced_bfr, p < replaced_bfr.end()); - - // We may find duplicate ranges of various lengths, so make sure - // we don't pick a smaller one - next_p = max(next_p, replaced_bfr.end()); - } else { - BEESCOUNT(scan_dup_miss); - } + // If we did find a block, but not this hash, correct the hash table and move on + if (resolved.found_hash()) { + BEESCOUNT(scan_hash_hit); + } else { + BEESLOGDEBUG("Erasing stale hash " << hash << " addr " << resolved.addr()); + hash_table->erase_hash_addr(hash, resolved.addr()); + BEESCOUNT(scan_hash_miss); + continue; + } + + // We found a block and it was a duplicate + if (resolved.found_dup()) { + THROW_CHECK0(runtime_error, replaced_bfr); + BEESCOUNT(scan_dup_hit); + + // Save this match. If a better match is found later, + // it will be replaced. + dedupe_list.push_back(replaced_brp); + + // Push matching block to front of LRU + front_hash_list.push_back(make_pair(hash, resolved.addr())); + + // This is the block that matched in the replaced bfr + bar.at(bar_p) = '='; + + // Invalidate resolve cache so we can count refs correctly + invalidate_addr_list.push_back(resolved.addr()); + invalidate_addr_list.push_back(bbd.addr()); + + // next_p may be past EOF so check p only + THROW_CHECK2(runtime_error, p, replaced_bfr, p < replaced_bfr.end()); + + // We may find duplicate ranges of various lengths, so make sure + // we don't pick a smaller one + next_p = max(next_p, replaced_bfr.end()); + + // Stop after one dedupe is found. If there's a longer matching range + // out there, we'll find a matching block after the end of this range, + // since the longer range is longer than this one. + break; + } else { + BEESCOUNT(scan_dup_miss); } - } else { - BEESCOUNT(matched_0); } }