mirror of
https://github.com/Zygo/bees.git
synced 2025-07-05 18:12:27 +02:00
bees: remove local cruft, throw at github
This commit is contained in:
487
src/bees-resolve.cc
Normal file
487
src/bees-resolve.cc
Normal file
@ -0,0 +1,487 @@
|
||||
#include "bees.h"
|
||||
|
||||
#include "crucible/limits.h"
|
||||
#include "crucible/string.h"
|
||||
|
||||
using namespace crucible;
|
||||
using namespace std;
|
||||
|
||||
BeesAddress
|
||||
BeesResolver::addr(BeesAddress new_addr)
|
||||
{
|
||||
THROW_CHECK1(invalid_argument, new_addr, !new_addr.is_magic());
|
||||
|
||||
m_found_data = false;
|
||||
m_found_dup = false;
|
||||
m_found_hash = false;
|
||||
m_wrong_data = false;
|
||||
m_biors.clear();
|
||||
m_ranges.clear();
|
||||
m_addr = new_addr;
|
||||
m_bior_count = 0;
|
||||
|
||||
auto rv = m_ctx->resolve_addr(m_addr);
|
||||
m_biors = rv.m_biors;
|
||||
m_is_toxic = rv.m_is_toxic;
|
||||
m_bior_count = m_biors.size();
|
||||
|
||||
return m_addr;
|
||||
}
|
||||
|
||||
BeesResolver::BeesResolver(shared_ptr<BeesContext> ctx, BeesAddress new_addr) :
|
||||
m_ctx(ctx),
|
||||
m_bior_count(0)
|
||||
{
|
||||
addr(new_addr);
|
||||
}
|
||||
|
||||
BeesBlockData
|
||||
BeesResolver::adjust_offset(const BeesFileRange &haystack, const BeesBlockData &needle)
|
||||
{
|
||||
BEESTRACE("Searching for needle " << needle << "\n\tin haystack " << haystack);
|
||||
|
||||
BEESCOUNT(adjust_try);
|
||||
|
||||
// Constraint checks
|
||||
THROW_CHECK1(invalid_argument, needle.begin(), (needle.begin() & BLOCK_MASK_CLONE) == 0);
|
||||
THROW_CHECK1(invalid_argument, haystack.begin(), (haystack.begin() & BLOCK_MASK_CLONE) == 0);
|
||||
|
||||
// Need to know the precise dimensions of the haystack and needle
|
||||
off_t haystack_size = haystack.file_size();
|
||||
|
||||
// If the needle is not a full block then it can only match at EOF
|
||||
off_t needle_len = needle.size();
|
||||
bool is_unaligned_eof = needle_len & BLOCK_MASK_CLONE;
|
||||
BEESTRACE("is_unaligned_eof = " << is_unaligned_eof << ", needle_len = " << to_hex(needle_len) << ", haystack_size = " << to_hex(haystack_size));
|
||||
|
||||
// Unaligned EOF can only match at EOF, so only check there
|
||||
if (is_unaligned_eof) {
|
||||
BEESTRACE("Construct needle_bfr from " << needle);
|
||||
BeesFileRange needle_bfr(needle);
|
||||
|
||||
// Census
|
||||
if (haystack_size & BLOCK_MASK_CLONE) {
|
||||
BEESCOUNT(adjust_eof_haystack);
|
||||
}
|
||||
if (needle_bfr.end() & BLOCK_MASK_CLONE) {
|
||||
BEESCOUNT(adjust_eof_needle);
|
||||
}
|
||||
|
||||
// Non-aligned part of the lengths must be the same
|
||||
if ( (haystack_size & BLOCK_MASK_CLONE) != (needle_bfr.end() & BLOCK_MASK_CLONE) ) {
|
||||
BEESCOUNT(adjust_eof_fail);
|
||||
return BeesBlockData();
|
||||
}
|
||||
|
||||
// Read the haystack block
|
||||
BEESTRACE("Reading haystack (haystack_size = " << to_hex(haystack_size) << ")");
|
||||
BeesBlockData straw(haystack.fd(), haystack_size & ~BLOCK_MASK_CLONE, haystack_size & BLOCK_MASK_CLONE);
|
||||
|
||||
// It either matches or it doesn't
|
||||
BEESTRACE("Verifying haystack " << straw);
|
||||
if (straw.is_data_equal(needle)) {
|
||||
BEESCOUNT(adjust_eof_hit);
|
||||
m_found_data = true;
|
||||
m_found_hash = true;
|
||||
return straw;
|
||||
}
|
||||
|
||||
// Check for matching hash
|
||||
BEESTRACE("Verifying haystack hash");
|
||||
if (straw.hash() == needle.hash()) {
|
||||
// OK at least the hash is still valid
|
||||
m_found_hash = true;
|
||||
}
|
||||
|
||||
BEESCOUNT(adjust_eof_miss);
|
||||
// BEESLOG("adjust_eof_miss " << straw);
|
||||
return BeesBlockData();
|
||||
}
|
||||
|
||||
off_t lower_offset = haystack.begin();
|
||||
off_t upper_offset = haystack.end();
|
||||
bool is_compressed_offset = false;
|
||||
bool is_exact = false;
|
||||
bool is_legacy = false;
|
||||
if (m_addr.is_compressed()) {
|
||||
BtrfsExtentWalker ew(haystack.fd(), haystack.begin(), m_ctx->root_fd());
|
||||
BEESTRACE("haystack extent data " << ew);
|
||||
Extent e = ew.current();
|
||||
if (m_addr.has_compressed_offset()) {
|
||||
off_t coff = m_addr.get_compressed_offset();
|
||||
if (e.offset() > coff) {
|
||||
// this extent begins after the target block
|
||||
BEESCOUNT(adjust_offset_low);
|
||||
return BeesBlockData();
|
||||
}
|
||||
coff -= e.offset();
|
||||
if (e.size() <= coff) {
|
||||
// this extent ends before the target block
|
||||
BEESCOUNT(adjust_offset_high);
|
||||
return BeesBlockData();
|
||||
}
|
||||
lower_offset = e.begin() + coff;
|
||||
upper_offset = lower_offset + BLOCK_SIZE_CLONE;
|
||||
BEESCOUNT(adjust_offset_hit);
|
||||
is_compressed_offset = true;
|
||||
} else {
|
||||
lower_offset = e.begin();
|
||||
upper_offset = e.end();
|
||||
BEESCOUNT(adjust_legacy);
|
||||
is_legacy = true;
|
||||
}
|
||||
} else {
|
||||
BEESCOUNT(adjust_exact);
|
||||
is_exact = true;
|
||||
}
|
||||
|
||||
BEESTRACE("Checking haystack " << haystack << " offsets " << to_hex(lower_offset) << ".." << to_hex(upper_offset));
|
||||
|
||||
// Check all the blocks in the list
|
||||
for (off_t haystack_offset = lower_offset; haystack_offset < upper_offset; haystack_offset += BLOCK_SIZE_CLONE) {
|
||||
THROW_CHECK1(out_of_range, haystack_offset, (haystack_offset & BLOCK_MASK_CLONE) == 0);
|
||||
|
||||
// Straw cannot extend beyond end of haystack
|
||||
if (haystack_offset + needle.size() > haystack_size) {
|
||||
BEESCOUNT(adjust_needle_too_long);
|
||||
break;
|
||||
}
|
||||
|
||||
// Read the haystack
|
||||
BEESTRACE("straw " << name_fd(haystack.fd()) << ", offset " << to_hex(haystack_offset) << ", length " << needle.size());
|
||||
BeesBlockData straw(haystack.fd(), haystack_offset, needle.size());
|
||||
|
||||
BEESTRACE("straw = " << straw);
|
||||
|
||||
// Stop if we find a match
|
||||
if (straw.is_data_equal(needle)) {
|
||||
BEESCOUNT(adjust_hit);
|
||||
m_found_data = true;
|
||||
m_found_hash = true;
|
||||
if (is_compressed_offset) BEESCOUNT(adjust_compressed_offset_correct);
|
||||
if (is_legacy) BEESCOUNT(adjust_legacy_correct);
|
||||
if (is_exact) BEESCOUNT(adjust_exact_correct);
|
||||
return straw;
|
||||
}
|
||||
|
||||
if (straw.hash() != needle.hash()) {
|
||||
// Not the same hash or data, try next block
|
||||
BEESCOUNT(adjust_miss);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Found the hash but not the data. Yay!
|
||||
m_found_hash = true;
|
||||
BEESLOG("HASH COLLISION\n"
|
||||
<< "\tneedle " << needle << "\n"
|
||||
<< "\tstraw " << straw);
|
||||
BEESCOUNT(hash_collision);
|
||||
}
|
||||
|
||||
// Ran out of offsets to try
|
||||
BEESCOUNT(adjust_no_match);
|
||||
if (is_compressed_offset) BEESCOUNT(adjust_compressed_offset_wrong);
|
||||
if (is_legacy) BEESCOUNT(adjust_legacy_wrong);
|
||||
if (is_exact) BEESCOUNT(adjust_exact_wrong);
|
||||
m_wrong_data = true;
|
||||
return BeesBlockData();
|
||||
}
|
||||
|
||||
BeesFileRange
|
||||
BeesResolver::chase_extent_ref(const BtrfsInodeOffsetRoot &bior, BeesBlockData &needle_bbd)
|
||||
{
|
||||
BEESTRACE("chase_extent_ref bior " << bior << " needle_bbd " << needle_bbd);
|
||||
BEESNOTE("chase_extent_ref bior " << bior << " needle_bbd " << needle_bbd);
|
||||
BEESCOUNT(chase_try);
|
||||
|
||||
Fd file_fd = m_ctx->roots()->open_root_ino(bior.m_root, bior.m_inum);
|
||||
if (!file_fd) {
|
||||
// Delete snapshots generate craptons of these
|
||||
// BEESINFO("No FD in chase_extent_ref " << bior);
|
||||
BEESCOUNT(chase_no_fd);
|
||||
return BeesFileRange();
|
||||
}
|
||||
|
||||
BEESNOTE("searching at offset " << to_hex(bior.m_offset) << " in file " << name_fd(file_fd) << "\n\tfor " << needle_bbd);
|
||||
|
||||
BEESTRACE("bior file " << name_fd(file_fd));
|
||||
BEESTRACE("get file_addr " << bior);
|
||||
BeesAddress file_addr(file_fd, bior.m_offset, m_ctx);
|
||||
BEESTRACE("file_addr " << file_addr);
|
||||
|
||||
// ...or are we?
|
||||
if (file_addr.is_magic()) {
|
||||
BEESINFO("file_addr is magic: file_addr = " << file_addr << " bior = " << bior << " needle_bbd = " << needle_bbd);
|
||||
BEESCOUNT(chase_wrong_magic);
|
||||
return BeesFileRange();
|
||||
}
|
||||
THROW_CHECK1(invalid_argument, m_addr, !m_addr.is_magic());
|
||||
|
||||
// Did we get the physical block we asked for? The magic bits have to match too,
|
||||
// but the compressed offset bits do not.
|
||||
if (file_addr.get_physical_or_zero() != m_addr.get_physical_or_zero()) {
|
||||
// BEESINFO("found addr " << file_addr << " at " << name_fd(file_fd) << " offset " << to_hex(bior.m_offset) << " but looking for " << m_addr);
|
||||
// FIEMAP/resolve are working, but the data is old.
|
||||
BEESCOUNT(chase_wrong_addr);
|
||||
return BeesFileRange();
|
||||
}
|
||||
|
||||
// Calculate end of range, which is a sum block or less
|
||||
// It's a sum block because we have to compare content now
|
||||
off_t file_size = Stat(file_fd).st_size;
|
||||
off_t bior_offset = ranged_cast<off_t>(bior.m_offset);
|
||||
off_t end_offset = min(file_size, bior_offset + needle_bbd.size());
|
||||
BeesBlockData haystack_bbd(file_fd, bior_offset, end_offset - bior_offset);
|
||||
|
||||
BEESTRACE("matched haystack_bbd " << haystack_bbd << " file_addr " << file_addr);
|
||||
|
||||
// If the data was compressed and no offset was captured then
|
||||
// we won't get an exact address from resolve.
|
||||
// Search near the resolved address for a matching data block.
|
||||
// ...even if it's not compressed, we should do this sanity
|
||||
// check before considering the block as a duplicate candidate.
|
||||
auto new_bbd = adjust_offset(haystack_bbd, needle_bbd);
|
||||
if (new_bbd.empty()) {
|
||||
// matching offset search failed
|
||||
BEESCOUNT(chase_wrong_data);
|
||||
return BeesFileRange();
|
||||
}
|
||||
if (new_bbd.begin() == haystack_bbd.begin()) {
|
||||
BEESCOUNT(chase_uncorrected);
|
||||
} else {
|
||||
// corrected the bfr
|
||||
BEESCOUNT(chase_corrected);
|
||||
haystack_bbd = new_bbd;
|
||||
}
|
||||
|
||||
// We have found at least one duplicate block, so resolve was a success
|
||||
BEESCOUNT(chase_hit);
|
||||
|
||||
// Matching block
|
||||
BEESTRACE("Constructing dst_bfr { " << BeesFileId(haystack_bbd.fd()) << ", " << to_hex(haystack_bbd.begin()) << ".." << to_hex(haystack_bbd.end()) << " }");
|
||||
BeesFileRange dst_bfr(BeesFileId(haystack_bbd.fd()), haystack_bbd.begin(), haystack_bbd.end());
|
||||
|
||||
return dst_bfr;
|
||||
}
|
||||
|
||||
void
|
||||
BeesResolver::replace_src(const BeesFileRange &src_bfr)
|
||||
{
|
||||
BEESTRACE("replace_src src_bfr " << src_bfr);
|
||||
THROW_CHECK0(runtime_error, !m_is_toxic);
|
||||
BEESCOUNT(replacesrc_try);
|
||||
|
||||
// Open src, reuse it for all dst
|
||||
auto i_bfr = src_bfr;
|
||||
BEESNOTE("Opening src bfr " << i_bfr);
|
||||
BEESTRACE("Opening src bfr " << i_bfr);
|
||||
i_bfr.fd(m_ctx);
|
||||
|
||||
BeesBlockData bbd(i_bfr);
|
||||
|
||||
for_each_extent_ref(bbd, [&](const BeesFileRange &j) -> bool {
|
||||
// Open dst
|
||||
auto j_bfr = j;
|
||||
BEESNOTE("Opening dst bfr " << j_bfr);
|
||||
BEESTRACE("Opening dst bfr " << j_bfr);
|
||||
j_bfr.fd(m_ctx);
|
||||
|
||||
if (i_bfr.overlaps(j_bfr)) {
|
||||
BEESCOUNT(replacesrc_overlaps);
|
||||
return false; // i.e. continue
|
||||
}
|
||||
|
||||
// Make pair(src, dst)
|
||||
BEESTRACE("creating brp (" << i_bfr << ", " << j_bfr << ")");
|
||||
BeesRangePair brp(i_bfr, j_bfr);
|
||||
BEESTRACE("Found matching range: " << brp);
|
||||
|
||||
// Extend range at beginning
|
||||
BEESNOTE("Extending matching range: " << brp);
|
||||
// No particular reason to be constrained?
|
||||
if (brp.grow(m_ctx, true)) {
|
||||
BEESCOUNT(replacesrc_grown);
|
||||
}
|
||||
|
||||
// Dedup
|
||||
BEESNOTE("dedup " << brp);
|
||||
if (m_ctx->dedup(brp)) {
|
||||
BEESCOUNT(replacesrc_dedup_hit);
|
||||
m_found_dup = true;
|
||||
} else {
|
||||
BEESCOUNT(replacesrc_dedup_miss);
|
||||
}
|
||||
return false; // i.e. continue
|
||||
});
|
||||
}
|
||||
|
||||
void
|
||||
BeesResolver::find_matches(bool just_one, BeesBlockData &bbd)
|
||||
{
|
||||
// Walk through the (ino, offset, root) tuples until we find a match.
|
||||
BEESTRACE("finding all matches for " << bbd << " at " << m_addr << ": " << m_biors.size() << " found");
|
||||
THROW_CHECK0(runtime_error, !m_is_toxic);
|
||||
bool stop_now = false;
|
||||
for (auto ino_off_root : m_biors) {
|
||||
if (m_wrong_data) {
|
||||
return;
|
||||
}
|
||||
|
||||
BEESTRACE("ino_off_root " << ino_off_root);
|
||||
BeesFileId this_fid(ino_off_root.m_root, ino_off_root.m_inum);
|
||||
|
||||
// Silently ignore blacklisted files, e.g. BeesTempFile files
|
||||
if (m_ctx->is_blacklisted(this_fid)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Look at the old data
|
||||
catch_all([&]() {
|
||||
BEESTRACE("chase_extent_ref ino " << ino_off_root << " bbd " << bbd);
|
||||
auto new_range = chase_extent_ref(ino_off_root, bbd);
|
||||
if (new_range) {
|
||||
m_ranges.insert(new_range.copy_closed());
|
||||
stop_now = true;
|
||||
}
|
||||
});
|
||||
|
||||
if (just_one && stop_now) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
BeesResolver::for_each_extent_ref(BeesBlockData bbd, function<bool(const BeesFileRange &bfr)> visitor)
|
||||
{
|
||||
// Walk through the (ino, offset, root) tuples until we are told to stop
|
||||
BEESTRACE("for_each_extent_ref " << bbd << " at " << m_addr << ": " << m_biors.size() << " found");
|
||||
THROW_CHECK0(runtime_error, !m_is_toxic);
|
||||
bool stop_now = false;
|
||||
for (auto ino_off_root : m_biors) {
|
||||
BEESTRACE("ino_off_root " << ino_off_root);
|
||||
BeesFileId this_fid(ino_off_root.m_root, ino_off_root.m_inum);
|
||||
|
||||
// Silently ignore blacklisted files, e.g. BeesTempFile files
|
||||
if (m_ctx->is_blacklisted(this_fid)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Look at the old data
|
||||
catch_all([&]() {
|
||||
BEESTRACE("chase_extent_ref ino " << ino_off_root << " bbd " << bbd);
|
||||
auto new_range = chase_extent_ref(ino_off_root, bbd);
|
||||
// XXX: should we catch visitor's exceptions here?
|
||||
if (new_range) {
|
||||
stop_now = visitor(new_range);
|
||||
} else {
|
||||
// We have reliable block addresses now, so we guarantee we can hit the desired block.
|
||||
// Failure in chase_extent_ref means we are done, and don't need to look up all the
|
||||
// other references.
|
||||
stop_now = true;
|
||||
}
|
||||
});
|
||||
|
||||
if (stop_now) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return stop_now;
|
||||
}
|
||||
|
||||
BeesFileRange
|
||||
BeesResolver::replace_dst(const BeesFileRange &dst_bfr)
|
||||
{
|
||||
BEESTRACE("replace_dst dst_bfr " << dst_bfr);
|
||||
BEESCOUNT(replacedst_try);
|
||||
|
||||
// Open dst, reuse it for all src
|
||||
BEESNOTE("Opening dst bfr " << dst_bfr);
|
||||
BEESTRACE("Opening dst bfr " << dst_bfr);
|
||||
dst_bfr.fd(m_ctx);
|
||||
|
||||
BeesFileRange overlap_bfr;
|
||||
BEESTRACE("overlap_bfr " << overlap_bfr);
|
||||
|
||||
BeesBlockData bbd(dst_bfr);
|
||||
|
||||
for_each_extent_ref(bbd, [&](const BeesFileRange &src_bfr) -> bool {
|
||||
// Open src
|
||||
BEESNOTE("Opening src bfr " << src_bfr);
|
||||
BEESTRACE("Opening src bfr " << src_bfr);
|
||||
src_bfr.fd(m_ctx);
|
||||
|
||||
if (dst_bfr.overlaps(src_bfr)) {
|
||||
BEESCOUNT(replacedst_overlaps);
|
||||
return false; // i.e. continue
|
||||
}
|
||||
|
||||
// If dst is already occupying src, skip.
|
||||
// FIXME: BeesContext::scan_one_extent should be weeding these out, but does not.
|
||||
BeesBlockData src_bbd(src_bfr.fd(), src_bfr.begin(), min(BLOCK_SIZE_SUMS, src_bfr.size()));
|
||||
if (bbd.addr().get_physical_or_zero() == src_bbd.addr().get_physical_or_zero()) {
|
||||
BEESCOUNT(replacedst_same);
|
||||
return false; // i.e. continue
|
||||
}
|
||||
|
||||
// Make pair(src, dst)
|
||||
BEESTRACE("creating brp (" << src_bfr << ", " << dst_bfr << ")");
|
||||
BeesRangePair brp(src_bfr, dst_bfr);
|
||||
BEESTRACE("Found matching range: " << brp);
|
||||
|
||||
// Extend range at beginning
|
||||
BEESNOTE("Extending matching range: " << brp);
|
||||
// 'false' Has nasty loops, and may not be faster.
|
||||
// 'true' At best, keeps fragmentation constant...but can also make it worse
|
||||
if (brp.grow(m_ctx, true)) {
|
||||
BEESCOUNT(replacedst_grown);
|
||||
}
|
||||
|
||||
// Dedup
|
||||
BEESNOTE("dedup " << brp);
|
||||
if (m_ctx->dedup(brp)) {
|
||||
BEESCOUNT(replacedst_dedup_hit);
|
||||
m_found_dup = true;
|
||||
overlap_bfr = brp.second;
|
||||
// FIXME: find best range first, then dedup that
|
||||
return true; // i.e. break
|
||||
} else {
|
||||
BEESCOUNT(replacedst_dedup_miss);
|
||||
return false; // i.e. continue
|
||||
}
|
||||
});
|
||||
// BEESLOG("overlap_bfr after " << overlap_bfr);
|
||||
return overlap_bfr.copy_closed();
|
||||
}
|
||||
|
||||
BeesFileRange
|
||||
BeesResolver::find_one_match(BeesBlockData &bbd)
|
||||
{
|
||||
THROW_CHECK0(runtime_error, !m_is_toxic);
|
||||
find_matches(true, bbd);
|
||||
if (m_ranges.empty()) {
|
||||
return BeesFileRange();
|
||||
} else {
|
||||
return *m_ranges.begin();
|
||||
}
|
||||
}
|
||||
|
||||
set<BeesFileRange>
|
||||
BeesResolver::find_all_matches(BeesBlockData &bbd)
|
||||
{
|
||||
THROW_CHECK0(runtime_error, !m_is_toxic);
|
||||
find_matches(false, bbd);
|
||||
return m_ranges;
|
||||
}
|
||||
|
||||
bool
|
||||
BeesResolver::operator<(const BeesResolver &that) const
|
||||
{
|
||||
if (that.m_bior_count < m_bior_count) {
|
||||
return true;
|
||||
} else if (m_bior_count < that.m_bior_count) {
|
||||
return false;
|
||||
}
|
||||
return m_addr < that.m_addr;
|
||||
}
|
||||
|
Reference in New Issue
Block a user