1
0
mirror of https://github.com/Zygo/bees.git synced 2025-05-17 21:35:45 +02:00
bees/src/bees-resolve.cc
Zygo Blaxell be2c55119e bees: make exceptions less prominent in log output
Introduce a mechanism to suppress exceptions which do not produce a
full stack trace for common known cases where a loop should be aborted.
Use this mechanism to suppress the infamous "FIXME" exception.

Reduce the log level to at most NOTICE, and in some cases DEBUG.

Signed-off-by: Zygo Blaxell <bees@furryterror.org>
2019-01-06 01:48:35 -05:00

478 lines
14 KiB
C++

#include "bees.h"
#include "crucible/limits.h"
#include "crucible/string.h"
using namespace crucible;
using namespace std;
BeesAddress
BeesResolver::addr(BeesAddress new_addr)
{
THROW_CHECK1(invalid_argument, new_addr, !new_addr.is_magic());
m_found_data = false;
m_found_dup = false;
m_found_hash = false;
m_wrong_data = false;
m_biors.clear();
m_ranges.clear();
m_addr = new_addr;
m_bior_count = 0;
auto rv = m_ctx->resolve_addr(m_addr);
m_biors = rv.m_biors;
m_is_toxic = rv.m_is_toxic;
m_bior_count = m_biors.size();
return m_addr;
}
BeesResolver::BeesResolver(shared_ptr<BeesContext> ctx, BeesAddress new_addr) :
m_ctx(ctx),
m_bior_count(0)
{
addr(new_addr);
}
BeesBlockData
BeesResolver::adjust_offset(const BeesFileRange &haystack, const BeesBlockData &needle)
{
BEESTRACE("Searching for needle " << needle << "\n\tin haystack " << haystack);
BEESCOUNT(adjust_try);
// Constraint checks
THROW_CHECK1(invalid_argument, needle.begin(), (needle.begin() & BLOCK_MASK_CLONE) == 0);
THROW_CHECK1(invalid_argument, haystack.begin(), (haystack.begin() & BLOCK_MASK_CLONE) == 0);
// Need to know the precise dimensions of the haystack and needle
off_t haystack_size = haystack.file_size();
// If the needle is not a full block then it can only match at EOF
off_t needle_len = needle.size();
bool is_unaligned_eof = needle_len & BLOCK_MASK_CLONE;
BEESTRACE("is_unaligned_eof = " << is_unaligned_eof << ", needle_len = " << to_hex(needle_len) << ", haystack_size = " << to_hex(haystack_size));
// Unaligned EOF can only match at EOF, so only check there
if (is_unaligned_eof) {
BEESTRACE("Construct needle_bfr from " << needle);
BeesFileRange needle_bfr(needle);
// Census
if (haystack_size & BLOCK_MASK_CLONE) {
BEESCOUNT(adjust_eof_haystack);
}
if (needle_bfr.end() & BLOCK_MASK_CLONE) {
BEESCOUNT(adjust_eof_needle);
}
// Non-aligned part of the lengths must be the same
if ( (haystack_size & BLOCK_MASK_CLONE) != (needle_bfr.end() & BLOCK_MASK_CLONE) ) {
BEESCOUNT(adjust_eof_fail);
return BeesBlockData();
}
// Read the haystack block
BEESTRACE("Reading haystack (haystack_size = " << to_hex(haystack_size) << ")");
BeesBlockData straw(haystack.fd(), haystack_size & ~BLOCK_MASK_CLONE, haystack_size & BLOCK_MASK_CLONE);
// It either matches or it doesn't
BEESTRACE("Verifying haystack " << straw);
if (straw.is_data_equal(needle)) {
BEESCOUNT(adjust_eof_hit);
m_found_data = true;
m_found_hash = true;
return straw;
}
// Check for matching hash
BEESTRACE("Verifying haystack hash");
if (straw.hash() == needle.hash()) {
// OK at least the hash is still valid
m_found_hash = true;
}
BEESCOUNT(adjust_eof_miss);
// BEESLOG("adjust_eof_miss " << straw);
return BeesBlockData();
}
off_t haystack_offset = haystack.begin();
bool is_compressed_offset = false;
bool is_exact = false;
if (m_addr.is_compressed()) {
BtrfsExtentWalker ew(haystack.fd(), haystack.begin(), m_ctx->root_fd());
BEESTRACE("haystack extent data " << ew);
Extent e = ew.current();
THROW_CHECK1(runtime_error, m_addr, m_addr.has_compressed_offset());
off_t coff = m_addr.get_compressed_offset();
if (e.offset() > coff) {
// this extent begins after the target block
BEESCOUNT(adjust_offset_low);
return BeesBlockData();
}
coff -= e.offset();
if (e.size() <= coff) {
// this extent ends before the target block
BEESCOUNT(adjust_offset_high);
return BeesBlockData();
}
haystack_offset = e.begin() + coff;
BEESCOUNT(adjust_offset_hit);
is_compressed_offset = true;
} else {
BEESCOUNT(adjust_exact);
is_exact = true;
}
BEESTRACE("Checking haystack " << haystack << " offset " << to_hex(haystack_offset));
// Check all the blocks in the list
THROW_CHECK1(out_of_range, haystack_offset, (haystack_offset & BLOCK_MASK_CLONE) == 0);
// Straw cannot extend beyond end of haystack
if (haystack_offset + needle.size() > haystack_size) {
BEESCOUNT(adjust_needle_too_long);
return BeesBlockData();
}
// Read the haystack
BEESTRACE("straw " << name_fd(haystack.fd()) << ", offset " << to_hex(haystack_offset) << ", length " << needle.size());
BeesBlockData straw(haystack.fd(), haystack_offset, needle.size());
BEESTRACE("straw = " << straw);
// Stop if we find a match
if (straw.is_data_equal(needle)) {
BEESCOUNT(adjust_hit);
m_found_data = true;
m_found_hash = true;
if (is_compressed_offset) BEESCOUNT(adjust_compressed_offset_correct);
if (is_exact) BEESCOUNT(adjust_exact_correct);
return straw;
}
if (straw.hash() != needle.hash()) {
// Not the same hash or data, try next block
BEESCOUNT(adjust_miss);
return BeesBlockData();
}
// Found the hash but not the data. Yay!
m_found_hash = true;
#if 0
BEESLOGINFO("HASH COLLISION\n"
<< "\tneedle " << needle << "\n"
<< "\tstraw " << straw);
#endif
BEESCOUNT(hash_collision);
// Ran out of offsets to try
BEESCOUNT(adjust_no_match);
if (is_compressed_offset) BEESCOUNT(adjust_compressed_offset_wrong);
if (is_exact) BEESCOUNT(adjust_exact_wrong);
m_wrong_data = true;
return BeesBlockData();
}
BeesFileRange
BeesResolver::chase_extent_ref(const BtrfsInodeOffsetRoot &bior, BeesBlockData &needle_bbd)
{
BEESTRACE("chase_extent_ref bior " << bior << " needle_bbd " << needle_bbd);
BEESNOTE("chase_extent_ref bior " << bior << " needle_bbd " << needle_bbd);
BEESCOUNT(chase_try);
Fd file_fd = m_ctx->roots()->open_root_ino(bior.m_root, bior.m_inum);
if (!file_fd) {
// Deleted snapshots generate craptons of these
// BEESLOGDEBUG("No FD in chase_extent_ref " << bior);
BEESCOUNT(chase_no_fd);
return BeesFileRange();
}
BEESNOTE("searching at offset " << to_hex(bior.m_offset) << " in file " << name_fd(file_fd) << "\n\tfor " << needle_bbd);
BEESTRACE("bior file " << name_fd(file_fd));
BEESTRACE("get file_addr " << bior);
BeesAddress file_addr(file_fd, bior.m_offset, m_ctx);
BEESTRACE("file_addr " << file_addr);
// ...or are we?
if (file_addr.is_magic()) {
BEESLOGDEBUG("file_addr is magic: file_addr = " << file_addr << " bior = " << bior << " needle_bbd = " << needle_bbd);
BEESCOUNT(chase_wrong_magic);
return BeesFileRange();
}
THROW_CHECK1(invalid_argument, m_addr, !m_addr.is_magic());
// Did we get the physical block we asked for? The magic bits have to match too,
// but the compressed offset bits do not.
if (file_addr.get_physical_or_zero() != m_addr.get_physical_or_zero()) {
// BEESLOGDEBUG("found addr " << file_addr << " at " << name_fd(file_fd) << " offset " << to_hex(bior.m_offset) << " but looking for " << m_addr);
// FIEMAP/resolve are working, but the data is old.
BEESCOUNT(chase_wrong_addr);
return BeesFileRange();
}
// Calculate end of range, which is a sum block or less
// It's a sum block because we have to compare content now
off_t file_size = Stat(file_fd).st_size;
off_t bior_offset = ranged_cast<off_t>(bior.m_offset);
off_t end_offset = min(file_size, bior_offset + needle_bbd.size());
BeesBlockData haystack_bbd(file_fd, bior_offset, end_offset - bior_offset);
BEESTRACE("matched haystack_bbd " << haystack_bbd << " file_addr " << file_addr);
// If the data was compressed and no offset was captured then
// we won't get an exact address from resolve.
// Search near the resolved address for a matching data block.
// ...even if it's not compressed, we should do this sanity
// check before considering the block as a duplicate candidate.
auto new_bbd = adjust_offset(haystack_bbd, needle_bbd);
if (new_bbd.empty()) {
// matching offset search failed
BEESCOUNT(chase_no_data);
return BeesFileRange();
}
if (new_bbd.begin() == haystack_bbd.begin()) {
BEESCOUNT(chase_uncorrected);
} else {
// corrected the bfr
BEESCOUNT(chase_corrected);
haystack_bbd = new_bbd;
}
// We have found at least one duplicate block, so resolve was a success
BEESCOUNT(chase_hit);
// Matching block
BEESTRACE("Constructing dst_bfr { " << BeesFileId(haystack_bbd.fd()) << ", " << to_hex(haystack_bbd.begin()) << ".." << to_hex(haystack_bbd.end()) << " }");
BeesFileRange dst_bfr(BeesFileId(haystack_bbd.fd()), haystack_bbd.begin(), haystack_bbd.end());
return dst_bfr;
}
void
BeesResolver::replace_src(const BeesFileRange &src_bfr)
{
BEESTRACE("replace_src src_bfr " << src_bfr);
THROW_CHECK0(runtime_error, !m_is_toxic);
BEESCOUNT(replacesrc_try);
// Open src, reuse it for all dst
auto i_bfr = src_bfr;
BEESNOTE("Opening src bfr " << i_bfr);
BEESTRACE("Opening src bfr " << i_bfr);
i_bfr.fd(m_ctx);
BeesBlockData bbd(i_bfr);
for_each_extent_ref(bbd, [&](const BeesFileRange &j) -> bool {
// Open dst
auto j_bfr = j;
BEESNOTE("Opening dst bfr " << j_bfr);
BEESTRACE("Opening dst bfr " << j_bfr);
j_bfr.fd(m_ctx);
if (i_bfr.overlaps(j_bfr)) {
BEESCOUNT(replacesrc_overlaps);
return false; // i.e. continue
}
// Make pair(src, dst)
BEESTRACE("creating brp (" << i_bfr << ", " << j_bfr << ")");
BeesRangePair brp(i_bfr, j_bfr);
BEESTRACE("Found matching range: " << brp);
// Extend range at beginning
BEESNOTE("Extending matching range: " << brp);
// No particular reason to be constrained?
if (brp.grow(m_ctx, true)) {
BEESCOUNT(replacesrc_grown);
}
// Dedup
BEESNOTE("dedup " << brp);
if (m_ctx->dedup(brp)) {
BEESCOUNT(replacesrc_dedup_hit);
m_found_dup = true;
} else {
BEESCOUNT(replacesrc_dedup_miss);
}
return false; // i.e. continue
});
}
void
BeesResolver::find_matches(bool just_one, BeesBlockData &bbd)
{
// Walk through the (ino, offset, root) tuples until we find a match.
BEESTRACE("finding all matches for " << bbd << " at " << m_addr << ": " << m_biors.size() << " found");
THROW_CHECK0(runtime_error, !m_is_toxic);
bool stop_now = false;
for (auto ino_off_root : m_biors) {
if (m_wrong_data) {
return;
}
BEESTRACE("ino_off_root " << ino_off_root);
BeesFileId this_fid(ino_off_root.m_root, ino_off_root.m_inum);
// Silently ignore blacklisted files, e.g. BeesTempFile files
if (m_ctx->is_blacklisted(this_fid)) {
continue;
}
// Look at the old data
catch_all([&]() {
BEESTRACE("chase_extent_ref ino " << ino_off_root << " bbd " << bbd);
auto new_range = chase_extent_ref(ino_off_root, bbd);
if (new_range) {
m_ranges.insert(new_range.copy_closed());
stop_now = true;
}
});
if (just_one && stop_now) {
break;
}
}
}
bool
BeesResolver::for_each_extent_ref(BeesBlockData bbd, function<bool(const BeesFileRange &bfr)> visitor)
{
// Walk through the (ino, offset, root) tuples until we are told to stop
BEESTRACE("for_each_extent_ref " << bbd << " at " << m_addr << ": " << m_biors.size() << " found");
THROW_CHECK0(runtime_error, !m_is_toxic);
bool stop_now = false;
for (auto ino_off_root : m_biors) {
BEESTRACE("ino_off_root " << ino_off_root);
BeesFileId this_fid(ino_off_root.m_root, ino_off_root.m_inum);
// Silently ignore blacklisted files, e.g. BeesTempFile files
if (m_ctx->is_blacklisted(this_fid)) {
continue;
}
// Look at the old data
// FIXME: propagate exceptions for now. Proper fix requires a rewrite.
// catch_all([&]() {
BEESTRACE("chase_extent_ref ino " << ino_off_root << " bbd " << bbd);
auto new_range = chase_extent_ref(ino_off_root, bbd);
// XXX: should we catch visitor's exceptions here?
if (new_range) {
stop_now = visitor(new_range);
} else {
// We have reliable block addresses now, so we guarantee we can hit the desired block.
// Failure in chase_extent_ref means we are done, and don't need to look up all the
// other references.
// Or...not? If we have a compressed extent, some refs will not match
// if there is are two references to the same extent with a reference
// to a different extent between them.
// stop_now = true;
}
// });
if (stop_now) {
break;
}
}
return stop_now;
}
BeesFileRange
BeesResolver::replace_dst(const BeesFileRange &dst_bfr)
{
BEESTRACE("replace_dst dst_bfr " << dst_bfr);
BEESCOUNT(replacedst_try);
// Open dst, reuse it for all src
BEESNOTE("Opening dst bfr " << dst_bfr);
BEESTRACE("Opening dst bfr " << dst_bfr);
dst_bfr.fd(m_ctx);
BeesFileRange overlap_bfr;
BEESTRACE("overlap_bfr " << overlap_bfr);
BeesBlockData bbd(dst_bfr);
for_each_extent_ref(bbd, [&](const BeesFileRange &src_bfr) -> bool {
// Open src
BEESNOTE("Opening src bfr " << src_bfr);
BEESTRACE("Opening src bfr " << src_bfr);
src_bfr.fd(m_ctx);
if (dst_bfr.overlaps(src_bfr)) {
BEESCOUNT(replacedst_overlaps);
return false; // i.e. continue
}
// If dst is already occupying src, skip.
// FIXME: BeesContext::scan_one_extent should be weeding these out, but does not.
BeesBlockData src_bbd(src_bfr.fd(), src_bfr.begin(), min(BLOCK_SIZE_SUMS, src_bfr.size()));
if (bbd.addr().get_physical_or_zero() == src_bbd.addr().get_physical_or_zero()) {
BEESCOUNT(replacedst_same);
// stop looping here, all the other srcs will probably fail this test too
BeesTracer::set_silent();
throw runtime_error("FIXME: bailing out here, need to fix this further up the call stack");
}
// Make pair(src, dst)
BEESTRACE("creating brp (" << src_bfr << ", " << dst_bfr << ")");
BeesRangePair brp(src_bfr, dst_bfr);
BEESTRACE("Found matching range: " << brp);
// Extend range at beginning
BEESNOTE("Extending matching range: " << brp);
// 'false' Has nasty loops, and may not be faster.
// 'true' At best, keeps fragmentation constant...but can also make it worse
if (brp.grow(m_ctx, true)) {
BEESCOUNT(replacedst_grown);
}
// Dedup
BEESNOTE("dedup " << brp);
if (m_ctx->dedup(brp)) {
BEESCOUNT(replacedst_dedup_hit);
m_found_dup = true;
overlap_bfr = brp.second;
// FIXME: find best range first, then dedup that
return true; // i.e. break
} else {
BEESCOUNT(replacedst_dedup_miss);
return false; // i.e. continue
}
});
// BEESLOG("overlap_bfr after " << overlap_bfr);
return overlap_bfr.copy_closed();
}
BeesFileRange
BeesResolver::find_one_match(BeesBlockData &bbd)
{
THROW_CHECK0(runtime_error, !m_is_toxic);
find_matches(true, bbd);
if (m_ranges.empty()) {
return BeesFileRange();
} else {
return *m_ranges.begin();
}
}
set<BeesFileRange>
BeesResolver::find_all_matches(BeesBlockData &bbd)
{
THROW_CHECK0(runtime_error, !m_is_toxic);
find_matches(false, bbd);
return m_ranges;
}
bool
BeesResolver::operator<(const BeesResolver &that) const
{
// Lowest count, highest address
return tie(that.m_bior_count, m_addr) < tie(m_bior_count, that.m_addr);
}