bees/src/bees-types.cc

#include "bees.h"

#include "crucible/crc64.h"
#include "crucible/limits.h"
#include "crucible/ntoa.h"
#include "crucible/string.h"

#include <fstream>
#include <inttypes.h>

using namespace crucible;
using namespace std;

ostream &
operator<<(ostream &os, const BeesFileId &bfi)
{
	return os << bfi.root() << ":" << bfi.ino();
}

bool
BeesFileId::operator<(const BeesFileId &that) const
{
	// Order by inode first so we get good locality when scanning across snapshots
	return tie(m_ino, m_root) < tie(that.m_ino, that.m_root);
}

bool
BeesFileId::operator==(const BeesFileId &that) const
{
	return m_root == that.m_root && m_ino == that.m_ino;
}

bool
BeesFileId::operator!=(const BeesFileId &that) const
{
	return m_root != that.m_root || m_ino != that.m_ino;
}

BeesFileId::operator bool() const
{
	return m_root && m_ino;
}

BeesFileId::BeesFileId(const BtrfsInodeOffsetRoot &bior) :
	m_root(bior.m_root),
	m_ino(bior.m_inum)
{
}

BeesFileId::BeesFileId(uint64_t root, uint64_t ino) :
	m_root(root),
	m_ino(ino)
{
}

BeesFileId::BeesFileId(int fd) :
	m_root(btrfs_get_root_id(fd)),
	m_ino(Stat(fd).st_ino)
{
}

BeesFileId::BeesFileId() :
	m_root(0),
	m_ino(0)
{
}

ostream &
operator<<(ostream &os, const BeesFileRange &bfr)
{
	if (bfr.end() == numeric_limits<off_t>::max()) {
		os << "- [" << to_hex(bfr.begin()) << "..eof]";
	} else {
		os << pretty(bfr.size()) << " ";
		if (bfr.begin() != 0) {
			os << "[" << to_hex(bfr.begin());
		} else {
			os << "(";
		}
		os << ".." << to_hex(bfr.end());
		if (!!bfr.m_fd && bfr.end() >= bfr.file_size()) {
			os << ")";
		} else {
			os << "]";
		}
	}
	if (bfr.m_fid) {
		os << " fid = " << bfr.m_fid;
	}
	if (!!bfr.m_fd) {
		os << " fd = " << bfr.m_fd << " '" << name_fd(bfr.m_fd) << "'";
	}
	return os;
}

ostream &
operator<<(ostream &os, const BeesRangePair &brp)
{
	return os << "BeesRangePair: " << pretty(brp.first.size())
		<< " src[" << to_hex(brp.first.begin()) << ".." << to_hex(brp.first.end()) << "]"
		<< " dst[" << to_hex(brp.second.begin()) << ".." << to_hex(brp.second.end()) << "]"
		<< "\nsrc = " << brp.first.fd() << " " << name_fd(brp.first.fd())
		<< "\ndst = " << brp.second.fd() << " " << name_fd(brp.second.fd());
}

bool
BeesFileRange::operator<(const BeesFileRange &that) const
{
	// Read file blocks in order
	return make_tuple(fid(), m_begin, m_end) < make_tuple(that.fid(), that.m_begin, that.m_end);
	// Faster to read big chunks first?  Probably confuses the hell
	// out of crawl state, so let's only keep this if there's a clear
	// performance win.
	// return make_tuple(that.size(), fid(), m_begin, m_end) < make_tuple(size(), that.fid(), that.m_begin, that.m_end);
}

bool
BeesFileRange::operator==(const BeesFileRange &that) const
{
	// These fields are cheap to compare and have the most variety
	if (m_begin != that.m_begin || m_end != that.m_end) {
		return false;
	}
	// If they both have the same fd they're equal,
	// but different fds are not necessarily distinct
	if (!!m_fd && !!that.m_fd && m_fd == that.m_fd) {
		return true;
	}
	// OK now we have to go check their FileIds
	return fid() == that.fid();
}

bool
BeesFileRange::operator!=(const BeesFileRange &that) const
{
	return !((*this) == that);
}

bool
BeesFileRange::empty() const
{
	THROW_CHECK2(invalid_argument, m_begin, m_end, m_begin <= m_end);
	return m_begin >= m_end;
}

off_t
BeesFileRange::size() const
{
	THROW_CHECK2(invalid_argument, m_begin, m_end, m_begin <= m_end);
	return m_end - m_begin;
}

off_t
BeesFileRange::file_size() const
{
	if (m_file_size <= 0) {
		Stat st(fd());
		m_file_size = st.st_size;
		// These checks could trigger on valid input, but that would mean we have
		// lost a race (e.g. a file was truncated while we were building a
		// matching range pair with it).  In such cases we should probably stop
		// whatever we were doing and backtrack to some higher level anyway.
		// Well, OK, but we call this function from exception handlers...
		THROW_CHECK1(invalid_argument, m_file_size, m_file_size >= 0);
		// THROW_CHECK2(invalid_argument, m_file_size, m_end, m_end <= m_file_size || m_end == numeric_limits<off_t>::max());
	}
	return m_file_size;
}

off_t
BeesFileRange::grow_end(off_t delta)
{
	THROW_CHECK1(invalid_argument, delta, delta > 0);
	m_end = min(m_end + delta, file_size());
	THROW_CHECK2(runtime_error, m_file_size, m_end, m_end <= m_file_size);
	return m_end;
}

off_t
BeesFileRange::grow_begin(off_t delta)
{
	THROW_CHECK1(invalid_argument, delta, delta > 0);
	m_begin -= min(delta, m_begin);
	return m_begin;
}

BeesFileRange::BeesFileRange(const BeesBlockData &bbd) :
	m_fd(bbd.fd()),
	m_begin(bbd.begin()),
	m_end(bbd.end())
{
}

BeesFileRange::BeesFileRange(Fd fd, off_t begin, off_t end) :
	m_fd(fd),
	m_begin(begin),
	m_end(end)
{
}

BeesFileRange::BeesFileRange(const BeesFileId &fid, off_t begin, off_t end) :
	m_fid(fid),
	m_begin(begin),
	m_end(end)
{
}

bool
BeesFileRange::is_same_file(const BeesFileRange &that) const
{
	// If we have two FDs, start by comparing those
	if (!!m_fd && !!that.m_fd && m_fd == that.m_fd) {
		return true;
	}
	// OK have to go fetch the fid from both files and compare them
	return fid() == that.fid();
}

bool
BeesFileRange::overlaps(const BeesFileRange &that) const
{
	// Determine whether the byte ranges overlap before doing syscalls on file descriptors

	pair<uint64_t, uint64_t> a(m_begin, m_end);
	pair<uint64_t, uint64_t> b(that.m_begin, that.m_end);

	// range a starts lower than or equal b
	if (b.first < a.first) {
		swap(a, b);
	}

	// if b starts within a, they overlap
	// (and the intersecting region is b.first..min(a.second, b.second))
	// (and the union region is a.first..max(a.second, b.second))
	if (b.first >= a.first && b.first < a.second) {
		return is_same_file(that);
	}

	return false;
}

bool
BeesFileRange::coalesce(const BeesFileRange &that)
{
	// Let's define coalesce-with-null as identity,
	// and coalesce-null-with-null as coalesced
	if (!*this) {
		operator=(that);
		return true;
	}
	if (!that) {
		return true;
	}

	// Can't coalesce different files
	if (!is_same_file(that)) return false;

	pair<uint64_t, uint64_t> a(m_begin, m_end);
	pair<uint64_t, uint64_t> b(that.m_begin, that.m_end);

	// range a starts lower than or equal b
	if (b.first < a.first) {
		swap(a, b);
	}

	// if b starts within a, they overlap
	// (and the intersecting region is b.first..min(a.second, b.second))
	// (and the union region is a.first..max(a.second, b.second))
	if (b.first >= a.first && b.first < a.second) {
		m_begin = a.first;
		m_end = max(a.second, b.second);
		return true;
	}

	return false;
}

BeesFileRange::operator BeesBlockData() const
{
	BEESTRACE("operator BeesBlockData " << *this);
	return BeesBlockData(m_fd, m_begin, m_end - m_begin);
}

Fd
BeesFileRange::fd() const
{
	return m_fd;
}

Fd
BeesFileRange::fd(const shared_ptr<BeesContext> &ctx) const
{
	// If we don't have a fid we can't do much here
	if (m_fid) {
		if (!m_fd) {
			// If we don't have a fd, open by fid
			if (m_fid && ctx) {
				Fd new_fd = ctx->roots()->open_root_ino(m_fid);
				m_fd = new_fd;
			}
		} else {
			// If we have both fid and fd, make sure they match
			BeesFileId fd_fid(m_fd);
			THROW_CHECK2(invalid_argument, fd_fid, m_fid, fd_fid == m_fid);
		}
	}
	// We either had a fid and opened it, or we didn't and we're just stuck with our fd
	return m_fd;
}

BeesFileRange
BeesFileRange::copy_closed() const
{
	return BeesFileRange(fid(), m_begin, m_end);
}

BeesFileId
BeesFileRange::fid() const
{
	if (!m_fid) {
		if (!!m_fd) {
			m_fid = BeesFileId(m_fd);
		}
	}
	return m_fid;
}

BeesRangePair::BeesRangePair(const BeesFileRange &src, const BeesFileRange &dst) :
	pair<BeesFileRange, BeesFileRange>(src, dst)
{
	BEESTRACE("checking constraints on " << *this);

	// Must not initially overlap
	THROW_CHECK2(invalid_argument, first, second, !first.overlaps(second));

	// Must initially be equal
	THROW_CHECK2(invalid_argument, first, second, first.size() == second.size());

	// Can't check content unless open
	if (!first.fd() || !second.fd()) {
		return;
	}

	// Must check every block individually
	off_t first_begin = first.begin();
	off_t second_begin = second.begin();
	off_t size = first.size();
	while (size) {
		off_t len = min(BLOCK_SIZE_SUMS, size);
		BeesBlockData first_bbd(first.fd(), first_begin, len);
		BeesBlockData second_bbd(second.fd(), second_begin, len);
		THROW_CHECK2(invalid_argument, first_bbd, second_bbd, first_bbd.is_data_equal(second_bbd));
		first_begin += len;
		second_begin += len;
		size -= len;
	}
}

bool
BeesRangePair::operator<(const BeesRangePair &that) const
{
	// Order by destination then source
	return tie(second, first) < tie(that.second, that.first);
}

bool
BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
{
	BEESTOOLONG("grow constrained = " << constrained << " *this = " << *this);
	BEESTRACE("grow constrained = " << constrained << " *this = " << *this);
	bool rv = false;
	Timer grow_backward_timer;

	THROW_CHECK1(invalid_argument, first.begin(), (first.begin() & BLOCK_MASK_CLONE) == 0);
	THROW_CHECK1(invalid_argument, second.begin(), (second.begin() & BLOCK_MASK_CLONE) == 0);

	// We should not be overlapping already
	THROW_CHECK2(invalid_argument, first, second, !first.overlaps(second));

	BtrfsExtentWalker ew_second(second.fd());

	// Stop on aligned extent boundary
	ew_second.seek(second.begin());

	Extent e_second = ew_second.current();
	BEESTRACE("e_second " << e_second);

	// Preread entire extent
	readahead(second.fd(), e_second.begin(), e_second.size());
	readahead(first.fd(), e_second.begin() + first.begin() - second.begin(), e_second.size());

	auto hash_table = ctx->hash_table();

	// Look backward
	BEESTRACE("grow_backward " << *this);
	while (first.size() < BLOCK_SIZE_MAX_EXTENT) {
		if (second.begin() <= e_second.begin()) {
#if 0
			if (constrained) {
				break;
			}
			BEESCOUNT(pairbackward_extent);
			ew_second.seek(second.begin() - min(BLOCK_SIZE_CLONE, second.begin()));
			e_second = ew_second.current();
			if (e_second.flags() & Extent::HOLE) {
				BEESCOUNT(pairbackward_hole);
				break;
			}
			readahead(second.fd(), e_second.begin(), e_second.size());
#else
			// This tends to repeatedly process extents that were recently processed.
			// We tend to catch duplicate blocks early since we scan them forwards.
			// Also, reading backwards is slow so we probably don't want to do it much.
			break;
#endif
		}
		BEESCOUNT(pairbackward_try);

		// Extend first range.  If we hit BOF we can go no further.
		BeesFileRange new_first = first;
		BEESTRACE("new_first = " << new_first);
		new_first.grow_begin(BLOCK_SIZE_CLONE);
		if (new_first.begin() == first.begin()) {
			BEESCOUNT(pairbackward_bof_first);
			break;
		}

		// Source extent cannot be toxic
		BeesAddress first_addr(first.fd(), new_first.begin());
		if (!first_addr.is_magic()) {
			auto first_resolved = ctx->resolve_addr(first_addr);
			if (first_resolved.is_toxic()) {
				BEESLOGWARN("WORKAROUND: not growing matching pair backward because src addr is toxic:\n" << *this);
				BEESCOUNT(pairbackward_toxic_addr);
				break;
			}
		}

		// Extend second range.  If we hit BOF we can go no further.
		BeesFileRange new_second = second;
		BEESTRACE("new_second = " << new_second);
		new_second.grow_begin(BLOCK_SIZE_CLONE);
		if (new_second.begin() == second.begin()) {
			BEESCOUNT(pairbackward_bof_second);
			break;
		}

		// If the ranges now overlap we went too far
		if (new_first.overlaps(new_second)) {
			BEESCOUNT(pairbackward_overlap);
			break;
		}

		BEESTRACE("first " << first << " new_first " << new_first);
		BeesBlockData first_bbd(first.fd(), new_first.begin(), first.begin() - new_first.begin());
		BEESTRACE("first_bbd " << first_bbd);
		BEESTRACE("second " << second << " new_second " << new_second);
		BeesBlockData second_bbd(second.fd(), new_second.begin(), second.begin() - new_second.begin());
		BEESTRACE("second_bbd " << second_bbd);

		// Both blocks must have identical content
		if (!first_bbd.is_data_equal(second_bbd)) {
			BEESCOUNT(pairbackward_miss);
			break;
		}

		// Physical blocks must be distinct
		if (first_bbd.addr().get_physical_or_zero() == second_bbd.addr().get_physical_or_zero()) {
			BEESCOUNT(pairbackward_same);
			break;
		}

		// Source block cannot be zero in a non-compressed non-magic extent
		if (first_bbd.is_data_zero() && !first_addr.is_magic() && !first_addr.is_compressed()) {
			BEESCOUNT(pairbackward_zero);
			break;
		}

		// Source block cannot have a toxic hash
		auto found_hashes = hash_table->find_cell(first_bbd.hash());
		bool found_toxic = false;
		for (auto i : found_hashes) {
			if (BeesAddress(i.e_addr).is_toxic()) {
				found_toxic = true;
				break;
			}
		}
		if (found_toxic) {
			BEESLOGWARN("WORKAROUND: found toxic hash in " << first_bbd << " while extending backward:\n" << *this);
			BEESCOUNT(pairbackward_toxic_hash);
			break;
		}

		THROW_CHECK2(invalid_argument, new_first.size(), new_second.size(), new_first.size() == new_second.size());
		first = new_first;
		second = new_second;
		rv = true;
		BEESCOUNT(pairbackward_hit);
	}
	BEESCOUNT(pairbackward_stop);
	BEESCOUNTADD(pairbackward_ms, grow_backward_timer.age() * 1000);

	// Look forward
	BEESTRACE("grow_forward " << *this);
	Timer grow_forward_timer;
	while (first.size() < BLOCK_SIZE_MAX_EXTENT) {
		if (second.end() >= e_second.end()) {
			if (constrained) {
				break;
			}
			BEESCOUNT(pairforward_extent);
			ew_second.seek(second.end());
			e_second = ew_second.current();
			if (e_second.flags() & Extent::HOLE) {
				BEESCOUNT(pairforward_hole);
				break;
			}
			readahead(second.fd(), e_second.begin(), e_second.size());
		}
		BEESCOUNT(pairforward_try);

		// Extend first range.  If we hit EOF we can go no further.
		BeesFileRange new_first = first;
		BEESTRACE("new_first = " << new_first);
		new_first.grow_end(BLOCK_SIZE_CLONE);
		if (new_first.end() == first.end()) {
			BEESCOUNT(pairforward_eof_first);
			break;
		}

		// Source extent cannot be toxic
		BeesAddress first_addr(first.fd(), new_first.begin());
		if (!first_addr.is_magic()) {
			auto first_resolved = ctx->resolve_addr(first_addr);
			if (first_resolved.is_toxic()) {
				BEESLOGWARN("WORKAROUND: not growing matching pair forward because src is toxic:\n" << *this);
				BEESCOUNT(pairforward_toxic);
				break;
			}
		}

		// Extend second range.  If we hit EOF we can go no further.
		BeesFileRange new_second = second;
		BEESTRACE("new_second = " << new_second);
		new_second.grow_end(BLOCK_SIZE_CLONE);
		if (new_second.end() == second.end()) {
			BEESCOUNT(pairforward_eof_second);
			break;
		}

		// If we have hit an unaligned EOF then it has to be the same unaligned EOF.
		// If we haven't hit EOF then the ends of the ranges are still aligned,
		// so the misalignment (zero) will be equal.
		if ((new_second.end() & BLOCK_MASK_CLONE) != (new_first.end() & BLOCK_MASK_CLONE)) {
			BEESCOUNT(pairforward_eof_malign);
			break;
		}

		// If the ranges now overlap we went too far
		if (new_first.overlaps(new_second)) {
			BEESCOUNT(pairforward_overlap);
			break;
		}

		BEESTRACE("first " << first << " new_first " << new_first);
		BeesBlockData first_bbd(first.fd(), first.end(), new_first.end() - first.end());
		BEESTRACE("first_bbd " << first_bbd);
		BEESTRACE("second " << second << " new_second " << new_second);
		BeesBlockData second_bbd(second.fd(), second.end(), new_second.end() - second.end());
		BEESTRACE("second_bbd " << second_bbd);

		// Both blocks must have identical content
		if (!first_bbd.is_data_equal(second_bbd)) {
			BEESCOUNT(pairforward_miss);
			break;
		}

		// Physical blocks must be distinct
		if (first_bbd.addr().get_physical_or_zero() == second_bbd.addr().get_physical_or_zero()) {
			BEESCOUNT(pairforward_same);
			break;
		}

		// Source block cannot be zero in a non-compressed non-magic extent
		if (first_bbd.is_data_zero() && !first_addr.is_magic() && !first_addr.is_compressed()) {
			BEESCOUNT(pairforward_zero);
			break;
		}

		// Source block cannot have a toxic hash
		auto found_hashes = hash_table->find_cell(first_bbd.hash());
		bool found_toxic = false;
		for (auto i : found_hashes) {
			if (BeesAddress(i.e_addr).is_toxic()) {
				found_toxic = true;
				break;
			}
		}
		if (found_toxic) {
			BEESLOGWARN("WORKAROUND: found toxic hash in " << first_bbd << " while extending forward:\n" << *this);
			BEESCOUNT(pairforward_toxic_hash);
			break;
		}

		// OK, next block
		THROW_CHECK2(invalid_argument, new_first.size(), new_second.size(), new_first.size() == new_second.size());
		first = new_first;
		second = new_second;
		rv = true;
		BEESCOUNT(pairforward_hit);
	}

	if (first.overlaps(second)) {
		BEESLOGTRACE("after grow, first " << first << "\n\toverlaps " << second);
		BEESCOUNT(bug_grow_pair_overlaps);
	}

	BEESCOUNT(pairforward_stop);
	BEESCOUNTADD(pairforward_ms, grow_forward_timer.age() * 1000);
	return rv;
}

BeesRangePair
BeesRangePair::copy_closed() const
{
	return BeesRangePair(first.copy_closed(), second.copy_closed());
}

ostream &
operator<<(ostream &os, const BeesAddress &ba)
{
	if (ba.is_magic()) {
		enum {
			ZERO     = BeesAddress::MagicValue::ZERO,
			DELALLOC = BeesAddress::MagicValue::DELALLOC,
			HOLE     = BeesAddress::MagicValue::HOLE,
			UNUSABLE = BeesAddress::MagicValue::UNUSABLE,
		};
                static const bits_ntoa_table table[] = {
                        NTOA_TABLE_ENTRY_ENUM(ZERO),
                        NTOA_TABLE_ENTRY_ENUM(DELALLOC),
                        NTOA_TABLE_ENTRY_ENUM(HOLE),
                        NTOA_TABLE_ENTRY_ENUM(UNUSABLE),
                        NTOA_TABLE_ENTRY_END()
                };
		return os << bits_ntoa(static_cast<BeesAddress::Type>(ba), table);
	}

	auto gpz = ba.get_physical_or_zero();
	if (gpz == 0x1000) {
		os << "NIL";
	} else {
		os << to_hex(gpz);
	}

	if (ba.is_toxic()) {
		os << "t";
	}

	if (ba.is_unaligned_eof()) {
		os << "u";
	}

	if (ba.is_compressed()) {
		os << "z";
		if (ba.has_compressed_offset()) {
			os << astringprintf("%" PRIx64, ba.get_compressed_offset());
		}
	}

	return os;
}

bool
BeesAddress::magic_check(uint64_t flags)
{
	// This one isn't FIEMAP
	if (flags & Extent::HOLE) {
		m_addr = HOLE;
		BEESCOUNT(addr_hole);
		return true;
	}

	// These trigger extra processing steps for compressed extents
	static const unsigned compressed_flags = FIEMAP_EXTENT_ENCODED;

	// These indicate the extent is not yet on disk (try again with sync)
	static const unsigned delalloc_flags = FIEMAP_EXTENT_UNKNOWN | FIEMAP_EXTENT_DELALLOC;

	// These flags are irrelevant to extent-same
	static const unsigned ignore_flags = FIEMAP_EXTENT_LAST | FIEMAP_EXTENT_SHARED;

	// These flags mean we can't use extent-same
	static const unsigned unusable_flags = FIEMAP_EXTENT_NOT_ALIGNED | FIEMAP_EXTENT_DATA_INLINE;

	// All of the above (any other flag is a new feature we maybe can't cope with)
	static const unsigned recognized_flags = compressed_flags | delalloc_flags | ignore_flags | unusable_flags;

	if (flags & ~recognized_flags) {
		BEESLOGTRACE("Unrecognized flags in " << fiemap_extent_flags_ntoa(flags));
		m_addr = UNUSABLE;
		// maybe we throw here?
		BEESCOUNT(addr_unrecognized);
		return true;
	}

	if (flags & unusable_flags) {
		// we know these, but can't touch them
		BEESCOUNT(addr_unusable);
		m_addr = UNUSABLE;
		return true;
	}

	if (flags & delalloc_flags) {
		// delayed allocation, try again with force
		BEESCOUNT(addr_delalloc);
		m_addr = DELALLOC;
		return true;
	}

	return false;
}

BeesAddress::BeesAddress(const Extent &e, off_t offset) :
	m_addr(ZERO)
{
	BEESTRACE("BeesAddress " << e << " offset " << to_hex(offset));
	Type new_addr = 0;

	THROW_CHECK1(invalid_argument, e, (e.physical() & BLOCK_MASK_CLONE) == 0);
	THROW_CHECK1(invalid_argument, e, (e.begin() & BLOCK_MASK_CLONE) == 0);
	THROW_CHECK1(invalid_argument, e, (offset & BLOCK_MASK_CLONE) == 0);
	THROW_CHECK1(invalid_argument, e, e.end() > e.begin());

	if (magic_check(e.flags())) {
		BEESCOUNT(addr_magic);
		return;
	}

	// All addresses from here on are physical
	THROW_CHECK1(invalid_argument, e, e.physical() > 0);

	if (e.flags() & FIEMAP_EXTENT_ENCODED) {
		THROW_CHECK1(invalid_argument, e, (e.offset() & BLOCK_MASK_CLONE) == 0);
		THROW_CHECK1(invalid_argument, e, e.offset() >= 0 && e.offset() < BLOCK_SIZE_MAX_COMPRESSED_EXTENT);
		int extent_offset = offset - e.begin() + e.offset();
		BEESTRACE("extent_offset = " << to_hex(extent_offset));
		THROW_CHECK1(invalid_argument, extent_offset, extent_offset >= 0 && extent_offset < BLOCK_SIZE_MAX_COMPRESSED_EXTENT);
		THROW_CHECK1(invalid_argument, extent_offset, (extent_offset & BLOCK_MASK_CLONE) == 0);
		unsigned offset_bits = (extent_offset / BLOCK_SIZE_CLONE) + 1;
		BEESTRACE("offset_bits = " << offset_bits);
		THROW_CHECK1(invalid_argument, offset_bits, offset_bits >= c_offset_min && offset_bits <= c_offset_max);
		THROW_CHECK1(invalid_argument, offset_bits, (offset_bits & ~c_offset_mask) == 0);
#if 1
		new_addr = e.physical() | c_compressed_mask | offset_bits;
		BEESCOUNT(addr_compressed_offset);
#else
		new_addr = e.physical() | c_compressed_mask;
		BEESCOUNT(addr_compressed);
#endif
	} else {
		new_addr = e.physical() + (offset - e.begin());
		BEESCOUNT(addr_uncompressed);
	}

	if ((e.flags() & FIEMAP_EXTENT_LAST) && (e.end() & BLOCK_MASK_CLONE) != 0 && (offset & ~BLOCK_MASK_CLONE) == (e.end() & ~BLOCK_MASK_CLONE)) {
		new_addr |= c_eof_mask;
		BEESCOUNT(addr_eof_e);
	}

	m_addr = new_addr;
	BEESCOUNT(addr_block);
}

BeesAddress::BeesAddress(int fd, off_t offset) :
	m_addr(ZERO)
{
	BEESTOOLONG("BeesAddress(fd " << fd << " " << name_fd(fd) << " offset " << to_hex(offset) << ")");
	BEESTRACE("BeesAddress(fd " << fd << " " << name_fd(fd) << " offset " << to_hex(offset) << ")");

	Type uoffset = ranged_cast<Type>(offset);

	THROW_CHECK1(invalid_argument, uoffset, (uoffset & c_all_mask) == 0);
	THROW_CHECK1(invalid_argument, uoffset, (uoffset & BLOCK_MASK_CLONE) == 0);

	Timer extentwalker_timer;
	BtrfsExtentWalker ew(fd, uoffset);
	Extent e = ew.current();
	BEESCOUNT(addr_from_fd);
	BEESCOUNTADD(addr_ms, extentwalker_timer.age() * 1000);

	*this = BeesAddress(e, offset);
}

BeesAddress::BeesAddress(int fd, off_t offset, shared_ptr<BeesContext> ctx) :
	m_addr(ZERO)
{
	BEESTOOLONG("BeesAddress(fd " << fd << " " << name_fd(fd) << " offset " << to_hex(offset) << " ctx " << ctx->root_path() << ")");
	BEESTRACE("BeesAddress(fd " << fd << " " << name_fd(fd) << " offset " << to_hex(offset) << " ctx " << ctx->root_path() << ")");

	Type uoffset = ranged_cast<Type>(offset);

	THROW_CHECK1(invalid_argument, uoffset, (uoffset & c_all_mask) == 0);
	THROW_CHECK1(invalid_argument, uoffset, (uoffset & BLOCK_MASK_CLONE) == 0);

	Timer extentwalker_timer;
	BtrfsExtentWalker ew(fd, uoffset, ctx->root_fd());
	Extent e = ew.current();
	BEESCOUNT(addr_from_root_fd);
	BEESCOUNTADD(addr_ms, extentwalker_timer.age() * 1000);

	*this = BeesAddress(e, offset);
}

// Get just the physical address with no extra bits or compressed block offset (magic values become zero)

BeesAddress::Type
BeesAddress::get_physical_or_zero() const
{
	if (is_magic()) {
		return 0;
	} else {
		return m_addr & ~c_all_mask;
	}
}

// A compressed block address is divided into two fields:
// the beginning of the physical extent,
// and the distance (in CLONE blocks) from the start of the extent to the current block.
// Throws an exception if has_compressed_offset is not true.

BeesAddress::Type
BeesAddress::get_compressed_offset() const
{
	THROW_CHECK1(invalid_argument, *this, has_compressed_offset());
	return ((m_addr & c_offset_mask) - 1) * BLOCK_SIZE_CLONE;
}

void
BeesAddress::set_toxic()
{
	THROW_CHECK1(invalid_argument, *this, !is_magic());
	m_addr |= c_toxic_mask;
}

bool
BeesAddress::operator==(const BeesAddress &that) const
{
	// If one side has an offset and the other doesn't, compare without checking offset bits
	// This returns the right result for comparisons between magic and non-magic values,
	// even though the math is all wrong.
	if (has_compressed_offset() != that.has_compressed_offset()) {
		return (m_addr & ~c_offset_mask) == (that.m_addr & ~c_offset_mask);
	} else {
		return m_addr == that.m_addr;
	}
}

bool
BeesAddress::operator<(const BeesAddress &that) const
{
	if (has_compressed_offset() != that.has_compressed_offset()) {
		return (m_addr & ~c_offset_mask) < (that.m_addr & ~c_offset_mask);
	} else {
		return m_addr < that.m_addr;
	}
}

ostream &
operator<<(ostream &os, const BeesBlockData &bbd)
{
	os << "BeesBlockData { " << pretty(bbd.m_length) << " " << to_hex(bbd.m_offset) << " fd = " << bbd.m_fd << " '" << name_fd(bbd.m_fd) << "'";
	if (bbd.m_addr != BeesAddress::ZERO) {
		os << ", address = " << bbd.m_addr;
	}
	if (bbd.m_hash_done) {
		os << ", hash = " << bbd.m_hash;
	}
	if (!bbd.m_data.empty()) {
// Turn this on to debug BeesBlockData, but leave it off otherwise.
// It's a massive data leak that is only interesting to developers.
#if 0
		os << ", data[" << bbd.m_data.size() << "] = '";

		size_t max_print = 12;
		size_t to_print = min(bbd.m_data.size(), max_print);
		for (size_t i = 0; i < to_print; ++i) {
			uint8_t c = bbd.m_data[i];
			// We are ASCII heathens here
			if (c >= 32 && c < 127 && c != '\\') {
				os << c;
			} else {
				char buf[8];
				sprintf(buf, "\\x%02x", c);
				os << buf;
			}
		}
		os << "...'";
#else
		os << ", data[" << bbd.m_data.size() << "]";
#endif
	}
	return os << " }";
}

BeesBlockData::BeesBlockData(Fd fd, off_t offset, size_t read_length) :
	m_fd(fd),
	m_offset(offset),
	m_length(read_length)
{
	BEESTRACE("Constructing " << *this);
	THROW_CHECK1(invalid_argument, m_length, m_length > 0);
	THROW_CHECK1(invalid_argument, m_length, m_length <= BLOCK_SIZE_SUMS);
	THROW_CHECK1(invalid_argument, m_offset, (m_offset % BLOCK_SIZE_SUMS) == 0);
}

BeesBlockData::BeesBlockData() :
	m_offset(0),
	m_length(0)
{
}

BeesAddress
BeesBlockData::addr() const
{
	if (m_addr == BeesAddress::ZERO) {
		m_addr = BeesAddress(fd(), m_offset);
	}
	return m_addr;
}

BeesBlockData &
BeesBlockData::addr(const BeesAddress &a)
{
	m_addr = a;
	return *this;
}

const BeesBlockData::Blob &
BeesBlockData::data() const
{
	if (m_data.empty()) {
		THROW_CHECK1(invalid_argument, size(), size() > 0);
		BEESNOTE("Reading BeesBlockData " << *this);
		BEESTOOLONG("Reading BeesBlockData " << *this);
		Timer read_timer;

		Blob rv(size());
		pread_or_die(m_fd, rv, m_offset);
		THROW_CHECK2(runtime_error, rv.size(), size(), ranged_cast<off_t>(rv.size()) == size());
		m_data = rv;
		BEESCOUNT(block_read);
		BEESCOUNTADD(block_bytes, rv.size());
		BEESCOUNTADD(block_ms, read_timer.age() * 1000);
	}

	return m_data;
}

BeesHash
BeesBlockData::hash() const
{
	if (!m_hash_done) {
		// We can only dedup unaligned EOF blocks against other unaligned EOF blocks,
		// so we do NOT round up to a full sum block size.
		const Blob &blob = data();
		// TODO:  It turns out that file formats with 4K block
		// alignment and embedded CRC64 do exist, and every block
		// of such files has the same hash.  Could use a subset
		// of SHA1 here instead.
		m_hash = Digest::CRC::crc64(blob.data(), blob.size());
		m_hash_done = true;
		BEESCOUNT(block_hash);
	}

	return m_hash;
}

bool
BeesBlockData::is_data_zero() const
{
	// The CRC64 of zero is zero, so skip some work if we already know the CRC
	if (m_hash_done && m_hash != 0) {
		return false;
	}

	// OK read block (maybe) and check every byte
	for (auto c : data()) {
		if (c != '\0') {
			return false;
		}
	}

	BEESCOUNT(block_zero);
	return true;
}

bool
BeesBlockData::is_data_equal(const BeesBlockData &that) const
{
	BEESTRACE("is_data_equal this = " << *this << ", that = " << that);
	THROW_CHECK1(invalid_argument, size(), size() > 0);
	THROW_CHECK2(invalid_argument, size(), that.size(), size() == that.size());

	// skip some work if we already know the CRCs don't match
	if (m_hash_done && that.m_hash_done && m_hash != that.m_hash) {
		return false;
	}

	return data() == that.data();
}