mirror of
https://github.com/Zygo/bees.git
synced 2025-05-18 13:55:44 +02:00
Other btrfs utils use readahead() not posix_fadvise(). There does not appear to be a performance or correctness difference between the three (none, posix_fadvise, or readahead()). Signed-off-by: Zygo Blaxell <bees@furryterror.org>
1013 lines
27 KiB
C++
1013 lines
27 KiB
C++
#include "bees.h"
|
|
|
|
#include "crucible/crc64.h"
|
|
#include "crucible/limits.h"
|
|
#include "crucible/ntoa.h"
|
|
#include "crucible/string.h"
|
|
|
|
#include <fstream>
|
|
#include <inttypes.h>
|
|
|
|
using namespace crucible;
|
|
using namespace std;
|
|
|
|
ostream &
|
|
operator<<(ostream &os, const BeesFileId &bfi)
|
|
{
|
|
return os << bfi.root() << ":" << bfi.ino();
|
|
}
|
|
|
|
bool
|
|
BeesFileId::operator<(const BeesFileId &that) const
|
|
{
|
|
// Order by inode first so we get good locality when scanning across snapshots
|
|
return tie(m_ino, m_root) < tie(that.m_ino, that.m_root);
|
|
}
|
|
|
|
bool
|
|
BeesFileId::operator==(const BeesFileId &that) const
|
|
{
|
|
return m_root == that.m_root && m_ino == that.m_ino;
|
|
}
|
|
|
|
bool
|
|
BeesFileId::operator!=(const BeesFileId &that) const
|
|
{
|
|
return m_root != that.m_root || m_ino != that.m_ino;
|
|
}
|
|
|
|
BeesFileId::operator bool() const
|
|
{
|
|
return m_root && m_ino;
|
|
}
|
|
|
|
BeesFileId::BeesFileId(const BtrfsInodeOffsetRoot &bior) :
|
|
m_root(bior.m_root),
|
|
m_ino(bior.m_inum)
|
|
{
|
|
}
|
|
|
|
BeesFileId::BeesFileId(uint64_t root, uint64_t ino) :
|
|
m_root(root),
|
|
m_ino(ino)
|
|
{
|
|
}
|
|
|
|
BeesFileId::BeesFileId(int fd) :
|
|
m_root(btrfs_get_root_id(fd)),
|
|
m_ino(Stat(fd).st_ino)
|
|
{
|
|
}
|
|
|
|
BeesFileId::BeesFileId() :
|
|
m_root(0),
|
|
m_ino(0)
|
|
{
|
|
}
|
|
|
|
ostream &
|
|
operator<<(ostream &os, const BeesFileRange &bfr)
|
|
{
|
|
if (bfr.end() == numeric_limits<off_t>::max()) {
|
|
os << "- [" << to_hex(bfr.begin()) << "..eof]";
|
|
} else {
|
|
os << pretty(bfr.size()) << " ";
|
|
if (bfr.begin() != 0) {
|
|
os << "[" << to_hex(bfr.begin());
|
|
} else {
|
|
os << "(";
|
|
}
|
|
os << ".." << to_hex(bfr.end());
|
|
if (!!bfr.m_fd && bfr.end() >= bfr.file_size()) {
|
|
os << ")";
|
|
} else {
|
|
os << "]";
|
|
}
|
|
}
|
|
if (bfr.m_fid) {
|
|
os << " fid = " << bfr.m_fid;
|
|
}
|
|
if (!!bfr.m_fd) {
|
|
os << " fd = " << bfr.m_fd << " '" << name_fd(bfr.m_fd) << "'";
|
|
}
|
|
return os;
|
|
}
|
|
|
|
ostream &
|
|
operator<<(ostream &os, const BeesRangePair &brp)
|
|
{
|
|
return os << "BeesRangePair: " << pretty(brp.first.size())
|
|
<< " src[" << to_hex(brp.first.begin()) << ".." << to_hex(brp.first.end()) << "]"
|
|
<< " dst[" << to_hex(brp.second.begin()) << ".." << to_hex(brp.second.end()) << "]"
|
|
<< "\nsrc = " << brp.first.fd() << " " << name_fd(brp.first.fd())
|
|
<< "\ndst = " << brp.second.fd() << " " << name_fd(brp.second.fd());
|
|
}
|
|
|
|
bool
|
|
BeesFileRange::operator<(const BeesFileRange &that) const
|
|
{
|
|
// Read file blocks in order
|
|
return make_tuple(fid(), m_begin, m_end) < make_tuple(that.fid(), that.m_begin, that.m_end);
|
|
// Faster to read big chunks first? Probably confuses the hell
|
|
// out of crawl state, so let's only keep this if there's a clear
|
|
// performance win.
|
|
// return make_tuple(that.size(), fid(), m_begin, m_end) < make_tuple(size(), that.fid(), that.m_begin, that.m_end);
|
|
}
|
|
|
|
bool
|
|
BeesFileRange::operator==(const BeesFileRange &that) const
|
|
{
|
|
// These fields are cheap to compare and have the most variety
|
|
if (m_begin != that.m_begin || m_end != that.m_end) {
|
|
return false;
|
|
}
|
|
// If they both have the same fd they're equal,
|
|
// but different fds are not necessarily distinct
|
|
if (!!m_fd && !!that.m_fd && m_fd == that.m_fd) {
|
|
return true;
|
|
}
|
|
// OK now we have to go check their FileIds
|
|
return fid() == that.fid();
|
|
}
|
|
|
|
bool
|
|
BeesFileRange::operator!=(const BeesFileRange &that) const
|
|
{
|
|
return !((*this) == that);
|
|
}
|
|
|
|
bool
|
|
BeesFileRange::empty() const
|
|
{
|
|
THROW_CHECK2(invalid_argument, m_begin, m_end, m_begin <= m_end);
|
|
return m_begin >= m_end;
|
|
}
|
|
|
|
off_t
|
|
BeesFileRange::size() const
|
|
{
|
|
THROW_CHECK2(invalid_argument, m_begin, m_end, m_begin <= m_end);
|
|
return m_end - m_begin;
|
|
}
|
|
|
|
off_t
|
|
BeesFileRange::file_size() const
|
|
{
|
|
if (m_file_size <= 0) {
|
|
Stat st(fd());
|
|
m_file_size = st.st_size;
|
|
// These checks could trigger on valid input, but that would mean we have
|
|
// lost a race (e.g. a file was truncated while we were building a
|
|
// matching range pair with it). In such cases we should probably stop
|
|
// whatever we were doing and backtrack to some higher level anyway.
|
|
// Well, OK, but we call this function from exception handlers...
|
|
THROW_CHECK1(invalid_argument, m_file_size, m_file_size >= 0);
|
|
// THROW_CHECK2(invalid_argument, m_file_size, m_end, m_end <= m_file_size || m_end == numeric_limits<off_t>::max());
|
|
}
|
|
return m_file_size;
|
|
}
|
|
|
|
off_t
|
|
BeesFileRange::grow_end(off_t delta)
|
|
{
|
|
THROW_CHECK1(invalid_argument, delta, delta > 0);
|
|
m_end = min(m_end + delta, file_size());
|
|
THROW_CHECK2(runtime_error, m_file_size, m_end, m_end <= m_file_size);
|
|
return m_end;
|
|
}
|
|
|
|
off_t
|
|
BeesFileRange::grow_begin(off_t delta)
|
|
{
|
|
THROW_CHECK1(invalid_argument, delta, delta > 0);
|
|
m_begin -= min(delta, m_begin);
|
|
return m_begin;
|
|
}
|
|
|
|
BeesFileRange::BeesFileRange(const BeesBlockData &bbd) :
|
|
m_fd(bbd.fd()),
|
|
m_begin(bbd.begin()),
|
|
m_end(bbd.end())
|
|
{
|
|
}
|
|
|
|
BeesFileRange::BeesFileRange(Fd fd, off_t begin, off_t end) :
|
|
m_fd(fd),
|
|
m_begin(begin),
|
|
m_end(end)
|
|
{
|
|
}
|
|
|
|
BeesFileRange::BeesFileRange(const BeesFileId &fid, off_t begin, off_t end) :
|
|
m_fid(fid),
|
|
m_begin(begin),
|
|
m_end(end)
|
|
{
|
|
}
|
|
|
|
bool
|
|
BeesFileRange::is_same_file(const BeesFileRange &that) const
|
|
{
|
|
// If we have two FDs, start by comparing those
|
|
if (!!m_fd && !!that.m_fd && m_fd == that.m_fd) {
|
|
return true;
|
|
}
|
|
// OK have to go fetch the fid from both files and compare them
|
|
return fid() == that.fid();
|
|
}
|
|
|
|
bool
|
|
BeesFileRange::overlaps(const BeesFileRange &that) const
|
|
{
|
|
// Determine whether the byte ranges overlap before doing syscalls on file descriptors
|
|
|
|
pair<uint64_t, uint64_t> a(m_begin, m_end);
|
|
pair<uint64_t, uint64_t> b(that.m_begin, that.m_end);
|
|
|
|
// range a starts lower than or equal b
|
|
if (b.first < a.first) {
|
|
swap(a, b);
|
|
}
|
|
|
|
// if b starts within a, they overlap
|
|
// (and the intersecting region is b.first..min(a.second, b.second))
|
|
// (and the union region is a.first..max(a.second, b.second))
|
|
if (b.first >= a.first && b.first < a.second) {
|
|
return is_same_file(that);
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
BeesFileRange::coalesce(const BeesFileRange &that)
|
|
{
|
|
// Let's define coalesce-with-null as identity,
|
|
// and coalesce-null-with-null as coalesced
|
|
if (!*this) {
|
|
operator=(that);
|
|
return true;
|
|
}
|
|
if (!that) {
|
|
return true;
|
|
}
|
|
|
|
// Can't coalesce different files
|
|
if (!is_same_file(that)) return false;
|
|
|
|
pair<uint64_t, uint64_t> a(m_begin, m_end);
|
|
pair<uint64_t, uint64_t> b(that.m_begin, that.m_end);
|
|
|
|
// range a starts lower than or equal b
|
|
if (b.first < a.first) {
|
|
swap(a, b);
|
|
}
|
|
|
|
// if b starts within a, they overlap
|
|
// (and the intersecting region is b.first..min(a.second, b.second))
|
|
// (and the union region is a.first..max(a.second, b.second))
|
|
if (b.first >= a.first && b.first < a.second) {
|
|
m_begin = a.first;
|
|
m_end = max(a.second, b.second);
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
BeesFileRange::operator BeesBlockData() const
|
|
{
|
|
BEESTRACE("operator BeesBlockData " << *this);
|
|
return BeesBlockData(m_fd, m_begin, m_end - m_begin);
|
|
}
|
|
|
|
Fd
|
|
BeesFileRange::fd() const
|
|
{
|
|
return m_fd;
|
|
}
|
|
|
|
Fd
|
|
BeesFileRange::fd(const shared_ptr<BeesContext> &ctx) const
|
|
{
|
|
// If we don't have a fid we can't do much here
|
|
if (m_fid) {
|
|
if (!m_fd) {
|
|
// If we don't have a fd, open by fid
|
|
if (m_fid && ctx) {
|
|
Fd new_fd = ctx->roots()->open_root_ino(m_fid);
|
|
m_fd = new_fd;
|
|
}
|
|
} else {
|
|
// If we have both fid and fd, make sure they match
|
|
BeesFileId fd_fid(m_fd);
|
|
THROW_CHECK2(invalid_argument, fd_fid, m_fid, fd_fid == m_fid);
|
|
}
|
|
}
|
|
// We either had a fid and opened it, or we didn't and we're just stuck with our fd
|
|
return m_fd;
|
|
}
|
|
|
|
BeesFileRange
|
|
BeesFileRange::copy_closed() const
|
|
{
|
|
return BeesFileRange(fid(), m_begin, m_end);
|
|
}
|
|
|
|
BeesFileId
|
|
BeesFileRange::fid() const
|
|
{
|
|
if (!m_fid) {
|
|
if (!!m_fd) {
|
|
m_fid = BeesFileId(m_fd);
|
|
}
|
|
}
|
|
return m_fid;
|
|
}
|
|
|
|
BeesRangePair::BeesRangePair(const BeesFileRange &src, const BeesFileRange &dst) :
|
|
pair<BeesFileRange, BeesFileRange>(src, dst)
|
|
{
|
|
BEESTRACE("checking constraints on " << *this);
|
|
|
|
// Must not initially overlap
|
|
THROW_CHECK2(invalid_argument, first, second, !first.overlaps(second));
|
|
|
|
// Must initially be equal
|
|
THROW_CHECK2(invalid_argument, first, second, first.size() == second.size());
|
|
|
|
// Can't check content unless open
|
|
if (!first.fd() || !second.fd()) {
|
|
return;
|
|
}
|
|
|
|
// Must check every block individually
|
|
off_t first_begin = first.begin();
|
|
off_t second_begin = second.begin();
|
|
off_t size = first.size();
|
|
while (size) {
|
|
off_t len = min(BLOCK_SIZE_SUMS, size);
|
|
BeesBlockData first_bbd(first.fd(), first_begin, len);
|
|
BeesBlockData second_bbd(second.fd(), second_begin, len);
|
|
THROW_CHECK2(invalid_argument, first_bbd, second_bbd, first_bbd.is_data_equal(second_bbd));
|
|
first_begin += len;
|
|
second_begin += len;
|
|
size -= len;
|
|
}
|
|
}
|
|
|
|
bool
|
|
BeesRangePair::operator<(const BeesRangePair &that) const
|
|
{
|
|
// Order by destination then source
|
|
return tie(second, first) < tie(that.second, that.first);
|
|
}
|
|
|
|
bool
|
|
BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
|
|
{
|
|
BEESTOOLONG("grow constrained = " << constrained << " *this = " << *this);
|
|
BEESTRACE("grow constrained = " << constrained << " *this = " << *this);
|
|
bool rv = false;
|
|
Timer grow_backward_timer;
|
|
|
|
THROW_CHECK1(invalid_argument, first.begin(), (first.begin() & BLOCK_MASK_CLONE) == 0);
|
|
THROW_CHECK1(invalid_argument, second.begin(), (second.begin() & BLOCK_MASK_CLONE) == 0);
|
|
|
|
// We should not be overlapping already
|
|
THROW_CHECK2(invalid_argument, first, second, !first.overlaps(second));
|
|
|
|
BtrfsExtentWalker ew_second(second.fd());
|
|
|
|
// Stop on aligned extent boundary
|
|
ew_second.seek(second.begin());
|
|
|
|
Extent e_second = ew_second.current();
|
|
BEESTRACE("e_second " << e_second);
|
|
|
|
// Preread entire extent
|
|
readahead(second.fd(), e_second.begin(), e_second.size());
|
|
readahead(first.fd(), e_second.begin() + first.begin() - second.begin(), e_second.size());
|
|
|
|
auto hash_table = ctx->hash_table();
|
|
|
|
// Look backward
|
|
BEESTRACE("grow_backward " << *this);
|
|
while (first.size() < BLOCK_SIZE_MAX_EXTENT) {
|
|
if (second.begin() <= e_second.begin()) {
|
|
#if 0
|
|
if (constrained) {
|
|
break;
|
|
}
|
|
BEESCOUNT(pairbackward_extent);
|
|
ew_second.seek(second.begin() - min(BLOCK_SIZE_CLONE, second.begin()));
|
|
e_second = ew_second.current();
|
|
if (e_second.flags() & Extent::HOLE) {
|
|
BEESCOUNT(pairbackward_hole);
|
|
break;
|
|
}
|
|
readahead(second.fd(), e_second.begin(), e_second.size());
|
|
#else
|
|
// This tends to repeatedly process extents that were recently processed.
|
|
// We tend to catch duplicate blocks early since we scan them forwards.
|
|
// Also, reading backwards is slow so we probably don't want to do it much.
|
|
break;
|
|
#endif
|
|
}
|
|
BEESCOUNT(pairbackward_try);
|
|
|
|
// Extend first range. If we hit BOF we can go no further.
|
|
BeesFileRange new_first = first;
|
|
BEESTRACE("new_first = " << new_first);
|
|
new_first.grow_begin(BLOCK_SIZE_CLONE);
|
|
if (new_first.begin() == first.begin()) {
|
|
BEESCOUNT(pairbackward_bof_first);
|
|
break;
|
|
}
|
|
|
|
// Source extent cannot be toxic
|
|
BeesAddress first_addr(first.fd(), new_first.begin());
|
|
if (!first_addr.is_magic()) {
|
|
auto first_resolved = ctx->resolve_addr(first_addr);
|
|
if (first_resolved.is_toxic()) {
|
|
BEESLOGWARN("WORKAROUND: not growing matching pair backward because src addr is toxic:\n" << *this);
|
|
BEESCOUNT(pairbackward_toxic_addr);
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Extend second range. If we hit BOF we can go no further.
|
|
BeesFileRange new_second = second;
|
|
BEESTRACE("new_second = " << new_second);
|
|
new_second.grow_begin(BLOCK_SIZE_CLONE);
|
|
if (new_second.begin() == second.begin()) {
|
|
BEESCOUNT(pairbackward_bof_second);
|
|
break;
|
|
}
|
|
|
|
// If the ranges now overlap we went too far
|
|
if (new_first.overlaps(new_second)) {
|
|
BEESCOUNT(pairbackward_overlap);
|
|
break;
|
|
}
|
|
|
|
BEESTRACE("first " << first << " new_first " << new_first);
|
|
BeesBlockData first_bbd(first.fd(), new_first.begin(), first.begin() - new_first.begin());
|
|
BEESTRACE("first_bbd " << first_bbd);
|
|
BEESTRACE("second " << second << " new_second " << new_second);
|
|
BeesBlockData second_bbd(second.fd(), new_second.begin(), second.begin() - new_second.begin());
|
|
BEESTRACE("second_bbd " << second_bbd);
|
|
|
|
// Both blocks must have identical content
|
|
if (!first_bbd.is_data_equal(second_bbd)) {
|
|
BEESCOUNT(pairbackward_miss);
|
|
break;
|
|
}
|
|
|
|
// Physical blocks must be distinct
|
|
if (first_bbd.addr().get_physical_or_zero() == second_bbd.addr().get_physical_or_zero()) {
|
|
BEESCOUNT(pairbackward_same);
|
|
break;
|
|
}
|
|
|
|
// Source block cannot be zero in a non-compressed non-magic extent
|
|
if (first_bbd.is_data_zero() && !first_addr.is_magic() && !first_addr.is_compressed()) {
|
|
BEESCOUNT(pairbackward_zero);
|
|
break;
|
|
}
|
|
|
|
// Source block cannot have a toxic hash
|
|
auto found_hashes = hash_table->find_cell(first_bbd.hash());
|
|
bool found_toxic = false;
|
|
for (auto i : found_hashes) {
|
|
if (BeesAddress(i.e_addr).is_toxic()) {
|
|
found_toxic = true;
|
|
break;
|
|
}
|
|
}
|
|
if (found_toxic) {
|
|
BEESLOGWARN("WORKAROUND: found toxic hash in " << first_bbd << " while extending backward:\n" << *this);
|
|
BEESCOUNT(pairbackward_toxic_hash);
|
|
break;
|
|
}
|
|
|
|
THROW_CHECK2(invalid_argument, new_first.size(), new_second.size(), new_first.size() == new_second.size());
|
|
first = new_first;
|
|
second = new_second;
|
|
rv = true;
|
|
BEESCOUNT(pairbackward_hit);
|
|
}
|
|
BEESCOUNT(pairbackward_stop);
|
|
BEESCOUNTADD(pairbackward_ms, grow_backward_timer.age() * 1000);
|
|
|
|
// Look forward
|
|
BEESTRACE("grow_forward " << *this);
|
|
Timer grow_forward_timer;
|
|
while (first.size() < BLOCK_SIZE_MAX_EXTENT) {
|
|
if (second.end() >= e_second.end()) {
|
|
if (constrained) {
|
|
break;
|
|
}
|
|
BEESCOUNT(pairforward_extent);
|
|
ew_second.seek(second.end());
|
|
e_second = ew_second.current();
|
|
if (e_second.flags() & Extent::HOLE) {
|
|
BEESCOUNT(pairforward_hole);
|
|
break;
|
|
}
|
|
readahead(second.fd(), e_second.begin(), e_second.size());
|
|
}
|
|
BEESCOUNT(pairforward_try);
|
|
|
|
// Extend first range. If we hit EOF we can go no further.
|
|
BeesFileRange new_first = first;
|
|
BEESTRACE("new_first = " << new_first);
|
|
new_first.grow_end(BLOCK_SIZE_CLONE);
|
|
if (new_first.end() == first.end()) {
|
|
BEESCOUNT(pairforward_eof_first);
|
|
break;
|
|
}
|
|
|
|
// Source extent cannot be toxic
|
|
BeesAddress first_addr(first.fd(), new_first.begin());
|
|
if (!first_addr.is_magic()) {
|
|
auto first_resolved = ctx->resolve_addr(first_addr);
|
|
if (first_resolved.is_toxic()) {
|
|
BEESLOGWARN("WORKAROUND: not growing matching pair forward because src is toxic:\n" << *this);
|
|
BEESCOUNT(pairforward_toxic);
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Extend second range. If we hit EOF we can go no further.
|
|
BeesFileRange new_second = second;
|
|
BEESTRACE("new_second = " << new_second);
|
|
new_second.grow_end(BLOCK_SIZE_CLONE);
|
|
if (new_second.end() == second.end()) {
|
|
BEESCOUNT(pairforward_eof_second);
|
|
break;
|
|
}
|
|
|
|
// If we have hit an unaligned EOF then it has to be the same unaligned EOF.
|
|
// If we haven't hit EOF then the ends of the ranges are still aligned,
|
|
// so the misalignment (zero) will be equal.
|
|
if ((new_second.end() & BLOCK_MASK_CLONE) != (new_first.end() & BLOCK_MASK_CLONE)) {
|
|
BEESCOUNT(pairforward_eof_malign);
|
|
break;
|
|
}
|
|
|
|
// If the ranges now overlap we went too far
|
|
if (new_first.overlaps(new_second)) {
|
|
BEESCOUNT(pairforward_overlap);
|
|
break;
|
|
}
|
|
|
|
BEESTRACE("first " << first << " new_first " << new_first);
|
|
BeesBlockData first_bbd(first.fd(), first.end(), new_first.end() - first.end());
|
|
BEESTRACE("first_bbd " << first_bbd);
|
|
BEESTRACE("second " << second << " new_second " << new_second);
|
|
BeesBlockData second_bbd(second.fd(), second.end(), new_second.end() - second.end());
|
|
BEESTRACE("second_bbd " << second_bbd);
|
|
|
|
// Both blocks must have identical content
|
|
if (!first_bbd.is_data_equal(second_bbd)) {
|
|
BEESCOUNT(pairforward_miss);
|
|
break;
|
|
}
|
|
|
|
// Physical blocks must be distinct
|
|
if (first_bbd.addr().get_physical_or_zero() == second_bbd.addr().get_physical_or_zero()) {
|
|
BEESCOUNT(pairforward_same);
|
|
break;
|
|
}
|
|
|
|
// Source block cannot be zero in a non-compressed non-magic extent
|
|
if (first_bbd.is_data_zero() && !first_addr.is_magic() && !first_addr.is_compressed()) {
|
|
BEESCOUNT(pairforward_zero);
|
|
break;
|
|
}
|
|
|
|
// Source block cannot have a toxic hash
|
|
auto found_hashes = hash_table->find_cell(first_bbd.hash());
|
|
bool found_toxic = false;
|
|
for (auto i : found_hashes) {
|
|
if (BeesAddress(i.e_addr).is_toxic()) {
|
|
found_toxic = true;
|
|
break;
|
|
}
|
|
}
|
|
if (found_toxic) {
|
|
BEESLOGWARN("WORKAROUND: found toxic hash in " << first_bbd << " while extending forward:\n" << *this);
|
|
BEESCOUNT(pairforward_toxic_hash);
|
|
break;
|
|
}
|
|
|
|
// OK, next block
|
|
THROW_CHECK2(invalid_argument, new_first.size(), new_second.size(), new_first.size() == new_second.size());
|
|
first = new_first;
|
|
second = new_second;
|
|
rv = true;
|
|
BEESCOUNT(pairforward_hit);
|
|
}
|
|
|
|
if (first.overlaps(second)) {
|
|
BEESLOGTRACE("after grow, first " << first << "\n\toverlaps " << second);
|
|
BEESCOUNT(bug_grow_pair_overlaps);
|
|
}
|
|
|
|
BEESCOUNT(pairforward_stop);
|
|
BEESCOUNTADD(pairforward_ms, grow_forward_timer.age() * 1000);
|
|
return rv;
|
|
}
|
|
|
|
BeesRangePair
|
|
BeesRangePair::copy_closed() const
|
|
{
|
|
return BeesRangePair(first.copy_closed(), second.copy_closed());
|
|
}
|
|
|
|
ostream &
|
|
operator<<(ostream &os, const BeesAddress &ba)
|
|
{
|
|
if (ba.is_magic()) {
|
|
enum {
|
|
ZERO = BeesAddress::MagicValue::ZERO,
|
|
DELALLOC = BeesAddress::MagicValue::DELALLOC,
|
|
HOLE = BeesAddress::MagicValue::HOLE,
|
|
UNUSABLE = BeesAddress::MagicValue::UNUSABLE,
|
|
};
|
|
static const bits_ntoa_table table[] = {
|
|
NTOA_TABLE_ENTRY_ENUM(ZERO),
|
|
NTOA_TABLE_ENTRY_ENUM(DELALLOC),
|
|
NTOA_TABLE_ENTRY_ENUM(HOLE),
|
|
NTOA_TABLE_ENTRY_ENUM(UNUSABLE),
|
|
NTOA_TABLE_ENTRY_END()
|
|
};
|
|
return os << bits_ntoa(static_cast<BeesAddress::Type>(ba), table);
|
|
}
|
|
|
|
auto gpz = ba.get_physical_or_zero();
|
|
if (gpz == 0x1000) {
|
|
os << "NIL";
|
|
} else {
|
|
os << to_hex(gpz);
|
|
}
|
|
|
|
if (ba.is_toxic()) {
|
|
os << "t";
|
|
}
|
|
|
|
if (ba.is_unaligned_eof()) {
|
|
os << "u";
|
|
}
|
|
|
|
if (ba.is_compressed()) {
|
|
os << "z";
|
|
if (ba.has_compressed_offset()) {
|
|
os << astringprintf("%" PRIx64, ba.get_compressed_offset());
|
|
}
|
|
}
|
|
|
|
return os;
|
|
}
|
|
|
|
bool
|
|
BeesAddress::magic_check(uint64_t flags)
|
|
{
|
|
// This one isn't FIEMAP
|
|
if (flags & Extent::HOLE) {
|
|
m_addr = HOLE;
|
|
BEESCOUNT(addr_hole);
|
|
return true;
|
|
}
|
|
|
|
// These trigger extra processing steps for compressed extents
|
|
static const unsigned compressed_flags = FIEMAP_EXTENT_ENCODED;
|
|
|
|
// These indicate the extent is not yet on disk (try again with sync)
|
|
static const unsigned delalloc_flags = FIEMAP_EXTENT_UNKNOWN | FIEMAP_EXTENT_DELALLOC;
|
|
|
|
// These flags are irrelevant to extent-same
|
|
static const unsigned ignore_flags = FIEMAP_EXTENT_LAST | FIEMAP_EXTENT_SHARED;
|
|
|
|
// These flags mean we can't use extent-same
|
|
static const unsigned unusable_flags = FIEMAP_EXTENT_NOT_ALIGNED | FIEMAP_EXTENT_DATA_INLINE;
|
|
|
|
// All of the above (any other flag is a new feature we maybe can't cope with)
|
|
static const unsigned recognized_flags = compressed_flags | delalloc_flags | ignore_flags | unusable_flags;
|
|
|
|
if (flags & ~recognized_flags) {
|
|
BEESLOGTRACE("Unrecognized flags in " << fiemap_extent_flags_ntoa(flags));
|
|
m_addr = UNUSABLE;
|
|
// maybe we throw here?
|
|
BEESCOUNT(addr_unrecognized);
|
|
return true;
|
|
}
|
|
|
|
if (flags & unusable_flags) {
|
|
// we know these, but can't touch them
|
|
BEESCOUNT(addr_unusable);
|
|
m_addr = UNUSABLE;
|
|
return true;
|
|
}
|
|
|
|
if (flags & delalloc_flags) {
|
|
// delayed allocation, try again with force
|
|
BEESCOUNT(addr_delalloc);
|
|
m_addr = DELALLOC;
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
BeesAddress::BeesAddress(const Extent &e, off_t offset) :
|
|
m_addr(ZERO)
|
|
{
|
|
BEESTRACE("BeesAddress " << e << " offset " << to_hex(offset));
|
|
Type new_addr = 0;
|
|
|
|
THROW_CHECK1(invalid_argument, e, (e.physical() & BLOCK_MASK_CLONE) == 0);
|
|
THROW_CHECK1(invalid_argument, e, (e.begin() & BLOCK_MASK_CLONE) == 0);
|
|
THROW_CHECK1(invalid_argument, e, (offset & BLOCK_MASK_CLONE) == 0);
|
|
THROW_CHECK1(invalid_argument, e, e.end() > e.begin());
|
|
|
|
if (magic_check(e.flags())) {
|
|
BEESCOUNT(addr_magic);
|
|
return;
|
|
}
|
|
|
|
// All addresses from here on are physical
|
|
THROW_CHECK1(invalid_argument, e, e.physical() > 0);
|
|
|
|
if (e.flags() & FIEMAP_EXTENT_ENCODED) {
|
|
THROW_CHECK1(invalid_argument, e, (e.offset() & BLOCK_MASK_CLONE) == 0);
|
|
THROW_CHECK1(invalid_argument, e, e.offset() >= 0 && e.offset() < BLOCK_SIZE_MAX_COMPRESSED_EXTENT);
|
|
int extent_offset = offset - e.begin() + e.offset();
|
|
BEESTRACE("extent_offset = " << to_hex(extent_offset));
|
|
THROW_CHECK1(invalid_argument, extent_offset, extent_offset >= 0 && extent_offset < BLOCK_SIZE_MAX_COMPRESSED_EXTENT);
|
|
THROW_CHECK1(invalid_argument, extent_offset, (extent_offset & BLOCK_MASK_CLONE) == 0);
|
|
unsigned offset_bits = (extent_offset / BLOCK_SIZE_CLONE) + 1;
|
|
BEESTRACE("offset_bits = " << offset_bits);
|
|
THROW_CHECK1(invalid_argument, offset_bits, offset_bits >= c_offset_min && offset_bits <= c_offset_max);
|
|
THROW_CHECK1(invalid_argument, offset_bits, (offset_bits & ~c_offset_mask) == 0);
|
|
#if 1
|
|
new_addr = e.physical() | c_compressed_mask | offset_bits;
|
|
BEESCOUNT(addr_compressed_offset);
|
|
#else
|
|
new_addr = e.physical() | c_compressed_mask;
|
|
BEESCOUNT(addr_compressed);
|
|
#endif
|
|
} else {
|
|
new_addr = e.physical() + (offset - e.begin());
|
|
BEESCOUNT(addr_uncompressed);
|
|
}
|
|
|
|
if ((e.flags() & FIEMAP_EXTENT_LAST) && (e.end() & BLOCK_MASK_CLONE) != 0 && (offset & ~BLOCK_MASK_CLONE) == (e.end() & ~BLOCK_MASK_CLONE)) {
|
|
new_addr |= c_eof_mask;
|
|
BEESCOUNT(addr_eof_e);
|
|
}
|
|
|
|
m_addr = new_addr;
|
|
BEESCOUNT(addr_block);
|
|
}
|
|
|
|
BeesAddress::BeesAddress(int fd, off_t offset) :
|
|
m_addr(ZERO)
|
|
{
|
|
BEESTOOLONG("BeesAddress(fd " << fd << " " << name_fd(fd) << " offset " << to_hex(offset) << ")");
|
|
BEESTRACE("BeesAddress(fd " << fd << " " << name_fd(fd) << " offset " << to_hex(offset) << ")");
|
|
|
|
Type uoffset = ranged_cast<Type>(offset);
|
|
|
|
THROW_CHECK1(invalid_argument, uoffset, (uoffset & c_all_mask) == 0);
|
|
THROW_CHECK1(invalid_argument, uoffset, (uoffset & BLOCK_MASK_CLONE) == 0);
|
|
|
|
Timer extentwalker_timer;
|
|
BtrfsExtentWalker ew(fd, uoffset);
|
|
Extent e = ew.current();
|
|
BEESCOUNT(addr_from_fd);
|
|
BEESCOUNTADD(addr_ms, extentwalker_timer.age() * 1000);
|
|
|
|
*this = BeesAddress(e, offset);
|
|
}
|
|
|
|
BeesAddress::BeesAddress(int fd, off_t offset, shared_ptr<BeesContext> ctx) :
|
|
m_addr(ZERO)
|
|
{
|
|
BEESTOOLONG("BeesAddress(fd " << fd << " " << name_fd(fd) << " offset " << to_hex(offset) << " ctx " << ctx->root_path() << ")");
|
|
BEESTRACE("BeesAddress(fd " << fd << " " << name_fd(fd) << " offset " << to_hex(offset) << " ctx " << ctx->root_path() << ")");
|
|
|
|
Type uoffset = ranged_cast<Type>(offset);
|
|
|
|
THROW_CHECK1(invalid_argument, uoffset, (uoffset & c_all_mask) == 0);
|
|
THROW_CHECK1(invalid_argument, uoffset, (uoffset & BLOCK_MASK_CLONE) == 0);
|
|
|
|
Timer extentwalker_timer;
|
|
BtrfsExtentWalker ew(fd, uoffset, ctx->root_fd());
|
|
Extent e = ew.current();
|
|
BEESCOUNT(addr_from_root_fd);
|
|
BEESCOUNTADD(addr_ms, extentwalker_timer.age() * 1000);
|
|
|
|
*this = BeesAddress(e, offset);
|
|
}
|
|
|
|
// Get just the physical address with no extra bits or compressed block offset (magic values become zero)
|
|
|
|
BeesAddress::Type
|
|
BeesAddress::get_physical_or_zero() const
|
|
{
|
|
if (is_magic()) {
|
|
return 0;
|
|
} else {
|
|
return m_addr & ~c_all_mask;
|
|
}
|
|
}
|
|
|
|
// A compressed block address is divided into two fields:
|
|
// the beginning of the physical extent,
|
|
// and the distance (in CLONE blocks) from the start of the extent to the current block.
|
|
// Throws an exception if has_compressed_offset is not true.
|
|
|
|
BeesAddress::Type
|
|
BeesAddress::get_compressed_offset() const
|
|
{
|
|
THROW_CHECK1(invalid_argument, *this, has_compressed_offset());
|
|
return ((m_addr & c_offset_mask) - 1) * BLOCK_SIZE_CLONE;
|
|
}
|
|
|
|
void
|
|
BeesAddress::set_toxic()
|
|
{
|
|
THROW_CHECK1(invalid_argument, *this, !is_magic());
|
|
m_addr |= c_toxic_mask;
|
|
}
|
|
|
|
bool
|
|
BeesAddress::operator==(const BeesAddress &that) const
|
|
{
|
|
// If one side has an offset and the other doesn't, compare without checking offset bits
|
|
// This returns the right result for comparisons between magic and non-magic values,
|
|
// even though the math is all wrong.
|
|
if (has_compressed_offset() != that.has_compressed_offset()) {
|
|
return (m_addr & ~c_offset_mask) == (that.m_addr & ~c_offset_mask);
|
|
} else {
|
|
return m_addr == that.m_addr;
|
|
}
|
|
}
|
|
|
|
bool
|
|
BeesAddress::operator<(const BeesAddress &that) const
|
|
{
|
|
if (has_compressed_offset() != that.has_compressed_offset()) {
|
|
return (m_addr & ~c_offset_mask) < (that.m_addr & ~c_offset_mask);
|
|
} else {
|
|
return m_addr < that.m_addr;
|
|
}
|
|
}
|
|
|
|
ostream &
|
|
operator<<(ostream &os, const BeesBlockData &bbd)
|
|
{
|
|
os << "BeesBlockData { " << pretty(bbd.m_length) << " " << to_hex(bbd.m_offset) << " fd = " << bbd.m_fd << " '" << name_fd(bbd.m_fd) << "'";
|
|
if (bbd.m_addr != BeesAddress::ZERO) {
|
|
os << ", address = " << bbd.m_addr;
|
|
}
|
|
if (bbd.m_hash_done) {
|
|
os << ", hash = " << bbd.m_hash;
|
|
}
|
|
if (!bbd.m_data.empty()) {
|
|
// Turn this on to debug BeesBlockData, but leave it off otherwise.
|
|
// It's a massive data leak that is only interesting to developers.
|
|
#if 0
|
|
os << ", data[" << bbd.m_data.size() << "] = '";
|
|
|
|
size_t max_print = 12;
|
|
size_t to_print = min(bbd.m_data.size(), max_print);
|
|
for (size_t i = 0; i < to_print; ++i) {
|
|
uint8_t c = bbd.m_data[i];
|
|
// We are ASCII heathens here
|
|
if (c >= 32 && c < 127 && c != '\\') {
|
|
os << c;
|
|
} else {
|
|
char buf[8];
|
|
sprintf(buf, "\\x%02x", c);
|
|
os << buf;
|
|
}
|
|
}
|
|
os << "...'";
|
|
#else
|
|
os << ", data[" << bbd.m_data.size() << "]";
|
|
#endif
|
|
}
|
|
return os << " }";
|
|
}
|
|
|
|
BeesBlockData::BeesBlockData(Fd fd, off_t offset, size_t read_length) :
|
|
m_fd(fd),
|
|
m_offset(offset),
|
|
m_length(read_length)
|
|
{
|
|
BEESTRACE("Constructing " << *this);
|
|
THROW_CHECK1(invalid_argument, m_length, m_length > 0);
|
|
THROW_CHECK1(invalid_argument, m_length, m_length <= BLOCK_SIZE_SUMS);
|
|
THROW_CHECK1(invalid_argument, m_offset, (m_offset % BLOCK_SIZE_SUMS) == 0);
|
|
}
|
|
|
|
BeesBlockData::BeesBlockData() :
|
|
m_offset(0),
|
|
m_length(0)
|
|
{
|
|
}
|
|
|
|
BeesAddress
|
|
BeesBlockData::addr() const
|
|
{
|
|
if (m_addr == BeesAddress::ZERO) {
|
|
m_addr = BeesAddress(fd(), m_offset);
|
|
}
|
|
return m_addr;
|
|
}
|
|
|
|
BeesBlockData &
|
|
BeesBlockData::addr(const BeesAddress &a)
|
|
{
|
|
m_addr = a;
|
|
return *this;
|
|
}
|
|
|
|
const BeesBlockData::Blob &
|
|
BeesBlockData::data() const
|
|
{
|
|
if (m_data.empty()) {
|
|
THROW_CHECK1(invalid_argument, size(), size() > 0);
|
|
BEESNOTE("Reading BeesBlockData " << *this);
|
|
BEESTOOLONG("Reading BeesBlockData " << *this);
|
|
Timer read_timer;
|
|
|
|
Blob rv(size());
|
|
pread_or_die(m_fd, rv, m_offset);
|
|
THROW_CHECK2(runtime_error, rv.size(), size(), ranged_cast<off_t>(rv.size()) == size());
|
|
m_data = rv;
|
|
BEESCOUNT(block_read);
|
|
BEESCOUNTADD(block_bytes, rv.size());
|
|
BEESCOUNTADD(block_ms, read_timer.age() * 1000);
|
|
}
|
|
|
|
return m_data;
|
|
}
|
|
|
|
BeesHash
|
|
BeesBlockData::hash() const
|
|
{
|
|
if (!m_hash_done) {
|
|
// We can only dedup unaligned EOF blocks against other unaligned EOF blocks,
|
|
// so we do NOT round up to a full sum block size.
|
|
const Blob &blob = data();
|
|
// TODO: It turns out that file formats with 4K block
|
|
// alignment and embedded CRC64 do exist, and every block
|
|
// of such files has the same hash. Could use a subset
|
|
// of SHA1 here instead.
|
|
m_hash = Digest::CRC::crc64(blob.data(), blob.size());
|
|
m_hash_done = true;
|
|
BEESCOUNT(block_hash);
|
|
}
|
|
|
|
return m_hash;
|
|
}
|
|
|
|
bool
|
|
BeesBlockData::is_data_zero() const
|
|
{
|
|
// The CRC64 of zero is zero, so skip some work if we already know the CRC
|
|
if (m_hash_done && m_hash != 0) {
|
|
return false;
|
|
}
|
|
|
|
// OK read block (maybe) and check every byte
|
|
for (auto c : data()) {
|
|
if (c != '\0') {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
BEESCOUNT(block_zero);
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
BeesBlockData::is_data_equal(const BeesBlockData &that) const
|
|
{
|
|
BEESTRACE("is_data_equal this = " << *this << ", that = " << that);
|
|
THROW_CHECK1(invalid_argument, size(), size() > 0);
|
|
THROW_CHECK2(invalid_argument, size(), that.size(), size() == that.size());
|
|
|
|
// skip some work if we already know the CRCs don't match
|
|
if (m_hash_done && that.m_hash_done && m_hash != that.m_hash) {
|
|
return false;
|
|
}
|
|
|
|
return data() == that.data();
|
|
}
|
|
|