1
0
mirror of https://github.com/Zygo/bees.git synced 2025-05-17 21:35:45 +02:00
bees/lib/extentwalker.cc
Zygo Blaxell e3747320cf BtrfsExtentWalker: use a buffer at least as large as a btrfs metadata page to avoid EOVERFLOW
We are getting a lot of exceptions when an inline extent is too large for the
TREE_SEARCH_V2 buffer.  This disrupts ExtentWalker's extent boundary
search when there is an inline extent at the beginning of a file:

	# fiemap foo
	Log 0x0..0x1000 Phy 0x0..0x1000 Flags FIEMAP_EXTENT_NOT_ALIGNED|FIEMAP_EXTENT_DATA_INLINE
	Log 0x1000..0x2000 Phy 0x7307f9000..0x7307fa000 Flags 0
	Log 0x2000..0x3000 Phy 0x731078000..0x731079000 Flags 0
	Log 0x3000..0x5000 Phy 0x73127d000..0x73127f000 Flags FIEMAP_EXTENT_ENCODED
	Log 0x5000..0x6000 Phy 0x73137a000..0x73137b000 Flags 0
	Log 0x6000..0x7000 Phy 0x731683000..0x731684000 Flags 0
	Log 0x7000..0x8000 Phy 0x73224f000..0x732250000 Flags 0
	Log 0x8000..0x9000 Phy 0x7323c9000..0x7323ca000 Flags 0
	Log 0x9000..0xb000 Phy 0x732425000..0x732427000 Flags FIEMAP_EXTENT_ENCODED
	Log 0xb000..0xc000 Phy 0x732598000..0x732599000 Flags 0
	Log 0xc000..0xd000 Phy 0x7325d5000..0x7325d6000 Flags FIEMAP_EXTENT_LAST

	# fiewalk foo
	exception type std::system_error: BTRFS_IOC_TREE_SEARCH_V2: /tmp/foo at fs.cc:844: Value too large for defined data type

Normally crawlers simply skip over inline extents, but ExtentWalker will
seek backward from the first non-inline extent to confirm that it has
an accurate starting block for the target extent.  This fails when it
encounters the first inline extent.

strace reveals that buffer size is too small for the first extent,
as seen here:

	ioctl(3, BTRFS_IOC_TREE_SEARCH_V2, {key={tree_id=258, min_objectid=78897856, max_objectid=UINT64_MAX, min_offset=0, max_offset=UINT64_MAX, min_transid=0, max_transid=UINT64_MAX, min_type=BTRFS_EXTENT_DATA_KEY, max_type=BTRFS_EXTENT_DATA_KEY, nr_items=16}, buf_size=1360} => {buf_size=1418}) = -1 EOVERFLOW (Value too large for defined data type)

Fix this by increasing the buffer size until it can handle the largest
possible object on the largest possible btrfs metadata page (65536 bytes).
BtrfsExtentWalker already has optimizations to minimize the allocation
cost, so we don't need any changes there.

Signed-off-by: Zygo Blaxell <bees@furryterror.org>
2019-06-12 22:48:05 -04:00

634 lines
18 KiB
C++

#include "crucible/extentwalker.h"
#include "crucible/chatter.h"
#include "crucible/error.h"
#include "crucible/fs.h"
#include "crucible/limits.h"
#include "crucible/string.h"
namespace crucible {
using namespace std;
const off_t ExtentWalker::sc_step_size;
// fm_start, fm_length, fm_flags, m_extents
// fe_logical, fe_physical, fe_length, fe_flags
static const off_t MAX_OFFSET = numeric_limits<off_t>::max();
static const off_t FIEMAP_BLOCK_SIZE = 4096;
static bool __ew_do_log = getenv("EXTENTWALKER_DEBUG");
#define EWLOG(x) do { \
if (__ew_do_log) { \
CHATTER(x); \
} \
} while (0)
ostream &
operator<<(ostream &os, const Extent &e)
{
os << "Extent {"
<< " begin = " << to_hex(e.m_begin)
<< ", end = " << to_hex(e.m_end)
<< ", physical = " << to_hex(e.m_physical)
<< ", flags = ";
if (e.m_flags & Extent::HOLE) {
os << "Extent::HOLE|";
}
if (e.m_flags & Extent::PREALLOC) {
os << "Extent::PREALLOC|";
}
if (e.m_flags & Extent::OBSCURED) {
os << "Extent::OBSCURED|";
}
if (e.m_flags & ~(Extent::HOLE | Extent::PREALLOC | Extent::OBSCURED)) {
os << fiemap_extent_flags_ntoa(e.m_flags & ~(Extent::HOLE | Extent::PREALLOC | Extent::OBSCURED));
}
if (e.m_physical_len) {
os << ", physical_len = " << to_hex(e.m_physical_len);
}
if (e.m_logical_len) {
os << ", logical_len = " << to_hex(e.m_logical_len);
}
if (e.m_offset) {
os << ", offset = " << to_hex(e.m_offset);
}
return os << " }";
}
ostream &
operator<<(ostream &os, const ExtentWalker::Vec &v)
{
os << "ExtentWalker::Vec {";
for (auto e : v) {
os << "\n\t" << e;
}
return os << "}";
}
ostream &
operator<<(ostream &os, const ExtentWalker &ew)
{
return os << "ExtentWalker {"
<< " fd = " << name_fd(ew.m_fd)
<< ", stat.st_size = " << to_hex(ew.m_stat.st_size)
<< ", extents = " << ew.m_extents
<< ", current = [" << ew.m_current - ew.m_extents.begin()
<< "] }";
}
Extent::operator bool() const
{
THROW_CHECK2(invalid_argument, m_begin, m_end, m_end >= m_begin);
return m_end > m_begin;
}
off_t
Extent::size() const
{
THROW_CHECK2(invalid_argument, m_begin, m_end, m_end >= m_begin);
return m_end - m_begin;
}
bool
Extent::operator==(const Extent &that) const
{
return m_begin == that.m_begin && m_end == that.m_end && m_physical == that.m_physical && m_flags == that.m_flags;
}
bool
Extent::compressed() const
{
return m_flags & FIEMAP_EXTENT_ENCODED;
}
uint64_t
Extent::bytenr() const
{
return compressed() ? m_physical : m_physical - m_offset;
}
ExtentWalker::ExtentWalker(Fd fd) :
m_fd(fd),
m_current(m_extents.begin())
{
}
ExtentWalker::ExtentWalker(Fd fd, off_t initial_pos) :
m_fd(fd),
m_current(m_extents.begin())
{
seek(initial_pos);
}
ExtentWalker::Itr
ExtentWalker::find_in_cache(off_t pos)
{
EWLOG("find_in_cache " << to_hex(pos));
// EOF is an annoying special case
if (pos >= m_stat.st_size) {
if (!m_extents.empty() && m_extents.rbegin()->m_end == m_stat.st_size) {
auto i = m_extents.end();
return --i;
}
}
for (auto vi = m_extents.begin(); vi != m_extents.end(); ++vi) {
if (pos >= vi->m_begin && pos < vi->m_end) {
EWLOG("pos " << to_hex(pos) << " in " << *vi);
if (vi == m_extents.begin() && !(m_extents.begin()->m_begin == 0)) {
// Must have an extent before pos, unless
// there can be no extent before pos because pos == 0
EWLOG("can't match first unless begin is BOF");
break;
}
auto ni = vi;
++ni;
if (ni == m_extents.end() && !(vi->m_end >= m_stat.st_size)) {
// Must have an extent after pos, unless
// there can be no extent after pos because pos >= EOF
EWLOG("can't match last unless end past EOF " << to_hex(m_stat.st_size));
break;
}
// Extent surrounded on either side by other known extents
return vi;
}
}
EWLOG("find_in_cache failed: " << *this);
return m_extents.end();
}
void
ExtentWalker::run_fiemap(off_t pos)
{
ostringstream log;
CHATTER_UNWIND("Log of run_fiemap: " << log.str());
EWLOG("pos = " << to_hex(pos));
THROW_CHECK1(invalid_argument, pos, (pos & (FIEMAP_BLOCK_SIZE - 1)) == 0);
Vec fm;
off_t step_size = pos;
off_t begin = pos - min(pos, sc_step_size);
// This loop should not run forever
int loop_count = 0;
int loop_limit = 99;
while (true) {
if (loop_count == 90) {
EWLOG(log.str());
}
THROW_CHECK1(runtime_error, loop_count, loop_count < loop_limit);
++loop_count;
// Get file size every time in case it changes under us
m_stat.fstat(m_fd);
// Get fiemap begin..EOF
fm = get_extent_map(begin);
EWLOG("fiemap result loop count #" << loop_count << ":" << fm);
// This algorithm seeks at least three extents: one before,
// one after, and one containing pos. Files which contain
// two or fewer extents will cause an obvious problem with that,
// so handle those cases separately.
// FIEMAP lies, and we catch it in a lie about the size of the
// second extent. To work around this, try getting more than 3.
// 0..2(ish) extents
if (fm.size() < sc_extent_fetch_min) {
// If we are not at beginning of file, move backward
if (begin > 0) {
step_size /= 2;
auto next_begin = (begin - min(step_size, begin)) & ~(FIEMAP_BLOCK_SIZE - 1);
EWLOG("step backward " << to_hex(begin) << " -> " << to_hex(next_begin) << " extents size " << fm.size());
if (begin == next_begin) {
EWLOG("step backward stopped");
break;
}
begin = next_begin;
continue;
}
// We are at beginning of file and have too few extents.
// Zero extents? Entire file is a hole.
if (fm.empty()) {
EWLOG("zero extents");
break;
}
// We know we have the beginning of the file and at least
// one extent. If the last extent is EOF then we have the
// whole file in the buffer. If the last extent is NOT
// EOF then fiemap did something we didn't expect.
THROW_CHECK1(runtime_error, fm.rbegin()->flags(), fm.rbegin()->flags() & FIEMAP_EXTENT_LAST);
break;
}
// We have at least three extents, so there is now a first and last.
// We want pos to be between first and last. There doesn't have
// to be an extent between these (it could be a hole).
auto &first_extent = fm.at(sc_extent_fetch_min - 2);
auto &last_extent = *fm.rbegin();
EWLOG("first_extent = " << first_extent);
EWLOG("last_extent = " << last_extent);
// First extent must end on or before pos
if (first_extent.end() > pos) {
// Can we move backward?
if (begin > 0) {
step_size /= 2;
auto next_begin = (begin - min(step_size, begin)) & ~(FIEMAP_BLOCK_SIZE - 1);
EWLOG("step backward " << to_hex(begin) << " -> " << to_hex(next_begin) << " extents size " << fm.size());
if (begin == next_begin) {
EWLOG("step backward stopped");
break;
}
begin = next_begin;
continue;
}
// We are as far back as we can go, so there must be no
// extent before pos (i.e. file starts with a hole).
EWLOG("no extent before pos");
break;
}
// First extent ends on or before pos.
// If last extent is EOF then we have the entire file in the buffer.
// pos could be in last extent, so skip the later checks that
// insist pos be located prior to the last extent.
if (last_extent.flags() & FIEMAP_EXTENT_LAST) {
break;
}
// Don't have EOF, must have an extent after pos.
if (last_extent.begin() <= pos) {
step_size /= 2;
auto new_begin = (begin + step_size) & ~(FIEMAP_BLOCK_SIZE - 1);
EWLOG("step forward " << to_hex(begin) << " -> " << to_hex(new_begin));
if (begin == new_begin) {
EWLOG("step forward stopped");
break;
}
begin = new_begin;
continue;
}
// Last extent begins after pos, first extent ends on or before pos.
// All other cases should have been handled before here.
THROW_CHECK2(runtime_error, pos, first_extent, first_extent.end() <= pos);
THROW_CHECK2(runtime_error, pos, last_extent, last_extent.begin() > pos);
// We should probably stop now
break;
}
// Fill in holes so there are Extent records over entire range
auto fmi = fm.begin();
off_t ipos = begin;
Vec new_vec;
// If we mapped the entire file and there are no extents,
// the entire file is a hole.
bool last_extent_is_last = (begin == 0 && fm.empty());
while (fmi != fm.end()) {
Extent new_extent(*fmi);
THROW_CHECK2(runtime_error, ipos, new_extent.m_begin, ipos <= new_extent.m_begin);
if (new_extent.m_begin > ipos) {
Extent hole_extent;
hole_extent.m_begin = ipos;
hole_extent.m_end = fmi->begin();
hole_extent.m_physical = 0;
hole_extent.m_flags = Extent::HOLE;
new_vec.push_back(hole_extent);
ipos += hole_extent.size();
}
THROW_CHECK2(runtime_error, ipos, new_extent.m_begin, ipos == new_extent.m_begin);
new_vec.push_back(new_extent);
ipos += new_extent.size();
last_extent_is_last = fmi->flags() & FIEMAP_EXTENT_LAST;
++fmi;
}
// If we have run out of extents before EOF, insert a hole at the end
if (last_extent_is_last && ipos < m_stat.st_size) {
Extent hole_extent;
hole_extent.m_begin = ipos;
hole_extent.m_end = m_stat.st_size;
hole_extent.m_physical = 0;
hole_extent.m_flags = Extent::HOLE;
if (!new_vec.empty() && new_vec.rbegin()->m_flags & FIEMAP_EXTENT_LAST) {
new_vec.rbegin()->m_flags &= ~(FIEMAP_EXTENT_LAST);
hole_extent.m_flags |= FIEMAP_EXTENT_LAST;
}
new_vec.push_back(hole_extent);
ipos += new_vec.size();
}
THROW_CHECK1(runtime_error, new_vec.size(), !new_vec.empty());
// Allow last extent to extend beyond desired range (e.g. at EOF)
// ...but that's not what this does
// THROW_CHECK3(runtime_error, ipos, new_vec.rbegin()->m_end, m_stat.st_size, ipos <= new_vec.rbegin()->m_end);
// If we have the last extent in the file, truncate it to the file size.
if (ipos >= m_stat.st_size) {
THROW_CHECK2(runtime_error, new_vec.rbegin()->m_begin, m_stat.st_size, m_stat.st_size > new_vec.rbegin()->m_begin);
THROW_CHECK2(runtime_error, new_vec.rbegin()->m_end, m_stat.st_size, m_stat.st_size <= new_vec.rbegin()->m_end);
new_vec.rbegin()->m_end = m_stat.st_size;
}
// Verify contiguous, ascending order, at least one Extent
THROW_CHECK1(runtime_error, new_vec, !new_vec.empty());
ipos = new_vec.begin()->m_begin;
bool last_flag_last = false;
for (auto e : new_vec) {
THROW_CHECK1(runtime_error, new_vec, e.m_begin == ipos);
THROW_CHECK1(runtime_error, e, e.size() > 0);
THROW_CHECK1(runtime_error, new_vec, !last_flag_last);
ipos += e.size();
last_flag_last = e.m_flags & FIEMAP_EXTENT_LAST;
}
THROW_CHECK1(runtime_error, new_vec, !last_extent_is_last || new_vec.rbegin()->m_end == ipos);
m_extents = new_vec;
m_current = m_extents.begin();
}
void
ExtentWalker::reset()
{
m_extents.clear();
m_current = m_extents.begin();
}
void
ExtentWalker::seek(off_t pos)
{
CHATTER_UNWIND("seek " << to_hex(pos));
THROW_CHECK1(out_of_range, pos, pos >= 0);
Itr rv = find_in_cache(pos);
if (rv != m_extents.end()) {
m_current = rv;
return;
}
run_fiemap(pos);
m_current = find_in_cache(pos);
}
Extent
ExtentWalker::current()
{
THROW_CHECK2(invalid_argument, *this, m_extents.size(), m_current != m_extents.end());
CHATTER_UNWIND("current " << *m_current);
return *m_current;
}
bool
ExtentWalker::next()
{
CHATTER_UNWIND("next");
THROW_CHECK1(invalid_argument, (m_current != m_extents.end()), m_current != m_extents.end());
if (current().m_end >= m_stat.st_size) {
CHATTER_UNWIND("next EOF");
return false;
}
auto next_pos = current().m_end;
if (next_pos >= m_stat.st_size) {
CHATTER_UNWIND("next next_pos = " << next_pos << " m_stat.st_size = " << m_stat.st_size);
return false;
}
seek(next_pos);
THROW_CHECK1(runtime_error, (m_current != m_extents.end()), m_current != m_extents.end());
// FIEMAP is full of lies, so this check keeps failing
// THROW_CHECK2(runtime_error, current().m_begin, next_pos, current().m_begin == next_pos);
// Just ensure that pos is in the next extent somewhere.
THROW_CHECK2(runtime_error, current(), next_pos, current().m_begin <= next_pos);
THROW_CHECK2(runtime_error, current(), next_pos, current().m_end > next_pos);
return true;
}
bool
ExtentWalker::prev()
{
CHATTER_UNWIND("prev");
THROW_CHECK1(invalid_argument, (m_current != m_extents.end()), m_current != m_extents.end());
auto prev_iter = m_current;
if (prev_iter->m_begin == 0) {
CHATTER_UNWIND("prev BOF");
return false;
}
THROW_CHECK1(invalid_argument, (prev_iter != m_extents.begin()), prev_iter != m_extents.begin());
--prev_iter;
CHATTER_UNWIND("prev seeking to " << *prev_iter << "->m_begin");
auto prev_end = current().m_begin;
seek(prev_iter->m_begin);
THROW_CHECK1(runtime_error, (m_current != m_extents.end()), m_current != m_extents.end());
THROW_CHECK2(runtime_error, current().m_end, prev_end, current().m_end == prev_end);
return true;
}
ExtentWalker::~ExtentWalker()
{
}
BtrfsExtentWalker::BtrfsExtentWalker(Fd fd) :
ExtentWalker(fd),
m_tree_id(0)
{
}
BtrfsExtentWalker::BtrfsExtentWalker(Fd fd, off_t initial_pos) :
ExtentWalker(fd),
m_tree_id(0)
{
seek(initial_pos);
}
void
BtrfsExtentWalker::set_root_fd(Fd root_fd)
{
m_root_fd = root_fd;
}
BtrfsExtentWalker::BtrfsExtentWalker(Fd fd, off_t initial_pos, Fd root_fd) :
ExtentWalker(fd),
m_tree_id(0)
{
set_root_fd(root_fd);
seek(initial_pos);
}
BtrfsExtentWalker::Vec
BtrfsExtentWalker::get_extent_map(off_t pos)
{
BtrfsIoctlSearchKey sk(65536);
if (!m_root_fd) {
m_root_fd = m_fd;
}
if (!m_tree_id) {
m_tree_id = btrfs_get_root_id(m_fd);
}
sk.tree_id = m_tree_id;
sk.min_objectid = m_stat.st_ino;
sk.max_objectid = numeric_limits<uint64_t>::max();
sk.min_offset = ranged_cast<uint64_t>(pos);
sk.max_offset = numeric_limits<uint64_t>::max();
sk.min_transid = 0;
sk.max_transid = numeric_limits<uint64_t>::max();
sk.min_type = sk.max_type = BTRFS_EXTENT_DATA_KEY;
sk.nr_items = sc_extent_fetch_max;
CHATTER_UNWIND("sk " << sk << " root_fd " << name_fd(m_root_fd));
sk.do_ioctl(m_root_fd);
Vec rv;
bool past_eof = false;
for (auto i : sk.m_result) {
// If we're seeing extents from the next file then we're past EOF on this file
if (i.objectid > m_stat.st_ino) {
past_eof = true;
break;
}
// Ignore things that aren't EXTENT_DATA_KEY
if (i.type != BTRFS_EXTENT_DATA_KEY) {
continue;
}
// Hmmmkay we shouldn't be seeing these
if (i.objectid < m_stat.st_ino) {
THROW_ERROR(out_of_range, "objectid " << i.objectid << " < m_stat.st_ino " << m_stat.st_ino);
continue;
}
Extent e;
e.m_begin = i.offset;
auto compressed = call_btrfs_get(btrfs_stack_file_extent_compression, i.m_data);
// FIEMAP told us about compressed extents and we can too
if (compressed) {
e.m_flags |= FIEMAP_EXTENT_ENCODED;
}
auto type = call_btrfs_get(btrfs_stack_file_extent_type, i.m_data);
off_t len = -1;
switch (type) {
default:
cerr << "Unhandled file extent type " << type << " in root " << m_tree_id << " ino " << m_stat.st_ino << endl;
break;
case BTRFS_FILE_EXTENT_INLINE:
len = ranged_cast<off_t>(call_btrfs_get(btrfs_stack_file_extent_ram_bytes, i.m_data));
e.m_flags |= FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED;
// Inline extents are never obscured, so don't bother filling in m_physical_len, etc.
break;
case BTRFS_FILE_EXTENT_PREALLOC:
e.m_flags |= Extent::PREALLOC;
// fallthrough
case BTRFS_FILE_EXTENT_REG: {
e.m_physical = call_btrfs_get(btrfs_stack_file_extent_disk_bytenr, i.m_data);
// This is the length of the full extent (decompressed)
off_t ram = ranged_cast<off_t>(call_btrfs_get(btrfs_stack_file_extent_ram_bytes, i.m_data));
// This is the length of the part of the extent appearing in the file (decompressed)
len = ranged_cast<off_t>(call_btrfs_get(btrfs_stack_file_extent_num_bytes, i.m_data));
// This is the offset from start of on-disk extent to the part we see in the file (decompressed)
// May be negative due to the kind of bug we're stuck with forever, so no cast range check
off_t offset = call_btrfs_get(btrfs_stack_file_extent_offset, i.m_data);
// If there is a physical address there must be size too
if (e.m_physical) {
THROW_CHECK1(runtime_error, ram, ram > 0);
THROW_CHECK1(runtime_error, len, len > 0);
THROW_CHECK2(runtime_error, offset, ram, offset < ram);
} else {
// There are two kinds of hole in btrfs. This is the other one.
e.m_flags |= Extent::HOLE;
}
// Partially obscured extent
// FIXME: sometimes this happens:
// i.type == BTRFS_EXTENT_DATA_KEY
// type = 0x1
// compressed = 0x0
// REG start 0x0 offset 0x0 num 0x20000 ram 0x21000 gen 1101121
// btrfs_file_extent_item {
// generation = 1101121
// ram_bytes = 135168
// compression = 0x0
// encryption = 0x0
// other_encoding = 0x0
// type = 0x1
// disk_bytenr = 0x0
// disk_num_bytes = 0x0
// offset = 0x0
// num_bytes = 0x20000
// }
if (ram != len || offset != 0) {
e.m_flags |= Extent::OBSCURED;
// cerr << e << "\nram = " << ram << ", len = " << len << ", offset = " << offset << endl;
}
e.m_physical_len = ram;
e.m_logical_len = len;
e.m_offset = offset;
// To maintain compatibility with FIEMAP we ignore the offset for compressed extents.
// At some point we'll grow out of this.
if (!compressed) {
e.m_physical += offset;
}
break;
}
}
if (len > 0) {
e.m_end = e.m_begin + len;
if (e.m_end >= m_stat.st_size) {
e.m_flags |= FIEMAP_EXTENT_LAST;
}
// FIXME: no FIEMAP_EXTENT_SHARED
// WONTFIX: non-trivial to replicate LOGIAL_INO
rv.push_back(e);
}
}
// Plug a hole at EOF
if (past_eof && !rv.empty()) {
rv.rbegin()->m_flags |= FIEMAP_EXTENT_LAST;
}
return rv;
}
ExtentWalker::Vec
ExtentWalker::get_extent_map(off_t pos)
{
Fiemap fm;
fm.fm_start = ranged_cast<uint64_t>(pos);
fm.fm_length = ranged_cast<uint64_t>(numeric_limits<off_t>::max() - pos);
fm.m_max_count = fm.m_min_count = sc_extent_fetch_max;
fm.do_ioctl(m_fd);
Vec rv;
for (auto i : fm.m_extents) {
Extent e;
e.m_begin = ranged_cast<off_t>(i.fe_logical);
e.m_end = ranged_cast<off_t>(i.fe_logical + i.fe_length);
e.m_physical = i.fe_physical;
e.m_flags = i.fe_flags;
rv.push_back(e);
}
return rv;
}
};