1
0
mirror of https://github.com/Zygo/bees.git synced 2025-05-17 21:35:45 +02:00
bees/src/bees-roots.cc
Zygo Blaxell 5bdad7fc93 crucible: progress: a progress tracker for worker queues
The task queue can become very large with many subvols, requiring hours
for the queue to clear.  'beescrawl.dat' saves in the meantime will save
the work currently scheduled, not the work currently completed.

Fix by tracking progress with ProgressTracker.  ProgressTracker::begin()
gives the last completed crawl position.  ProgressTracker::end() gives
the last scheduled crawl position.  begin() does not advance if there
is any item between begin() and end() is not yet completed.  In between
are crawled extents that are on the task queue but not yet processed.
The file 'beescrawl.dat' saves the begin() position while the extent
scanning task queue is fed from the end() position.

Also remove an unused method crawl_state_get() and repurpose the
operator<(BeesCrawlState) that nobody was using.

Signed-off-by: Zygo Blaxell <bees@furryterror.org>
2018-02-28 23:49:39 -05:00

1109 lines
30 KiB
C++

#include "bees.h"
#include "crucible/cache.h"
#include "crucible/ntoa.h"
#include "crucible/string.h"
#include "crucible/task.h"
#include <fstream>
#include <tuple>
using namespace crucible;
using namespace std;
BeesRoots::ScanMode BeesRoots::s_scan_mode = BeesRoots::SCAN_MODE_ZERO;
string
format_time(time_t t)
{
struct tm *tmp = localtime(&t);
char buf[1024];
strftime(buf, sizeof(buf), "%Y-%m-%d-%H-%M-%S", tmp);
return buf;
}
ostream &
operator<<(ostream &os, const BeesCrawlState &bcs)
{
time_t now = time(NULL);
auto age = now - bcs.m_started;
return os << "BeesCrawlState "
<< bcs.m_root << ":" << bcs.m_objectid << " offset " << to_hex(bcs.m_offset)
<< " transid " << bcs.m_min_transid << ".." << bcs.m_max_transid
<< " started " << format_time(bcs.m_started) << " (" << age << "s ago)";
}
BeesCrawlState::BeesCrawlState() :
m_root(0),
m_objectid(0),
m_offset(0),
m_min_transid(0),
m_max_transid(0),
m_started(time(NULL))
{
}
bool
BeesCrawlState::operator<(const BeesCrawlState &that) const
{
return tie(m_min_transid, m_objectid, m_offset, m_root, m_max_transid)
< tie(that.m_min_transid, that.m_objectid, that.m_offset, that.m_root, that.m_max_transid);
}
string
BeesRoots::scan_mode_ntoa(BeesRoots::ScanMode mode)
{
static const bits_ntoa_table table[] = {
NTOA_TABLE_ENTRY_ENUM(SCAN_MODE_ZERO),
NTOA_TABLE_ENTRY_ENUM(SCAN_MODE_ONE),
NTOA_TABLE_ENTRY_ENUM(SCAN_MODE_TWO),
NTOA_TABLE_ENTRY_ENUM(SCAN_MODE_COUNT),
NTOA_TABLE_ENTRY_END()
};
return bits_ntoa(mode, table);
}
void
BeesRoots::set_scan_mode(ScanMode mode)
{
THROW_CHECK1(invalid_argument, mode, mode < SCAN_MODE_COUNT);
s_scan_mode = mode;
BEESLOGINFO("Scan mode set to " << mode << " (" << scan_mode_ntoa(mode) << ")");
}
string
BeesRoots::crawl_state_filename() const
{
string rv;
// Legacy filename included UUID
rv += "beescrawl.";
rv += m_ctx->root_uuid();
rv += ".dat";
struct stat buf;
if (fstatat(m_ctx->home_fd(), rv.c_str(), &buf, AT_SYMLINK_NOFOLLOW)) {
// Use new filename
rv = "beescrawl.dat";
}
return rv;
}
ostream &
BeesRoots::state_to_stream(ostream &ofs)
{
for (auto i : m_root_crawl_map) {
auto ibcs = i.second->get_state_begin();
if (ibcs.m_max_transid) {
ofs << "root " << ibcs.m_root << " ";
ofs << "objectid " << ibcs.m_objectid << " ";
ofs << "offset " << ibcs.m_offset << " ";
ofs << "min_transid " << ibcs.m_min_transid << " ";
ofs << "max_transid " << ibcs.m_max_transid << " ";
ofs << "started " << ibcs.m_started << " ";
ofs << "start_ts " << format_time(ibcs.m_started) << "\n";
}
}
return ofs;
}
void
BeesRoots::state_save()
{
BEESNOTE("saving crawl state");
BEESLOGINFO("Saving crawl state");
BEESTOOLONG("Saving crawl state");
Timer save_time;
unique_lock<mutex> lock(m_mutex);
// We don't have ofstreamat or ofdstream in C++11, so we're building a string and writing it with raw syscalls.
ostringstream ofs;
if (!m_crawl_dirty) {
BEESLOGINFO("Nothing to save");
return;
}
state_to_stream(ofs);
if (ofs.str().empty()) {
BEESLOGWARN("Crawl state empty!");
m_crawl_dirty = false;
return;
}
lock.unlock();
m_crawl_state_file.write(ofs.str());
// Renaming things is hard after release
if (m_crawl_state_file.name() != "beescrawl.dat") {
renameat(m_ctx->home_fd(), m_crawl_state_file.name().c_str(), m_ctx->home_fd(), "beescrawl.dat");
m_crawl_state_file.name("beescrawl.dat");
}
BEESNOTE("relocking crawl state");
lock.lock();
// Not really correct but probably close enough
m_crawl_dirty = false;
BEESLOGINFO("Saved crawl state in " << save_time << "s");
}
void
BeesRoots::crawl_state_set_dirty()
{
unique_lock<mutex> lock(m_mutex);
m_crawl_dirty = true;
}
void
BeesRoots::crawl_state_erase(const BeesCrawlState &bcs)
{
unique_lock<mutex> lock(m_mutex);
// Do not delete the last entry, it holds our max_transid
if (m_root_crawl_map.size() < 2) {
BEESCOUNT(crawl_no_empty);
return;
}
if (m_root_crawl_map.count(bcs.m_root)) {
m_root_crawl_map.erase(bcs.m_root);
m_crawl_dirty = true;
}
}
uint64_t
BeesRoots::transid_min()
{
BEESNOTE("Calculating transid_min");
unique_lock<mutex> lock(m_mutex);
if (m_root_crawl_map.empty()) {
return 0;
}
uint64_t rv = numeric_limits<uint64_t>::max();
for (auto i : m_root_crawl_map) {
rv = min(rv, i.second->get_state_end().m_min_transid);
}
return rv;
}
uint64_t
BeesRoots::transid_max_nocache()
{
uint64_t rv = 0;
uint64_t root = BTRFS_FS_TREE_OBJECTID;
BEESNOTE("Calculating transid_max (" << rv << " as of root " << root << ")");
BEESTRACE("Calculating transid_max...");
rv = btrfs_get_root_transid(root);
// XXX: Do we need any of this? Or is
// m_transid_re.update(btrfs_get_root_transid(BTRFS_FS_TREE_OBJECTID)) good enough?
BtrfsIoctlSearchKey sk;
sk.tree_id = BTRFS_ROOT_TREE_OBJECTID;
sk.min_type = sk.max_type = BTRFS_ROOT_BACKREF_KEY;
sk.min_objectid = root;
while (true) {
sk.nr_items = 1024;
sk.do_ioctl(m_ctx->root_fd());
if (sk.m_result.empty()) {
break;
}
for (auto i : sk.m_result) {
sk.next_min(i);
if (i.type == BTRFS_ROOT_BACKREF_KEY) {
if (i.transid > rv) {
BEESLOGDEBUG("transid_max root " << i.objectid << " parent " << i.offset << " transid " << i.transid);
BEESCOUNT(transid_max_miss);
}
root = i.objectid;
}
if (i.transid > rv) {
rv = i.transid;
}
}
}
m_transid_re.update(rv);
return rv;
}
uint64_t
BeesRoots::transid_max()
{
return m_transid_re.count();
}
size_t
BeesRoots::crawl_batch(shared_ptr<BeesCrawl> this_crawl)
{
auto ctx_copy = m_ctx;
size_t batch_count = 0;
auto subvol = this_crawl->get_state_begin().m_root;
ostringstream oss;
oss << "crawl_" << subvol;
auto task_title = oss.str();
while (batch_count < BEES_MAX_CRAWL_BATCH) {
auto this_range = this_crawl->pop_front();
if (!this_range) {
break;
}
auto this_hold = this_crawl->hold_state(this_range);
auto shared_this_copy = shared_from_this();
Task(task_title, [ctx_copy, this_hold, this_range, shared_this_copy]() {
BEESNOTE("scan_forward " << this_range);
ctx_copy->scan_forward(this_range);
shared_this_copy->crawl_state_set_dirty();
}).run();
BEESCOUNT(crawl_scan);
++batch_count;
}
return batch_count;
}
bool
BeesRoots::crawl_roots()
{
BEESNOTE("Crawling roots");
unique_lock<mutex> lock(m_mutex);
// Work from a copy because BeesCrawl might change the world under us
auto crawl_map_copy = m_root_crawl_map;
lock.unlock();
// Nothing to crawl? Seems suspicious...
if (m_root_crawl_map.empty()) {
BEESLOGINFO("idle: crawl map is empty!");
}
switch (s_scan_mode) {
case SCAN_MODE_ZERO: {
// Scan the same inode/offset tuple in each subvol (good for snapshots)
BeesFileRange first_range;
shared_ptr<BeesCrawl> first_crawl;
for (auto i : crawl_map_copy) {
auto this_crawl = i.second;
auto this_range = this_crawl->peek_front();
if (this_range) {
if (!first_range || this_range < first_range) {
first_crawl = this_crawl;
first_range = this_range;
}
}
}
if (!first_crawl) {
return false;
}
auto batch_count = crawl_batch(first_crawl);
if (batch_count) {
return true;
}
break;
}
case SCAN_MODE_ONE: {
// Scan each subvol one extent at a time (good for continuous forward progress)
size_t batch_count = 0;
for (auto i : crawl_map_copy) {
batch_count += crawl_batch(i.second);
}
if (batch_count) {
return true;
}
break;
}
case SCAN_MODE_TWO: {
// Scan oldest crawl first (requires maximum amount of temporary space)
vector<shared_ptr<BeesCrawl>> crawl_vector;
for (auto i : crawl_map_copy) {
crawl_vector.push_back(i.second);
}
sort(crawl_vector.begin(), crawl_vector.end(), [&](const shared_ptr<BeesCrawl> &a, const shared_ptr<BeesCrawl> &b) -> bool {
auto a_state = a->get_state_end();
auto b_state = b->get_state_end();
return tie(a_state.m_started, a_state.m_root) < tie(b_state.m_started, b_state.m_root);
});
size_t batch_count = 0;
for (auto i : crawl_vector) {
batch_count += crawl_batch(i);
if (batch_count) {
return true;
}
}
break;
}
case SCAN_MODE_COUNT: assert(false); break;
}
BEESCOUNT(crawl_done);
auto want_transid = m_transid_re.count() + m_transid_factor;
auto ran_out_time = m_crawl_timer.lap();
BEESLOGINFO("Crawl master ran out of data after " << ran_out_time << "s, waiting about " << m_transid_re.seconds_until(want_transid) << "s for transid " << want_transid << "...");
// Do not run again
return false;
}
void
BeesRoots::crawl_thread()
{
BEESNOTE("creating crawl task");
// Create the Task that does the crawling
auto shared_this = shared_from_this();
m_crawl_task = Task("crawl_master", [shared_this]() {
auto tqs = TaskMaster::get_queue_count();
BEESNOTE("queueing extents to scan, " << tqs << " of " << BEES_MAX_QUEUE_SIZE);
bool run_again = false;
while (tqs < BEES_MAX_QUEUE_SIZE) {
run_again = shared_this->crawl_roots();
tqs = TaskMaster::get_queue_count();
if (!run_again) {
break;
}
}
if (run_again) {
shared_this->m_crawl_task.run();
} else {
shared_this->m_task_running = false;
}
});
// Monitor transid_max and wake up roots when it changes
BEESNOTE("tracking transid");
auto last_count = m_transid_re.count();
while (true) {
// Measure current transid
catch_all([&]() {
m_transid_re.update(transid_max_nocache());
});
// Make sure we have a full complement of crawlers
catch_all([&]() {
insert_new_crawl();
});
// Don't hold root FDs open too long.
// The open FDs prevent snapshots from being deleted.
// cleaner_kthread just keeps skipping over the open dir and all its children.
// Even open files are a problem if they're big enough.
auto new_count = m_transid_re.count();
if (new_count != last_count) {
m_ctx->fd_cache()->clear();
}
last_count = new_count;
// If no crawl task is running, start a new one
bool already_running = m_task_running.exchange(true);
if (!already_running) {
auto resumed_after_time = m_crawl_timer.lap();
BEESLOGINFO("Crawl master resumed after " << resumed_after_time << "s at transid " << new_count);
m_crawl_task.run();
}
auto poll_time = m_transid_re.seconds_for(m_transid_factor);
BEESLOGDEBUG("Polling " << poll_time << "s for next " << m_transid_factor << " transid " << m_transid_re);
BEESNOTE("waiting " << poll_time << "s for next " << m_transid_factor << " transid " << m_transid_re);
nanosleep(poll_time);
}
}
void
BeesRoots::writeback_thread()
{
while (true) {
BEESNOTE("idle, " << (m_crawl_dirty ? "dirty" : "clean"));
catch_all([&]() {
BEESNOTE("saving crawler state");
state_save();
});
nanosleep(BEES_WRITEBACK_INTERVAL);
}
}
void
BeesRoots::insert_root(const BeesCrawlState &new_bcs)
{
unique_lock<mutex> lock(m_mutex);
if (!m_root_crawl_map.count(new_bcs.m_root)) {
auto new_bcp = make_shared<BeesCrawl>(m_ctx, new_bcs);
auto new_pair = make_pair(new_bcs.m_root, new_bcp);
m_root_crawl_map.insert(new_pair);
m_crawl_dirty = true;
}
auto found = m_root_crawl_map.find(new_bcs.m_root);
THROW_CHECK0(runtime_error, found != m_root_crawl_map.end());
found->second->deferred(false);
}
void
BeesRoots::insert_new_crawl()
{
BEESNOTE("adding crawlers for new subvols and removing crawlers for removed subvols");
BeesCrawlState new_bcs;
// Avoid a wasted loop iteration by starting from root 5
new_bcs.m_root = BTRFS_FS_TREE_OBJECTID;
new_bcs.m_min_transid = transid_min();
new_bcs.m_max_transid = transid_max();
unique_lock<mutex> lock(m_mutex);
set<uint64_t> excess_roots;
for (auto i : m_root_crawl_map) {
excess_roots.insert(i.first);
}
lock.unlock();
while (new_bcs.m_root) {
excess_roots.erase(new_bcs.m_root);
insert_root(new_bcs);
BEESCOUNT(crawl_create);
new_bcs.m_root = next_root(new_bcs.m_root);
}
for (auto i : excess_roots) {
new_bcs.m_root = i;
crawl_state_erase(new_bcs);
}
}
void
BeesRoots::state_load()
{
BEESNOTE("loading crawl state");
BEESLOGINFO("loading crawl state");
string crawl_data = m_crawl_state_file.read();
for (auto line : split("\n", crawl_data)) {
BEESLOGDEBUG("Read line: " << line);
map<string, uint64_t> d;
auto words = split(" ", line);
for (auto it = words.begin(); it < words.end(); ++it) {
auto it1 = it;
++it;
THROW_CHECK1(out_of_range, words.size(), it < words.end());
string key = *it1;
uint64_t val = from_hex(*it);
BEESTRACE("key " << key << " val " << val);
auto result = d.insert(make_pair(key, val));
THROW_CHECK0(runtime_error, result.second);
}
BeesCrawlState loaded_state;
loaded_state.m_root = d.at("root");
loaded_state.m_objectid = d.at("objectid");
loaded_state.m_offset = d.at("offset");
loaded_state.m_min_transid = d.count("gen_current") ? d.at("gen_current") : d.at("min_transid");
loaded_state.m_max_transid = d.count("gen_next") ? d.at("gen_next") : d.at("max_transid");
if (d.count("started")) {
loaded_state.m_started = d.at("started");
}
BEESLOGDEBUG("loaded_state " << loaded_state);
insert_root(loaded_state);
}
}
BeesRoots::BeesRoots(shared_ptr<BeesContext> ctx) :
m_ctx(ctx),
m_crawl_state_file(ctx->home_fd(), crawl_state_filename()),
m_crawl_thread("crawl_transid"),
m_writeback_thread("crawl_writeback"),
m_task_running(false)
{
m_crawl_thread.exec([&]() {
// Measure current transid before creating any crawlers
catch_all([&]() {
m_transid_re.update(transid_max_nocache());
});
// Make sure we have a full complement of crawlers
catch_all([&]() {
state_load();
});
m_writeback_thread.exec([&]() {
writeback_thread();
});
crawl_thread();
});
}
Fd
BeesRoots::open_root_nocache(uint64_t rootid)
{
BEESTRACE("open_root_nocache " << rootid);
BEESNOTE("open_root_nocache " << rootid);
// Stop recursion at the root of the filesystem tree
if (rootid == BTRFS_FS_TREE_OBJECTID) {
return m_ctx->root_fd();
}
// Find backrefs for this rootid and follow up to root
BtrfsIoctlSearchKey sk;
sk.tree_id = BTRFS_ROOT_TREE_OBJECTID;
sk.min_objectid = sk.max_objectid = rootid;
sk.min_type = sk.max_type = BTRFS_ROOT_BACKREF_KEY;
BEESTRACE("sk " << sk);
while (sk.min_objectid <= rootid) {
sk.nr_items = 1024;
sk.do_ioctl(m_ctx->root_fd());
if (sk.m_result.empty()) {
break;
}
for (auto i : sk.m_result) {
sk.next_min(i);
if (i.type == BTRFS_ROOT_BACKREF_KEY && i.objectid == rootid) {
auto dirid = call_btrfs_get(btrfs_stack_root_ref_dirid, i.m_data);
auto name_len = call_btrfs_get(btrfs_stack_root_ref_name_len, i.m_data);
auto name_start = sizeof(struct btrfs_root_ref);
auto name_end = name_len + name_start;
THROW_CHECK2(runtime_error, i.m_data.size(), name_end, i.m_data.size() >= name_end);
string name(i.m_data.data() + name_start, i.m_data.data() + name_end);
auto parent_rootid = i.offset;
// BEESLOG("parent_rootid " << parent_rootid << " dirid " << dirid << " name " << name);
BEESTRACE("parent_rootid " << parent_rootid << " dirid " << dirid << " name " << name);
BEESCOUNT(root_parent_open_try);
Fd parent_fd = open_root(parent_rootid);
if (!parent_fd) {
BEESLOGTRACE("no parent_fd");
BEESCOUNT(root_parent_open_fail);
continue;
}
BEESCOUNT(root_parent_open_ok);
if (dirid != BTRFS_FIRST_FREE_OBJECTID) {
BEESTRACE("dirid " << dirid << " root " << rootid << " INO_PATH");
BtrfsIoctlInoPathArgs ino(dirid);
if (!ino.do_ioctl_nothrow(parent_fd)) {
BEESLOGINFO("dirid " << dirid << " inode path lookup failed in parent_fd " << name_fd(parent_fd) << ": " << strerror(errno));
BEESCOUNT(root_parent_path_fail);
continue;
}
if (ino.m_paths.empty()) {
BEESLOGINFO("dirid " << dirid << " inode has no paths in parent_fd " << name_fd(parent_fd));
BEESCOUNT(root_parent_path_empty);
continue;
}
// Theoretically there is only one, so don't bother looping.
BEESTRACE("dirid " << dirid << " path " << ino.m_paths.at(0));
parent_fd = openat(parent_fd, ino.m_paths.at(0).c_str(), FLAGS_OPEN_DIR);
if (!parent_fd) {
BEESLOGTRACE("no parent_fd from dirid");
BEESCOUNT(root_parent_path_open_fail);
continue;
}
}
// BEESLOG("openat(" << name_fd(parent_fd) << ", " << name << ")");
BEESTRACE("openat(" << name_fd(parent_fd) << ", " << name << ")");
Fd rv = openat(parent_fd, name.c_str(), FLAGS_OPEN_DIR);
if (!rv) {
BEESLOGTRACE("open failed for name " << name << ": " << strerror(errno));
BEESCOUNT(root_open_fail);
continue;
}
BEESCOUNT(root_found);
// Verify correct root ID
// Throw exceptions here because these are very rare events
// and unlike the file open case, we don't have alternatives to try
auto new_root_id = btrfs_get_root_id(rv);
THROW_CHECK2(runtime_error, new_root_id, rootid, new_root_id == rootid);
Stat st(rv);
THROW_CHECK1(runtime_error, st.st_ino, st.st_ino == BTRFS_FIRST_FREE_OBJECTID);
// BEESLOGDEBUG("open_root_nocache " << rootid << ": " << name_fd(rv));
BEESCOUNT(root_ok);
return rv;
}
}
}
BEESLOGDEBUG("No path for rootid " << rootid);
BEESCOUNT(root_notfound);
return Fd();
}
Fd
BeesRoots::open_root(uint64_t rootid)
{
// Ignore some of the crap that comes out of LOGICAL_INO
if (rootid == BTRFS_ROOT_TREE_OBJECTID) {
return Fd();
}
return m_ctx->fd_cache()->open_root(m_ctx, rootid);
}
uint64_t
BeesRoots::next_root(uint64_t root)
{
BEESNOTE("Next root from " << root);
BEESTRACE("Next root from " << root);
// BTRFS_FS_TREE_OBJECTID has no backref keys so we can't find it that way
if (root < BTRFS_FS_TREE_OBJECTID) {
// BEESLOGDEBUG("First root is BTRFS_FS_TREE_OBJECTID = " << BTRFS_FS_TREE_OBJECTID);
return BTRFS_FS_TREE_OBJECTID;
}
BtrfsIoctlSearchKey sk;
sk.tree_id = BTRFS_ROOT_TREE_OBJECTID;
sk.min_type = sk.max_type = BTRFS_ROOT_BACKREF_KEY;
sk.min_objectid = root + 1;
while (true) {
sk.nr_items = 1024;
sk.do_ioctl(m_ctx->root_fd());
if (sk.m_result.empty()) {
return 0;
}
for (auto i : sk.m_result) {
sk.next_min(i);
if (i.type == BTRFS_ROOT_BACKREF_KEY) {
// BEESLOGDEBUG("Found root " << i.objectid << " parent " << i.offset << " transid " << i.transid);
return i.objectid;
}
}
}
}
Fd
BeesRoots::open_root_ino_nocache(uint64_t root, uint64_t ino)
{
BEESTRACE("opening root " << root << " ino " << ino);
Fd root_fd = open_root(root);
if (!root_fd) {
BEESCOUNT(open_no_root);
return root_fd;
}
BEESTOOLONG("open_root_ino(root " << root << ", ino " << ino << ")");
BEESTRACE("looking up ino " << ino);
BtrfsIoctlInoPathArgs ipa(ino);
if (!ipa.do_ioctl_nothrow(root_fd)) {
if (errno == ENOENT) {
BEESCOUNT(open_lookup_enoent);
} else {
BEESLOGINFO("Lookup root " << root << " ino " << ino << " failed: " << strerror(errno));
BEESCOUNT(open_lookup_error);
}
return Fd();
}
BEESTRACE("searching paths for root " << root << " ino " << ino);
Fd rv;
if (ipa.m_paths.empty()) {
BEESLOGWARN("No paths for root " << root << " ino " << ino);
BEESCOUNT(open_lookup_empty);
}
BEESCOUNT(open_lookup_ok);
for (auto file_path : ipa.m_paths) {
BEESTRACE("Looking up root " << root << " ino " << ino << " in dir " << name_fd(root_fd) << " path " << file_path);
BEESCOUNT(open_file);
// Just open file RO. root can do the dedup ioctl without
// opening in write mode, and if we do open in write mode,
// we can't exec the file while we have it open.
const char *fp_cstr = file_path.c_str();
rv = openat(root_fd, fp_cstr, FLAGS_OPEN_FILE);
if (!rv) {
// errno == ENOENT is the most common error case.
// No need to report it.
if (errno == ENOENT) {
BEESCOUNT(open_fail_enoent);
} else {
BEESLOGWARN("Could not open path '" << file_path << "' at root " << root << " " << name_fd(root_fd) << ": " << strerror(errno));
BEESCOUNT(open_fail_error);
}
continue;
}
// Correct inode?
Stat file_stat(rv);
if (file_stat.st_ino != ino) {
BEESLOGWARN("Opening " << name_fd(root_fd) << "/" << file_path << " found wrong inode " << file_stat.st_ino << " instead of " << ino);
rv = Fd();
BEESCOUNT(open_wrong_ino);
break;
}
// Correct root?
auto file_root = btrfs_get_root_id(rv);
if (file_root != root) {
BEESLOGWARN("Opening " << name_fd(root_fd) << "/" << file_path << " found wrong root " << file_root << " instead of " << root);
rv = Fd();
BEESCOUNT(open_wrong_root);
break;
}
// Same filesystem?
Stat root_stat(root_fd);
if (root_stat.st_dev != file_stat.st_dev) {
BEESLOGWARN("Opening root " << name_fd(root_fd) << " path " << file_path << " found path st_dev " << file_stat.st_dev << " but root st_dev is " << root_stat.st_dev);
rv = Fd();
BEESCOUNT(open_wrong_dev);
break;
}
// The kernel rejects dedup requests with
// src and dst that have different datasum flags
// (datasum is a flag in the inode).
//
// We can detect the common case where a file is
// marked with nodatacow (which implies nodatasum).
// nodatacow files are arguably out of scope for dedup,
// since dedup would just make them datacow again.
// To handle these we pretend we couldn't open them.
//
// A less common case is nodatasum + datacow files.
// Those are availble for dedup but we have to solve
// some other problems before we can dedup them. They
// require a separate hash table namespace from datasum
// + datacow files, and we have to create nodatasum
// temporary files when we rewrite extents.
//
// FIXME: the datasum flag is scooped up by
// TREE_SEARCH_V2 during crawls. We throw the inode
// items away when we should be examining them for the
// nodatasum flag.
int attr = ioctl_iflags_get(rv);
if (attr & FS_NOCOW_FL) {
BEESLOGWARN("Opening " << name_fd(rv) << " found FS_NOCOW_FL flag in " << to_hex(attr));
rv = Fd();
BEESCOUNT(open_wrong_flags);
break;
}
BEESCOUNT(open_hit);
return rv;
}
// All of the paths we tried were wrong.
BEESCOUNT(open_no_path);
return Fd();
}
Fd
BeesRoots::open_root_ino(uint64_t root, uint64_t ino)
{
return m_ctx->fd_cache()->open_root_ino(m_ctx, root, ino);
}
RateEstimator &
BeesRoots::transid_re()
{
return m_transid_re;
}
BeesCrawl::BeesCrawl(shared_ptr<BeesContext> ctx, BeesCrawlState initial_state) :
m_ctx(ctx),
m_state(initial_state)
{
}
bool
BeesCrawl::next_transid()
{
auto roots = m_ctx->roots();
auto next_transid = roots->transid_max();
auto crawl_state = get_state_end();
// If we are already at transid_max then we are still finished
m_finished = crawl_state.m_max_transid >= next_transid;
if (m_finished) {
m_deferred = true;
} else {
// Log performance stats from the old crawl
auto current_time = time(NULL);
// Start new crawl
crawl_state.m_min_transid = crawl_state.m_max_transid;
crawl_state.m_max_transid = next_transid;
crawl_state.m_objectid = 0;
crawl_state.m_offset = 0;
crawl_state.m_started = current_time;
BEESCOUNT(crawl_restart);
set_state(crawl_state);
m_deferred = false;
BEESLOGINFO("Crawl started " << crawl_state);
}
return !m_finished;
}
bool
BeesCrawl::fetch_extents()
{
THROW_CHECK1(runtime_error, m_extents.size(), m_extents.empty());
// insert_root will undefer us. Until then, nothing.
if (m_deferred) {
return false;
}
auto old_state = get_state_end();
// We can't scan an empty transid interval.
if (m_finished || old_state.m_max_transid <= old_state.m_min_transid) {
BEESTRACE("Crawl finished " << get_state_end());
return next_transid();
}
BEESNOTE("crawling " << get_state_end());
Timer crawl_timer;
BtrfsIoctlSearchKey sk(BEES_MAX_CRAWL_SIZE * (sizeof(btrfs_file_extent_item) + sizeof(btrfs_ioctl_search_header)));
sk.tree_id = old_state.m_root;
sk.min_objectid = old_state.m_objectid;
sk.min_type = sk.max_type = BTRFS_EXTENT_DATA_KEY;
sk.min_offset = old_state.m_offset;
sk.min_transid = old_state.m_min_transid;
// Don't set max_transid here. We want to see old extents with
// new references, and max_transid filtering in the kernel locks
// the filesystem while slowing us down.
// sk.max_transid = old_state.m_max_transid;
sk.max_transid = numeric_limits<uint64_t>::max();
sk.nr_items = BEES_MAX_CRAWL_SIZE;
// Lock in the old state
set_state(old_state);
BEESTRACE("Searching crawl sk " << static_cast<btrfs_ioctl_search_key&>(sk));
bool ioctl_ok = false;
{
BEESNOTE("searching crawl sk " << static_cast<btrfs_ioctl_search_key&>(sk));
BEESTOOLONG("Searching crawl sk " << static_cast<btrfs_ioctl_search_key&>(sk));
Timer crawl_timer;
ioctl_ok = sk.do_ioctl_nothrow(m_ctx->root_fd());
BEESCOUNTADD(crawl_ms, crawl_timer.age() * 1000);
}
if (ioctl_ok) {
BEESCOUNT(crawl_search);
} else {
BEESLOGWARN("Search ioctl failed: " << strerror(errno));
BEESCOUNT(crawl_fail);
}
if (!ioctl_ok || sk.m_result.empty()) {
BEESCOUNT(crawl_empty);
BEESLOGINFO("Crawl finished " << get_state_end());
return next_transid();
}
// BEESLOGINFO("Crawling " << sk.m_result.size() << " results from " << get_state_end());
auto results_left = sk.m_result.size();
BEESNOTE("crawling " << results_left << " results from " << get_state_end());
size_t count_other = 0;
size_t count_inline = 0;
size_t count_unknown = 0;
size_t count_data = 0;
size_t count_low = 0;
size_t count_high = 0;
BeesFileRange last_bfr;
for (auto i : sk.m_result) {
sk.next_min(i);
--results_left;
BEESCOUNT(crawl_items);
BEESTRACE("i = " << i);
// We need the "+ 1" and objectid rollover that next_min does.
auto new_state = get_state_end();
new_state.m_objectid = sk.min_objectid;
new_state.m_offset = sk.min_offset;
// Saving state here means we can skip a search result
// if we are interrupted. Not saving state here means we
// can fail to make forward progress in cases where there
// is a lot of metadata we can't process. Favor forward
// progress over losing search results.
set_state(new_state);
// Ignore things that aren't EXTENT_DATA_KEY
if (i.type != BTRFS_EXTENT_DATA_KEY) {
++count_other;
BEESCOUNT(crawl_nondata);
continue;
}
auto gen = call_btrfs_get(btrfs_stack_file_extent_generation, i.m_data);
if (gen < get_state_end().m_min_transid) {
BEESCOUNT(crawl_gen_low);
++count_low;
// We want (need?) to scan these anyway?
// The header generation refers to the transid
// of the metadata page holding the current ref.
// This includes anything else in that page that
// happened to be modified, regardless of how
// old it is.
// The file_extent_generation refers to the
// transid of the extent item's page, which is
// a different approximation of what we want.
// Combine both of these filters to minimize
// the number of times we unnecessarily re-read
// an extent.
continue;
}
if (gen > get_state_end().m_max_transid) {
BEESCOUNT(crawl_gen_high);
++count_high;
// We have to filter these here because we can't
// do it in the kernel.
continue;
}
auto type = call_btrfs_get(btrfs_stack_file_extent_type, i.m_data);
switch (type) {
default:
BEESLOGDEBUG("Unhandled file extent type " << type << " in root " << get_state_end().m_root << " ino " << i.objectid << " offset " << to_hex(i.offset));
++count_unknown;
BEESCOUNT(crawl_unknown);
break;
case BTRFS_FILE_EXTENT_INLINE:
// Ignore these for now.
// BEESLOGDEBUG("Ignored file extent type INLINE in root " << get_state_end().m_root << " ino " << i.objectid << " offset " << to_hex(i.offset));
++count_inline;
// TODO: replace with out-of-line dup extents
BEESCOUNT(crawl_inline);
break;
case BTRFS_FILE_EXTENT_PREALLOC:
BEESCOUNT(crawl_prealloc);
// fallthrough
case BTRFS_FILE_EXTENT_REG: {
auto physical = call_btrfs_get(btrfs_stack_file_extent_disk_bytenr, i.m_data);
auto ram = call_btrfs_get(btrfs_stack_file_extent_ram_bytes, i.m_data);
auto len = call_btrfs_get(btrfs_stack_file_extent_num_bytes, i.m_data);
auto offset = call_btrfs_get(btrfs_stack_file_extent_offset, i.m_data);
BEESTRACE("Root " << get_state_end().m_root << " ino " << i.objectid << " physical " << to_hex(physical)
<< " logical " << to_hex(i.offset) << ".." << to_hex(i.offset + len)
<< " gen " << gen);
++count_data;
if (physical) {
THROW_CHECK1(runtime_error, ram, ram > 0);
THROW_CHECK1(runtime_error, len, len > 0);
THROW_CHECK2(runtime_error, offset, ram, offset < ram);
BeesFileId bfi(get_state_end().m_root, i.objectid);
if (m_ctx->is_blacklisted(bfi)) {
BEESCOUNT(crawl_blacklisted);
} else {
BeesFileRange bfr(bfi, i.offset, i.offset + len);
// BEESNOTE("pushing bfr " << bfr << " limit " << BEES_MAX_QUEUE_SIZE);
m_extents.insert(bfr);
BEESCOUNT(crawl_push);
}
} else {
BEESCOUNT(crawl_hole);
}
break;
}
}
}
// BEESLOGINFO("Crawled inline " << count_inline << " data " << count_data << " other " << count_other << " unknown " << count_unknown << " gen_low " << count_low << " gen_high " << count_high << " " << get_state_end() << " in " << crawl_timer << "s");
return true;
}
void
BeesCrawl::fetch_extents_harder()
{
BEESNOTE("fetch_extents_harder " << get_state_end() << " with " << m_extents.size() << " extents");
while (m_extents.empty()) {
bool progress_made = fetch_extents();
if (!progress_made) {
return;
}
}
}
BeesFileRange
BeesCrawl::peek_front()
{
unique_lock<mutex> lock(m_mutex);
fetch_extents_harder();
if (m_extents.empty()) {
return BeesFileRange();
}
auto rv = *m_extents.begin();
return rv;
}
BeesFileRange
BeesCrawl::pop_front()
{
unique_lock<mutex> lock(m_mutex);
fetch_extents_harder();
if (m_extents.empty()) {
return BeesFileRange();
}
auto rv = *m_extents.begin();
m_extents.erase(m_extents.begin());
return rv;
}
BeesCrawlState
BeesCrawl::get_state_begin()
{
return m_state.begin();
}
BeesCrawlState
BeesCrawl::get_state_end()
{
return m_state.end();
}
ProgressTracker<BeesCrawlState>::ProgressHolder
BeesCrawl::hold_state(const BeesFileRange &bfr)
{
auto bcs = m_state.end();
bcs.m_objectid = bfr.fid().ino();
bcs.m_offset = bfr.begin();
return m_state.hold(bcs);
}
void
BeesCrawl::set_state(const BeesCrawlState &bcs)
{
m_state.set(bcs);
m_ctx->roots()->crawl_state_set_dirty();
}
void
BeesCrawl::deferred(bool def_setting)
{
unique_lock<mutex> lock(m_state_mutex);
m_deferred = def_setting;
}