1
0
mirror of https://github.com/Zygo/bees.git synced 2025-05-17 21:35:45 +02:00

context: workaround to prevent LOGICAL_INO and btrfs balance from running concurrently

This avoids some kernel bugs.  One of them is fixed in 5.3.4 and later:

	efad8a853a "Btrfs: fix use-after-free when using the tree modification log"

There are apparently others in current kernels, so for now just put bees
on pause until the balance is done.

At some point we may want to provide an option to disable this
workaround; however, running bees and balance at the same time makes
neither particularly fast, so maybe we'll just leave it this way.

Signed-off-by: Zygo Blaxell <bees@furryterror.org>
This commit is contained in:
Zygo Blaxell 2019-11-19 16:01:31 -05:00
parent 44f446e17e
commit c4f0e4abee
2 changed files with 57 additions and 2 deletions

View File

@ -812,27 +812,68 @@ BeesResolveAddrResult::BeesResolveAddrResult()
{
}
void
BeesContext::wait_for_balance()
{
Timer balance_timer;
BEESNOTE("WORKAROUND: waiting for balance to stop");
while (true) {
btrfs_ioctl_balance_args args;
memset_zero<btrfs_ioctl_balance_args>(&args);
const int ret = ioctl(root_fd(), BTRFS_IOC_BALANCE_PROGRESS, &args);
if (ret < 0) {
// Either can't get balance status or not running, exit either way
break;
}
if (!(args.state & BTRFS_BALANCE_STATE_RUNNING)) {
// Balance not running, doesn't matter if paused or cancelled
break;
}
BEESLOGDEBUG("WORKAROUND: Waiting " << balance_timer << "s for balance to stop");
unique_lock<mutex> lock(m_abort_mutex);
if (m_abort_requested) {
// Force the calling function to stop. We cannot
// proceed to LOGICAL_INO while balance is running
// until the bugs are fixed, and it's probably
// not going to be particularly fast to have
// both bees and balance banging the disk anyway.
BeesTracer::set_silent();
throw std::runtime_error("Stop requested while balance running");
}
m_abort_condvar.wait_for(lock, chrono::duration<double>(BEES_BALANCE_POLL_INTERVAL));
}
}
BeesResolveAddrResult
BeesContext::resolve_addr_uncached(BeesAddress addr)
{
THROW_CHECK1(invalid_argument, addr, !addr.is_magic());
THROW_CHECK0(invalid_argument, !!root_fd());
#if 0
// If we look at per-thread CPU usage we get a better estimate of
// how badly btrfs is performing without confounding factors like
// transaction latency, competing threads, and freeze/SIGSTOP
// pausing the bees process.
#if 0
// There can be only one of these running at a time, or the slow
// backrefs bug will kill the whole system. Also it looks like there
// are so many locks held while LOGICAL_INO runs that there is no
// point in trying to run two of them on the same filesystem.
// ...but it works most of the time, and the performance hit from
// not running resolve in multiple threads is significant.
BEESNOTE("waiting to resolve addr " << addr);
static mutex s_resolve_mutex;
unique_lock<mutex> lock(s_resolve_mutex);
#endif
// Is there a bug where resolve and balance cause a crash (BUG_ON at fs/btrfs/ctree.c:1227)?
// Apparently yes, and more than one.
// Wait for the balance to finish before we run LOGICAL_INO
wait_for_balance();
// Time how long this takes
Timer resolve_timer;
@ -944,12 +985,18 @@ BeesContext::stop()
Timer stop_timer;
BEESLOGNOTICE("Stopping bees...");
BEESNOTE("setting stop_request flag");
BEESNOTE("aborting blocked tasks");
BEESLOGDEBUG("Aborting blocked tasks");
unique_lock<mutex> abort_lock(m_abort_mutex);
m_abort_requested = true;
m_abort_condvar.notify_all();
abort_lock.unlock();
BEESNOTE("pausing work queue");
BEESLOGDEBUG("Pausing work queue");
TaskMaster::set_thread_count(0);
BEESNOTE("setting stop_request flag");
BEESLOGDEBUG("Setting stop_request flag");
unique_lock<mutex> lock(m_stop_mutex);
m_stop_requested = true;

View File

@ -109,6 +109,9 @@ const size_t BEES_MAX_CRAWL_BATCH = 128;
// Wait this many transids between crawls
const size_t BEES_TRANSID_FACTOR = 10;
// Wait this long for a balance to stop
const double BEES_BALANCE_POLL_INTERVAL = 60.0;
// Flags
const int FLAGS_OPEN_COMMON = O_NOFOLLOW | O_NONBLOCK | O_CLOEXEC | O_NOATIME | O_LARGEFILE | O_NOCTTY;
const int FLAGS_OPEN_DIR = FLAGS_OPEN_COMMON | O_RDONLY | O_DIRECTORY;
@ -729,12 +732,17 @@ class BeesContext : public enable_shared_from_this<BeesContext> {
bool m_stop_requested = false;
bool m_stop_status = false;
mutable mutex m_abort_mutex;
condition_variable m_abort_condvar;
bool m_abort_requested = false;
BeesThread m_progress_thread;
BeesThread m_status_thread;
void set_root_fd(Fd fd);
BeesResolveAddrResult resolve_addr_uncached(BeesAddress addr);
void wait_for_balance();
BeesFileRange scan_one_extent(const BeesFileRange &bfr, const Extent &e);
void rewrite_file_range(const BeesFileRange &bfr);