From f4464c68960efeebb61697983074c769444b1efc Mon Sep 17 00:00:00 2001 From: Zygo Blaxell Date: Sat, 24 Nov 2018 00:35:16 -0500 Subject: [PATCH] roots: quick fix for task scheduling bug leading to loss of crawl_master The crawl_master task had a simple atomic variable that was supposed to prevent duplicate crawl_master tasks from ending up in the queue; however, this had a race condition that could lead to m_task_running being set with no crawl_master task running to clear it. This would in turn prevent crawl_thread from scheduling any further crawl_master tasks, and bees would eventually stop doing any more work. A proper fix is to modify the Task class and its friends such that Task::run() guarantees that 1) at most one instance of a Task is ever scheduled or running at any time, and 2) if a Task is scheduled while an instance of the Task is running, the scheduling is deferred until after the current instance completes. This is part of a fairly large planned change set, but it's not ready to push now. So instead, unconditionally push a new crawl_master Task into the queue on every poll, then silently and quickly exit if the queue is too full or the supply of new extents is empty. Drop the scheduling-related members of BeesRoots as they will not be needed when the proper fix lands. Fixes: 4f0bc78a "crawl: don't block a Task waiting for new transids" Signed-off-by: Zygo Blaxell --- src/bees-roots.cc | 21 +++++++++++---------- src/bees.h | 1 - 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/bees-roots.cc b/src/bees-roots.cc index 7b9c5cd..cf471fa 100644 --- a/src/bees-roots.cc +++ b/src/bees-roots.cc @@ -392,15 +392,22 @@ BeesRoots::crawl_thread() m_crawl_task = Task("crawl_master", [shared_this]() { auto tqs = TaskMaster::get_queue_count(); BEESNOTE("queueing extents to scan, " << tqs << " of " << BEES_MAX_QUEUE_SIZE); +#if 0 bool run_again = true; while (tqs < BEES_MAX_QUEUE_SIZE && run_again) { run_again = shared_this->crawl_roots(); tqs = TaskMaster::get_queue_count(); } +#else + bool run_again = false; + while (tqs < BEES_MAX_QUEUE_SIZE) { + run_again = shared_this->crawl_roots(); + tqs = TaskMaster::get_queue_count(); + if (!run_again) break; + } +#endif if (run_again) { shared_this->m_crawl_task.run(); - } else { - shared_this->m_task_running = false; } }); @@ -429,12 +436,7 @@ BeesRoots::crawl_thread() last_count = new_count; // If no crawl task is running, start a new one - bool already_running = m_task_running.exchange(true); - if (!already_running) { - auto resumed_after_time = m_crawl_timer.lap(); - BEESLOGINFO("Crawl master resumed after " << resumed_after_time << "s at transid " << new_count); - m_crawl_task.run(); - } + m_crawl_task.run(); auto poll_time = m_transid_re.seconds_for(m_transid_factor); BEESLOGDEBUG("Polling " << poll_time << "s for next " << m_transid_factor << " transid " << m_transid_re); @@ -554,8 +556,7 @@ BeesRoots::BeesRoots(shared_ptr ctx) : m_ctx(ctx), m_crawl_state_file(ctx->home_fd(), crawl_state_filename()), m_crawl_thread("crawl_transid"), - m_writeback_thread("crawl_writeback"), - m_task_running(false) + m_writeback_thread("crawl_writeback") { m_root_ro_cache.func([&](uint64_t root) -> bool { diff --git a/src/bees.h b/src/bees.h index 1f9ade2..2b0260a 100644 --- a/src/bees.h +++ b/src/bees.h @@ -525,7 +525,6 @@ class BeesRoots : public enable_shared_from_this { BeesThread m_writeback_thread; RateEstimator m_transid_re; size_t m_transid_factor = BEES_TRANSID_FACTOR; - atomic m_task_running; Task m_crawl_task; bool m_workaround_btrfs_send = false; LRUCache m_root_ro_cache;