roots: quick fix for task scheduling bug leading to loss of crawl_master

The crawl_master task had a simple atomic variable that was supposed to prevent duplicate crawl_master tasks from ending up in the queue; however, this had a race condition that could lead to m_task_running being set with no crawl_master task running to clear it. This would in turn prevent crawl_thread from scheduling any further crawl_master tasks, and bees would eventually stop doing any more work. A proper fix is to modify the Task class and its friends such that Task::run() guarantees that 1) at most one instance of a Task is ever scheduled or running at any time, and 2) if a Task is scheduled while an instance of the Task is running, the scheduling is deferred until after the current instance completes. This is part of a fairly large planned change set, but it's not ready to push now. So instead, unconditionally push a new crawl_master Task into the queue on every poll, then silently and quickly exit if the queue is too full or the supply of new extents is empty. Drop the scheduling-related members of BeesRoots as they will not be needed when the proper fix lands. Fixes: 4f0bc78a "crawl: don't block a Task waiting for new transids" Signed-off-by: Zygo Blaxell <bees@furryterror.org>
2026-01-08 20:00:22 +01:00 · 2018-11-24 00:35:16 -05:00
parent 04dbfd5bf1
commit a5b9919d26
2 changed files with 11 additions and 11 deletions
--- a/src/bees-roots.cc
+++ b/src/bees-roots.cc
@@ -399,15 +399,22 @@ BeesRoots::crawl_thread()
 	m_crawl_task = Task("crawl_master", [shared_this]() {
 		auto tqs = TaskMaster::get_queue_count();
 		BEESNOTE("queueing extents to scan, " << tqs << " of " << BEES_MAX_QUEUE_SIZE);
+#if 0
 		bool run_again = true;
 		while (tqs < BEES_MAX_QUEUE_SIZE && run_again) {
 			run_again = shared_this->crawl_roots();
 			tqs = TaskMaster::get_queue_count();
 		}
+#else
+		bool run_again = false;
+		while (tqs < BEES_MAX_QUEUE_SIZE) {
+			run_again = shared_this->crawl_roots();
+			tqs = TaskMaster::get_queue_count();
+			if (!run_again) break;
+		}
+#endif
 		if (run_again) {
 			shared_this->m_crawl_task.run();
-		} else {
-			shared_this->m_task_running = false;
 		}
 	});

@@ -436,12 +443,7 @@ BeesRoots::crawl_thread()
 		last_count = new_count;

 		// If no crawl task is running, start a new one
-		bool already_running = m_task_running.exchange(true);
-		if (!already_running) {
-			auto resumed_after_time = m_crawl_timer.lap();
-			BEESLOGINFO("Crawl master resumed after " << resumed_after_time << "s at transid " << new_count);
-			m_crawl_task.run();
-		}
+		m_crawl_task.run();

 		auto poll_time = m_transid_re.seconds_for(m_transid_factor);
 		BEESLOGDEBUG("Polling " << poll_time << "s for next " << m_transid_factor << " transid " << m_transid_re);
@@ -561,8 +563,7 @@ BeesRoots::BeesRoots(shared_ptr<BeesContext> ctx) :
 	m_ctx(ctx),
 	m_crawl_state_file(ctx->home_fd(), crawl_state_filename()),
 	m_crawl_thread("crawl_transid"),
-	m_writeback_thread("crawl_writeback"),
-	m_task_running(false)
+	m_writeback_thread("crawl_writeback")
 {

 	m_root_ro_cache.func([&](uint64_t root) -> bool {
--- a/src/bees.h
+++ b/src/bees.h
@@ -535,7 +535,6 @@ class BeesRoots : public enable_shared_from_this<BeesRoots> {
 	BeesThread				m_writeback_thread;
 	RateEstimator				m_transid_re;
 	size_t					m_transid_factor = BEES_TRANSID_FACTOR;
-	atomic<bool>				m_task_running;
 	Task					m_crawl_task;
 	bool					m_workaround_btrfs_send = false;
 	LRUCache<bool, uint64_t>		m_root_ro_cache;