mirror of
https://github.com/Zygo/bees.git
synced 2025-06-16 01:26:18 +02:00
roots: quick fix for task scheduling bug leading to loss of crawl_master
The crawl_master task had a simple atomic variable that was supposed
to prevent duplicate crawl_master tasks from ending up in the queue;
however, this had a race condition that could lead to m_task_running
being set with no crawl_master task running to clear it. This would in
turn prevent crawl_thread from scheduling any further crawl_master tasks,
and bees would eventually stop doing any more work.
A proper fix is to modify the Task class and its friends such that
Task::run() guarantees that 1) at most one instance of a Task is ever
scheduled or running at any time, and 2) if a Task is scheduled while
an instance of the Task is running, the scheduling is deferred until
after the current instance completes. This is part of a fairly large
planned change set, but it's not ready to push now.
So instead, unconditionally push a new crawl_master Task into the queue
on every poll, then silently and quickly exit if the queue is too full
or the supply of new extents is empty. Drop the scheduling-related
members of BeesRoots as they will not be needed when the proper fix lands.
Fixes: 4f0bc78a
"crawl: don't block a Task waiting for new transids"
Signed-off-by: Zygo Blaxell <bees@furryterror.org>
This commit is contained in:
@ -399,15 +399,22 @@ BeesRoots::crawl_thread()
|
||||
m_crawl_task = Task("crawl_master", [shared_this]() {
|
||||
auto tqs = TaskMaster::get_queue_count();
|
||||
BEESNOTE("queueing extents to scan, " << tqs << " of " << BEES_MAX_QUEUE_SIZE);
|
||||
#if 0
|
||||
bool run_again = true;
|
||||
while (tqs < BEES_MAX_QUEUE_SIZE && run_again) {
|
||||
run_again = shared_this->crawl_roots();
|
||||
tqs = TaskMaster::get_queue_count();
|
||||
}
|
||||
#else
|
||||
bool run_again = false;
|
||||
while (tqs < BEES_MAX_QUEUE_SIZE) {
|
||||
run_again = shared_this->crawl_roots();
|
||||
tqs = TaskMaster::get_queue_count();
|
||||
if (!run_again) break;
|
||||
}
|
||||
#endif
|
||||
if (run_again) {
|
||||
shared_this->m_crawl_task.run();
|
||||
} else {
|
||||
shared_this->m_task_running = false;
|
||||
}
|
||||
});
|
||||
|
||||
@ -436,12 +443,7 @@ BeesRoots::crawl_thread()
|
||||
last_count = new_count;
|
||||
|
||||
// If no crawl task is running, start a new one
|
||||
bool already_running = m_task_running.exchange(true);
|
||||
if (!already_running) {
|
||||
auto resumed_after_time = m_crawl_timer.lap();
|
||||
BEESLOGINFO("Crawl master resumed after " << resumed_after_time << "s at transid " << new_count);
|
||||
m_crawl_task.run();
|
||||
}
|
||||
m_crawl_task.run();
|
||||
|
||||
auto poll_time = m_transid_re.seconds_for(m_transid_factor);
|
||||
BEESLOGDEBUG("Polling " << poll_time << "s for next " << m_transid_factor << " transid " << m_transid_re);
|
||||
@ -561,8 +563,7 @@ BeesRoots::BeesRoots(shared_ptr<BeesContext> ctx) :
|
||||
m_ctx(ctx),
|
||||
m_crawl_state_file(ctx->home_fd(), crawl_state_filename()),
|
||||
m_crawl_thread("crawl_transid"),
|
||||
m_writeback_thread("crawl_writeback"),
|
||||
m_task_running(false)
|
||||
m_writeback_thread("crawl_writeback")
|
||||
{
|
||||
|
||||
m_root_ro_cache.func([&](uint64_t root) -> bool {
|
||||
|
@ -535,7 +535,6 @@ class BeesRoots : public enable_shared_from_this<BeesRoots> {
|
||||
BeesThread m_writeback_thread;
|
||||
RateEstimator m_transid_re;
|
||||
size_t m_transid_factor = BEES_TRANSID_FACTOR;
|
||||
atomic<bool> m_task_running;
|
||||
Task m_crawl_task;
|
||||
bool m_workaround_btrfs_send = false;
|
||||
LRUCache<bool, uint64_t> m_root_ro_cache;
|
||||
|
Reference in New Issue
Block a user