1
0
mirror of https://github.com/Zygo/bees.git synced 2025-06-17 10:06:16 +02:00

bees: handle SIGTERM and SIGINT, force immediate flush and exit

Capture SIGINT and SIGTERM and shut down, preserving current completed
crawl and hash table state.

  * Executing tasks are completed, queued tasks are paused.
  * Crawl state is saved.
  * The crawl master and crawl writeback threads are terminated.
  * The task queue is flushed.
  * Dirty hash table extents are flushed.
  * Hash prefetch and writeback threads are terminated.
  * Hash table is deallocated.
  * FD caches and tmpfiles are destroyed.
  * Assuming the above didn't crash or deadlock, bees exits.

The above order isn't the fastest, but it does roughly follow the
shared_ptr dependencies and avoids data races--especially those that
might lead to bees reporting an extent scanned when it was only queued
for future scanning that did not occur.

In case of a violation of expected shared_ptr dependency order,
exceptions in BeesContext child object accessor methods (i.e. roots(),
hash_table(), etc) prevent any further progress in threads that somehow
remain unexpectedly active.

Move some threads from main into BeesContext so they can be stopped
via BeesContext.  The main thread now runs a loop waiting for signals.

A slow FD leak was discovered in TempFile handling.  This has not been
fixed yet, but an implementation detail of the C++ runtime library makes
the leak so slow it may never be important enough to fix.

Signed-off-by: Zygo Blaxell <bees@furryterror.org>
This commit is contained in:
Zygo Blaxell
2018-12-02 00:51:35 -05:00
parent cbc6725f0f
commit 570b3f7de0
5 changed files with 423 additions and 64 deletions

View File

@ -1,5 +1,6 @@
#include "bees.h"
#include "crucible/cleanup.h"
#include "crucible/limits.h"
#include "crucible/string.h"
#include "crucible/task.h"
@ -13,9 +14,13 @@
// struct rusage
#include <sys/resource.h>
// struct sigset
#include <signal.h>
using namespace crucible;
using namespace std;
static inline
const char *
getenv_or_die(const char *name)
@ -82,17 +87,15 @@ BeesContext::dump_status()
if (!status_charp) return;
string status_file(status_charp);
BEESLOGINFO("Writing status to file '" << status_file << "' every " << BEES_STATUS_INTERVAL << " sec");
while (1) {
BEESNOTE("waiting " << BEES_STATUS_INTERVAL);
sleep(BEES_STATUS_INTERVAL);
Timer total_timer;
while (!m_stop_status) {
BEESNOTE("writing status to file '" << status_file << "'");
ofstream ofs(status_file + ".tmp");
auto thisStats = BeesStats::s_global;
ofs << "TOTAL:\n";
ofs << "\t" << thisStats << "\n";
auto avg_rates = thisStats / m_total_timer.age();
auto avg_rates = thisStats / total_timer.age();
ofs << "RATES:\n";
ofs << "\t" << avg_rates << "\n";
@ -113,42 +116,65 @@ BeesContext::dump_status()
BEESNOTE("renaming status file '" << status_file << "'");
rename((status_file + ".tmp").c_str(), status_file.c_str());
BEESNOTE("idle " << BEES_STATUS_INTERVAL);
unique_lock<mutex> lock(m_stop_mutex);
if (m_stop_status) {
return;
}
m_stop_condvar.wait_for(lock, chrono::duration<double>(BEES_STATUS_INTERVAL));
}
}
void
BeesContext::show_progress()
{
auto lastProgressStats = BeesStats::s_global;
auto lastStats = lastProgressStats;
auto lastStats = BeesStats::s_global;
Timer stats_timer;
while (1) {
sleep(BEES_PROGRESS_INTERVAL);
Timer all_timer;
while (!stop_requested()) {
BEESNOTE("idle " << BEES_PROGRESS_INTERVAL);
if (stats_timer.age() > BEES_STATS_INTERVAL) {
stats_timer.lap();
auto thisStats = BeesStats::s_global;
auto avg_rates = lastStats / BEES_STATS_INTERVAL;
BEESLOGINFO("TOTAL: " << thisStats);
BEESLOGINFO("RATES: " << avg_rates);
lastStats = thisStats;
unique_lock<mutex> lock(m_stop_mutex);
if (m_stop_requested) {
return;
}
m_stop_condvar.wait_for(lock, chrono::duration<double>(BEES_PROGRESS_INTERVAL));
BEESLOGINFO("ACTIVITY:");
// Snapshot stats and timer state
auto thisStats = BeesStats::s_global;
auto deltaStats = thisStats - lastProgressStats;
if (deltaStats) {
BEESLOGINFO("\t" << deltaStats / BEES_PROGRESS_INTERVAL);
};
lastProgressStats = thisStats;
auto stats_age = stats_timer.age();
auto all_age = all_timer.age();
stats_timer.lap();
auto avg_rates = thisStats / stats_age;
BEESNOTE("logging event counter totals for last " << all_timer);
BEESLOGINFO("TOTAL COUNTS (" << all_age << "s):\n\t" << thisStats);
BEESNOTE("logging event counter rates for last " << all_timer);
BEESLOGINFO("TOTAL RATES (" << all_age << "s):\n\t" << avg_rates);
BEESNOTE("logging event counter delta counts for last " << stats_age);
BEESLOGINFO("DELTA COUNTS (" << stats_age << "s):");
auto deltaStats = thisStats - lastStats;
BEESLOGINFO("\t" << deltaStats / stats_age);
BEESNOTE("logging event counter delta rates for last " << stats_age);
BEESLOGINFO("DELTA RATES (" << stats_age << "s):");
auto deltaRates = deltaStats / stats_age;
BEESLOGINFO("\t" << deltaRates);
BEESNOTE("logging current thread status");
BEESLOGINFO("THREADS:");
for (auto t : BeesNote::get_status()) {
BEESLOGINFO("\ttid " << t.first << ": " << t.second);
}
lastStats = thisStats;
}
}
@ -171,11 +197,19 @@ BeesContext::home_fd()
}
BeesContext::BeesContext(shared_ptr<BeesContext> parent) :
m_parent_ctx(parent)
m_parent_ctx(parent),
m_progress_thread("progress_report"),
m_status_thread("status_report")
{
if (m_parent_ctx) {
m_fd_cache = m_parent_ctx->fd_cache();
}
m_progress_thread.exec([=]() {
show_progress();
});
m_status_thread.exec([=]() {
dump_status();
});
}
bool
@ -745,7 +779,7 @@ BeesContext::scan_forward(const BeesFileRange &bfr)
Extent e;
catch_all([&]() {
while (true) {
while (!stop_requested()) {
e = ew.current();
catch_all([&]() {
@ -881,11 +915,106 @@ BeesContext::set_root_fd(Fd fd)
m_resolve_cache.func([&](BeesAddress addr) -> BeesResolveAddrResult {
return resolve_addr_uncached(addr);
});
}
// Start queue producers
const char *
BeesHalt::what() const noexcept
{
return "bees stop requested";
}
void
BeesContext::start()
{
BEESLOGNOTICE("Starting bees main loop...");
BEESNOTE("starting BeesContext");
// Force these to exist now so we don't have recursive locking
// operations trying to access them
fd_cache();
hash_table();
// Kick off the crawlers
roots();
}
BEESLOGINFO("returning from set_root_fd in " << name_fd(fd));
void
BeesContext::stop()
{
Timer stop_timer;
BEESLOGNOTICE("Stopping bees...");
BEESLOGWARN("WARNING: This feature is EXPERIMENTAL and may not work!");
BEESNOTE("setting stop_request flag");
BEESNOTE("pausing work queue");
BEESLOGDEBUG("Pausing work queue");
TaskMaster::set_thread_count(0);
BEESLOGDEBUG("Setting stop_request flag");
unique_lock<mutex> lock(m_stop_mutex);
m_stop_requested = true;
m_stop_condvar.notify_all();
lock.unlock();
// Stop crawlers first so we get good progress persisted on disk
BEESNOTE("stopping crawlers");
BEESLOGDEBUG("Stopping crawlers");
if (m_roots) {
m_roots->stop();
m_roots.reset();
} else {
BEESLOGDEBUG("Crawlers not running");
}
BEESNOTE("cancelling work queue");
BEESLOGDEBUG("Cancelling work queue");
TaskMaster::cancel();
BEESNOTE("stopping hash table");
BEESLOGDEBUG("Stopping hash table");
if (m_hash_table) {
m_hash_table->stop();
m_hash_table.reset();
} else {
BEESLOGDEBUG("Hash table not running");
}
BEESNOTE("closing tmpfiles");
BEESLOGDEBUG("Closing tmpfiles");
m_tmpfiles.clear();
BEESNOTE("closing FD caches");
BEESLOGDEBUG("Closing FD caches");
if (m_fd_cache) {
m_fd_cache->clear();
BEESNOTE("destroying FD caches");
BEESLOGDEBUG("Destroying FD caches");
m_fd_cache.reset();
}
BEESNOTE("waiting for progress thread");
BEESLOGDEBUG("Waiting for progress thread");
m_progress_thread.join();
// XXX: nobody can see this BEESNOTE because we are killing the
// thread that publishes it
BEESNOTE("waiting for progress thread");
BEESLOGDEBUG("Waiting for progress thread");
lock.lock();
m_stop_status = true;
m_stop_condvar.notify_all();
lock.unlock();
m_status_thread.join();
BEESLOGNOTICE("bees stopped in " << stop_timer << " sec");
}
bool
BeesContext::stop_requested() const
{
unique_lock<mutex> lock(m_stop_mutex);
return m_stop_requested;
}
void
@ -910,22 +1039,32 @@ BeesContext::is_blacklisted(const BeesFileId &fid) const
shared_ptr<BeesTempFile>
BeesContext::tmpfile()
{
// There need be only one, this is not a high-contention path
static mutex s_mutex;
unique_lock<mutex> lock(s_mutex);
// FIXME: this whole thing leaks FDs (quite slowly). Make a pool instead.
unique_lock<mutex> lock(m_stop_mutex);
if (m_stop_requested) {
throw BeesHalt();
}
if (!m_tmpfiles[this_thread::get_id()]) {
m_tmpfiles[this_thread::get_id()] = make_shared<BeesTempFile>(shared_from_this());
// We know we are the only possible accessor of this,
// so drop the lock to avoid a deadlock loop
lock.unlock();
auto rv = make_shared<BeesTempFile>(shared_from_this());
lock.lock();
m_tmpfiles[this_thread::get_id()] = rv;
}
auto rv = m_tmpfiles[this_thread::get_id()];
return rv;
return m_tmpfiles[this_thread::get_id()];
}
shared_ptr<BeesFdCache>
BeesContext::fd_cache()
{
static mutex s_mutex;
unique_lock<mutex> lock(s_mutex);
unique_lock<mutex> lock(m_stop_mutex);
if (m_stop_requested) {
throw BeesHalt();
}
if (!m_fd_cache) {
m_fd_cache = make_shared<BeesFdCache>();
}
@ -936,8 +1075,10 @@ BeesContext::fd_cache()
shared_ptr<BeesRoots>
BeesContext::roots()
{
static mutex s_mutex;
unique_lock<mutex> lock(s_mutex);
unique_lock<mutex> lock(m_stop_mutex);
if (m_stop_requested) {
throw BeesHalt();
}
if (!m_roots) {
m_roots = make_shared<BeesRoots>(shared_from_this());
}
@ -948,8 +1089,10 @@ BeesContext::roots()
shared_ptr<BeesHashTable>
BeesContext::hash_table()
{
static mutex s_mutex;
unique_lock<mutex> lock(s_mutex);
unique_lock<mutex> lock(m_stop_mutex);
if (m_stop_requested) {
throw BeesHalt();
}
if (!m_hash_table) {
m_hash_table = make_shared<BeesHashTable>(shared_from_this(), "beeshash.dat");
}