mirror of
https://github.com/Zygo/bees.git
synced 2025-05-17 21:35:45 +02:00
context: better detection for toxic extents
We detect toxic extents by measuring how long the LOGICAL_INO ioctl takes to run. If it is above some threshold, we consider the extent toxic, and blacklist it; otherwise, we process the extent normally. The detector was using the execution time of the ioctl, which detects toxic extents, but it also detects pauses of the bees process and transaction commit latency due to load. This leads to a significant number of false positives. The detection threshold was also very long, burning a lot of kernel CPU before the detection was triggered. Use the per-thread system CPU statistics to measure the kernel CPU usage of the LOGICAL_INO call directly. This is much more reliable because it is not confounded by other threads, and it's faster because we can set the time threshold two orders of magnitude lower. Also remove the lock and mutex added in "context: serialize LOGICAL_INO calls" because we theoretically no longer need it (but leave the code there with #if 0 in case we do need it in practice). Signed-off-by: Zygo Blaxell <bees@furryterror.org>
This commit is contained in:
parent
9a97699dd9
commit
542371684c
@ -8,6 +8,11 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
// round
|
||||||
|
#include <cmath>
|
||||||
|
// struct rusage
|
||||||
|
#include <sys/resource.h>
|
||||||
|
|
||||||
using namespace crucible;
|
using namespace crucible;
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
@ -766,6 +771,12 @@ BeesContext::resolve_addr_uncached(BeesAddress addr)
|
|||||||
THROW_CHECK1(invalid_argument, addr, !addr.is_magic());
|
THROW_CHECK1(invalid_argument, addr, !addr.is_magic());
|
||||||
THROW_CHECK0(invalid_argument, !!root_fd());
|
THROW_CHECK0(invalid_argument, !!root_fd());
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// If we look at per-thread CPU usage we get a better estimate of
|
||||||
|
// how badly btrfs is performing without confounding factors like
|
||||||
|
// transaction latency, competing threads, and freeze/SIGSTOP
|
||||||
|
// pausing the bees process.
|
||||||
|
|
||||||
// There can be only one of these running at a time, or the slow
|
// There can be only one of these running at a time, or the slow
|
||||||
// backrefs bug will kill the whole system. Also it looks like there
|
// backrefs bug will kill the whole system. Also it looks like there
|
||||||
// are so many locks held while LOGICAL_INO runs that there is no
|
// are so many locks held while LOGICAL_INO runs that there is no
|
||||||
@ -773,12 +784,17 @@ BeesContext::resolve_addr_uncached(BeesAddress addr)
|
|||||||
BEESNOTE("waiting to resolve addr " << addr);
|
BEESNOTE("waiting to resolve addr " << addr);
|
||||||
static mutex s_resolve_mutex;
|
static mutex s_resolve_mutex;
|
||||||
unique_lock<mutex> lock(s_resolve_mutex);
|
unique_lock<mutex> lock(s_resolve_mutex);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Time how long this takes
|
||||||
Timer resolve_timer;
|
Timer resolve_timer;
|
||||||
|
|
||||||
// There is no performance benefit if we restrict the buffer size.
|
|
||||||
BtrfsIoctlLogicalInoArgs log_ino(addr.get_physical_or_zero());
|
BtrfsIoctlLogicalInoArgs log_ino(addr.get_physical_or_zero());
|
||||||
|
|
||||||
|
// Get this thread's system CPU usage
|
||||||
|
struct rusage usage_before;
|
||||||
|
DIE_IF_MINUS_ONE(getrusage(RUSAGE_THREAD, &usage_before));
|
||||||
|
|
||||||
{
|
{
|
||||||
BEESTOOLONG("Resolving addr " << addr << " in " << root_path() << " refs " << log_ino.m_iors.size());
|
BEESTOOLONG("Resolving addr " << addr << " in " << root_path() << " refs " << log_ino.m_iors.size());
|
||||||
BEESNOTE("resolving addr " << addr << " with LOGICAL_INO");
|
BEESNOTE("resolving addr " << addr << " with LOGICAL_INO");
|
||||||
@ -790,16 +806,27 @@ BeesContext::resolve_addr_uncached(BeesAddress addr)
|
|||||||
BEESCOUNTADD(resolve_ms, resolve_timer.age() * 1000);
|
BEESCOUNTADD(resolve_ms, resolve_timer.age() * 1000);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Prevent unavoidable performance bug from crippling the rest of the system
|
// Again!
|
||||||
|
struct rusage usage_after;
|
||||||
|
DIE_IF_MINUS_ONE(getrusage(RUSAGE_THREAD, &usage_after));
|
||||||
|
|
||||||
|
double sys_usage_delta =
|
||||||
|
(usage_after.ru_stime.tv_sec + usage_after.ru_stime.tv_usec / 1000000.0) -
|
||||||
|
(usage_before.ru_stime.tv_sec + usage_before.ru_stime.tv_usec / 1000000.0);
|
||||||
|
|
||||||
|
double user_usage_delta =
|
||||||
|
(usage_after.ru_utime.tv_sec + usage_after.ru_utime.tv_usec / 1000000.0) -
|
||||||
|
(usage_before.ru_utime.tv_sec + usage_before.ru_utime.tv_usec / 1000000.0);
|
||||||
|
|
||||||
auto rt_age = resolve_timer.age();
|
auto rt_age = resolve_timer.age();
|
||||||
|
|
||||||
// Avoid performance bug
|
// Avoid performance bug
|
||||||
BeesResolveAddrResult rv;
|
BeesResolveAddrResult rv;
|
||||||
rv.m_biors = log_ino.m_iors;
|
rv.m_biors = log_ino.m_iors;
|
||||||
if (rt_age < BEES_TOXIC_DURATION && log_ino.m_iors.size() < BEES_MAX_EXTENT_REF_COUNT) {
|
if (sys_usage_delta < BEES_TOXIC_SYS_DURATION && log_ino.m_iors.size() < BEES_MAX_EXTENT_REF_COUNT) {
|
||||||
rv.m_is_toxic = false;
|
rv.m_is_toxic = false;
|
||||||
} else {
|
} else {
|
||||||
BEESLOGWARN("WORKAROUND: toxic address " << addr << " in " << root_path() << " with " << log_ino.m_iors.size() << " refs took " << rt_age << "s in LOGICAL_INO");
|
BEESLOGNOTICE("WORKAROUND: toxic address: addr = " << addr << ", sys_usage_delta = " << round(sys_usage_delta* 1000.0) / 1000.0 << ", user_usage_delta = " << round(user_usage_delta * 1000.0) / 1000.0 << ", rt_age = " << rt_age << ", refs " << log_ino.m_iors.size());
|
||||||
BEESCOUNT(resolve_toxic);
|
BEESCOUNT(resolve_toxic);
|
||||||
rv.m_is_toxic = true;
|
rv.m_is_toxic = true;
|
||||||
}
|
}
|
||||||
|
@ -88,11 +88,8 @@ const double BEES_DEFAULT_THREAD_FACTOR = 1.0;
|
|||||||
// Log warnings when an operation takes too long
|
// Log warnings when an operation takes too long
|
||||||
const double BEES_TOO_LONG = 5.0;
|
const double BEES_TOO_LONG = 5.0;
|
||||||
|
|
||||||
// Avoid any extent where LOGICAL_INO takes this long
|
// Avoid any extent where LOGICAL_INO takes this much kernel CPU time
|
||||||
const double BEES_TOXIC_DURATION = 9.9;
|
const double BEES_TOXIC_SYS_DURATION = 0.1;
|
||||||
// EXPERIMENT: Kernel v4.14+ may let us ignore toxicity
|
|
||||||
// NOPE: kernel 4.14 has the same toxicity problems as any previous kernel
|
|
||||||
// const double BEES_TOXIC_DURATION = 99.9;
|
|
||||||
|
|
||||||
// How long between hash table histograms
|
// How long between hash table histograms
|
||||||
const double BEES_HASH_TABLE_ANALYZE_INTERVAL = BEES_STATS_INTERVAL;
|
const double BEES_HASH_TABLE_ANALYZE_INTERVAL = BEES_STATS_INTERVAL;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user