mirror of
https://github.com/Zygo/bees.git
synced 2025-05-17 13:25:45 +02:00
BeesStringFile: figure out when to call--or _not_ call--fsync
Older kernel versions featured some bugs in btrfs `fsync`, which could leave behind "ghost dirents", orphan filename items that did not have a corresponding inode. These dirents were created during log replay during the first mount after a crash due to several different bugs in the log tree and its use over the years. The last known bug of this kind was fixed in kernel 5.16. As of this writing, no fixes for this bug have been backported to any earlier LTS kernel. Some filesystems, including btrfs, will flush the contents of a new file before renaming it over an old file. On paper, btrfs can do this very cheaply since the contents of the new file are not referenced, and the old file not dereferenced, until a tree commit which includes both actions atomically; however, in real life, btrfs provides `fsync`-like semantics and uses the log-tree infrastructure to implement them, which compromises performance and acts as a magnet for bugs. The benefit of this trade-off is that `rename` can be used as a synchronization point for data outside of the btrfs, which would not happen if everything `rename` does was simply deferred to the next tree commit. The cost of this trade-off is that for the first 8 years of its existence, bees would trigger the bug so often that the project recommended its users put $BEESHOME in its own subvol to make it easy to remove ghost dirents left behind by the bug. Some other filesystems, such as xfs, don't have any special semantics for `rename`, and require `fsync` to avoid garbage or missing data after a crash. Even filesystems which do have a special case for `rename` can be configured to turn it off. btrfs will silently delete data from files in the event that an unrecoverable data block write error occurs. Kernel version 6.2 adds important new and unexpected cases where this can happen on filesystems using raid56 data, but it also happens in all usable btrfs versions (the silent deletion behavior was introduced in kernel version 3.9). Unrecoverable write errors are currently reported to userspace only through `fsync`. Since the failed extents are deleted, they cannot be detected via csum failures or scrub after the fact--and it's too late by then, the data is already gone. `fsync` is the last opportunity to detect the write failure before the `rename`. If the error is not detected, the contents of the file will be silently discarded in btrfs. The impact on bees is that scans will abruptly restart from zero after a crash combined with some other reasonably common failures. Putting all of this together leads to a rather complex workaround: if the filesystem under $BEESHOME (specifically, the filesystem where BeesStringFile objects such as `beescrawl.dat` are written) is a btrfs filesystem, and the host kernel is a version prior to 5.16, then don't call `fsync` before `rename`. In all other cases, do call `fsync`, and prevent dependent writes (i.e. the following `rename`) in the event of errors. Since present kernel versions still require `fsync`, we don't need an upper bound on the kernel version check until someone fixes btrfs `rename` (or perhaps adds a flag to `renameat2` which prevents use of the log tree) in the kernel. Once that fix happens, we can drop the `fsync` call for kernels after that fixed version. Signed-off-by: Zygo Blaxell <bees@furryterror.org>
This commit is contained in:
parent
962d94567c
commit
3e7eb43b51
89
src/bees.cc
89
src/bees.cc
@ -4,6 +4,7 @@
|
|||||||
#include "crucible/process.h"
|
#include "crucible/process.h"
|
||||||
#include "crucible/string.h"
|
#include "crucible/string.h"
|
||||||
#include "crucible/task.h"
|
#include "crucible/task.h"
|
||||||
|
#include "crucible/uname.h"
|
||||||
|
|
||||||
#include <cctype>
|
#include <cctype>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@ -11,17 +12,19 @@
|
|||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include <regex>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
|
||||||
// PRIx64
|
// PRIx64
|
||||||
#include <inttypes.h>
|
#include <inttypes.h>
|
||||||
|
|
||||||
#include <sched.h>
|
|
||||||
#include <sys/fanotify.h>
|
|
||||||
|
|
||||||
#include <linux/fs.h>
|
#include <linux/fs.h>
|
||||||
#include <sys/ioctl.h>
|
#include <sys/ioctl.h>
|
||||||
|
|
||||||
|
// statfs
|
||||||
|
#include <linux/magic.h>
|
||||||
|
#include <sys/statfs.h>
|
||||||
|
|
||||||
// setrlimit
|
// setrlimit
|
||||||
#include <sys/time.h>
|
#include <sys/time.h>
|
||||||
#include <sys/resource.h>
|
#include <sys/resource.h>
|
||||||
@ -391,6 +394,73 @@ BeesStringFile::read()
|
|||||||
return read_string(fd, st.st_size);
|
return read_string(fd, st.st_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static
|
||||||
|
void
|
||||||
|
bees_fsync(int const fd)
|
||||||
|
{
|
||||||
|
|
||||||
|
// Note that when btrfs renames a temporary over an existing file,
|
||||||
|
// it flushes the temporary, so we get the right behavior if we
|
||||||
|
// just do nothing here (except when the file is first created;
|
||||||
|
// however, in that case the result is the same as if the file
|
||||||
|
// did not exist, was empty, or was filled with garbage).
|
||||||
|
//
|
||||||
|
// Kernel versions prior to 5.16 had bugs which would put ghost
|
||||||
|
// dirents in $BEESHOME if there was a crash when we called
|
||||||
|
// fsync() here.
|
||||||
|
//
|
||||||
|
// Some other filesystems will throw our data away if we don't
|
||||||
|
// call fsync, so we do need to call fsync() on those filesystems.
|
||||||
|
//
|
||||||
|
// Newer btrfs kernel versions rely on fsync() to report
|
||||||
|
// unrecoverable write errors. If we don't check the fsync()
|
||||||
|
// result, we'll lose the data when we rename(). Kernel 6.2 added
|
||||||
|
// a number of new root causes for the class of "unrecoverable
|
||||||
|
// write errors" so we need to check this now.
|
||||||
|
|
||||||
|
BEESNOTE("checking filesystem type for " << name_fd(fd));
|
||||||
|
// LSB deprecated statfs without providing a replacement that
|
||||||
|
// can fill in the f_type field.
|
||||||
|
struct statfs stf = { 0 };
|
||||||
|
DIE_IF_NON_ZERO(fstatfs(fd, &stf));
|
||||||
|
if (stf.f_type != BTRFS_SUPER_MAGIC) {
|
||||||
|
BEESLOGONCE("Using fsync on non-btrfs filesystem type " << to_hex(stf.f_type));
|
||||||
|
BEESNOTE("fsync non-btrfs " << name_fd(fd));
|
||||||
|
DIE_IF_NON_ZERO(fsync(fd));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool did_uname = false;
|
||||||
|
static bool do_fsync = false;
|
||||||
|
|
||||||
|
if (!did_uname) {
|
||||||
|
Uname uname;
|
||||||
|
const string version(uname.release);
|
||||||
|
static const regex version_re(R"/(^(\d+)\.(\d+)\.)/", regex::optimize | regex::ECMAScript);
|
||||||
|
smatch m;
|
||||||
|
// Last known bug in the fsync-rename use case was fixed in kernel 5.16
|
||||||
|
static const auto min_major = 5, min_minor = 16;
|
||||||
|
if (regex_search(version, m, version_re)) {
|
||||||
|
const auto major = stoul(m[1]);
|
||||||
|
const auto minor = stoul(m[2]);
|
||||||
|
if (tie(major, minor) > tie(min_major, min_minor)) {
|
||||||
|
BEESLOGONCE("Using fsync on btrfs because kernel version is " << major << "." << minor);
|
||||||
|
do_fsync = true;
|
||||||
|
} else {
|
||||||
|
BEESLOGONCE("Not using fsync on btrfs because kernel version is " << major << "." << minor);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
BEESLOGONCE("Not using fsync on btrfs because can't parse kernel version '" << version << "'");
|
||||||
|
}
|
||||||
|
did_uname = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (do_fsync) {
|
||||||
|
BEESNOTE("fsync btrfs " << name_fd(fd));
|
||||||
|
DIE_IF_NON_ZERO(fsync(fd));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
BeesStringFile::write(string contents)
|
BeesStringFile::write(string contents)
|
||||||
{
|
{
|
||||||
@ -406,19 +476,8 @@ BeesStringFile::write(string contents)
|
|||||||
Fd ofd = openat_or_die(m_dir_fd, tmpname, FLAGS_CREATE_FILE, S_IRUSR | S_IWUSR);
|
Fd ofd = openat_or_die(m_dir_fd, tmpname, FLAGS_CREATE_FILE, S_IRUSR | S_IWUSR);
|
||||||
BEESNOTE("writing " << tmpname << " in " << name_fd(m_dir_fd));
|
BEESNOTE("writing " << tmpname << " in " << name_fd(m_dir_fd));
|
||||||
write_or_die(ofd, contents);
|
write_or_die(ofd, contents);
|
||||||
#if 0
|
|
||||||
// This triggers too many btrfs bugs. I wish I was kidding.
|
|
||||||
// Forget snapshots, balance, compression, and dedupe:
|
|
||||||
// the system call you have to fear on btrfs is fsync().
|
|
||||||
// Also note that when bees renames a temporary over an
|
|
||||||
// existing file, it flushes the temporary, so we get
|
|
||||||
// the right behavior if we just do nothing here
|
|
||||||
// (except when the file is first created; however,
|
|
||||||
// in that case the result is the same as if the file
|
|
||||||
// did not exist, was empty, or was filled with garbage).
|
|
||||||
BEESNOTE("fsyncing " << tmpname << " in " << name_fd(m_dir_fd));
|
BEESNOTE("fsyncing " << tmpname << " in " << name_fd(m_dir_fd));
|
||||||
DIE_IF_NON_ZERO(fsync(ofd));
|
bees_fsync(ofd);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
BEESNOTE("renaming " << tmpname << " to " << m_name << " in FD " << name_fd(m_dir_fd));
|
BEESNOTE("renaming " << tmpname << " to " << m_name << " in FD " << name_fd(m_dir_fd));
|
||||||
BEESTRACE("renaming " << tmpname << " to " << m_name << " in FD " << name_fd(m_dir_fd));
|
BEESTRACE("renaming " << tmpname << " to " << m_name << " in FD " << name_fd(m_dir_fd));
|
||||||
|
@ -134,6 +134,14 @@ const int FLAGS_OPEN_FANOTIFY = O_RDWR | O_NOATIME | O_CLOEXEC | O_LARGEFILE;
|
|||||||
#define BEESLOGINFO(x) BEESLOG(LOG_INFO, x)
|
#define BEESLOGINFO(x) BEESLOG(LOG_INFO, x)
|
||||||
#define BEESLOGDEBUG(x) BEESLOG(LOG_DEBUG, x)
|
#define BEESLOGDEBUG(x) BEESLOG(LOG_DEBUG, x)
|
||||||
|
|
||||||
|
#define BEESLOGONCE(__x) do { \
|
||||||
|
static bool already_logged = false; \
|
||||||
|
if (!already_logged) { \
|
||||||
|
already_logged = true; \
|
||||||
|
BEESLOGNOTICE(__x); \
|
||||||
|
} \
|
||||||
|
} while (false)
|
||||||
|
|
||||||
#define BEESCOUNT(stat) do { \
|
#define BEESCOUNT(stat) do { \
|
||||||
BeesStats::s_global.add_count(#stat); \
|
BeesStats::s_global.add_count(#stat); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user