From 593a55e829af233130d4e7114f0a0e1763bae947 Mon Sep 17 00:00:00 2001 From: Zygo Blaxell Date: Thu, 5 Mar 2020 17:43:29 -0500 Subject: [PATCH 1/2] btrsame: not really part of bees, but a useful tool nonetheless Signed-off-by: Zygo Blaxell --- src/Makefile | 7 +- src/btrsame.cc | 313 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 319 insertions(+), 1 deletion(-) create mode 100644 src/btrsame.cc diff --git a/src/Makefile b/src/Makefile index 90cbaad..193cd68 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,5 +1,7 @@ BEES = ../bin/bees PROGRAMS = \ + ../bin/bees \ + ../bin/btrsame \ ../bin/fiemap \ ../bin/fiewalk \ @@ -35,7 +37,7 @@ depends.mk: $(BEES_OBJS:%.o=.depends/%.dep) include depends.mk -$(BEES_OBJS) fiemap.o fiewalk.o: %.o: %.cc +$(BEES_OBJS) fiemap.o fiewalk.o btrsame.o: %.o: %.cc $(CXX) $(BEES_CXXFLAGS) -o $@ -c $< $(PROGRAMS): ../bin/%: %.o @@ -47,5 +49,8 @@ bees-version.o: %.o: %.c $(BEES): $(BEES_OBJS) bees-version.o $(CXX) $(BEES_CXXFLAGS) $(BEES_LDFLAGS) -o $@ $^ $(LIBS) +../bin/btrsame: btrsame.o + $(CXX) $(BEES_CXXFLAGS) $(BEES_LDFLAGS) -o $@ $^ $(LIBS) + clean: rm -fv *.o bees-version.c diff --git a/src/btrsame.cc b/src/btrsame.cc new file mode 100644 index 0000000..8e250d4 --- /dev/null +++ b/src/btrsame.cc @@ -0,0 +1,313 @@ +#include "crucible/error.h" +#include "crucible/fd.h" +#include "crucible/fs.h" +#include "crucible/string.h" +#include "crucible/time.h" + +#include +#include +#include +#include +#include + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE /* for readahead() */ +#endif +#include + +#include +#include + +using namespace crucible; +using namespace std; + +// until we reboot some machines +// it's time to reboot some machines +// not until the latest memory leak is fixed +// OK Go +#if 1 +#define EXTENT_SAME_CLASS BtrfsExtentSame +static const bool ALWAYS_ALIGN = false; +#else +#define EXTENT_SAME_CLASS BtrfsExtentSameByClone +static const bool ALWAYS_ALIGN = false; +#endif + +static const int EXTENT_ALIGNMENT = 4096; + +// Not much point in exceeding this. btrfs should (?) merge adjacent +// extents, non-adjacent extents have to be separate anyway, and it has +// really nasty latency when it gets big. +// const off_t max_step_size = BTRFS_MAX_DEDUPE_LEN; +const off_t max_step_size = 128 * 1024 * 1024; + +// Not a good idea to go below 4K +const off_t min_step_size = 4096; + +// Give up fairly early - 1MB +const off_t max_contiguous_differences = 1 * 1024 * 1024; + +struct PhysicalBlockRange { + uint64_t m_start, m_end; + + PhysicalBlockRange(int fd, uint64_t offset, uint64_t len = 4096); +}; + +PhysicalBlockRange::PhysicalBlockRange(int fd, uint64_t offset, uint64_t len) : + m_start(0), m_end(0) +{ + Fiemap emap(offset, len); + emap.do_ioctl(fd); + + if (emap.m_extents.empty()) { + // No extents in range, we are in a hole after the last extent + m_start = 0; + m_end = len; + return; + } + + const FiemapExtent &fe = emap.m_extents.at(0); + + if (offset < fe.fe_logical) { + // Extent begins after offset, we are in a hole before the extent + m_start = 0; + m_end = fe.fe_logical - offset; + return; + } + + // TODO: reject preallocated and delallocated extents too + // TODO: well preallocated might be OK for dedup + + uint64_t extent_offset = offset - fe.fe_logical; + + m_start = fe.fe_physical + extent_offset; + uint64_t phys_length = fe.fe_length - extent_offset; + m_end = m_start + phys_length; +} + +static +bool +verbose() +{ + static bool done = false; + static bool verbose; + if (!done) { + verbose = getenv("BTRSAME_VERBOSE"); + done = true; + } + return verbose; +} + +static +bool +bees_same_file(Fd incumbent_fd, Fd candidate_fd) +{ + Stat incumbent_stat(incumbent_fd); + Stat candidate_stat(candidate_fd); + off_t common_size = min(incumbent_stat.st_size, candidate_stat.st_size); + + // If we are using clone instead of extent-same then we can ignore + // the alignment restriction for the last block of the dest file. + // This only works when both files are the same size. + if (ALWAYS_ALIGN || candidate_stat.st_size != incumbent_stat.st_size) { + common_size &= ~(EXTENT_ALIGNMENT - 1); + } + + if (verbose()) { + cerr << "A size " << incumbent_stat.st_size << ", B size " << candidate_stat.st_size << ", common size " << common_size << endl; + } + + off_t total_deduped = 0; + int status_ok = 0, status_err = 0, status_different = 0; + off_t step_size = max_step_size; + uint64_t contiguous_differences = 0; + uint64_t total_differences = 0; + uint64_t total_shared = 0; + uint64_t total_holes = 0; + + bool fatal_error = false; + + off_t p, len; + ostringstream oss; + Timer timer; + for (p = 0; p < common_size && !fatal_error; ) { + off_t this_step_size = step_size; + len = min(common_size - p, step_size); + + if (timer > 1.0) { + cerr << oss.str() << flush; + timer.reset(); + } + oss.str(""); + oss << "\r" + << "total " << common_size + << (total_deduped ? " **DUP** " : " dup ") << total_deduped + << " diff " << total_differences + << " shared " << total_shared + << " holes " << total_holes + << " off " << p + << " len " << len + << ' '; + + PhysicalBlockRange incumbent_pbr(incumbent_fd, p, len); + PhysicalBlockRange candidate_pbr(candidate_fd, p, len); + + if (incumbent_pbr.m_start == candidate_pbr.m_start) { + off_t shared_len = min(incumbent_pbr.m_end - incumbent_pbr.m_start, candidate_pbr.m_end - candidate_pbr.m_start); + this_step_size = max(min_step_size, min(shared_len, common_size - p)); + total_shared += this_step_size; + contiguous_differences = 0; + len = shared_len; + // At this point, if we see anything shared, it's because we already deduped the whole thing + // unless it's a hole. We do have those. + if (incumbent_pbr.m_start) { + // break; + } else { + total_holes += shared_len; + } + } else { + + // These might be triggering a locking bug + // ...actually we found the locking bug and it's somewhere else + // ...though there may be a memory leak bug so let's try turning this back off again + // ...nope, seems to be a kernel bug triggered by git + // ...but we still run out of RAM, hard, on some machines running this code. But not all. + // ...let's avoid the scary syscalls until we prove they work, OK? + // ...ok let's go looking for scary syscall behavior now + // ...no hangs but they don't seem to be helping, or are helping negatively + // ...long stalls here, see if they go away + // OK we were totally doing the wrong thing here. "common_size - p", indeed. + // DIE_IF_MINUS_ONE(posix_fadvise(incumbent_fd, p, common_size - p, POSIX_FADV_WILLNEED)); + // DIE_IF_MINUS_ONE(posix_fadvise(candidate_fd, p, common_size - p, POSIX_FADV_WILLNEED)); + DIE_IF_MINUS_ONE(readahead(incumbent_fd, p, len)); + DIE_IF_MINUS_ONE(readahead(candidate_fd, p, len)); + + EXTENT_SAME_CLASS bes(incumbent_fd, p, len); + bes.add(candidate_fd, p); + bes.do_ioctl(); + + int status = bes.m_info[0].status; + + if (status == 0) { + ++status_ok; + total_deduped += bes.m_info[0].bytes_deduped; + contiguous_differences = 0; + if (step_size * 2 <= max_step_size) { + step_size *= 2; + } + } else { + if (status < 0) { + oss << " (" << strerror(-status) << ", errno = " << -status << ")" << endl; + ++status_err; + switch (-status) { + case EXDEV: + oss << " (fatal error)" << endl; + THROW_ERRNO(-status); + break; + } + } else if (status == BTRFS_SAME_DATA_DIFFERS) { + ++status_different; + } else { + ++status_err; + } + if (step_size > min_step_size) { + step_size = min_step_size; + continue; + } else { + total_differences += step_size; + contiguous_differences += step_size; + if (total_deduped == 0 && contiguous_differences > max_contiguous_differences) { + oss << " (giving up, contiguous_differences = " << contiguous_differences + << ", max_contiguous_differences = " << max_contiguous_differences << ")" << endl; + break; + } + } + } + } + + p += len; + } + cerr << oss.str() << "\r" + << "total " << common_size + << (total_deduped ? " **DUP** " : " dup ") << total_deduped + << " diff " << total_differences + << " shared " << total_shared + << " holes " << total_holes + << " off " << p + << " len " << len + << " " + << endl; + + return status_ok > 0 && status_err == 0 && status_different == 0; +} + +int +main(int argc, char **argv) +{ + if (argc != 3) { + cerr << "Usage: " << argv[0] << " file1 file2" << endl; + cerr << "Uses the BTRFS_EXTENT_SAME ioctl to deduplicate file1 and file2" << endl; + exit(EXIT_FAILURE); + } + + if (verbose()) { + cerr << "A: " << argv[1] << endl; + } + Fd incumbent_fd = open_or_die(argv[1], O_RDONLY); + + if (verbose()) { + cerr << "B: " << argv[2] << endl; + } + Fd candidate_fd = open_or_die(argv[2], O_RDWR); + + int rv; + if (bees_same_file(incumbent_fd, candidate_fd)) { + rv = EXIT_SUCCESS; + } else { + // any run that doesn't end with terminate() is success + // rv = EXIT_FAILURE; + rv = EXIT_SUCCESS; + } + +// Let's try not doing this to see if our memory leaks go away +// OK we have memory leak fixes, bring on the extra testing +// OK it's slow and unnecessary in this context +#if 0 + catch_all([&]() { + PhysicalBlockRange pbr(incumbent_fd, 0); + cerr << "pbr = " << to_hex(pbr.m_start) << ".." << to_hex(pbr.m_end) << endl; + set inodes_seen; + set paths_seen; + if (pbr.m_start) { + BtrfsIoctlLogicalInoArgs lia(pbr.m_start); + lia.do_ioctl(incumbent_fd); + // cerr << &lia; + // [0] = BtrfsInodeOffsetRoot { .m_inum = 10544359, .m_offset = 0x0, .m_root = 257}, + for (auto i : lia.m_iors) { + auto seen_inode = inodes_seen.insert(i.m_inum); + if (!seen_inode.second) { + continue; + } + cerr << "Root " << i.m_root << " Inode " << i.m_inum << " Offset " << to_hex(i.m_offset) << "\n"; + catch_all([&]() { + // cerr << "Inode " << i.m_inum << ":\n"; + BtrfsIoctlInoPathArgs ipa(i.m_inum); + ipa.do_ioctl(incumbent_fd); + for (auto p : ipa.m_paths) { + auto seen_path = paths_seen.insert(p); + if (seen_path.second) { + cerr << "\tPath " << p << "\n"; + } + } + // Not useful without the rest of the root tree + // BtrfsIoctlInoLookupArgs ila(BTRFS_FIRST_FREE_OBJECTID); + // ila.do_ioctl(incumbent_fd); + // cerr << "ila = '" << ila.name << "'\n"; + }); + } + } + }); +#endif + + return rv; +} From 78134bfd785894ea8cf49b41744e76324f383e16 Mon Sep 17 00:00:00 2001 From: Zygo Blaxell Date: Thu, 5 Mar 2020 17:47:28 -0500 Subject: [PATCH 2/2] btrsame: clean out some cruft That reboot we were waiting for happened in 2015. Also, never use the clone for dedupe. Kernels old enough to not have dedupe have far too many bugs anyway. Signed-off-by: Zygo Blaxell --- src/btrsame.cc | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/btrsame.cc b/src/btrsame.cc index 8e250d4..0947f89 100644 --- a/src/btrsame.cc +++ b/src/btrsame.cc @@ -21,24 +21,14 @@ using namespace crucible; using namespace std; -// until we reboot some machines -// it's time to reboot some machines -// not until the latest memory leak is fixed -// OK Go -#if 1 #define EXTENT_SAME_CLASS BtrfsExtentSame static const bool ALWAYS_ALIGN = false; -#else -#define EXTENT_SAME_CLASS BtrfsExtentSameByClone -static const bool ALWAYS_ALIGN = false; -#endif static const int EXTENT_ALIGNMENT = 4096; -// Not much point in exceeding this. btrfs should (?) merge adjacent -// extents, non-adjacent extents have to be separate anyway, and it has -// really nasty latency when it gets big. // const off_t max_step_size = BTRFS_MAX_DEDUPE_LEN; +// btrfs maximum extent size is 128M, there is nothing to gain by going larger; +// however, going smaller will create a bunch of adjacent split extent refs. const off_t max_step_size = 128 * 1024 * 1024; // Not a good idea to go below 4K