From 84f91af50307ff4fc7312961b4aff485ab1565db Mon Sep 17 00:00:00 2001 From: Zygo Blaxell Date: Sun, 20 Nov 2022 12:01:16 -0500 Subject: [PATCH] context: don't let multiple worker Tasks get stuck on a single extent or inode When two Tasks attempt to lock the same extent, append the later Task to the earlier Task's post-exec work queue. This will guarantee that all Tasks which attempt to manipulate the same extent will execute sequentially, and free up threads to process other extents. Similarly, if two scanner threads operate on the same inode, any dedupe they perform will lock out other scanner threads in btrfs. Avoid this by serializing Task objects that reference the same file. This does theoretically use an unbounded amount of memory, but in practice a Task that encounters a contended extent or inode quickly stops spawning new Tasks that might increase the queue size, and all Tasks that might contend for the same lock(s) end up on a single FIFO queue. Note that the scope of inode locks is intentionally global, i.e. when an inode is locked, it locks every inode with the same number in every subvol. This avoids significant lock contention and task queue growth when the same inode with the same file extents appear in snapshots. Fixes: https://github.com/Zygo/bees/issues/158 Signed-off-by: Zygo Blaxell --- docs/event-counters.md | 2 ++ src/bees-context.cc | 43 ++++++++++++++++++++++++++++++------------ src/bees-roots.cc | 12 ++++++++++++ src/bees.h | 8 +++++--- 4 files changed, 50 insertions(+), 15 deletions(-) diff --git a/docs/event-counters.md b/docs/event-counters.md index ab226d4..a81094b 100644 --- a/docs/event-counters.md +++ b/docs/event-counters.md @@ -363,6 +363,8 @@ scanf The `scanf` event group consists of operations related to `BeesContext::scan_forward`. This is the entry point where `crawl` schedules new data for scanning. + * `scanf_deferred_extent`: Two tasks attempted to scan the same extent at the same time, so one was deferred. + * `scanf_deferred_inode`: Two tasks attempted to scan the same inode at the same time, so one was deferred. * `scanf_extent`: A btrfs extent item was scanned. * `scanf_extent_ms`: Total thread-seconds spent scanning btrfs extent items. * `scanf_total`: A logical byte range of a file was scanned. diff --git a/src/bees-context.cc b/src/bees-context.cc index c49a903..370f0c2 100644 --- a/src/bees-context.cc +++ b/src/bees-context.cc @@ -89,7 +89,6 @@ BeesContext::dump_status() for (auto t : BeesNote::get_status()) { ofs << "\ttid " << t.first << ": " << t.second << "\n"; } - #if 0 // Huge amount of data, not a lot of information (yet) ofs << "WORKERS:\n"; @@ -678,7 +677,13 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e) return bfr; } -BeesFileRange +shared_ptr +BeesContext::get_inode_mutex(const uint64_t inode) +{ + return m_inode_locks(inode); +} + +void BeesContext::scan_forward(const BeesFileRange &bfr_in) { BEESTRACE("scan_forward " << bfr_in); @@ -689,7 +694,7 @@ BeesContext::scan_forward(const BeesFileRange &bfr_in) // Silently filter out blacklisted files if (is_blacklisted(bfr_in.fid())) { BEESCOUNT(scan_blacklisted); - return bfr_in; + return; } // Reconstitute FD @@ -703,31 +708,36 @@ BeesContext::scan_forward(const BeesFileRange &bfr_in) if (!bfr.fd()) { // BEESLOGINFO("No FD in " << root_path() << " for " << bfr); BEESCOUNT(scan_no_fd); - return bfr; + return; } // Sanity check if (bfr.begin() >= bfr.file_size()) { BEESLOGWARN("past EOF: " << bfr); BEESCOUNT(scan_eof); - return bfr; + return; } BtrfsExtentWalker ew(bfr.fd(), bfr.begin(), root_fd()); - BeesFileRange return_bfr(bfr); - Extent e; + bool start_over = false; catch_all([&]() { - while (!stop_requested()) { + while (!stop_requested() && !start_over) { e = ew.current(); catch_all([&]() { uint64_t extent_bytenr = e.bytenr(); - BEESNOTE("waiting for extent bytenr " << to_hex(extent_bytenr)); - auto extent_lock = m_extent_lock_set.make_lock(extent_bytenr); + auto extent_mutex = m_extent_locks(extent_bytenr); + const auto extent_lock = extent_mutex->try_lock(Task::current_task()); + if (!extent_lock) { + // BEESLOGDEBUG("Deferring extent bytenr " << to_hex(extent_bytenr) << " from " << bfr); + BEESCOUNT(scanf_deferred_extent); + start_over = true; + return; + } Timer one_extent_timer; - return_bfr = scan_one_extent(bfr, e); + scan_one_extent(bfr, e); BEESCOUNTADD(scanf_extent_ms, one_extent_timer.age() * 1000); BEESCOUNT(scanf_extent); }); @@ -745,7 +755,7 @@ BeesContext::scan_forward(const BeesFileRange &bfr_in) BEESCOUNTADD(scanf_total_ms, scan_timer.age() * 1000); BEESCOUNT(scanf_total); - return return_bfr; + return; } BeesResolveAddrResult::BeesResolveAddrResult() @@ -875,6 +885,15 @@ BeesContext::start() BEESLOGNOTICE("Starting bees main loop..."); BEESNOTE("starting BeesContext"); + m_extent_locks.func([](uint64_t bytenr) { + return make_shared(); + (void)bytenr; + }); + m_inode_locks.func([](const uint64_t fid) { + return make_shared(); + (void)fid; + }); + m_progress_thread = make_shared("progress_report"); m_progress_thread = make_shared("progress_report"); m_status_thread = make_shared("status_report"); m_progress_thread->exec([=]() { diff --git a/src/bees-roots.cc b/src/bees-roots.cc index 10fb64a..1b6afb6 100644 --- a/src/bees-roots.cc +++ b/src/bees-roots.cc @@ -243,6 +243,18 @@ BeesFileCrawl::crawl_one_extent() { BEESNOTE("crawl_one_extent m_offset " << to_hex(m_offset) << " state " << m_state); BEESTRACE("crawl_one_extent m_offset " << to_hex(m_offset) << " state " << m_state); + + // Only one thread can dedupe a file. btrfs will lock others out. + // Inodes are usually full of shared extents, especially in the case of snapshots, + // so when we lock an inode, we'll lock the same inode number in all subvols at once. + auto inode_mutex = m_ctx->get_inode_mutex(m_bedf.objectid()); + auto inode_lock = inode_mutex->try_lock(Task::current_task()); + if (!inode_lock) { + BEESCOUNT(scanf_deferred_inode); + // Returning false here means we won't reschedule ourselves, but inode_mutex will do that + return false; + } + // If we hit an exception here we don't try to catch it. // It will mean the file or subvol was deleted or there's metadata corruption, // and we should stop trying to scan the inode in that case. diff --git a/src/bees.h b/src/bees.h index b91aaef..271cbae 100644 --- a/src/bees.h +++ b/src/bees.h @@ -726,7 +726,8 @@ class BeesContext : public enable_shared_from_this { Timer m_total_timer; - LockSet m_extent_lock_set; + NamedPtr m_extent_locks; + NamedPtr m_inode_locks; mutable mutex m_stop_mutex; condition_variable m_stop_condvar; @@ -751,7 +752,7 @@ public: Fd home_fd(); string root_path() const { return m_root_path; } - BeesFileRange scan_forward(const BeesFileRange &bfr); + void scan_forward(const BeesFileRange &bfr); bool is_root_ro(uint64_t root); BeesRangePair dup_extent(const BeesFileRange &src, const shared_ptr &tmpfile); @@ -761,6 +762,8 @@ public: void blacklist_erase(const BeesFileId &fid); bool is_blacklisted(const BeesFileId &fid) const; + shared_ptr get_inode_mutex(uint64_t inode); + BeesResolveAddrResult resolve_addr(BeesAddress addr); void invalidate_addr(BeesAddress addr); @@ -777,7 +780,6 @@ public: shared_ptr tmpfile(); const Timer &total_timer() const { return m_total_timer; } - LockSet &extent_lock_set() { return m_extent_lock_set; } }; class BeesResolver {