mirror of
https://github.com/Zygo/bees.git
synced 2025-05-17 21:35:45 +02:00
crawl: change scan order to make forward progress at all times
Previously, the scan order processed each subvol in order. This required very large amounts of temporary disk space, as a full filesystem scan was required before any shared extents could be deduped. If the hash table RAM was underprovisioned this would mean some shared dup blocks were removed from the hash table before they could be deduped. Currently the scan order takes the first unscanned extent from each subvol. This works well if--and only if--the subvols are either empty or children of a common ancestor. It forces the same inode/offset pairs to be read at close to the same time from each subvol. When a new snapshot is created, this ordering diverts scanning to the new subvol until it catches up to the existing subvols. For large filesystems with frequent snapshot creation this means that the scanner never reaches the end of all subvols. Each new subvol effectively resets the current scan position for the entire filesystem to zero. This prevents bees from ever completing the first filesystem scan. Change the order again, so that we now read one unscanned extent from each subvol in round-robin fashion. When a new subvol is created, we share scan time between old and new subvols. This ensures we eventually finish scanning initial subvols and enter the incremental scanning state. The cost of this change is more repeated reading of shared extents at scan time with less benefit from disk-device-level caching; however, the only way to really fix this problem is to implement scanning on tree 2 (the btrfs extent tree) instead of the subvol trees. Signed-off-by: Zygo Blaxell <bees@furryterror.org>
This commit is contained in:
parent
7ecead1700
commit
c1e31004b6
@ -477,11 +477,6 @@ BeesResolver::find_all_matches(BeesBlockData &bbd)
|
||||
bool
|
||||
BeesResolver::operator<(const BeesResolver &that) const
|
||||
{
|
||||
if (that.m_bior_count < m_bior_count) {
|
||||
return true;
|
||||
} else if (m_bior_count < that.m_bior_count) {
|
||||
return false;
|
||||
}
|
||||
return m_addr < that.m_addr;
|
||||
// Lowest count, highest address
|
||||
return tie(that.m_bior_count, m_addr) < tie(m_bior_count, that.m_addr);
|
||||
}
|
||||
|
||||
|
@ -42,8 +42,8 @@ BeesCrawlState::BeesCrawlState() :
|
||||
bool
|
||||
BeesCrawlState::operator<(const BeesCrawlState &that) const
|
||||
{
|
||||
return tie(m_root, m_objectid, m_offset, m_min_transid, m_max_transid)
|
||||
< tie(that.m_root, that.m_objectid, that.m_offset, that.m_min_transid, that.m_max_transid);
|
||||
return tie(m_objectid, m_offset, m_root, m_min_transid, m_max_transid)
|
||||
< tie(that.m_objectid, that.m_offset, that.m_root, that.m_min_transid, that.m_max_transid);
|
||||
}
|
||||
|
||||
string
|
||||
@ -208,15 +208,15 @@ BeesRoots::crawl_roots()
|
||||
auto crawl_map_copy = m_root_crawl_map;
|
||||
lock.unlock();
|
||||
|
||||
#if 0
|
||||
// Scan the same inode/offset tuple in each subvol (good for snapshots)
|
||||
BeesFileRange first_range;
|
||||
shared_ptr<BeesCrawl> first_crawl;
|
||||
for (auto i : crawl_map_copy) {
|
||||
auto this_crawl = i.second;
|
||||
auto this_range = this_crawl->peek_front();
|
||||
if (this_range) {
|
||||
auto tuple_this = make_tuple(this_range.fid().ino(), this_range.fid().root(), this_range.begin());
|
||||
auto tuple_first = make_tuple(first_range.fid().ino(), first_range.fid().root(), first_range.begin());
|
||||
if (!first_range || tuple_this < tuple_first) {
|
||||
if (!first_range || this_range < first_range) {
|
||||
first_crawl = this_crawl;
|
||||
first_range = this_range;
|
||||
}
|
||||
@ -234,6 +234,27 @@ BeesRoots::crawl_roots()
|
||||
THROW_CHECK2(runtime_error, first_range, first_range_popped, first_range == first_range_popped);
|
||||
return;
|
||||
}
|
||||
#else
|
||||
// Scan each subvol one extent at a time (good for continuous forward progress)
|
||||
bool crawled = false;
|
||||
for (auto i : crawl_map_copy) {
|
||||
auto this_crawl = i.second;
|
||||
auto this_range = this_crawl->peek_front();
|
||||
if (this_range) {
|
||||
catch_all([&]() {
|
||||
// BEESINFO("scan_forward " << this_range);
|
||||
m_ctx->scan_forward(this_range);
|
||||
});
|
||||
crawled = true;
|
||||
BEESCOUNT(crawl_scan);
|
||||
m_crawl_current = this_crawl->get_state();
|
||||
auto this_range_popped = this_crawl->pop_front();
|
||||
THROW_CHECK2(runtime_error, this_range, this_range_popped, this_range == this_range_popped);
|
||||
}
|
||||
}
|
||||
|
||||
if (crawled) return;
|
||||
#endif
|
||||
|
||||
BEESLOG("Crawl ran out of data after " << m_crawl_timer.lap() << "s, waiting for more...");
|
||||
BEESCOUNT(crawl_done);
|
||||
|
Loading…
x
Reference in New Issue
Block a user