From 8d08a3c06f340a8678727518f957141e08150696 Mon Sep 17 00:00:00 2001 From: Zygo Blaxell Date: Fri, 15 Nov 2024 14:35:01 -0500 Subject: [PATCH] readahead: inject some sanity at the foundation of an insane architecture This solves some of the worst problems with bees reads: 1. The kernel readahead doesn't work. More precisely, it's much better adapted for a very different use case: a single thread alternating between reading a file sequentially and processing the data that was read. bees has multiple threads which compete for access to IO and then issue reads in random order immediately after the call to readahead. The kernel uses idle ioprio scheduling for the readaheads, so the readaheads get preempted by the random reads, or cancels the readaheads because the data access pattern isn't sequential after the readahead was issued. 2. Seeking drives perform terribly with multiple competing readers, especially with btrfs striped profiles where the iops are broken into tiny stripe-sized pieces. At one point I intended to read the btrfs device map and figure out which devices can be read in parallel, but to make that useful, the user needs to have an array with multiple drives in single profile, or 4+ drives in raid1 profile. In all other cases, the elaborate calculations always return the same result: there can be only one reader at a time. This commit fixes both problems: 1. Don't use the kernel readahead. Use normal reads into a dummy buffer instead. 2. Allow only one thread to readahead at any time. Once the read is completed, the data is in the page cache, and all the random-order small reads that bees does will hit the page cache, not a spinning disk. In some cases we need to read two things close together, so add a `bees_readahead_pair` which holds one lock across both reads. Signed-off-by: Zygo Blaxell --- src/bees-types.cc | 4 ++-- src/bees.cc | 29 ++++++++++++++++++++++++----- src/bees.h | 1 + 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/src/bees-types.cc b/src/bees-types.cc index 2b59bb6..2baad0e 100644 --- a/src/bees-types.cc +++ b/src/bees-types.cc @@ -349,8 +349,8 @@ BeesRangePair::grow(shared_ptr ctx, bool constrained) BEESTRACE("e_second " << e_second); // Preread entire extent - bees_readahead(second.fd(), e_second.begin(), e_second.size()); - bees_readahead(first.fd(), e_second.begin() + first.begin() - second.begin(), e_second.size()); + bees_readahead_pair(second.fd(), e_second.begin(), e_second.size(), + first.fd(), e_second.begin() + first.begin() - second.begin(), e_second.size()); auto hash_table = ctx->hash_table(); diff --git a/src/bees.cc b/src/bees.cc index 787ab5c..b547b8d 100644 --- a/src/bees.cc +++ b/src/bees.cc @@ -214,8 +214,9 @@ BeesTooLong::operator=(const func_type &f) return *this; } +static void -bees_readahead(int const fd, const off_t offset, const size_t size) +bees_readahead_nolock(int const fd, const off_t offset, const size_t size) { Timer readahead_timer; BEESNOTE("readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size)); @@ -225,10 +226,8 @@ bees_readahead(int const fd, const off_t offset, const size_t size) DIE_IF_NON_ZERO(readahead(fd, offset, size)); #else // Make sure this data is in page cache by brute force - // This isn't necessary and it might even be slower, - // but the btrfs kernel code does readahead with lower ioprio - // and might discard the readahead request entirely, - // so it's maybe, *maybe*, worth doing both. + // The btrfs kernel code does readahead with lower ioprio + // and might discard the readahead request entirely. BEESNOTE("emulating readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size)); auto working_size = size; auto working_offset = offset; @@ -249,6 +248,26 @@ bees_readahead(int const fd, const off_t offset, const size_t size) BEESCOUNTADD(readahead_ms, readahead_timer.age() * 1000); } +void +bees_readahead_pair(int fd, off_t offset, size_t size, int fd2, off_t offset2, size_t size2) +{ + BEESNOTE("waiting to readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size) + << ", " << name_fd(fd2) << " offset " << to_hex(offset2) << " len " << pretty(size2)); + static mutex only_one; + unique_lock m_lock(only_one); + bees_readahead_nolock(fd, offset, size); + bees_readahead_nolock(fd2, offset2, size2); +} + +void +bees_readahead(int const fd, const off_t offset, const size_t size) +{ + BEESNOTE("waiting to readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size)); + static mutex only_one; + unique_lock m_lock(only_one); + bees_readahead_nolock(fd, offset, size); +} + void bees_unreadahead(int const fd, off_t offset, size_t size) { diff --git a/src/bees.h b/src/bees.h index 116b747..c39f46b 100644 --- a/src/bees.h +++ b/src/bees.h @@ -868,6 +868,7 @@ extern const char *BEES_VERSION; extern thread_local default_random_engine bees_generator; string pretty(double d); void bees_readahead(int fd, off_t offset, size_t size); +void bees_readahead_pair(int fd, off_t offset, size_t size, int fd2, off_t offset2, size_t size2); void bees_unreadahead(int fd, off_t offset, size_t size); string format_time(time_t t);