From a353d8cc6efffc9bc8bc92fac38cf393522f0a88 Mon Sep 17 00:00:00 2001 From: Zygo Blaxell Date: Tue, 22 Jun 2021 22:13:22 -0400 Subject: [PATCH] hash: use POSIX_FADV_WILLNEED and POSIX_FADV_DONTNEED The hash table is one of the few cases in bees where a non-trivial amount of page cache memory will be used in a predictable way, so we can advise the kernel about our IO demands in advance. Use WILLNEED to prefetch hash table pages at startup. Use DONTNEED to trigger writeback on hash table pages at shutdown. Signed-off-by: Zygo Blaxell --- src/bees-hash.cc | 43 +++++++++++++++++++++++++++++++++++-------- src/bees.cc | 10 ++++++++++ src/bees.h | 4 ++++ 3 files changed, 49 insertions(+), 8 deletions(-) diff --git a/src/bees-hash.cc b/src/bees-hash.cc index 29a9081..d3373ee 100644 --- a/src/bees-hash.cc +++ b/src/bees-hash.cc @@ -115,8 +115,9 @@ BeesHashTable::flush_dirty_extent(uint64_t extent_index) bool wrote_extent = false; catch_all([&]() { - uint8_t *dirty_extent = m_extent_ptr[extent_index].p_byte; - uint8_t *dirty_extent_end = m_extent_ptr[extent_index + 1].p_byte; + uint8_t *const dirty_extent = m_extent_ptr[extent_index].p_byte; + uint8_t *const dirty_extent_end = m_extent_ptr[extent_index + 1].p_byte; + const size_t dirty_extent_offset = dirty_extent - m_byte_ptr; THROW_CHECK1(out_of_range, dirty_extent, dirty_extent >= m_byte_ptr); THROW_CHECK1(out_of_range, dirty_extent_end, dirty_extent_end <= m_byte_ptr_end); THROW_CHECK2(out_of_range, dirty_extent_end, dirty_extent, dirty_extent_end - dirty_extent == BLOCK_SIZE_HASHTAB_EXTENT); @@ -131,9 +132,13 @@ BeesHashTable::flush_dirty_extent(uint64_t extent_index) lock.unlock(); // Write the extent (or not) - pwrite_or_die(m_fd, extent_copy, dirty_extent - m_byte_ptr); + pwrite_or_die(m_fd, extent_copy, dirty_extent_offset); BEESCOUNT(hash_extent_out); + // Nope, this causes a _dramatic_ loss of performance. + // const size_t dirty_extent_size = dirty_extent_end - dirty_extent; + // bees_unreadahead(m_fd, dirty_extent_offset, dirty_extent_size); + wrote_extent = true; }); @@ -155,6 +160,8 @@ BeesHashTable::flush_dirty_extents(bool slowly) unique_lock lock(m_stop_mutex); if (m_stop_requested) { BEESLOGDEBUG("Stop requested in hash table flush_dirty_extents"); + // This function is called by another thread with !slowly, + // so we just get out of the way here. break; } m_stop_condvar.wait_for(lock, sleep_time); @@ -197,6 +204,11 @@ BeesHashTable::writeback_loop() m_dirty_condvar.wait(lock); } } + catch_all([&]() { + // trigger writeback on our way out + BEESTOOLONG("unreadahead hash table size " << pretty(m_size)); + bees_unreadahead(m_fd, 0, m_size); + }); BEESLOGDEBUG("Exited hash table writeback_loop"); } @@ -225,6 +237,7 @@ BeesHashTable::prefetch_loop() size_t toxic_count = 0; size_t unaligned_eof_count = 0; + m_prefetch_running = true; for (uint64_t ext = 0; ext < m_extents && !m_stop_requested; ++ext) { BEESNOTE("prefetching hash table extent #" << ext << " of " << m_extents); catch_all([&]() { @@ -266,6 +279,7 @@ BeesHashTable::prefetch_loop() } }); } + m_prefetch_running = false; BEESNOTE("calculating hash table statistics"); @@ -394,18 +408,29 @@ BeesHashTable::fetch_missing_extent_by_index(uint64_t extent_index) BEESTRACE("Fetching hash extent #" << extent_index << " of " << m_extents << " extents"); BEESTOOLONG("Fetching hash extent #" << extent_index << " of " << m_extents << " extents"); - uint8_t *dirty_extent = m_extent_ptr[extent_index].p_byte; - uint8_t *dirty_extent_end = m_extent_ptr[extent_index + 1].p_byte; + uint8_t *const dirty_extent = m_extent_ptr[extent_index].p_byte; + uint8_t *const dirty_extent_end = m_extent_ptr[extent_index + 1].p_byte; + const size_t dirty_extent_size = dirty_extent_end - dirty_extent; + const size_t dirty_extent_offset = dirty_extent - m_byte_ptr; // If the read fails don't retry, just go with whatever data we have m_extent_metadata.at(extent_index).m_missing = false; catch_all([&]() { BEESTOOLONG("pread(fd " << m_fd << " '" << name_fd(m_fd)<< "', length " << to_hex(dirty_extent_end - dirty_extent) << ", offset " << to_hex(dirty_extent - m_byte_ptr) << ")"); - pread_or_die(m_fd, dirty_extent, dirty_extent_end - dirty_extent, dirty_extent - m_byte_ptr); + pread_or_die(m_fd, dirty_extent, dirty_extent_size, dirty_extent_offset); // Only count extents successfully read BEESCOUNT(hash_extent_in); + + // Won't need that again + bees_unreadahead(m_fd, dirty_extent_offset, dirty_extent_size); + + // If we are in prefetch, give the kernel a hint about the next extent + if (m_prefetch_running) { + // XXX: don't call this if bees_readahead is implemented by pread() + bees_readahead(m_fd, dirty_extent_offset + dirty_extent_size, dirty_extent_size); + } }); } @@ -753,10 +778,12 @@ BeesHashTable::~BeesHashTable() // into the same trap (and maybe throw an exception) here. // flush_dirty_extents(false); catch_all([&]() { + // drop the memory mapping + BEESTOOLONG("unmap handle table size " << pretty(m_size)); DIE_IF_NON_ZERO(munmap(m_cell_ptr, m_size)); - m_cell_ptr = nullptr; - m_size = 0; }); + m_cell_ptr = nullptr; + m_size = 0; } BEESLOGDEBUG("BeesHashTable destroyed"); } diff --git a/src/bees.cc b/src/bees.cc index 9a01096..f84c7b0 100644 --- a/src/bees.cc +++ b/src/bees.cc @@ -252,6 +252,16 @@ bees_readahead(int const fd, off_t offset, size_t size) BEESCOUNTADD(readahead_ms, readahead_timer.age() * 1000); } +void +bees_unreadahead(int const fd, off_t offset, size_t size) +{ + Timer unreadahead_timer; + BEESNOTE("unreadahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size)); + BEESTOOLONG("unreadahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size)); + DIE_IF_NON_ZERO(posix_fadvise(fd, offset, size, POSIX_FADV_DONTNEED)); + BEESCOUNTADD(readahead_unread_ms, unreadahead_timer.age() * 1000); +} + BeesStringFile::BeesStringFile(Fd dir_fd, string name, size_t limit) : m_dir_fd(dir_fd), m_name(name), diff --git a/src/bees.h b/src/bees.h index d0380d9..2821cf9 100644 --- a/src/bees.h +++ b/src/bees.h @@ -462,6 +462,9 @@ private: RateLimiter m_flush_rate_limit; BeesStringFile m_stats_file; + // Prefetch readahead hint + bool m_prefetch_running = false; + // Mutex/condvar for the writeback thread mutex m_dirty_mutex; condition_variable m_dirty_condvar; @@ -887,6 +890,7 @@ extern const char *BEES_VERSION; string pretty(double d); void bees_sync(int fd); void bees_readahead(int fd, off_t offset, size_t size); +void bees_unreadahead(int fd, off_t offset, size_t size); string format_time(time_t t); #endif