commit cca0ee26a8023bacc1d08e4c6a15e7ead40eb15c Author: Zygo Blaxell Date: Tue Nov 15 23:32:44 2016 -0500 bees: remove local cruft, throw at github diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..91fdf51 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +*.[ao] +*.bak +*.new +*.so* +Doxyfile +depends.mk +doxygen_* +html/ +latex/ +make.log +make.log.new diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..ffb614c --- /dev/null +++ b/Makefile @@ -0,0 +1,15 @@ +default install all: lib src test + +clean: + git clean -dfx + +.PHONY: lib src + +lib: + $(MAKE) -C lib + +src: lib + $(MAKE) -C src + +test: lib src + $(MAKE) -C test diff --git a/bin/.gitignore b/bin/.gitignore new file mode 100644 index 0000000..72e8ffc --- /dev/null +++ b/bin/.gitignore @@ -0,0 +1 @@ +* diff --git a/include/crucible/backtrace.h b/include/crucible/backtrace.h new file mode 100644 index 0000000..69b9646 --- /dev/null +++ b/include/crucible/backtrace.h @@ -0,0 +1,29 @@ +#ifndef CRUCIBLE_BACKTRACE_H +#define CRUCIBLE_BACKTRACE_H + +#include +#include + +#include + +namespace crucible { + using namespace std; + + class Backtrace { + vector m_buffer; + mutable vector m_result_stringvec; + mutable char **m_result_cpp; + int m_result_size; + int m_desired_size; + public: + Backtrace(int size = 99); + ~Backtrace(); + const vector &strings() const; + const vector &voids() const; + void symbols_fd(int fd) const; + bool overflowed() const; + }; + +} + +#endif // CRUCIBLE_BACKTRACE_H diff --git a/include/crucible/bencode.h b/include/crucible/bencode.h new file mode 100644 index 0000000..616ba53 --- /dev/null +++ b/include/crucible/bencode.h @@ -0,0 +1,76 @@ +#ifndef CRUCIBLE_BENCODE_H +#define CRUCIBLE_BENCODE_H + +#include "crucible/error.h" + +#include + +#include +#include +#include +#include +#include +#include + +namespace crucible { + using namespace std; + + // So...much...forward declaration... + struct bencode_variant; + typedef shared_ptr bencode_variant_ptr; + + struct bencode_variant { + virtual ~bencode_variant(); + virtual ostream& print(ostream &os, const string &parent = "") const = 0; + virtual bencode_variant_ptr at(size_t i) const; + virtual bencode_variant_ptr at(const string &s) const; + virtual operator string() const; + }; + + ostream& operator<<(ostream &os, const bencode_variant_ptr &p); + + // ie + struct bencode_int : public bencode_variant { + ~bencode_int(); + bencode_int(int64_t i); + ostream & print(ostream &os, const string &parent = "") const override; + private: + int64_t m_i; + }; + + // :contents + struct bencode_string : public bencode_variant { + ~bencode_string(); + bencode_string(string s); + ostream & print(ostream &os, const string &parent = "") const override; + operator string() const override; + private: + string m_s; + }; + + // le + struct bencode_list : public bencode_variant { + ~bencode_list(); + bencode_list(const vector &l); + ostream & print(ostream &os, const string &parent = "") const override; + using bencode_variant::at; + bencode_variant_ptr at(size_t i) const override; + private: + vector m_l; + }; + + // de (lexicographically sorted pairs of , key is a string) + struct bencode_dict : public bencode_variant { + ~bencode_dict(); + bencode_dict(const map &m); + ostream& print(ostream &os, const string &parent = "") const override; + using bencode_variant::at; + bencode_variant_ptr at(const string &key) const override; + private: + map m_m; + }; + + bencode_variant_ptr bencode_decode_stream(istream &is); +}; + +#endif diff --git a/include/crucible/bool.h b/include/crucible/bool.h new file mode 100644 index 0000000..75f6cbe --- /dev/null +++ b/include/crucible/bool.h @@ -0,0 +1,13 @@ +#ifndef CRUCIBLE_BOOL_H +#define CRUCIBLE_BOOL_H + +namespace crucible { + struct DefaultBool { + bool m_b; + DefaultBool(bool init = false) : m_b(init) {} + operator bool() const { return m_b; } + bool &operator=(const bool &that) { return m_b = that; } + }; +} + +#endif // CRUCIBLE_BOOL_H diff --git a/include/crucible/btrfs.h b/include/crucible/btrfs.h new file mode 100644 index 0000000..044b682 --- /dev/null +++ b/include/crucible/btrfs.h @@ -0,0 +1,205 @@ +#ifndef CRUCIBLE_BTRFS_H +#define CRUCIBLE_BTRFS_H + +// Copied from Linux kernel sources as of 3.15 or so. +// These are probably missing from /usr/include at the moment. + +// NULL +#include + +// _IOWR macro and friends +#include + +// __u64 typedef and friends +#include + +// try Linux headers first +#include + +// Supply any missing definitions +#define mutex not_mutex +#include +// Repair the damage +#undef min +#undef max +#undef mutex + +#ifndef BTRFS_FIRST_FREE_OBJECTID + + #define BTRFS_ROOT_TREE_OBJECTID 1ULL + #define BTRFS_EXTENT_TREE_OBJECTID 2ULL + #define BTRFS_CHUNK_TREE_OBJECTID 3ULL + #define BTRFS_DEV_TREE_OBJECTID 4ULL + #define BTRFS_FS_TREE_OBJECTID 5ULL + #define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL + #define BTRFS_CSUM_TREE_OBJECTID 7ULL + #define BTRFS_QUOTA_TREE_OBJECTID 8ULL + #define BTRFS_UUID_TREE_OBJECTID 9ULL + #define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL + #define BTRFS_BALANCE_OBJECTID -4ULL + #define BTRFS_ORPHAN_OBJECTID -5ULL + #define BTRFS_TREE_LOG_OBJECTID -6ULL + #define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL + #define BTRFS_TREE_RELOC_OBJECTID -8ULL + #define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL + #define BTRFS_EXTENT_CSUM_OBJECTID -10ULL + #define BTRFS_FREE_SPACE_OBJECTID -11ULL + #define BTRFS_FREE_INO_OBJECTID -12ULL + #define BTRFS_MULTIPLE_OBJECTIDS -255ULL + #define BTRFS_FIRST_FREE_OBJECTID 256ULL + #define BTRFS_LAST_FREE_OBJECTID -256ULL + #define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL + #define BTRFS_DEV_ITEMS_OBJECTID 1ULL + + #define BTRFS_INODE_ITEM_KEY 1 + #define BTRFS_INODE_REF_KEY 12 + #define BTRFS_INODE_EXTREF_KEY 13 + #define BTRFS_XATTR_ITEM_KEY 24 + #define BTRFS_ORPHAN_ITEM_KEY 48 + #define BTRFS_DIR_LOG_ITEM_KEY 60 + #define BTRFS_DIR_LOG_INDEX_KEY 72 + #define BTRFS_DIR_ITEM_KEY 84 + #define BTRFS_DIR_INDEX_KEY 96 + #define BTRFS_EXTENT_DATA_KEY 108 + #define BTRFS_CSUM_ITEM_KEY 120 + #define BTRFS_EXTENT_CSUM_KEY 128 + #define BTRFS_ROOT_ITEM_KEY 132 + #define BTRFS_ROOT_BACKREF_KEY 144 + #define BTRFS_ROOT_REF_KEY 156 + #define BTRFS_EXTENT_ITEM_KEY 168 + #define BTRFS_METADATA_ITEM_KEY 169 + #define BTRFS_TREE_BLOCK_REF_KEY 176 + #define BTRFS_EXTENT_DATA_REF_KEY 178 + #define BTRFS_EXTENT_REF_V0_KEY 180 + #define BTRFS_SHARED_BLOCK_REF_KEY 182 + #define BTRFS_SHARED_DATA_REF_KEY 184 + #define BTRFS_BLOCK_GROUP_ITEM_KEY 192 + #define BTRFS_FREE_SPACE_INFO_KEY 198 + #define BTRFS_FREE_SPACE_EXTENT_KEY 199 + #define BTRFS_FREE_SPACE_BITMAP_KEY 200 + #define BTRFS_DEV_EXTENT_KEY 204 + #define BTRFS_DEV_ITEM_KEY 216 + #define BTRFS_CHUNK_ITEM_KEY 228 + #define BTRFS_BALANCE_ITEM_KEY 248 + #define BTRFS_QGROUP_STATUS_KEY 240 + #define BTRFS_QGROUP_INFO_KEY 242 + #define BTRFS_QGROUP_LIMIT_KEY 244 + #define BTRFS_QGROUP_RELATION_KEY 246 + #define BTRFS_DEV_STATS_KEY 249 + #define BTRFS_DEV_REPLACE_KEY 250 + #define BTRFS_UUID_KEY_SUBVOL 251 + #define BTRFS_UUID_KEY_RECEIVED_SUBVOL 252 + #define BTRFS_STRING_ITEM_KEY 253 + +#endif + +#ifndef BTRFS_DEFRAG_RANGE_START_IO + + // For some reason uapi has BTRFS_DEFRAG_RANGE_COMPRESS and + // BTRFS_DEFRAG_RANGE_START_IO but not btrfs_ioctl_defrag_range_args + // Never mind, it's too broken to be useful anyway + struct btrfs_ioctl_defrag_range_args { + /* start of the defrag operation */ + __u64 start; + + /* number of bytes to defrag, use (u64)-1 to say all */ + __u64 len; + + /* + * flags for the operation, which can include turning + * on compression for this one defrag + */ + __u64 flags; + + /* + * any extent bigger than this will be considered + * already defragged. Use 0 to take the kernel default + * Use 1 to say every single extent must be rewritten + */ + __u32 extent_thresh; + + /* + * which compression method to use if turning on compression + * for this defrag operation. If unspecified, zlib will + * be used + */ + __u32 compress_type; + + /* spare for later */ + __u32 unused[4]; + }; + +#endif + +#ifndef BTRFS_IOC_CLONE_RANGE + + struct btrfs_ioctl_clone_range_args { + __s64 src_fd; + __u64 src_offset, src_length; + __u64 dest_offset; + }; + + // We definitely have this + #define BTRFS_IOCTL_MAGIC 0x94 + + #define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int) + + #define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ + struct btrfs_ioctl_clone_range_args) +#endif + +#ifndef BTRFS_SAME_DATA_DIFFERS + + #define BTRFS_SAME_DATA_DIFFERS 1 + /* For extent-same ioctl */ + struct btrfs_ioctl_same_extent_info { + __s64 fd; /* in - destination file */ + __u64 logical_offset; /* in - start of extent in destination */ + __u64 bytes_deduped; /* out - total # of bytes we were able + * to dedupe from this file */ + /* status of this dedupe operation: + * 0 if dedup succeeds + * < 0 for error + * == BTRFS_SAME_DATA_DIFFERS if data differs + */ + __s32 status; /* out - see above description */ + __u32 reserved; + }; + + struct btrfs_ioctl_same_args { + __u64 logical_offset; /* in - start of extent in source */ + __u64 length; /* in - length of extent */ + __u16 dest_count; /* in - total elements in info array */ + __u16 reserved1; + __u32 reserved2; + struct btrfs_ioctl_same_extent_info info[0]; + }; + + #define BTRFS_IOC_FILE_EXTENT_SAME _IOWR(BTRFS_IOCTL_MAGIC, 54, \ + struct btrfs_ioctl_same_args) + +#endif + +#ifndef BTRFS_MAX_DEDUPE_LEN + #define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) +#endif + +#ifndef BTRFS_IOC_TREE_SEARCH_V2 + + /* + * Extended version of TREE_SEARCH ioctl that can return more than 4k of bytes. + * The allocated size of the buffer is set in buf_size. + */ + struct btrfs_ioctl_search_args_v2 { + struct btrfs_ioctl_search_key key; /* in/out - search parameters */ + __u64 buf_size; /* in - size of buffer + * out - on EOVERFLOW: needed size + * to store item */ + __u64 buf[0]; /* out - found items */ + }; + + #define BTRFS_IOC_TREE_SEARCH_V2 _IOWR(BTRFS_IOCTL_MAGIC, 17, \ + struct btrfs_ioctl_search_args_v2) +#endif + +#endif // CRUCIBLE_BTRFS_H diff --git a/include/crucible/cache.h b/include/crucible/cache.h new file mode 100644 index 0000000..a5b9581 --- /dev/null +++ b/include/crucible/cache.h @@ -0,0 +1,221 @@ +#ifndef CRUCIBLE_CACHE_H +#define CRUCIBLE_CACHE_H + +#include "crucible/lockset.h" + +#include +#include +#include +#include +#include + +namespace crucible { + using namespace std; + + template + class LRUCache { + public: + using Key = tuple; + using Func = function; + using Time = unsigned; + using Value = pair; + private: + Func m_fn; + Time m_ctr; + map m_map; + LockSet m_lockset; + size_t m_max_size; + mutex m_mutex; + + void check_overflow(); + public: + LRUCache(Func f = Func(), size_t max_size = 100); + + void func(Func f); + void max_size(size_t new_max_size); + + Return operator()(Arguments... args); + Return refresh(Arguments... args); + void expire(Arguments... args); + void prune(function predicate); + void insert(const Return &r, Arguments... args); + void clear(); + }; + + template + LRUCache::LRUCache(Func f, size_t max_size) : + m_fn(f), + m_ctr(0), + m_max_size(max_size) + { + } + + template + void + LRUCache::check_overflow() + { + if (m_map.size() <= m_max_size) return; + vector> map_contents; + map_contents.reserve(m_map.size()); + for (auto i : m_map) { + map_contents.push_back(make_pair(i.first, i.second.first)); + } + sort(map_contents.begin(), map_contents.end(), [](const pair &a, const pair &b) { + return a.second < b.second; + }); + for (size_t i = 0; i < map_contents.size() / 2; ++i) { + m_map.erase(map_contents[i].first); + } + } + + template + void + LRUCache::max_size(size_t new_max_size) + { + unique_lock lock(m_mutex); + m_max_size = new_max_size; + check_overflow(); + } + + template + void + LRUCache::func(Func func) + { + unique_lock lock(m_mutex); + m_fn = func; + } + + template + void + LRUCache::clear() + { + unique_lock lock(m_mutex); + m_map.clear(); + } + + template + void + LRUCache::prune(function pred) + { + unique_lock lock(m_mutex); + for (auto it = m_map.begin(); it != m_map.end(); ) { + auto next_it = ++it; + if (pred(it.second.second)) { + m_map.erase(it); + } + it = next_it; + } + } + + template + Return + LRUCache::operator()(Arguments... args) + { + Key k(args...); + bool inserted = false; + + // Do we have it cached? + unique_lock lock(m_mutex); + auto found = m_map.find(k); + if (found == m_map.end()) { + // No, release cache lock and acquire key lock + lock.unlock(); + typename LockSet::Lock key_lock(m_lockset, k); + + // Did item appear in cache while we were waiting for key? + lock.lock(); + found = m_map.find(k); + if (found == m_map.end()) { + + // No, we hold key and cache locks, but item not in cache. + // Release cache lock and call function + auto ctr_copy = m_ctr++; + lock.unlock(); + Value v(ctr_copy, m_fn(args...)); + + // Reacquire cache lock and insert return value + lock.lock(); + tie(found, inserted) = m_map.insert(make_pair(k, v)); + + // We hold a lock on this key so we are the ones to insert it + THROW_CHECK0(runtime_error, inserted); + + // Release key lock and clean out overflow + key_lock.unlock(); + check_overflow(); + } + } + + // Item should be in cache now + THROW_CHECK0(runtime_error, found != m_map.end()); + + // We are using this object so update the timestamp + if (!inserted) { + found->second.first = m_ctr++; + } + return found->second.second; + } + + template + void + LRUCache::expire(Arguments... args) + { + Key k(args...); + unique_lock lock(m_mutex); + m_map.erase(k); + } + + template + Return + LRUCache::refresh(Arguments... args) + { + expire(args...); + return operator()(args...); + } + + template + void + LRUCache::insert(const Return &r, Arguments... args) + { + Key k(args...); + bool inserted = false; + + // Do we have it cached? + unique_lock lock(m_mutex); + auto found = m_map.find(k); + if (found == m_map.end()) { + // No, release cache lock and acquire key lock + lock.unlock(); + typename LockSet::Lock key_lock(m_lockset, k); + + // Did item appear in cache while we were waiting for key? + lock.lock(); + found = m_map.find(k); + if (found == m_map.end()) { + + // No, we hold key and cache locks, but item not in cache. + // Release cache lock and insert the provided return value + auto ctr_copy = m_ctr++; + Value v(ctr_copy, r); + tie(found, inserted) = m_map.insert(make_pair(k, v)); + + // We hold a lock on this key so we are the ones to insert it + THROW_CHECK0(runtime_error, inserted); + + // Release key lock and clean out overflow + key_lock.unlock(); + check_overflow(); + } + } + + // Item should be in cache now + THROW_CHECK0(runtime_error, found != m_map.end()); + + // We are using this object so update the timestamp + if (!inserted) { + found->second.first = m_ctr++; + } + } +} + +#endif // CRUCIBLE_CACHE_H diff --git a/include/crucible/chatter.h b/include/crucible/chatter.h new file mode 100644 index 0000000..6486202 --- /dev/null +++ b/include/crucible/chatter.h @@ -0,0 +1,156 @@ +#ifndef CRUCIBLE_CHATTER_H +#define CRUCIBLE_CHATTER_H + +#include +#include +#include +#include +#include +#include + + /** \brief Chatter wraps a std::ostream reference with a destructor that + writes a newline, and inserts timestamp, pid, and tid prefixes on output. + + Typical usage is expressions like the following: + + int six = 6, nine = 9; \n + Chatter() << "What you get when you multiply" << six + << "by" << nine << '?'; \n + Chatter() << "forty two!"; + + which results in output like the following: + + What you get when you multiply 6 by 9 ?\n + forty-two! + + Note that newlines and timestamps are injected automatically in + the output by the Chatter destructor. You can also use std::endl + explicitly, although it will not have the effect of flushing the + buffer. + */ + +namespace crucible { + using namespace std; + + class Chatter { + string m_name; + ostream &m_os; + ostringstream m_oss; + + public: + Chatter(string name, ostream &os = cerr); + Chatter(Chatter &&c); + ostream &get_os() { return m_oss; } + + template Chatter &operator<<(const T& arg); + + ~Chatter(); + }; + + template + struct ChatterTraits { + Chatter &operator()(Chatter &c, const Argument &arg) + { + c.get_os() << arg; + return c; + } + }; + + template + Chatter & + Chatter::operator<<(const T& arg) + { + return ChatterTraits()(*this, arg); + } + + template + struct ChatterTraits { + Chatter &operator()(Chatter &c, const Argument *arg) + { + if (arg) { + c.get_os() << "(pointer to " << typeid(*arg).name() << ")(" << reinterpret_cast(arg) << ")"; + } else { + c.get_os() << "(NULL pointer to " << typeid(arg).name() << ')'; + } + return c; + } + }; + + template <> + struct ChatterTraits { + Chatter & + operator()(Chatter &c, const char *arg) + { + c.get_os() << arg; + return c; + } + }; + + template <> + struct ChatterTraits { + Chatter & + operator()(Chatter &c, ostream & arg) + { + c.get_os() << arg; + return c; + } + }; + + class ChatterBox { + string m_file; + int m_line; + string m_pretty_function; + bool m_enabled; + ostream& m_os; + + static set s_boxes; + + public: + ChatterBox(string file, int line, string pretty_function, ostream &os = cerr); + ~ChatterBox(); + + template Chatter operator<<(const T &t) + { + Chatter c(m_pretty_function, m_os); + c << t; + return c; + } + + bool enabled() const { return m_enabled; } + void set_enable(bool en); + + static set& all_boxes(); + }; + + class ChatterUnwinder { + function m_func; + public: + ChatterUnwinder(function f); + ~ChatterUnwinder(); + }; +}; + +#define CHATTER(x) do { \ + using namespace crucible; \ + static ChatterBox crucible_chatterbox_cb(__FILE__, __LINE__, __func__); \ + if (crucible_chatterbox_cb.enabled()) { \ + crucible_chatterbox_cb << x; \ + } \ +} while (0) + +#define CHATTER_TRACE(x) do { \ + using namespace crucible; \ + static ChatterBox crucible_chatterbox_cb(__FILE__, __LINE__, __func__); \ + if (crucible_chatterbox_cb.enabled()) { \ + crucible_chatterbox_cb << __FILE__ << ":" << __LINE__ << ": " << x; \ + } \ +} while (0) + +#define WTF_C(x, y) x##y +#define SRSLY_WTF_C(x, y) WTF_C(x, y) +#define CHATTER_UNWIND(x) \ + crucible::ChatterUnwinder SRSLY_WTF_C(chatterUnwinder_, __LINE__) ([&]() { \ + CHATTER_TRACE(x); \ + }) + +#endif // CRUCIBLE_CHATTER_H diff --git a/include/crucible/crc64.h b/include/crucible/crc64.h new file mode 100644 index 0000000..0bd8d11 --- /dev/null +++ b/include/crucible/crc64.h @@ -0,0 +1,16 @@ +#ifndef CRUCIBLE_CRC64_H +#define CRUCIBLE_CRC64_H + +#include +#include + +namespace crucible { + namespace Digest { + namespace CRC { + uint64_t crc64(const char *s); + uint64_t crc64(const void *p, size_t len); + }; + }; +}; + +#endif diff --git a/include/crucible/error.h b/include/crucible/error.h new file mode 100644 index 0000000..0b57869 --- /dev/null +++ b/include/crucible/error.h @@ -0,0 +1,161 @@ +#ifndef CRUCIBLE_ERROR_H +#define CRUCIBLE_ERROR_H + +// Common error-handling idioms for C library calls + +#include +#include +#include +#include +#include +#include + +#include + +namespace crucible { + using namespace std; + + // Common error-handling idioms for C library calls + + template T die_if_minus_errno(const char *expr, T rv) + { + if (rv < 0) { + throw system_error(error_code(-rv, system_category()), expr); + } + return rv; + } + + template T die_if_minus_one(const char *expr, T rv) + { + if (rv == -1) { + throw system_error(error_code(errno, system_category()), expr); + } + return rv; + } + + template T die_if_zero(const char *expr, T rv) + { + if (rv == 0) { + throw system_error(error_code(errno, system_category()), expr); + } + return rv; + } + + template T die_if_non_zero(const char *expr, T rv) + { + if (rv != 0) { + throw system_error(error_code(errno, system_category()), expr); + } + return rv; + } + + // Usage: catch_all([&]() { /* insert body here */ } ); + // Executes body with exceptions caught and reported to cerr. + // Returns: + // 0 if f() returns + // non-zero if f() throws an exception + // -1 for unknown exception + // 1 for std::exception or class derived thereof + + void set_catch_explainer(function f); + void default_catch_explainer(string s); + int catch_all(const function &f, const function &explainer = default_catch_explainer); + + // catch_and_explain traps the exception, calls the explainer, then rethrows the original exception + void catch_and_explain(const function &f, const function &explainer = default_catch_explainer); +}; + +// 0 on success, -errno on error. +// Covers most pthread functions. +#define DIE_IF_MINUS_ERRNO(expr) crucible::die_if_minus_errno(#expr, expr) + +// -1 on error, all other values mean success. +#define DIE_IF_MINUS_ONE(expr) crucible::die_if_minus_one(#expr, expr) + +// 0 (or NULL) on error, all other values mean success. +#define DIE_IF_ZERO(expr) crucible::die_if_zero(#expr, expr) + +// 0 (or NULL) on success, all other values mean error. +#define DIE_IF_NON_ZERO(expr) crucible::die_if_non_zero(#expr, expr) + +// macro for throwing an error +#define THROW_ERROR(type, expr) do { \ + std::ostringstream _te_oss; \ + _te_oss << expr; \ + throw type(_te_oss.str()); \ +} while (0) + +// macro for throwing a system_error with errno +#define THROW_ERRNO(expr) do { \ + std::ostringstream _te_oss; \ + _te_oss << expr; \ + throw std::system_error(std::error_code(errno, std::system_category()), _te_oss.str()); \ +} while (0) + +// macro for throwing a system_error with some other variable +#define THROW_ERRNO_VALUE(value, expr) do { \ + std::ostringstream _te_oss; \ + _te_oss << expr; \ + throw std::system_error(std::error_code((value), std::system_category()), _te_oss.str()); \ +} while (0) + +// macros for checking a constraint +#define CHECK_CONSTRAINT(value, expr) do { \ + if (!(expr)) { \ + THROW_ERROR(out_of_range, #value << " = " << value << " failed constraint check (" << #expr << ")"); \ + } \ +} while(0) + +#define THROW_CHECK0(type, expr) do { \ + if (!(expr)) { \ + THROW_ERROR(type, "failed constraint check (" << #expr << ")"); \ + } \ +} while(0) + +#define THROW_CHECK1(type, value, expr) do { \ + if (!(expr)) { \ + THROW_ERROR(type, #value << " = " << (value) << " failed constraint check (" << #expr << ")"); \ + } \ +} while(0) + +#define THROW_CHECK2(type, value1, value2, expr) do { \ + if (!(expr)) { \ + THROW_ERROR(type, #value1 << " = " << (value1) << ", " #value2 << " = " << (value2) \ + << " failed constraint check (" << #expr << ")"); \ + } \ +} while(0) + +#define THROW_CHECK3(type, value1, value2, value3, expr) do { \ + if (!(expr)) { \ + THROW_ERROR(type, #value1 << " = " << (value1) << ", " #value2 << " = " << (value2) << ", " #value3 << " = " << (value3) \ + << " failed constraint check (" << #expr << ")"); \ + } \ +} while(0) + +#define THROW_CHECK_BIN_OP(type, value1, op, value2) do { \ + if (!((value1) op (value2))) { \ + THROW_ERROR(type, "failed constraint check " << #value1 << " (" << (value1) << ") " << #op << " " << #value2 << " (" << (value2) << ")"); \ + } \ +} while(0) + +#define THROW_CHECK_PREFIX_OP(type, op, value1) do { \ + if (!(op (value1))) { \ + THROW_ERROR(type, "failed constraint check " << #op << " " << #value1 << " (" << (value1) << ")"); \ + } \ +} while(0) + +#define THROW_CHECK_RANGE(type, value_min, value_test, value_max) do { \ + if ((value_test) < (value_min) || (value_max) < (value_test)) { \ + THROW_ERROR(type, "failed constraint check " << #value_min << " (" << (value_min) << ") <= " #value_test << " (" << (value_test) \ + << ") <= " << #value_max << " (" << (value_max) << ")"); \ + } \ +} while(0) + +#define THROW_CHECK_ARRAY_RANGE(type, value_min, value_test, value_max) do { \ + if ((value_test) < (value_min) || !((value_test) < (value_max))) { \ + THROW_ERROR(type, "failed constraint check " << #value_min << " (" << (value_min) << ") <= " #value_test << " (" << (value_test) \ + << ") < " << #value_max << " (" << (value_max) << ")"); \ + } \ +} while(0) + +#endif // CRUCIBLE_ERROR_H diff --git a/include/crucible/execpipe.h b/include/crucible/execpipe.h new file mode 100644 index 0000000..ac14729 --- /dev/null +++ b/include/crucible/execpipe.h @@ -0,0 +1,28 @@ +#ifndef CRUCIBLE_EXECPIPE_H +#define CRUCIBLE_EXECPIPE_H + +#include "crucible/fd.h" + +#include +#include +#include + +namespace crucible { + using namespace std; + + void redirect_stdin(const Fd &child_fd); + void redirect_stdin_stdout(const Fd &child_fd); + void redirect_stdin_stdout_stderr(const Fd &child_fd); + void redirect_stdout(const Fd &child_fd); + void redirect_stdout_stderr(const Fd &child_fd); + + // Open a pipe (actually socketpair) to child process, then execute code in that process. + // e.g. popen([] () { system("echo Hello, World!"); }); + // Forked process will exit when function returns. + Fd popen(function f, function import_fd_fn = redirect_stdin_stdout); + + // Read all the data from fd into a string + string read_all(Fd fd, size_t max_bytes = numeric_limits::max(), size_t chunk_bytes = 4096); +}; + +#endif // CRUCIBLE_EXECPIPE_H diff --git a/include/crucible/extentwalker.h b/include/crucible/extentwalker.h new file mode 100644 index 0000000..6f12182 --- /dev/null +++ b/include/crucible/extentwalker.h @@ -0,0 +1,101 @@ +#ifndef CRUCIBLE_EXTENTWALKER_H +#define CRUCIBLE_EXTENTWALKER_H + +#include "crucible/fd.h" + +namespace crucible { + using namespace std; + + // FIXME: ExtentCursor is probably a better name + struct Extent { + off_t m_begin; + off_t m_end; + uint64_t m_physical; + uint64_t m_flags; + + // Btrfs extent reference details + off_t m_physical_len; + off_t m_logical_len; + off_t m_offset; + + // fiemap flags are uint32_t, so bits 32..63 are OK for us + + // no extent here + static const uint64_t HOLE = (1ULL << 32); + + // extent is physical space full of zeros + static const uint64_t PREALLOC = (1ULL << 33); + + // extent's physical (RAM) size does not match logical (can we know this?) + static const uint64_t OBSCURED = (1ULL << 34); + + operator bool() const; + off_t size() const; + off_t begin() const { return m_begin; } + off_t end() const { return m_end; } + uint64_t flags() const { return m_flags; } + uint64_t physical() const { return m_physical; } + off_t physical_len() const { return m_physical_len; } + off_t logical_len() const { return m_logical_len; } + off_t offset() const { return m_offset; } + bool operator==(const Extent &that) const; + bool operator!=(const Extent &that) const { return !(*this == that); } + + Extent(); + Extent(const Extent &e) = default; + }; + + class ExtentWalker { + public: + using Vec = vector; + using Itr = Vec::iterator; + + protected: + Fd m_fd; + Stat m_stat; + + virtual Vec get_extent_map(off_t pos); + + static const unsigned sc_extent_fetch_max = 64; + static const unsigned sc_extent_fetch_min = 4; + static const off_t sc_step_size = 0x1000 * (sc_extent_fetch_max / 2); + + private: + Vec m_extents; + Itr m_current; + + Itr find_in_cache(off_t pos); + void run_fiemap(off_t pos); + + public: + ExtentWalker(Fd fd = Fd()); + ExtentWalker(Fd fd, off_t initial_pos); + virtual ~ExtentWalker(); + + void reset(); + Extent current(); + bool next(); + bool prev(); + void seek(off_t new_pos); + + friend ostream & operator<<(ostream &os, const ExtentWalker &ew); + }; + + class BtrfsExtentWalker : public ExtentWalker { + uint64_t m_tree_id; + Fd m_root_fd; + + protected: + Vec get_extent_map(off_t pos) override; + + public: + BtrfsExtentWalker(Fd fd); + BtrfsExtentWalker(Fd fd, off_t initial_pos); + BtrfsExtentWalker(Fd fd, off_t initial_pos, Fd root_fd); + void set_root_fd(Fd fd); + }; + + ostream &operator<<(ostream &os, const Extent &e); +}; + +#endif // CRUCIBLE_EXTENTWALKER_H diff --git a/include/crucible/fd.h b/include/crucible/fd.h new file mode 100644 index 0000000..bd4265e --- /dev/null +++ b/include/crucible/fd.h @@ -0,0 +1,178 @@ +#ifndef CRUCIBLE_FD_H +#define CRUCIBLE_FD_H + +#include "crucible/resource.h" + +#include + +#include +#include + +// open +#include +#include +#include + +// socket +#include + +// pread/pwrite +#include + +namespace crucible { + using namespace std; + + // IOHandle is a file descriptor owner object. It closes them when destroyed. + // Most of the functions here don't use it because these functions don't own FDs. + // All good names for such objects are taken. + class IOHandle { + IOHandle(const IOHandle &) = delete; + IOHandle(IOHandle &&) = delete; + IOHandle& operator=(IOHandle &&) = delete; + IOHandle& operator=(const IOHandle &) = delete; + protected: + int m_fd; + IOHandle& operator=(int that) { m_fd = that; return *this; } + public: + virtual ~IOHandle(); + IOHandle(int fd); + IOHandle(); + + void close(); + int get_fd() const { return m_fd; } + int release_fd(); + }; + + template <> + struct ResourceTraits { + int get_key(const IOHandle &res) const { return res.get_fd(); } + shared_ptr make_resource(int fd) const { return make_shared(fd); } + bool is_null_key(const int &key) const { return key < 0; } + int get_null_key() const { return -1; } + }; + + typedef ResourceHandle Fd; + + // Functions named "foo_or_die" throw exceptions on failure. + + // Attempt to open the file with the given mode + int open_or_die(const string &file, int flags = O_RDONLY, mode_t mode = 0777); + int openat_or_die(int dir_fd, const string &file, int flags = O_RDONLY, mode_t mode = 0777); + + // Decode open parameters + string o_flags_ntoa(int flags); + string o_mode_ntoa(mode_t mode); + + // mmap with its one weird error case + void *mmap_or_die(void *addr, size_t length, int prot, int flags, int fd, off_t offset); + // Decode mmap parameters + string mmap_prot_ntoa(int prot); + string mmap_flags_ntoa(int flags); + + // Unlink, rename + void unlink_or_die(const string &file); + void rename_or_die(const string &from, const string &to); + void renameat_or_die(int fromfd, const string &frompath, int tofd, const string &topath); + + // Read or write structs: + // There is a template specialization to read or write strings + // Three-arg version of read_or_die/write_or_die throws an error on incomplete read/writes + // Four-arg version returns number of bytes read/written through reference arg + + void read_or_die(int fd, void *buf, size_t size); + template void read_or_die(int fd, T& buf) + { + return read_or_die(fd, static_cast(&buf), sizeof(buf)); + } + + void read_partial_or_die(int fd, void *buf, size_t size_wanted, size_t &size_read); + template void read_partial_or_die(int fd, T& buf, size_t &size_read) + { + return read_partial_or_die(fd, static_cast(&buf), sizeof(buf), size_read); + } + + void pread_or_die(int fd, void *buf, size_t size, off_t offset); + template void pread_or_die(int fd, T& buf, off_t offset) + { + return pread_or_die(fd, static_cast(&buf), sizeof(buf), offset); + } + + void write_or_die(int fd, const void *buf, size_t size); + template void write_or_die(int fd, const T& buf) + { + return write_or_die(fd, static_cast(&buf), sizeof(buf)); + } + + void write_partial_or_die(int fd, const void *buf, size_t size_wanted, size_t &size_written); + template void write_partial_or_die(int fd, const T& buf, size_t &size_written) + { + return write_partial_or_die(fd, static_cast(&buf), sizeof(buf), size_written); + } + + void pwrite_or_die(int fd, const void *buf, size_t size, off_t offset); + template void pwrite_or_die(int fd, const T& buf, off_t offset) + { + return pwrite_or_die(fd, static_cast(&buf), sizeof(buf), offset); + } + + // Specialization for strings which reads/writes the string content, not the struct string + template<> void write_or_die(int fd, const string& str); + template<> void pread_or_die(int fd, string& str, off_t offset); + template<> void pread_or_die>(int fd, vector& str, off_t offset); + template<> void pread_or_die>(int fd, vector& str, off_t offset); + + // A different approach to reading a simple string + string read_string(int fd, size_t size); + + // A lot of Unix API wants you to initialize a struct and call + // one function to fill it, another function to throw it away, + // and has some unknown third thing you have to do when there's + // an error. That's also a C++ object with an exception-throwing + // constructor. + struct Stat : public stat { + Stat(); + Stat(int f); + Stat(const string &filename); + Stat &fstat(int fd); + Stat &lstat(const string &filename); + }; + + string st_mode_ntoa(mode_t mode); + + // Because it's not trivial to do correctly + string readlink_or_die(const string &path); + + // Determine the name of a FD by readlink through /proc/self/fd/ + string name_fd(int fd); + + // Returns Fd objects because it does own them. + pair socketpair_or_die(int domain = AF_UNIX, int type = SOCK_STREAM, int protocol = 0); + + // like unique_lock but for flock instead of mutexes...and not trying + // to hide the many and subtle differences between those two things *at all*. + class Flock { + int m_fd; + bool m_locked; + Flock(const Flock &) = delete; + Flock(Flock &&) = delete; + Flock &operator=(const Flock &) = delete; + Flock &operator=(Flock &&) = delete; + public: + Flock(); + Flock(int fd); + Flock(int fd, bool init_locked_state); + ~Flock(); + void lock(); + void try_lock(); + void unlock(); + bool owns_lock(); + operator bool(); + int fd(); + }; + + // Doesn't use Fd objects because it's usually just used to replace stdin/stdout/stderr. + void dup2_or_die(int fd_in, int fd_out); + +} + +#endif // CRUCIBLE_FD_H diff --git a/include/crucible/fs.h b/include/crucible/fs.h new file mode 100644 index 0000000..503c0c7 --- /dev/null +++ b/include/crucible/fs.h @@ -0,0 +1,246 @@ +#ifndef CRUCIBLE_FS_H +#define CRUCIBLE_FS_H + +#include "crucible/error.h" + +// Terribly Linux-specific FS-wrangling functions + +// BTRFS +#include "crucible/btrfs.h" + +// FIEMAP_* structs and flags +#include + +#include +#include +#include + +#include +#include + +namespace crucible { + using namespace std; + + // wrapper around fallocate(...FALLOC_FL_PUNCH_HOLE...) + void punch_hole(int fd, off_t offset, off_t len); + + struct BtrfsExtentInfo : public btrfs_ioctl_same_extent_info { + BtrfsExtentInfo(int dst_fd, off_t dst_offset); + }; + + struct BtrfsExtentSame : public btrfs_ioctl_same_args { + virtual ~BtrfsExtentSame(); + BtrfsExtentSame(int src_fd, off_t src_offset, off_t src_length); + void add(int fd, off_t offset); + virtual void do_ioctl(); + + int m_fd; + vector m_info; + }; + + struct BtrfsExtentSameByClone : public BtrfsExtentSame { + using BtrfsExtentSame::BtrfsExtentSame; + void do_ioctl() override; + }; + + ostream & operator<<(ostream &os, const btrfs_ioctl_same_extent_info *info); + ostream & operator<<(ostream &os, const btrfs_ioctl_same_args *info); + ostream & operator<<(ostream &os, const BtrfsExtentSame &bes); + + struct BtrfsInodeOffsetRoot { + uint64_t m_inum; + uint64_t m_offset; + uint64_t m_root; + }; + + ostream & operator<<(ostream &os, const BtrfsInodeOffsetRoot &p); + + struct BtrfsDataContainer : public btrfs_data_container { + BtrfsDataContainer(size_t size = 64 * 1024); + void *prepare(); + + size_t get_size() const; + decltype(bytes_left) get_bytes_left() const; + decltype(bytes_missing) get_bytes_missing() const; + decltype(elem_cnt) get_elem_cnt() const; + decltype(elem_missed) get_elem_missed() const; + + vector m_data; + }; + + struct BtrfsIoctlLogicalInoArgs : public btrfs_ioctl_logical_ino_args { + BtrfsIoctlLogicalInoArgs(uint64_t logical, size_t buf_size = 64 * 1024); + virtual void do_ioctl(int fd); + virtual bool do_ioctl_nothrow(int fd); + + BtrfsDataContainer m_container; + vector m_iors; + }; + + ostream & operator<<(ostream &os, const BtrfsIoctlLogicalInoArgs &p); + + struct BtrfsIoctlInoPathArgs : public btrfs_ioctl_ino_path_args { + BtrfsIoctlInoPathArgs(uint64_t inode, size_t buf_size = 64 * 1024); + virtual void do_ioctl(int fd); + virtual bool do_ioctl_nothrow(int fd); + + BtrfsDataContainer m_container; + vector m_paths; + }; + + ostream & operator<<(ostream &os, const BtrfsIoctlInoPathArgs &p); + + struct BtrfsIoctlInoLookupArgs : public btrfs_ioctl_ino_lookup_args { + BtrfsIoctlInoLookupArgs(uint64_t objectid); + virtual void do_ioctl(int fd); + virtual bool do_ioctl_nothrow(int fd); + // use objectid = BTRFS_FIRST_FREE_OBJECTID + // this->treeid is the rootid for the path (we get the path too) + }; + + struct BtrfsIoctlDefragRangeArgs : public btrfs_ioctl_defrag_range_args { + BtrfsIoctlDefragRangeArgs(); + virtual void do_ioctl(int fd); + virtual bool do_ioctl_nothrow(int fd); + }; + + ostream & operator<<(ostream &os, const BtrfsIoctlDefragRangeArgs *p); + + // in btrfs/ctree.h, but that's a nightmare to #include here + typedef enum { + BTRFS_COMPRESS_NONE = 0, + BTRFS_COMPRESS_ZLIB = 1, + BTRFS_COMPRESS_LZO = 2, + BTRFS_COMPRESS_TYPES = 2, + BTRFS_COMPRESS_LAST = 3, + } btrfs_compression_type; + + struct FiemapExtent : public fiemap_extent { + FiemapExtent(); + FiemapExtent(const fiemap_extent &that); + operator bool() const; + off_t begin() const; + off_t end() const; + }; + + struct Fiemap : public fiemap { + + // Get entire file + Fiemap(uint64_t start = 0, uint64_t length = FIEMAP_MAX_OFFSET); + + void do_ioctl(int fd); + + vector m_extents; + uint64_t m_min_count = (4096 - sizeof(fiemap)) / sizeof(fiemap_extent); + uint64_t m_max_count = 16 * 1024 * 1024 / sizeof(fiemap_extent); + }; + + ostream & operator<<(ostream &os, const fiemap_extent *info); + ostream & operator<<(ostream &os, const FiemapExtent &info); + ostream & operator<<(ostream &os, const fiemap *info); + ostream & operator<<(ostream &os, const Fiemap &info); + + string fiemap_extent_flags_ntoa(unsigned long flags); + + // Helper functions + void btrfs_clone_range(int src_fd, off_t src_offset, off_t src_length, int dst_fd, off_t dst_offset); + bool btrfs_extent_same(int src_fd, off_t src_offset, off_t src_length, int dst_fd, off_t dst_offset); + + struct BtrfsIoctlSearchHeader : public btrfs_ioctl_search_header { + BtrfsIoctlSearchHeader(); + vector m_data; + size_t set_data(const vector &v, size_t offset); + }; + + ostream & operator<<(ostream &os, const btrfs_ioctl_search_header &hdr); + ostream & operator<<(ostream &os, const BtrfsIoctlSearchHeader &hdr); + + struct BtrfsIoctlSearchKey : public btrfs_ioctl_search_key { + BtrfsIoctlSearchKey(size_t buf_size = 1024 * 1024); + virtual bool do_ioctl_nothrow(int fd); + virtual void do_ioctl(int fd); + + // Copy objectid/type/offset so we move forward + void next_min(const BtrfsIoctlSearchHeader& ref); + + size_t m_buf_size; + vector m_result; + }; + + ostream & operator<<(ostream &os, const btrfs_ioctl_search_key &key); + ostream & operator<<(ostream &os, const BtrfsIoctlSearchKey &key); + + string btrfs_search_type_ntoa(unsigned type); + string btrfs_search_objectid_ntoa(unsigned objectid); + + uint64_t btrfs_get_root_id(int fd); + uint64_t btrfs_get_root_transid(int fd); + + template + const T* + get_struct_ptr(vector &v, size_t offset = 0) + { + // OK so sometimes btrfs overshoots a little + if (offset + sizeof(T) > v.size()) { + v.resize(offset + sizeof(T), 0); + } + THROW_CHECK2(invalid_argument, v.size(), offset + sizeof(T), offset + sizeof(T) <= v.size()); + return reinterpret_cast(v.data() + offset); + } + + template + R + call_btrfs_get(R (*func)(const A*), vector &v, size_t offset = 0) + { + return func(get_struct_ptr(v, offset)); + } + + template struct btrfs_get_le; + + template<> struct btrfs_get_le<__le64> { + uint64_t operator()(const void *p) { return get_unaligned_le64(p); } + }; + + template<> struct btrfs_get_le<__le32> { + uint32_t operator()(const void *p) { return get_unaligned_le32(p); } + }; + + template<> struct btrfs_get_le<__le16> { + uint16_t operator()(const void *p) { return get_unaligned_le16(p); } + }; + + template<> struct btrfs_get_le<__le8> { + uint8_t operator()(const void *p) { return get_unaligned_le8(p); } + }; + + template + T + btrfs_get_member(T S::* member, vector &v, size_t offset = 0) + { + const S *sp = reinterpret_cast(NULL); + const T *spm = &(sp->*member); + auto member_offset = reinterpret_cast(spm) - reinterpret_cast(sp); + return btrfs_get_le()(get_struct_ptr(v, offset + member_offset)); + } + + struct Statvfs : public statvfs { + Statvfs(); + Statvfs(string path); + Statvfs(int fd); + unsigned long size() const; + unsigned long free() const; + unsigned long available() const; + }; + + ostream &hexdump(ostream &os, const vector &v); + + struct BtrfsIoctlFsInfoArgs : public btrfs_ioctl_fs_info_args { + BtrfsIoctlFsInfoArgs(); + void do_ioctl(int fd); + string uuid() const; + }; + + ostream & operator<<(ostream &os, const BtrfsIoctlFsInfoArgs &a); +}; + +#endif // CRUCIBLE_FS_H diff --git a/include/crucible/interp.h b/include/crucible/interp.h new file mode 100644 index 0000000..3f61224 --- /dev/null +++ b/include/crucible/interp.h @@ -0,0 +1,106 @@ +#ifndef CRUCIBLE_INTERP_H +#define CRUCIBLE_INTERP_H + +#include "crucible/error.h" + +#include +#include +#include +#include + +namespace crucible { + using namespace std; + + struct ArgList : public vector { + ArgList(const char **argv); + // using vector::vector ... doesn't work: + // error: ‘std::vector >::vector’ names constructor + // Still doesn't work in 4.9 because it can't manage a conversion + ArgList(const vector &&that); + }; + + struct ArgActor { + struct ArgActorBase { + virtual void predicate(void *obj, string arg); + }; + + template + struct ArgActorDerived { + function m_func; + + ArgActorDerived(decltype(m_func) func) : + m_func(func) + { + } + + void predicate(void *obj, string arg) override + { + T &op = *(reinterpret_cast(obj)); + m_func(op, obj); + } + }; + + template + ArgActor(T, function func) : + m_actor(make_shared(ArgActorDerived(func))) + { + } + + ArgActor() = default; + + void predicate(void *t, string arg) + { + if (m_actor) { + m_actor->predicate(t, arg); + } else { + THROW_ERROR(invalid_argument, "null m_actor for predicate arg '" << arg << "'"); + } + } + + private: + shared_ptr m_actor; + }; + + struct ArgParser { + ~ArgParser(); + ArgParser(); + + void add_opt(string opt, ArgActor actor); + + template + void + parse(T t, const ArgList &args) + { + void *vt = &t; + parse_backend(vt, args); + } + + private: + void parse_backend(void *t, const ArgList &args); + map m_string_opts; + }; + + struct Command { + virtual ~Command(); + virtual int exec(const ArgList &args) = 0; + }; + + struct Proc : public Command { + int exec(const ArgList &args) override; + Proc(const function &f); + private: + function m_cmd; + }; + + struct Interp { + virtual ~Interp(); + Interp(const map > &cmdlist); + void add_command(const string &name, const shared_ptr &command); + int exec(const ArgList &args); + private: + Interp(const Interp &) = delete; + map > m_commands; + }; + +}; +#endif // CRUCIBLE_INTERP_H diff --git a/include/crucible/limits.h b/include/crucible/limits.h new file mode 100644 index 0000000..108fd78 --- /dev/null +++ b/include/crucible/limits.h @@ -0,0 +1,51 @@ +#ifndef CRUCIBLE_LIMITS_H +#define CRUCIBLE_LIMITS_H + +#include "crucible/error.h" + +#include +#include + +namespace crucible { + using namespace std; + + template + To + ranged_cast(From f) + { + if (typeid(From) == typeid(To)) { + return f; + } + + To t; + static string f_info = typeid(f).name(); + static string t_info = typeid(t).name(); + + if (numeric_limits::max() > numeric_limits::max() && numeric_limits::max() < numeric_limits::max()) { + THROW_ERROR(out_of_range, + "ranged_cast: can't compare limits of types " << f_info << " and " << t_info << ", template specialization required"); + } + + if (numeric_limits::max() > numeric_limits::max() && f > static_cast(numeric_limits::max())) { + THROW_ERROR(out_of_range, + "ranged_cast: " << f_info << "(" << f << ") out of range of target type " << t_info); + } + + if (!numeric_limits::is_signed && numeric_limits::is_signed && f < 0) { + THROW_ERROR(out_of_range, + "ranged_cast: " << f_info << "(" << f << ") out of range of unsigned target type " << t_info); + } + + t = static_cast(f); + + From f2 = static_cast(t); + if (f2 != f) { + THROW_ERROR(out_of_range, + "ranged_cast: " << f_info << "(" << f << ") -> " << t_info << " failed: result value " << f2); + } + + return t; + } +}; + +#endif // CRUCIBLE_LIMITS_H diff --git a/include/crucible/lockset.h b/include/crucible/lockset.h new file mode 100644 index 0000000..0524f7f --- /dev/null +++ b/include/crucible/lockset.h @@ -0,0 +1,210 @@ +#ifndef CRUCIBLE_LOCKSET_H +#define CRUCIBLE_LOCKSET_H + +#include + +#include + +#include +#include +#include +#include + +namespace crucible { + using namespace std; + + template + class LockSet { + + public: + using key_type = T; + using set_type = set; + + private: + + set_type m_set; + mutex m_mutex; + condition_variable m_condvar; + + public: + ~LockSet(); + LockSet() = default; + + void lock(const key_type &name); + void unlock(const key_type &name); + bool try_lock(const key_type &name); + size_t size(); + bool empty(); + set_type copy(); + void wait_unlock(double interval); + + class Lock { + LockSet &m_lockset; + key_type m_name; + bool m_locked; + + Lock() = delete; + Lock(const Lock &) = delete; + Lock& operator=(const Lock &) = delete; + public: + ~Lock(); + Lock(LockSet &lockset, const key_type &m_name, bool start_locked = true); + Lock(Lock &&that); + Lock& operator=(Lock &&that); + void lock(); + void unlock(); + bool try_lock(); + }; + + }; + + template + LockSet::~LockSet() + { + if (!m_set.empty()) { + cerr << "ERROR: " << m_set.size() << " locked items still in set at destruction" << endl; + } + // We will crash later. Might as well crash now. + assert(m_set.empty()); + } + + template + void + LockSet::lock(const key_type &name) + { + unique_lock lock(m_mutex); + while (m_set.count(name)) { + m_condvar.wait(lock); + } + auto rv = m_set.insert(name); + THROW_CHECK0(runtime_error, rv.second); + } + + template + bool + LockSet::try_lock(const key_type &name) + { + unique_lock lock(m_mutex); + if (m_set.count(name)) { + return false; + } + auto rv = m_set.insert(name); + THROW_CHECK1(runtime_error, name, rv.second); + return true; + } + + template + void + LockSet::unlock(const key_type &name) + { + unique_lock lock(m_mutex); + m_condvar.notify_all(); + auto erase_count = m_set.erase(name); + THROW_CHECK1(invalid_argument, erase_count, erase_count == 1); + } + + template + void + LockSet::wait_unlock(double interval) + { + unique_lock lock(m_mutex); + if (m_set.empty()) return; + m_condvar.wait_for(lock, chrono::duration(interval)); + } + + template + size_t + LockSet::size() + { + unique_lock lock(m_mutex); + return m_set.size(); + } + + template + bool + LockSet::empty() + { + unique_lock lock(m_mutex); + return m_set.empty(); + } + + template + typename LockSet::set_type + LockSet::copy() + { + unique_lock lock(m_mutex); + return m_set; + } + + template + void + LockSet::Lock::lock() + { + if (m_locked) return; + m_lockset.lock(m_name); + m_locked = true; + } + + template + bool + LockSet::Lock::try_lock() + { + if (m_locked) return true; + m_locked = m_lockset.try_lock(m_name); + return m_locked; + } + + template + void + LockSet::Lock::unlock() + { + if (!m_locked) return; + m_lockset.unlock(m_name); + m_locked = false; + } + + template + LockSet::Lock::~Lock() + { + if (m_locked) { + unlock(); + } + } + + template + LockSet::Lock::Lock(LockSet &lockset, const key_type &name, bool start_locked) : + m_lockset(lockset), + m_name(name), + m_locked(false) + { + if (start_locked) { + lock(); + } + } + + template + LockSet::Lock::Lock(Lock &&that) : + m_lockset(that.lockset), + m_name(that.m_name), + m_locked(that.m_locked) + { + that.m_locked = false; + } + + template + typename LockSet::Lock & + LockSet::Lock::operator=(Lock &&that) + { + THROW_CHECK2(invalid_argument, &m_lockset, &that.m_lockset, &m_lockset == &that.m_lockset); + if (m_locked && that.m_name != m_name) { + unlock(); + } + m_name = that.m_name; + m_locked = that.m_locked; + that.m_locked = false; + return *this; + } + +} + +#endif // CRUCIBLE_LOCKSET_H diff --git a/include/crucible/ntoa.h b/include/crucible/ntoa.h new file mode 100644 index 0000000..24bc2b0 --- /dev/null +++ b/include/crucible/ntoa.h @@ -0,0 +1,28 @@ +#ifndef CRUCIBLE_NTOA_H +#define CRUCIBLE_NTOA_H + +#include + +namespace crucible { + using namespace std; + + struct bits_ntoa_table { + unsigned long n; + unsigned long mask; + const char *a; + }; + + string bits_ntoa(unsigned long n, const bits_ntoa_table *a); + +}; + +// Combinations of bits (list multiple-bit entries first) +#define NTOA_TABLE_ENTRY_BITS(x) { .n = (x), .mask = (x), .a = (#x) } + +// Enumerations (entire value matches all bits) +#define NTOA_TABLE_ENTRY_ENUM(x) { .n = (x), .mask = ~0UL, .a = (#x) } + +// End of table (sorry, gcc doesn't implement this) +#define NTOA_TABLE_ENTRY_END() { .n = 0, .mask = 0, .a = nullptr } + +#endif // CRUCIBLE_NTOA_H diff --git a/include/crucible/path.h b/include/crucible/path.h new file mode 100644 index 0000000..c5ecf9c --- /dev/null +++ b/include/crucible/path.h @@ -0,0 +1,13 @@ +#ifndef CRUCIBLE_PATH_H +#define CRUCIBLE_PATH_H + +#include + +namespace crucible { + using namespace std; + + string basename(string s); + string join(string dir, string base); +}; + +#endif // CRUCIBLE_PATH_H diff --git a/include/crucible/process.h b/include/crucible/process.h new file mode 100644 index 0000000..f986f43 --- /dev/null +++ b/include/crucible/process.h @@ -0,0 +1,78 @@ +#ifndef CRUCIBLE_PROCESS_H +#define CRUCIBLE_PROCESS_H + +#include "crucible/resource.h" + +#include +#include + +#include +#include +#include + +namespace crucible { + using namespace std; + + // Like thread, but for processes. + // TODO: thread has a few warts for this usage: + // - can't create one from its native_handle, + // - can't destroy one without joining/detaching it first + // - can't implement detach correctly without crossing threshold of insanity + // - WTF is native_handle() not const? + struct Process { + // These parts are for compatibility with std::thread + + using id = ::pid_t; + using native_handle_type = ::pid_t; + + ~Process(); + Process(); + + template + Process(Fn fn, Args... args) : + Process() + { + do_fork(function([&]() { return fn(args...); })); + } + + Process(const Process &) = delete; + Process(Process &&move_from); + + bool joinable(); + void detach(); + native_handle_type native_handle(); + id get_id(); + + // Modified thread members for Process + + // join() calls waitpid(), returns status or exception (std::thread returns void) + using status_type = int; + status_type join(); + + // New members for Process + + // kill() terminates a process in the usual Unix way + void kill(int sig = SIGTERM); + + // take over ownership of an already-forked native process handle + Process(id pid); + + private: + id m_pid; + + void do_fork(function); + }; + + template <> + struct ResourceTraits { + Process::id get_key(const Process &res) const { return (const_cast(res)).native_handle(); } + shared_ptr make_resource(const Process::id &id) const { return make_shared(id); } + bool is_null_key(const Process::id &key) const { return !key; } + Process::id get_null_key() const { return 0; } + }; + + typedef ResourceHandle Pid; + + pid_t gettid(); +} +#endif // CRUCIBLE_PROCESS_H diff --git a/include/crucible/resource.h b/include/crucible/resource.h new file mode 100644 index 0000000..7c2e8d5 --- /dev/null +++ b/include/crucible/resource.h @@ -0,0 +1,387 @@ +#ifndef CRUCIBLE_RESOURCE_H +#define CRUCIBLE_RESOURCE_H + +#include "crucible/error.h" + +#include +#include +#include +#include +#include + +namespace crucible { + using namespace std; + + // Template classes for non-copiable resource owner objects + // for objects with process-wide unique names. + + // Everything we need to know about Key and Resource. + // Specialize this template for your Resource class. + template + struct ResourceTraits { + // How to get the Key out of a Resource owner. + // If the owner owns no resource, returns "null" for "no Resource." + Key get_key(const Resource &res) const; + + // How to construct a new Resource owner given _only_ the key. + // Usually just calls make_shared(key). + shared_ptr make_resource(const Key &key) const; + + // Test a Key value to see if it is null (no active Resource has this Key value). + // Usually an equality test with get_null_key(), but sometimes many Key values are equivalent to null. + bool is_null_key(const Key &key) const; + + // is_null_key(get_null_key()) == true + Key get_null_key() const; + }; + + template + class ResourceHandle { + public: + using key_type = Key; + using resource_type = Resource; + using resource_ptr_type = shared_ptr; + + private: + using traits_type = ResourceTraits; + + class ResourceHolder { + resource_ptr_type m_ptr; + public: + ~ResourceHolder(); + ResourceHolder(resource_ptr_type that); + ResourceHolder(const ResourceHolder &that) = default; + ResourceHolder(ResourceHolder &&that) = default; + ResourceHolder& operator=(ResourceHolder &&that) = default; + ResourceHolder& operator=(const ResourceHolder &that) = default; + resource_ptr_type get_resource_ptr() const; + }; + + using holder_ptr_type = shared_ptr; + using weak_holder_ptr_type = weak_ptr; + using map_type = map; + + // The only instance variable + holder_ptr_type m_ptr; + + // A bunch of static variables and functions + static mutex &s_mutex(); + static shared_ptr s_map(); + static holder_ptr_type insert(const key_type &key); + static holder_ptr_type insert(const resource_ptr_type &res); + static void erase(const key_type &key); + static ResourceTraits s_traits; + + public: + + // test for resource. A separate operator because key_type could be confused with bool. + bool operator!() const; + + // get key_type for an active resource or null + key_type get_key() const; + + // conversion/assignment to and from key_type + operator key_type() const; + ResourceHandle(const key_type &key); + ResourceHandle& operator=(const key_type &key); + + // conversion to/from resource_ptr_type + ResourceHandle(const resource_ptr_type &res); + ResourceHandle& operator=(const resource_ptr_type &res); + + // default constructor is public + ResourceHandle() = default; + + // forward anything else to the Resource constructor + // if we can do so unambiguously + template + ResourceHandle(A1 a1, A2 a2, Args... args) : ResourceHandle( make_shared(a1, a2, args...) ) + { + } + + // forward anything else to a Resource factory method + template + static + ResourceHandle + make(Args... args) { + return ResourceHandle( make_shared(args...) ); + } + + // get pointer to Resource object (nothrow, result may be null) + resource_ptr_type get_resource_ptr() const; + // this version throws and is probably not thread safe + resource_ptr_type operator->() const; + + // dynamic casting of the resource (throws if cast fails) + template shared_ptr cast() const; + }; + + template + Key + ResourceTraits::get_key(const Resource &res) const + { + return res.get_key(); + } + + template + shared_ptr + ResourceTraits::make_resource(const Key &key) const + { + return make_shared(key); + } + + template + bool + ResourceTraits::is_null_key(const Key &key) const + { + return !key; + } + + template + Key + ResourceTraits::get_null_key() const + { + return NULL; + } + + template + ResourceHandle::ResourceHolder::ResourceHolder(resource_ptr_type that) : + m_ptr(that) + { + // Cannot insert ourselves here since our shared_ptr does not exist yet. + } + + template + mutex & + ResourceHandle::s_mutex() + { + static mutex gcc_won_t_instantiate_this_either; + return gcc_won_t_instantiate_this_either; + } + + template + shared_ptr::map_type> + ResourceHandle::s_map() + { + static shared_ptr gcc_won_t_instantiate_the_damn_static_vars; + if (!gcc_won_t_instantiate_the_damn_static_vars) { + gcc_won_t_instantiate_the_damn_static_vars = make_shared(); + } + return gcc_won_t_instantiate_the_damn_static_vars; + } + + template + void + ResourceHandle::erase(const key_type &key) + { + unique_lock lock(s_mutex()); + // Resources are allowed to set their Keys to null. + if (s_traits.is_null_key(key)) { + // Clean out any dead weak_ptr objects. + for (auto i = s_map()->begin(); i != s_map()->end(); ) { + if (! (*i).second.lock()) { + i = s_map()->erase(i); + } else { + ++i; + } + } + return; + } + auto erased = s_map()->erase(key); + if (erased != 1) { + cerr << __PRETTY_FUNCTION__ << ": WARNING: s_map()->erase(" << key << ") returned " << erased << " != 1" << endl; + } + } + + template + ResourceHandle::ResourceHolder::~ResourceHolder() + { + if (!m_ptr) { + // Probably something harmless like a failed constructor. + cerr << __PRETTY_FUNCTION__ << ": WARNING: destroying null m_ptr" << endl; + return; + } + Key key = s_traits.get_key(*m_ptr); + ResourceHandle::erase(key); + } + + template + typename ResourceHandle::holder_ptr_type + ResourceHandle::insert(const key_type &key) + { + // no Resources for null keys + if (s_traits.is_null_key(key)) { + return holder_ptr_type(); + } + unique_lock lock(s_mutex()); + // find ResourceHolder for non-null key + auto found = s_map()->find(key); + if (found != s_map()->end()) { + holder_ptr_type rv = (*found).second.lock(); + // a weak_ptr may have expired + if (rv) { + return rv; + } + } + // not found or expired, throw any existing ref away and make a new one + resource_ptr_type rpt = s_traits.make_resource(key); + holder_ptr_type hpt = make_shared(rpt); + // store weak_ptr in map + (*s_map())[key] = hpt; + // return shared_ptr + return hpt; + }; + + template + typename ResourceHandle::holder_ptr_type + ResourceHandle::insert(const resource_ptr_type &res) + { + // no Resource, no ResourceHolder. + if (!res) { + return holder_ptr_type(); + } + // no ResourceHolders for null keys either. + key_type key = s_traits.get_key(*res); + if (s_traits.is_null_key(key)) { + return holder_ptr_type(); + } + unique_lock lock(s_mutex()); + // find ResourceHolder for non-null key + auto found = s_map()->find(key); + if (found != s_map()->end()) { + holder_ptr_type rv = (*found).second.lock(); + // The map doesn't own the ResourceHolders, the ResourceHandles do. + // It's OK for the map to contain an expired weak_ptr to some dead ResourceHolder... + if (rv) { + // found ResourceHolder, look at pointer + resource_ptr_type rp = rv->get_resource_ptr(); + // We do not store references to null Resources. + assert(rp); + // Key retrieved for an existing object must match key searched or be null. + key_type found_key = s_traits.get_key(*rp); + bool found_key_is_null = s_traits.is_null_key(found_key); + assert(found_key_is_null || found_key == key); + if (!found_key_is_null) { + // We do not store references to duplicate resources. + if (rp.owner_before(res) || res.owner_before(rp)) { + cerr << "inserting new Resource with existing Key " << key << " not allowed at " << __PRETTY_FUNCTION__ << endl;; + abort(); + // THROW_ERROR(out_of_range, "inserting new Resource with existing Key " << key << " not allowed at " << __PRETTY_FUNCTION__); + } + // rv is good, return it + return rv; + } + } + } + // not found or expired, make a new one + holder_ptr_type rv = make_shared(res); + s_map()->insert(make_pair(key, weak_holder_ptr_type(rv))); + // no need to check s_map result, we are either replacing a dead weak_ptr or adding a new one + return rv; + }; + + template + ResourceHandle::ResourceHandle(const key_type &key) + { + m_ptr = insert(key); + } + + template + ResourceHandle& + ResourceHandle::operator=(const key_type &key) + { + m_ptr = insert(key); + return *this; + } + + template + ResourceHandle::ResourceHandle(const resource_ptr_type &res) + { + m_ptr = insert(res); + } + + template + ResourceHandle& + ResourceHandle::operator=(const resource_ptr_type &res) + { + m_ptr = insert(res); + return *this; + } + + template + typename ResourceHandle::resource_ptr_type + ResourceHandle::ResourceHolder::get_resource_ptr() const + { + return m_ptr; + } + + template + typename ResourceHandle::resource_ptr_type + ResourceHandle::get_resource_ptr() const + { + if (!m_ptr) { + return resource_ptr_type(); + } + return m_ptr->get_resource_ptr(); + } + + template + typename ResourceHandle::resource_ptr_type + ResourceHandle::operator->() const + { + resource_ptr_type rp = get_resource_ptr(); + if (!rp) { + THROW_ERROR(out_of_range, __PRETTY_FUNCTION__ << " called on null Resource"); + } + return rp; + } + + template + template + shared_ptr + ResourceHandle::cast() const + { + shared_ptr dp; + resource_ptr_type rp = get_resource_ptr(); + if (!rp) { + return dp; + } + dp = dynamic_pointer_cast(rp); + if (!dp) { + throw bad_cast(); + } + return dp; + } + + template + typename ResourceHandle::key_type + ResourceHandle::get_key() const + { + resource_ptr_type rp = get_resource_ptr(); + if (!rp) { + return s_traits.get_null_key(); + } else { + return s_traits.get_key(*rp); + } + } + + template + ResourceHandle::operator key_type() const + { + return get_key(); + } + + template + bool + ResourceHandle::operator!() const + { + return s_traits.is_null_key(operator key_type()); + } + + template + ResourceTraits ResourceHandle::s_traits; + + +} + +#endif // RESOURCE_H diff --git a/include/crucible/string.h b/include/crucible/string.h new file mode 100644 index 0000000..94fc724 --- /dev/null +++ b/include/crucible/string.h @@ -0,0 +1,67 @@ +#ifndef CRUCIBLE_STRING_H +#define CRUCIBLE_STRING_H + +#include "crucible/error.h" + +#include +#include +#include +#include + +namespace crucible { + using namespace std; + + // Zero-initialize a base class object (usually a C struct) + template + void + memset_zero(Base *that) + { + memset(that, 0, sizeof(Base)); + } + + // Copy a base class object (usually a C struct) into a vector + template + vector + vector_copy_struct(Base *that) + { + const char *begin_that = reinterpret_cast(static_cast(that)); + return vector(begin_that, begin_that + sizeof(Base)); + } + + // int->hex conversion with sprintf + string to_hex(uint64_t i); + + // hex->int conversion with stoull + uint64_t from_hex(const string &s); + + // asprintf with string output and exceptions + template + string + astringprintf(const char *fmt, Args... args) + { + char *rv = NULL; + DIE_IF_MINUS_ONE(asprintf(&rv, fmt, args...)); + string rv_string = rv; + free(rv); + return rv_string; + } + + template + string + astringprintf(const string &fmt, Args... args) + { + return astringprintf(fmt.c_str(), args...); + } + + vector split(string delim, string s); + + // Shut up and give me the difference between two pointers + template + ptrdiff_t + pointer_distance(const P1 *a, const P2 *b) + { + return reinterpret_cast(a) - reinterpret_cast(b); + } +}; + +#endif // CRUCIBLE_STRING_H diff --git a/include/crucible/time.h b/include/crucible/time.h new file mode 100644 index 0000000..d4c03ec --- /dev/null +++ b/include/crucible/time.h @@ -0,0 +1,49 @@ +#ifndef CRUCIBLE_TIME_H +#define CRUCIBLE_TIME_H + +#include "crucible/error.h" + +#include +#include +#include + +namespace crucible { + + double nanosleep(double secs); + + class Timer { + chrono::high_resolution_clock::time_point m_start; + + public: + Timer(); + double age() const; + double report(int precision = 1000) const; + void reset(); + void set(const chrono::high_resolution_clock::time_point &start); + void set(double delta); + double lap(); + bool operator<(double d) const; + bool operator>(double d) const; + }; + + ostream &operator<<(ostream &os, const Timer &t); + + class RateLimiter { + Timer m_timer; + double m_rate; + double m_burst; + double m_tokens; + mutex m_mutex; + + void update_tokens(); + public: + RateLimiter(double rate, double burst); + RateLimiter(double rate); + void sleep_for(double cost = 1.0); + bool is_ready(); + void borrow(double cost = 1.0); + }; + +} + +#endif // CRUCIBLE_TIME_H diff --git a/include/crucible/timequeue.h b/include/crucible/timequeue.h new file mode 100644 index 0000000..2f4349b --- /dev/null +++ b/include/crucible/timequeue.h @@ -0,0 +1,188 @@ +#ifndef CRUCIBLE_TIMEQUEUE_H +#define CRUCIBLE_TIMEQUEUE_H + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace crucible { + using namespace std; + + template + class TimeQueue { + + public: + using Timestamp = chrono::high_resolution_clock::time_point; + + private: + struct Item { + Timestamp m_time; + unsigned m_id; + Task m_task; + + bool operator<(const Item &that) const { + if (m_time < that.m_time) return true; + if (that.m_time < m_time) return false; + return m_id < that.m_id; + } + static unsigned s_id; + + Item(const Timestamp &time, const Task& task) : + m_time(time), + m_id(++s_id), + m_task(task) + { + } + + }; + + set m_set; + mutable mutex m_mutex; + condition_variable m_cond_full, m_cond_empty; + size_t m_max_queue_depth; + + public: + ~TimeQueue(); + TimeQueue(size_t max_queue_depth = numeric_limits::max()); + + void push(const Task &task, double delay = 0); + void push_nowait(const Task &task, double delay = 0); + Task pop(); + bool pop_nowait(Task &t); + double when() const; + + size_t size() const; + bool empty() const; + + list peek(size_t count) const; + }; + + template unsigned TimeQueue::Item::s_id = 0; + + template + TimeQueue::~TimeQueue() + { + if (!m_set.empty()) { + cerr << "ERROR: " << m_set.size() << " locked items still in TimeQueue at destruction" << endl; + } + } + + template + void + TimeQueue::push(const Task &task, double delay) + { + Timestamp time = chrono::high_resolution_clock::now() + + chrono::duration_cast(chrono::duration(delay)); + unique_lock lock(m_mutex); + while (m_set.size() > m_max_queue_depth) { + m_cond_full.wait(lock); + } + m_set.insert(Item(time, task)); + m_cond_empty.notify_all(); + } + + template + void + TimeQueue::push_nowait(const Task &task, double delay) + { + Timestamp time = chrono::high_resolution_clock::now() + + chrono::duration_cast(chrono::duration(delay)); + unique_lock lock(m_mutex); + m_set.insert(Item(time, task)); + m_cond_empty.notify_all(); + } + + template + Task + TimeQueue::pop() + { + unique_lock lock(m_mutex); + while (1) { + while (m_set.empty()) { + m_cond_empty.wait(lock); + } + Timestamp now = chrono::high_resolution_clock::now(); + if (now > m_set.begin()->m_time) { + Task rv = m_set.begin()->m_task; + m_set.erase(m_set.begin()); + m_cond_full.notify_all(); + return rv; + } + m_cond_empty.wait_until(lock, m_set.begin()->m_time); + } + } + + template + bool + TimeQueue::pop_nowait(Task &t) + { + unique_lock lock(m_mutex); + if (m_set.empty()) { + return false; + } + Timestamp now = chrono::high_resolution_clock::now(); + if (now <= m_set.begin()->m_time) { + return false; + } + t = m_set.begin()->m_task; + m_set.erase(m_set.begin()); + m_cond_full.notify_all(); + return true; + } + + template + double + TimeQueue::when() const + { + unique_lock lock(m_mutex); + if (m_set.empty()) { + return numeric_limits::infinity(); + } + return chrono::duration(m_set.begin()->m_time - chrono::high_resolution_clock::now()).count(); + } + + template + size_t + TimeQueue::size() const + { + unique_lock lock(m_mutex); + return m_set.size(); + } + + template + bool + TimeQueue::empty() const + { + unique_lock lock(m_mutex); + return m_set.empty(); + } + + template + list + TimeQueue::peek(size_t count) const + { + unique_lock lock(m_mutex); + list rv; + auto it = m_set.begin(); + while (count-- && it != m_set.end()) { + rv.push_back(it->m_task); + ++it; + } + return rv; + } + + template + TimeQueue::TimeQueue(size_t max_depth) : + m_max_queue_depth(max_depth) + { + } + +} + +#endif // CRUCIBLE_TIMEQUEUE_H diff --git a/include/crucible/uuid.h b/include/crucible/uuid.h new file mode 100644 index 0000000..7b1d545 --- /dev/null +++ b/include/crucible/uuid.h @@ -0,0 +1,14 @@ +#ifndef CRUCIBLE_UUID_H +#define CRUCIBLE_UUID_H + +#include + +#include + +namespace crucible { + using namespace std; + + string uuid_unparse(const unsigned char a[16]); +} + +#endif // CRUCIBLE_UUID_H diff --git a/include/crucible/workqueue.h b/include/crucible/workqueue.h new file mode 100644 index 0000000..2ba13ec --- /dev/null +++ b/include/crucible/workqueue.h @@ -0,0 +1,189 @@ +#ifndef CRUCIBLE_WORKQUEUE_H +#define CRUCIBLE_WORKQUEUE_H + +#include + +#include +#include +#include +#include +#include +#include + +namespace crucible { + using namespace std; + + template + class WorkQueue { + + public: + using set_type = set; + using key_type = Task; + + private: + + set_type m_set; + mutable mutex m_mutex; + condition_variable m_cond_full, m_cond_empty; + size_t m_max_queue_depth; + + public: + ~WorkQueue(); + template WorkQueue(size_t max_queue_depth, Args... args); + template WorkQueue(Args... args); + + void push(const key_type &name); + void push_wait(const key_type &name, size_t limit); + void push_nowait(const key_type &name); + + key_type pop(); + bool pop_nowait(key_type &rv); + key_type peek(); + + size_t size() const; + bool empty(); + set_type copy(); + list peek(size_t count) const; + + }; + + template + WorkQueue::~WorkQueue() + { + if (!m_set.empty()) { + cerr << "ERROR: " << m_set.size() << " locked items still in WorkQueue " << this << " at destruction" << endl; + } + } + + template + void + WorkQueue::push(const key_type &name) + { + unique_lock lock(m_mutex); + while (!m_set.count(name) && m_set.size() > m_max_queue_depth) { + m_cond_full.wait(lock); + } + m_set.insert(name); + m_cond_empty.notify_all(); + } + + template + void + WorkQueue::push_wait(const key_type &name, size_t limit) + { + unique_lock lock(m_mutex); + while (!m_set.count(name) && m_set.size() >= limit) { + m_cond_full.wait(lock); + } + m_set.insert(name); + m_cond_empty.notify_all(); + } + + template + void + WorkQueue::push_nowait(const key_type &name) + { + unique_lock lock(m_mutex); + m_set.insert(name); + m_cond_empty.notify_all(); + } + + template + typename WorkQueue::key_type + WorkQueue::pop() + { + unique_lock lock(m_mutex); + while (m_set.empty()) { + m_cond_empty.wait(lock); + } + key_type rv = *m_set.begin(); + m_set.erase(m_set.begin()); + m_cond_full.notify_all(); + return rv; + } + + template + bool + WorkQueue::pop_nowait(key_type &rv) + { + unique_lock lock(m_mutex); + if (m_set.empty()) { + return false; + } + rv = *m_set.begin(); + m_set.erase(m_set.begin()); + m_cond_full.notify_all(); + return true; + } + + template + typename WorkQueue::key_type + WorkQueue::peek() + { + unique_lock lock(m_mutex); + if (m_set.empty()) { + return key_type(); + } else { + return *m_set.begin(); + } + } + + template + size_t + WorkQueue::size() const + { + unique_lock lock(m_mutex); + return m_set.size(); + } + + template + bool + WorkQueue::empty() + { + unique_lock lock(m_mutex); + return m_set.empty(); + } + + template + typename WorkQueue::set_type + WorkQueue::copy() + { + unique_lock lock(m_mutex); + return m_set; + } + + template + list + WorkQueue::peek(size_t count) const + { + unique_lock lock(m_mutex); + list rv; + for (auto i : m_set) { + if (count--) { + rv.push_back(i); + } else { + break; + } + } + return rv; + } + + template + template + WorkQueue::WorkQueue(Args... args) : + m_set(args...), + m_max_queue_depth(numeric_limits::max()) + { + } + + template + template + WorkQueue::WorkQueue(size_t max_depth, Args... args) : + m_set(args...), + m_max_queue_depth(max_depth) + { + } + +} + +#endif // CRUCIBLE_WORKQUEUE_H diff --git a/lib/Makefile b/lib/Makefile new file mode 100644 index 0000000..5a93cff --- /dev/null +++ b/lib/Makefile @@ -0,0 +1,37 @@ +default: libcrucible.so + +OBJS = \ + crc64.o \ + chatter.o \ + error.o \ + execpipe.o \ + extentwalker.o \ + fd.o \ + fs.o \ + interp.o \ + ntoa.o \ + path.o \ + process.o \ + string.o \ + time.o \ + uuid.o \ + +include ../makeflags + +LDFLAGS = -shared -luuid + +depends.mk: *.c *.cc + for x in *.c; do $(CC) $(CFLAGS) -M "$$x"; done > depends.mk.new + for x in *.cc; do $(CXX) $(CXXFLAGS) -M "$$x"; done >> depends.mk.new + mv -fv depends.mk.new depends.mk + +-include depends.mk + +%.o: %.c + $(CC) $(CFLAGS) -o $@ -c $< + +%.o: %.cc ../include/crucible/%.h + $(CXX) $(CXXFLAGS) -o $@ -c $< + +libcrucible.so: $(OBJS) Makefile + $(CXX) $(LDFLAGS) -o $@ $(OBJS) diff --git a/lib/chatter.cc b/lib/chatter.cc new file mode 100644 index 0000000..2ebe841 --- /dev/null +++ b/lib/chatter.cc @@ -0,0 +1,140 @@ +#include "crucible/chatter.h" +#include "crucible/error.h" +#include "crucible/path.h" +#include "crucible/process.h" + +#include +#include +#include +#include +#include +#include + +#include + +namespace crucible { + using namespace std; + + static auto_ptr> chatter_names; + static const char *SPACETAB = " \t"; + + static + void + init_chatter_names() + { + if (!chatter_names.get()) { + chatter_names.reset(new set); + const char *sp = ::getenv("CRUCIBLE_CHATTER"); + if (sp) { + cerr << "CRUCIBLE_CHATTER = '" << sp << "'" << endl; + string s(sp); + while (!s.empty()) { + s.erase(0, s.find_first_not_of(SPACETAB)); + if (s.empty()) { + break; + } + size_t last = s.find_first_of(SPACETAB); + string first_word = s.substr(0, last); + cerr << "\t'" << first_word << "'" << endl; + chatter_names->insert(first_word); + s.erase(0, last); + } + } + } + } + + Chatter::Chatter(string name, ostream &os) + : m_name(name), m_os(os) + { + } + + Chatter::~Chatter() + { + ostringstream header_stream; + + time_t ltime; + DIE_IF_MINUS_ONE(time(<ime)); + struct tm ltm; + DIE_IF_ZERO(localtime_r(<ime, <m)); + + char buf[1024]; + DIE_IF_ZERO(strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", <m)); + + header_stream << buf; + header_stream << " " << getpid() << "." << gettid(); + if (!m_name.empty()) { + header_stream << " " << m_name; + } + header_stream << ": "; + + string out = m_oss.str(); + string header = header_stream.str(); + + string::size_type start = 0; + while (start < out.size()) { + size_t end_line = out.find_first_of("\n", start); + if (end_line != string::npos) { + assert(out[end_line] == '\n'); + size_t end = end_line; + m_os << (header + out.substr(start, end - start) + "\n") << flush; + start = end_line + 1; + } else { + m_os << (header + out.substr(start) + "\n") << flush; + start = out.size(); + } + } + } + + Chatter::Chatter(Chatter &&c) + : m_name(c.m_name), m_os(c.m_os), m_oss(c.m_oss.str()) + { + c.m_oss.str(""); + } + + set ChatterBox::s_boxes; + + set& ChatterBox::all_boxes() + { + return s_boxes; + } + + ChatterBox::ChatterBox(string file, int line, string pretty_function, ostream &os) + : m_file(basename(file)), m_line(line), m_pretty_function(pretty_function), m_enabled(false), m_os(os) + { + s_boxes.insert(this); + init_chatter_names(); + if (chatter_names->find(m_file) != chatter_names->end()) { + m_enabled = true; + } else if (chatter_names->find(m_pretty_function) != chatter_names->end()) { + m_enabled = true; + } else if (!chatter_names->empty()) { + cerr << "CRUCIBLE_CHATTER does not list '" << m_file << "' or '" << m_pretty_function << "'" << endl; + } + // cerr << "ChatterBox " << reinterpret_cast(this) << " constructed" << endl; + } + + ChatterBox::~ChatterBox() + { + s_boxes.erase(this); + // cerr << "ChatterBox " << reinterpret_cast(this) << " destructed" << endl; + } + + void + ChatterBox::set_enable(bool en) + { + m_enabled = en; + } + + ChatterUnwinder::ChatterUnwinder(function f) : + m_func(f) + { + } + + ChatterUnwinder::~ChatterUnwinder() + { + if (uncaught_exception()) { + m_func(); + } + } + +}; diff --git a/lib/crc64.cc b/lib/crc64.cc new file mode 100644 index 0000000..90dc87d --- /dev/null +++ b/lib/crc64.cc @@ -0,0 +1,59 @@ +#include "crucible/crc64.h" + +#define POLY64REV 0xd800000000000000ULL + +namespace crucible { + + static bool init = false; + static uint64_t CRCTable[256]; + + static void init_crc64_table() + { + if (!init) { + for (int i = 0; i <= 255; i++) { + uint64_t part = i; + for (int j = 0; j < 8; j++) { + if (part & 1) { + part = (part >> 1) ^ POLY64REV; + } else { + part >>= 1; + } + } + CRCTable[i] = part; + } + init = true; + } + } + + uint64_t + Digest::CRC::crc64(const char *s) + { + init_crc64_table(); + + uint64_t crc = 0; + for (; *s; s++) { + uint64_t temp1 = crc >> 8; + uint64_t temp2 = CRCTable[(crc ^ static_cast(*s)) & 0xff]; + crc = temp1 ^ temp2; + } + + return crc; + } + + uint64_t + Digest::CRC::crc64(const void *p, size_t len) + { + init_crc64_table(); + + uint64_t crc = 0; + for (const unsigned char *s = static_cast(p); len; --len) { + uint64_t temp1 = crc >> 8; + uint64_t temp2 = CRCTable[(crc ^ *s++) & 0xff]; + crc = temp1 ^ temp2; + } + + return crc; + } + + +}; diff --git a/lib/error.cc b/lib/error.cc new file mode 100644 index 0000000..f2a6db0 --- /dev/null +++ b/lib/error.cc @@ -0,0 +1,74 @@ +#include "crucible/error.h" + +#include +#include + +#include + +namespace crucible { + using namespace std; + + static + string + analyze_exception(const exception &e) + { + // Let's ignore all the potential memory allocation exceptions for now, K? + ostringstream oss; + + int status; + char *realname = abi::__cxa_demangle(typeid(e).name(), 0, 0, &status); + oss << "exception type "; + // This is questionable since anything that would cause + // cxa_demangle to fail will probably cause an exception anyway. + if (realname) { + oss << realname; + free(realname); + } else { + oss << typeid(e).name(); + } + oss << ": " << e.what(); + return oss.str(); + } + + // FIXME: could probably avoid some of these levels of indirection + static + function current_catch_explainer = [&](string s) { + cerr << s << endl; + }; + + void + set_catch_explainer(function f) + { + current_catch_explainer = f; + } + + void + default_catch_explainer(string s) + { + current_catch_explainer(s); + } + + int + catch_all(const function &f, const function &explainer) + { + try { + f(); + return 0; + } catch (const exception &e) { + explainer(analyze_exception(e)); + return 1; + } + } + + void + catch_and_explain(const function &f, const function &explainer) + { + try { + f(); + } catch (const exception &e) { + explainer(analyze_exception(e)); + throw; + } + } + +}; diff --git a/lib/execpipe.cc b/lib/execpipe.cc new file mode 100644 index 0000000..57a150d --- /dev/null +++ b/lib/execpipe.cc @@ -0,0 +1,104 @@ +#include "crucible/execpipe.h" + +#include "crucible/chatter.h" +#include "crucible/error.h" +#include "crucible/process.h" + +#include +#include +#include +#include + +namespace crucible { + using namespace std; + + void + redirect_stdin(const Fd &child_fd) + { + dup2_or_die(child_fd, STDIN_FILENO); + } + + void + redirect_stdin_stdout(const Fd &child_fd) + { + dup2_or_die(child_fd, STDOUT_FILENO); + dup2_or_die(child_fd, STDIN_FILENO); + } + + void + redirect_stdin_stdout_stderr(const Fd &child_fd) + { + dup2_or_die(child_fd, STDERR_FILENO); + dup2_or_die(child_fd, STDOUT_FILENO); + dup2_or_die(child_fd, STDIN_FILENO); + } + + void + redirect_stdout_stderr(const Fd &child_fd) + { + dup2_or_die(child_fd, STDERR_FILENO); + dup2_or_die(child_fd, STDOUT_FILENO); + } + + void + redirect_stdout(const Fd &child_fd) + { + dup2_or_die(child_fd, STDOUT_FILENO); + } + + void + redirect_stderr(const Fd &child_fd) + { + dup2_or_die(child_fd, STDERR_FILENO); + } + + Fd popen(function f, function import_fd_fn) + { + Fd parent_fd, child_fd; + { + pair fd_pair = socketpair_or_die(); + parent_fd = fd_pair.first; + child_fd = fd_pair.second; + } + + pid_t fv; + DIE_IF_MINUS_ONE(fv = fork()); + + if (fv) { + child_fd->close(); + return parent_fd; + } else { + int rv = EXIT_FAILURE; + catch_all([&]() { + parent_fd->close(); + import_fd_fn(child_fd); + // system("ls -l /proc/$$/fd/ >&2"); + + rv = f(); + }); + _exit(rv); + cerr << "PID " << getpid() << " TID " << gettid() << "STILL ALIVE" << endl; + system("ls -l /proc/$$/task/ >&2"); + exit(EXIT_FAILURE); + } + } + + string + read_all(Fd fd, size_t max_bytes, size_t chunk_bytes) + { + char buf[chunk_bytes]; + string str; + size_t rv; + while (1) { + read_partial_or_die(fd, static_cast(buf), chunk_bytes, rv); + if (rv == 0) { + break; + } + if (max_bytes - str.size() < rv) { + THROW_ERROR(out_of_range, "Output size limit " << max_bytes << " exceeded by appending " << rv << " bytes read to " << str.size() << " already in string"); + } + str.append(buf, rv); + } + return str; + } +} diff --git a/lib/extentwalker.cc b/lib/extentwalker.cc new file mode 100644 index 0000000..dd7dacd --- /dev/null +++ b/lib/extentwalker.cc @@ -0,0 +1,630 @@ +#include "crucible/extentwalker.h" + +#include "crucible/chatter.h" +#include "crucible/error.h" +#include "crucible/fs.h" +#include "crucible/limits.h" +#include "crucible/string.h" + + +namespace crucible { + using namespace std; + + const off_t ExtentWalker::sc_step_size; + + // fm_start, fm_length, fm_flags, m_extents + // fe_logical, fe_physical, fe_length, fe_flags + + static const off_t MAX_OFFSET = numeric_limits::max(); + static const off_t FIEMAP_BLOCK_SIZE = 4096; + + static bool __ew_do_log = getenv("EXTENTWALKER_DEBUG"); + +#define EWLOG(x) do { \ + if (__ew_do_log) { \ + CHATTER(x); \ + } \ +} while (0) + + ostream & + operator<<(ostream &os, const Extent &e) + { + os << "Extent {" + << " begin = " << to_hex(e.m_begin) + << ", end = " << to_hex(e.m_end) + << ", physical = " << to_hex(e.m_physical) + << ", flags = "; + if (e.m_flags & Extent::HOLE) { + os << "Extent::HOLE|"; + } + if (e.m_flags & Extent::PREALLOC) { + os << "Extent::PREALLOC|"; + } + if (e.m_flags & Extent::OBSCURED) { + os << "Extent::OBSCURED|"; + } + if (e.m_flags & ~(Extent::HOLE | Extent::PREALLOC | Extent::OBSCURED)) { + os << fiemap_extent_flags_ntoa(e.m_flags & ~(Extent::HOLE | Extent::PREALLOC | Extent::OBSCURED)); + } + if (e.m_physical_len) { + os << ", physical_len = " << to_hex(e.m_physical_len); + } + if (e.m_logical_len) { + os << ", logical_len = " << to_hex(e.m_logical_len); + } + if (e.m_offset) { + os << ", offset = " << to_hex(e.m_offset); + } + return os << " }"; + } + + ostream & + operator<<(ostream &os, const ExtentWalker::Vec &v) + { + os << "ExtentWalker::Vec {"; + for (auto e : v) { + os << "\n\t" << e; + } + return os << "}"; + } + + ostream & + operator<<(ostream &os, const ExtentWalker &ew) + { + return os << "ExtentWalker {" + << " fd = " << name_fd(ew.m_fd) + << ", stat.st_size = " << to_hex(ew.m_stat.st_size) + << ", extents = " << ew.m_extents + << ", current = [" << ew.m_current - ew.m_extents.begin() + << "] }"; + } + + Extent::Extent() : + m_begin(0), + m_end(0), + m_physical(0), + m_flags(0), + m_physical_len(0), + m_logical_len(0), + m_offset(0) + { + } + + Extent::operator bool() const + { + THROW_CHECK2(invalid_argument, m_begin, m_end, m_end >= m_begin); + return m_end > m_begin; + } + + off_t + Extent::size() const + { + THROW_CHECK2(invalid_argument, m_begin, m_end, m_end >= m_begin); + return m_end - m_begin; + } + + bool + Extent::operator==(const Extent &that) const + { + return m_begin == that.m_begin && m_end == that.m_end && m_physical == that.m_physical && m_flags == that.m_flags; + } + + ExtentWalker::ExtentWalker(Fd fd) : + m_fd(fd), + m_current(m_extents.begin()) + { + } + + ExtentWalker::ExtentWalker(Fd fd, off_t initial_pos) : + m_fd(fd), + m_current(m_extents.begin()) + { + seek(initial_pos); + } + + ExtentWalker::Itr + ExtentWalker::find_in_cache(off_t pos) + { + EWLOG("find_in_cache " << to_hex(pos)); + // EOF is an annoying special case + if (pos >= m_stat.st_size) { + if (!m_extents.empty() && m_extents.rbegin()->m_end == m_stat.st_size) { + auto i = m_extents.end(); + return --i; + } + } + for (auto vi = m_extents.begin(); vi != m_extents.end(); ++vi) { + if (pos >= vi->m_begin && pos < vi->m_end) { + EWLOG("pos " << to_hex(pos) << " in " << *vi); + if (vi == m_extents.begin() && !(m_extents.begin()->m_begin == 0)) { + // Must have an extent before pos, unless + // there can be no extent before pos because pos == 0 + EWLOG("can't match first unless begin is BOF"); + break; + } + auto ni = vi; + ++ni; + if (ni == m_extents.end() && !(vi->m_end >= m_stat.st_size)) { + // Must have an extent after pos, unless + // there can be no extent after pos because pos >= EOF + EWLOG("can't match last unless end past EOF " << to_hex(m_stat.st_size)); + break; + } + // Extent surrounded on either side by other known extents + return vi; + } + } + EWLOG("find_in_cache failed: " << *this); + return m_extents.end(); + } + + void + ExtentWalker::run_fiemap(off_t pos) + { + ostringstream log; + CHATTER_UNWIND("Log of run_fiemap: " << log.str()); + + EWLOG("pos = " << to_hex(pos)); + + THROW_CHECK1(invalid_argument, pos, (pos & (FIEMAP_BLOCK_SIZE - 1)) == 0); + + Vec fm; + + off_t step_size = pos; + off_t begin = pos - min(pos, sc_step_size); + + // This loop should not run forever + int loop_count = 0; + int loop_limit = 99; + while (true) { + if (loop_count == 90) { + EWLOG(log.str()); + } + + THROW_CHECK1(runtime_error, loop_count, loop_count < loop_limit); + ++loop_count; + + // Get file size every time in case it changes under us + m_stat.fstat(m_fd); + + // Get fiemap begin..EOF + fm = get_extent_map(begin); + EWLOG("fiemap result loop count #" << loop_count << ":" << fm); + + // This algorithm seeks at least three extents: one before, + // one after, and one containing pos. Files which contain + // two or fewer extents will cause an obvious problem with that, + // so handle those cases separately. + + // FIEMAP lies, and we catch it in a lie about the size of the + // second extent. To work around this, try getting more than 3. + + // 0..2(ish) extents + if (fm.size() < sc_extent_fetch_min) { + // If we are not at beginning of file, move backward + if (begin > 0) { + step_size /= 2; + auto next_begin = (begin - min(step_size, begin)) & ~(FIEMAP_BLOCK_SIZE - 1); + EWLOG("step backward " << to_hex(begin) << " -> " << to_hex(next_begin) << " extents size " << fm.size()); + if (begin == next_begin) { + EWLOG("step backward stopped"); + break; + } + begin = next_begin; + continue; + } + + // We are at beginning of file and have too few extents. + + // Zero extents? Entire file is a hole. + if (fm.empty()) { + EWLOG("zero extents"); + break; + } + + // We know we have the beginning of the file and at least + // one extent. If the last extent is EOF then we have the + // whole file in the buffer. If the last extent is NOT + // EOF then fiemap did something we didn't expect. + THROW_CHECK1(runtime_error, fm.rbegin()->flags(), fm.rbegin()->flags() & FIEMAP_EXTENT_LAST); + break; + } + + // We have at least three extents, so there is now a first and last. + // We want pos to be between first and last. There doesn't have + // to be an extent between these (it could be a hole). + auto &first_extent = fm.at(sc_extent_fetch_min - 2); + auto &last_extent = *fm.rbegin(); + EWLOG("first_extent = " << first_extent); + EWLOG("last_extent = " << last_extent); + + // First extent must end on or before pos + if (first_extent.end() > pos) { + // Can we move backward? + if (begin > 0) { + step_size /= 2; + auto next_begin = (begin - min(step_size, begin)) & ~(FIEMAP_BLOCK_SIZE - 1); + EWLOG("step backward " << to_hex(begin) << " -> " << to_hex(next_begin) << " extents size " << fm.size()); + if (begin == next_begin) { + EWLOG("step backward stopped"); + break; + } + begin = next_begin; + continue; + } + + // We are as far back as we can go, so there must be no + // extent before pos (i.e. file starts with a hole). + EWLOG("no extent before pos"); + break; + } + + // First extent ends on or before pos. + + // If last extent is EOF then we have the entire file in the buffer. + // pos could be in last extent, so skip the later checks that + // insist pos be located prior to the last extent. + if (last_extent.flags() & FIEMAP_EXTENT_LAST) { + break; + } + + // Don't have EOF, must have an extent after pos. + if (last_extent.begin() <= pos) { + step_size /= 2; + auto new_begin = (begin + step_size) & ~(FIEMAP_BLOCK_SIZE - 1); + EWLOG("step forward " << to_hex(begin) << " -> " << to_hex(new_begin)); + if (begin == new_begin) { + EWLOG("step forward stopped"); + break; + } + begin = new_begin; + continue; + } + + // Last extent begins after pos, first extent ends on or before pos. + // All other cases should have been handled before here. + THROW_CHECK2(runtime_error, pos, first_extent, first_extent.end() <= pos); + THROW_CHECK2(runtime_error, pos, last_extent, last_extent.begin() > pos); + + // We should probably stop now + break; + } + + // Fill in holes so there are Extent records over entire range + auto fmi = fm.begin(); + off_t ipos = begin; + Vec new_vec; + // If we mapped the entire file and there are no extents, + // the entire file is a hole. + bool last_extent_is_last = (begin == 0 && fm.empty()); + while (fmi != fm.end()) { + Extent new_extent(*fmi); + THROW_CHECK2(runtime_error, ipos, new_extent.m_begin, ipos <= new_extent.m_begin); + if (new_extent.m_begin > ipos) { + Extent hole_extent; + hole_extent.m_begin = ipos; + hole_extent.m_end = fmi->begin(); + hole_extent.m_physical = 0; + hole_extent.m_flags = Extent::HOLE; + new_vec.push_back(hole_extent); + ipos += hole_extent.size(); + } + THROW_CHECK2(runtime_error, ipos, new_extent.m_begin, ipos == new_extent.m_begin); + new_vec.push_back(new_extent); + ipos += new_extent.size(); + last_extent_is_last = fmi->flags() & FIEMAP_EXTENT_LAST; + ++fmi; + } + // If we have run out of extents before EOF, insert a hole at the end + if (last_extent_is_last && ipos < m_stat.st_size) { + Extent hole_extent; + hole_extent.m_begin = ipos; + hole_extent.m_end = m_stat.st_size; + hole_extent.m_physical = 0; + hole_extent.m_flags = Extent::HOLE; + if (!new_vec.empty() && new_vec.rbegin()->m_flags & FIEMAP_EXTENT_LAST) { + new_vec.rbegin()->m_flags &= ~(FIEMAP_EXTENT_LAST); + hole_extent.m_flags |= FIEMAP_EXTENT_LAST; + } + new_vec.push_back(hole_extent); + ipos += new_vec.size(); + } + THROW_CHECK1(runtime_error, new_vec.size(), !new_vec.empty()); + + // Allow last extent to extend beyond desired range (e.g. at EOF) + THROW_CHECK2(runtime_error, ipos, new_vec.rbegin()->m_end, ipos <= new_vec.rbegin()->m_end); + // If we have the last extent in the file, truncate it to the file size. + if (ipos >= m_stat.st_size) { + THROW_CHECK2(runtime_error, new_vec.rbegin()->m_begin, m_stat.st_size, m_stat.st_size > new_vec.rbegin()->m_begin); + THROW_CHECK2(runtime_error, new_vec.rbegin()->m_end, m_stat.st_size, m_stat.st_size <= new_vec.rbegin()->m_end); + new_vec.rbegin()->m_end = m_stat.st_size; + } + + // Verify contiguous, ascending order, at least one Extent + THROW_CHECK1(runtime_error, new_vec, !new_vec.empty()); + + ipos = new_vec.begin()->m_begin; + bool last_flag_last = false; + for (auto e : new_vec) { + THROW_CHECK1(runtime_error, new_vec, e.m_begin == ipos); + THROW_CHECK1(runtime_error, e, e.size() > 0); + THROW_CHECK1(runtime_error, new_vec, !last_flag_last); + ipos += e.size(); + last_flag_last = e.m_flags & FIEMAP_EXTENT_LAST; + } + THROW_CHECK1(runtime_error, new_vec, !last_extent_is_last || new_vec.rbegin()->m_end == ipos); + + m_extents = new_vec; + m_current = m_extents.begin(); + } + + void + ExtentWalker::reset() + { + m_extents.clear(); + m_current = m_extents.begin(); + } + + void + ExtentWalker::seek(off_t pos) + { + CHATTER_UNWIND("seek " << to_hex(pos)); + THROW_CHECK1(out_of_range, pos, pos >= 0); + Itr rv = find_in_cache(pos); + if (rv != m_extents.end()) { + m_current = rv; + return; + } + run_fiemap(pos); + m_current = find_in_cache(pos); + } + + Extent + ExtentWalker::current() + { + THROW_CHECK2(invalid_argument, *this, m_extents.size(), m_current != m_extents.end()); + CHATTER_UNWIND("current " << *m_current); + return *m_current; + } + + + bool + ExtentWalker::next() + { + CHATTER_UNWIND("next"); + THROW_CHECK1(invalid_argument, (m_current != m_extents.end()), m_current != m_extents.end()); + if (current().m_end >= m_stat.st_size) { + CHATTER_UNWIND("next EOF"); + return false; + } + auto next_pos = current().m_end; + if (next_pos >= m_stat.st_size) { + CHATTER_UNWIND("next next_pos = " << next_pos << " m_stat.st_size = " << m_stat.st_size); + return false; + } + seek(next_pos); + THROW_CHECK1(runtime_error, (m_current != m_extents.end()), m_current != m_extents.end()); + + // FIEMAP is full of lies, so this check keeps failing + // THROW_CHECK2(runtime_error, current().m_begin, next_pos, current().m_begin == next_pos); + // Just ensure that pos is in the next extent somewhere. + THROW_CHECK2(runtime_error, current(), next_pos, current().m_begin <= next_pos); + THROW_CHECK2(runtime_error, current(), next_pos, current().m_end > next_pos); + + return true; + } + + bool + ExtentWalker::prev() + { + CHATTER_UNWIND("prev"); + THROW_CHECK1(invalid_argument, (m_current != m_extents.end()), m_current != m_extents.end()); + auto prev_iter = m_current; + if (prev_iter->m_begin == 0) { + CHATTER_UNWIND("prev BOF"); + return false; + } + THROW_CHECK1(invalid_argument, (prev_iter != m_extents.begin()), prev_iter != m_extents.begin()); + --prev_iter; + CHATTER_UNWIND("prev seeking to " << *prev_iter << "->m_begin"); + auto prev_end = current().m_begin; + seek(prev_iter->m_begin); + THROW_CHECK1(runtime_error, (m_current != m_extents.end()), m_current != m_extents.end()); + THROW_CHECK2(runtime_error, current().m_end, prev_end, current().m_end == prev_end); + return true; + } + + ExtentWalker::~ExtentWalker() + { + } + + BtrfsExtentWalker::BtrfsExtentWalker(Fd fd) : + ExtentWalker(fd), + m_tree_id(0) + { + } + + BtrfsExtentWalker::BtrfsExtentWalker(Fd fd, off_t initial_pos) : + ExtentWalker(fd), + m_tree_id(0) + { + seek(initial_pos); + } + + void + BtrfsExtentWalker::set_root_fd(Fd root_fd) + { + m_root_fd = root_fd; + } + + BtrfsExtentWalker::BtrfsExtentWalker(Fd fd, off_t initial_pos, Fd root_fd) : + ExtentWalker(fd), + m_tree_id(0) + { + set_root_fd(root_fd); + seek(initial_pos); + } + + BtrfsExtentWalker::Vec + BtrfsExtentWalker::get_extent_map(off_t pos) + { + BtrfsIoctlSearchKey sk; + if (!m_root_fd) { + m_root_fd = m_fd; + } + if (!m_tree_id) { + m_tree_id = btrfs_get_root_id(m_fd); + } + sk.tree_id = m_tree_id; + sk.min_objectid = m_stat.st_ino; + sk.max_objectid = numeric_limits::max(); + sk.min_offset = ranged_cast(pos); + sk.max_offset = numeric_limits::max(); + sk.min_transid = 0; + sk.max_transid = numeric_limits::max(); + sk.min_type = sk.max_type = BTRFS_EXTENT_DATA_KEY; + sk.nr_items = sc_extent_fetch_max; + + CHATTER_UNWIND("sk " << sk << " root_fd " << name_fd(m_root_fd)); + sk.do_ioctl(m_root_fd); + + Vec rv; + + bool past_eof = false; + for (auto i : sk.m_result) { + // If we're seeing extents from the next file then we're past EOF on this file + if (i.objectid > m_stat.st_ino) { + past_eof = true; + break; + } + + // Ignore things that aren't EXTENT_DATA_KEY + if (i.type != BTRFS_EXTENT_DATA_KEY) { + continue; + } + + // Hmmmkay we shouldn't be seeing these + if (i.objectid < m_stat.st_ino) { + THROW_ERROR(out_of_range, "objectid " << i.objectid << " < m_stat.st_ino " << m_stat.st_ino); + continue; + } + + Extent e; + e.m_begin = i.offset; + auto compressed = call_btrfs_get(btrfs_stack_file_extent_compression, i.m_data); + // FIEMAP told us about compressed extents and we can too + if (compressed) { + e.m_flags |= FIEMAP_EXTENT_ENCODED; + } + + auto type = call_btrfs_get(btrfs_stack_file_extent_type, i.m_data); + off_t len = -1; + switch (type) { + default: + cerr << "Unhandled file extent type " << type << " in root " << m_tree_id << " ino " << m_stat.st_ino << endl; + break; + case BTRFS_FILE_EXTENT_INLINE: + len = ranged_cast(call_btrfs_get(btrfs_stack_file_extent_ram_bytes, i.m_data)); + e.m_flags |= FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED; + // Inline extents are never obscured, so don't bother filling in m_physical_len, etc. + break; + case BTRFS_FILE_EXTENT_PREALLOC: + e.m_flags |= Extent::PREALLOC; + case BTRFS_FILE_EXTENT_REG: { + e.m_physical = call_btrfs_get(btrfs_stack_file_extent_disk_bytenr, i.m_data); + + // This is the length of the full extent (decompressed) + off_t ram = ranged_cast(call_btrfs_get(btrfs_stack_file_extent_ram_bytes, i.m_data)); + + // This is the length of the part of the extent appearing in the file (decompressed) + len = ranged_cast(call_btrfs_get(btrfs_stack_file_extent_num_bytes, i.m_data)); + + // This is the offset from start of on-disk extent to the part we see in the file (decompressed) + // May be negative due to the kind of bug we're stuck with forever, so no cast range check + off_t offset = call_btrfs_get(btrfs_stack_file_extent_offset, i.m_data); + + // If there is a physical address there must be size too + if (e.m_physical) { + THROW_CHECK1(runtime_error, ram, ram > 0); + THROW_CHECK1(runtime_error, len, len > 0); + THROW_CHECK2(runtime_error, offset, ram, offset < ram); + } else { + // There are two kinds of hole in btrfs. This is the other one. + e.m_flags |= Extent::HOLE; + } + + // Partially obscured extent + // FIXME: sometimes this happens: + // i.type == BTRFS_EXTENT_DATA_KEY + // type = 0x1 + // compressed = 0x0 + // REG start 0x0 offset 0x0 num 0x20000 ram 0x21000 gen 1101121 + // btrfs_file_extent_item { + // generation = 1101121 + // ram_bytes = 135168 + // compression = 0x0 + // encryption = 0x0 + // other_encoding = 0x0 + // type = 0x1 + // disk_bytenr = 0x0 + // disk_num_bytes = 0x0 + // offset = 0x0 + // num_bytes = 0x20000 + // } + if (ram != len || offset != 0) { + e.m_flags |= Extent::OBSCURED; + // cerr << e << "\nram = " << ram << ", len = " << len << ", offset = " << offset << endl; + } + e.m_physical_len = ram; + e.m_logical_len = len; + e.m_offset = offset; + + // To maintain compatibility with FIEMAP we ignore the offset for compressed extents. + // At some point we'll grow out of this. + if (!compressed) { + e.m_physical += offset; + } + + break; + } + } + if (len > 0) { + e.m_end = e.m_begin + len; + if (e.m_end >= m_stat.st_size) { + e.m_flags |= FIEMAP_EXTENT_LAST; + } + // FIXME: no FIEMAP_EXTENT_SHARED + // WONTFIX: non-trivial to replicate LOGIAL_INO + rv.push_back(e); + } + } + + // Plug a hole at EOF + if (past_eof && !rv.empty()) { + rv.rbegin()->m_flags |= FIEMAP_EXTENT_LAST; + } + + return rv; + } + + ExtentWalker::Vec + ExtentWalker::get_extent_map(off_t pos) + { + Fiemap fm; + fm.fm_start = ranged_cast(pos); + fm.fm_length = ranged_cast(numeric_limits::max() - pos); + fm.m_max_count = fm.m_min_count = sc_extent_fetch_max; + fm.do_ioctl(m_fd); + Vec rv; + for (auto i : fm.m_extents) { + Extent e; + e.m_begin = ranged_cast(i.fe_logical); + e.m_end = ranged_cast(i.fe_logical + i.fe_length); + e.m_physical = i.fe_physical; + e.m_flags = i.fe_flags; + rv.push_back(e); + } + return rv; + } + +}; diff --git a/lib/fd.cc b/lib/fd.cc new file mode 100644 index 0000000..4216b4f --- /dev/null +++ b/lib/fd.cc @@ -0,0 +1,575 @@ +#include "crucible/chatter.h" +#include "crucible/error.h" +#include "crucible/fd.h" +#include "crucible/ntoa.h" +#include "crucible/string.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace crucible { + using namespace std; + + static const struct bits_ntoa_table o_flags_table[] = { + NTOA_TABLE_ENTRY_BITS(O_APPEND), + NTOA_TABLE_ENTRY_BITS(O_ASYNC), + NTOA_TABLE_ENTRY_BITS(O_CLOEXEC), + NTOA_TABLE_ENTRY_BITS(O_CREAT), + NTOA_TABLE_ENTRY_BITS(O_DIRECT), + NTOA_TABLE_ENTRY_BITS(O_DIRECTORY), + NTOA_TABLE_ENTRY_BITS(O_EXCL), + NTOA_TABLE_ENTRY_BITS(O_LARGEFILE), + NTOA_TABLE_ENTRY_BITS(O_NOATIME), + NTOA_TABLE_ENTRY_BITS(O_NOCTTY), + NTOA_TABLE_ENTRY_BITS(O_NOFOLLOW), + NTOA_TABLE_ENTRY_BITS(O_NONBLOCK), + NTOA_TABLE_ENTRY_BITS(O_NDELAY), // NONBLOCK will prevent this + NTOA_TABLE_ENTRY_BITS(O_SYNC), + NTOA_TABLE_ENTRY_BITS(O_TRUNC), + + // These aren't really bit values + NTOA_TABLE_ENTRY_BITS(O_RDWR), + NTOA_TABLE_ENTRY_BITS(O_WRONLY), + NTOA_TABLE_ENTRY_BITS(O_RDONLY), + + NTOA_TABLE_ENTRY_END(), + }; + + static const struct bits_ntoa_table o_mode_table[] = { + NTOA_TABLE_ENTRY_BITS(S_IFMT), + NTOA_TABLE_ENTRY_BITS(S_IFSOCK), + NTOA_TABLE_ENTRY_BITS(S_IFLNK), + NTOA_TABLE_ENTRY_BITS(S_IFREG), + NTOA_TABLE_ENTRY_BITS(S_IFBLK), + NTOA_TABLE_ENTRY_BITS(S_IFDIR), + NTOA_TABLE_ENTRY_BITS(S_IFCHR), + NTOA_TABLE_ENTRY_BITS(S_IFIFO), + NTOA_TABLE_ENTRY_BITS(S_ISUID), + NTOA_TABLE_ENTRY_BITS(S_ISGID), + NTOA_TABLE_ENTRY_BITS(S_ISVTX), + NTOA_TABLE_ENTRY_BITS(S_IRWXU), + NTOA_TABLE_ENTRY_BITS(S_IRUSR), + NTOA_TABLE_ENTRY_BITS(S_IWUSR), + NTOA_TABLE_ENTRY_BITS(S_IXUSR), + NTOA_TABLE_ENTRY_BITS(S_IRWXG), + NTOA_TABLE_ENTRY_BITS(S_IRGRP), + NTOA_TABLE_ENTRY_BITS(S_IWGRP), + NTOA_TABLE_ENTRY_BITS(S_IXGRP), + NTOA_TABLE_ENTRY_BITS(S_IRWXO), + NTOA_TABLE_ENTRY_BITS(S_IROTH), + NTOA_TABLE_ENTRY_BITS(S_IWOTH), + NTOA_TABLE_ENTRY_BITS(S_IXOTH), + NTOA_TABLE_ENTRY_END(), + }; + + string o_flags_ntoa(int flags) + { + return bits_ntoa(flags, o_flags_table); + } + + string o_mode_ntoa(mode_t mode) + { + return bits_ntoa(mode, o_mode_table); + } + + void + IOHandle::close() + { + CHATTER_TRACE("close fd " << m_fd << " in " << this); + if (m_fd >= 0) { + // Assume that ::close always destroys the FD, even if errors are encountered; + int closing_fd = m_fd; + m_fd = -1; + CHATTER_UNWIND("closing fd " << closing_fd << " in " << this); + DIE_IF_MINUS_ONE(::close(closing_fd)); + } + } + + IOHandle::~IOHandle() + { + CHATTER_TRACE("destroy fd " << m_fd << " in " << this); + if (m_fd >= 0) { + catch_all([&](){ + close(); + }); + } + } + + IOHandle::IOHandle() : + m_fd(-1) + { + CHATTER_TRACE("open fd " << m_fd << " in " << this); + } + + IOHandle::IOHandle(int fd) : + m_fd(fd) + { + CHATTER_TRACE("open fd " << m_fd << " in " << this); + } + + int + IOHandle::release_fd() + { + CHATTER_TRACE("release fd " << m_fd << " in " << this); + int rv = m_fd; + m_fd = -1; + return rv; + } + + // XXX: necessary? useful? + template <> + struct ChatterTraits { + Chatter &operator()(Chatter &c, const Fd &fd) const + { + c << "Fd {this=" << &fd << " fd=" << static_cast(fd) << "}"; + return c; + } + }; + + int + open_or_die(const string &file, int flags, mode_t mode) + { + int fd(::open(file.c_str(), flags, mode)); + if (fd < 0) { + THROW_ERRNO("open: name '" << file << "' mode " << oct << setfill('0') << setw(3) << mode << " flags " << o_flags_ntoa(flags)); + } + return fd; + } + + int + openat_or_die(int dir_fd, const string &file, int flags, mode_t mode) + { + int fd(::openat(dir_fd, file.c_str(), flags, mode)); + if (fd < 0) { + THROW_ERRNO("openat: dir_fd " << dir_fd << " " << name_fd(dir_fd) << " name '" << file << "' mode " << oct << setfill('0') << setw(3) << mode << " flags " << o_flags_ntoa(flags)); + } + return fd; + } + + static const struct bits_ntoa_table mmap_prot_table[] = { + NTOA_TABLE_ENTRY_BITS(PROT_EXEC), + NTOA_TABLE_ENTRY_BITS(PROT_READ), + NTOA_TABLE_ENTRY_BITS(PROT_WRITE), + NTOA_TABLE_ENTRY_BITS(PROT_NONE), + NTOA_TABLE_ENTRY_END(), + }; + + string mmap_prot_ntoa(int prot) + { + return bits_ntoa(prot, mmap_prot_table); + } + + static const struct bits_ntoa_table mmap_flags_table[] = { + NTOA_TABLE_ENTRY_BITS(MAP_SHARED), + NTOA_TABLE_ENTRY_BITS(MAP_PRIVATE), + NTOA_TABLE_ENTRY_BITS(MAP_32BIT), + NTOA_TABLE_ENTRY_BITS(MAP_ANONYMOUS), + NTOA_TABLE_ENTRY_BITS(MAP_DENYWRITE), + NTOA_TABLE_ENTRY_BITS(MAP_EXECUTABLE), +#if MAP_FILE + NTOA_TABLE_ENTRY_BITS(MAP_FILE), +#endif + NTOA_TABLE_ENTRY_BITS(MAP_FIXED), + NTOA_TABLE_ENTRY_BITS(MAP_GROWSDOWN), + NTOA_TABLE_ENTRY_BITS(MAP_HUGETLB), + NTOA_TABLE_ENTRY_BITS(MAP_LOCKED), + NTOA_TABLE_ENTRY_BITS(MAP_NONBLOCK), + NTOA_TABLE_ENTRY_BITS(MAP_NORESERVE), + NTOA_TABLE_ENTRY_BITS(MAP_POPULATE), + NTOA_TABLE_ENTRY_BITS(MAP_STACK), +#ifdef MAP_UNINITIALIZED + NTOA_TABLE_ENTRY_BITS(MAP_UNINITIALIZED), +#endif + NTOA_TABLE_ENTRY_END(), + }; + + string mmap_flags_ntoa(int flags) + { + return bits_ntoa(flags, mmap_flags_table); + } + + void * + mmap_or_die(void *addr, size_t length, int prot, int flags, int fd, off_t offset) + { + void *rv = mmap(addr, length, prot, flags, fd, offset); + if (rv == MAP_FAILED) { + THROW_ERRNO("mmap: addr " << addr << " length " << length + << " prot " << mmap_prot_ntoa(prot) + << " flags " << mmap_flags_ntoa(flags) + << " fd " << fd << " offset " << offset); + } + return rv; + } + + void + rename_or_die(const string &from, const string &to) + { + if (::rename(from.c_str(), to.c_str())) { + THROW_ERRNO("rename: " << from << " -> " << to); + } + } + + void + renameat_or_die(int fromfd, const string &frompath, int tofd, const string &topath) + { + if (::renameat(fromfd, frompath.c_str(), tofd, topath.c_str())) { + THROW_ERRNO("renameat: " << name_fd(fromfd) << "/" << frompath + << " -> " << name_fd(tofd) << "/" << topath); + } + } + + string + socket_domain_ntoa(int domain) + { + static const bits_ntoa_table table[] = { + NTOA_TABLE_ENTRY_ENUM(AF_UNIX), + NTOA_TABLE_ENTRY_ENUM(AF_LOCAL), // probably the same as AF_UNIX + NTOA_TABLE_ENTRY_ENUM(AF_INET), + NTOA_TABLE_ENTRY_ENUM(AF_INET6), + NTOA_TABLE_ENTRY_ENUM(AF_PACKET), + NTOA_TABLE_ENTRY_END() + }; + return bits_ntoa(domain, table); + } + + string + socket_type_ntoa(int type) + { + static const bits_ntoa_table table[] = { + NTOA_TABLE_ENTRY_BITS(SOCK_CLOEXEC), + NTOA_TABLE_ENTRY_BITS(SOCK_NONBLOCK), + NTOA_TABLE_ENTRY_ENUM(SOCK_STREAM), + NTOA_TABLE_ENTRY_ENUM(SOCK_DGRAM), + NTOA_TABLE_ENTRY_ENUM(SOCK_RAW), + NTOA_TABLE_ENTRY_ENUM(SOCK_PACKET), + NTOA_TABLE_ENTRY_END() + }; + return bits_ntoa(type, table); + } + + string + socket_protocol_ntoa(int protocol) + { + static const bits_ntoa_table table[] = { + // an empty table just prints the number + NTOA_TABLE_ENTRY_END() + }; + return bits_ntoa(protocol, table); + } + + Fd + socket_or_die(int domain, int type, int protocol) + { + Fd fd(::socket(domain, type, protocol)); + if (fd < 0) { + THROW_ERRNO("socket: domain " << socket_domain_ntoa(domain) + << " type " << socket_type_ntoa(type) + << " protocol " << socket_protocol_ntoa(protocol)); + } + return fd; + } + + void + write_or_die_partial(int fd, const void *buf, size_t size_wanted, size_t &size_written) + { + if (size_wanted > (static_cast(~0) >> 1)) { + THROW_ERROR(invalid_argument, "cannot read " << size_wanted << ", more than signed size allows"); + } + if (fd < 0) { + THROW_ERROR(invalid_argument, "write: trying to write on a closed file descriptor"); + } + int rv = write(fd, buf, size_wanted); + if (rv < 0) { + THROW_ERRNO("write: " << size_wanted << " bytes returned " << rv); + } + size_written = rv; + } + + void + write_or_die(int fd, const void *buf, size_t size) + { + size_t size_written = 0; + write_or_die_partial(fd, buf, size, size_written); + if (size_written != size) { + THROW_ERROR(runtime_error, "write: only " << size_written << " of " << size << " bytes written"); + } + } + + void + pwrite_or_die(int fd, const void *buf, size_t size, off_t offset) + { + if (size > (static_cast(~0) >> 1)) { + THROW_ERROR(invalid_argument, "pwrite: cannot write " << size << ", more than signed size allows"); + } + if (fd < 0) { + THROW_ERROR(invalid_argument, "pwrite: trying to write on a closed file descriptor"); + } + int rv = ::pwrite(fd, buf, size, offset); + if (rv != static_cast(size)) { + THROW_ERROR(runtime_error, "pwrite: only " << rv << " of " << size << " bytes written at offset " << offset); + } + } + + template<> + void + write_or_die(int fd, const string &text) + { + return write_or_die(fd, text.data(), text.size()); + } + + void + read_partial_or_die(int fd, void *buf, size_t size, size_t &size_read) + { + if (size > (static_cast(~0) >> 1)) { + THROW_ERROR(invalid_argument, "cannot read " << size << ", more than signed size allows"); + } + if (fd < 0) { + THROW_ERROR(runtime_error, "read: trying to read on a closed file descriptor"); + } + size_read = 0; + while (size) { + int rv = read(fd, buf, size); + if (rv < 0) { + if (errno == EINTR) { + CHATTER_TRACE("resuming after EINTR"); + continue; + } + THROW_ERRNO("read: " << size << " bytes"); + } + if (rv > static_cast(size)) { + THROW_ERROR(runtime_error, "read: somehow read more bytes (" << rv << ") than requested (" << size << ")"); + } + if (rv == 0) break; + size_read += rv; + size -= rv; + // CHATTER("read " << rv << " bytes from fd " << fd); + } + } + + string + read_string(int fd, size_t size) + { + string rv(size, '\0'); + size_t size_read = 0; + void *rvp = const_cast(rv.data()); + read_partial_or_die(fd, rvp, size, size_read); + rv.resize(size_read); + return rv; + } + + void + read_or_die(int fd, void *buf, size_t size) + { + size_t size_read = 0; + read_partial_or_die(fd, buf, size, size_read); + if (size_read != size) { + THROW_ERROR(runtime_error, "read: " << size_read << " of " << size << " bytes"); + } + } + + void + pread_or_die(int fd, void *buf, size_t size, off_t offset) + { + if (size > (static_cast(~0) >> 1)) { + THROW_ERROR(invalid_argument, "cannot read " << size << ", more than signed size allows"); + } + if (fd < 0) { + throw runtime_error("read: trying to read on a closed file descriptor"); + } else { + while (size) { + int rv = pread(fd, buf, size, offset); + if (rv < 0) { + if (errno == EINTR) { + CHATTER(__func__ << "resuming after EINTR"); + continue; + } + THROW_ERRNO("pread: " << size << " bytes"); + } + if (rv != static_cast(size)) { + THROW_ERROR(runtime_error, "pread: " << size << " bytes at offset " << offset << " returned " << rv); + } + break; + } + } + } + + template<> + void + pread_or_die(int fd, string &text, off_t offset) + { + return pread_or_die(fd, const_cast(text.data()), text.size(), offset); + } + + template<> + void + pread_or_die>(int fd, vector &text, off_t offset) + { + return pread_or_die(fd, text.data(), text.size(), offset); + } + + template<> + void + pread_or_die>(int fd, vector &text, off_t offset) + { + return pread_or_die(fd, text.data(), text.size(), offset); + } + + Stat::Stat() + { + memset_zero(this); + } + + Stat & + Stat::lstat(const string &filename) + { + CHATTER_UNWIND("lstat " << filename); + DIE_IF_MINUS_ONE(::lstat(filename.c_str(), this)); + return *this; + } + + Stat & + Stat::fstat(int fd) + { + CHATTER_UNWIND("fstat " << fd); + DIE_IF_MINUS_ONE(::fstat(fd, this)); + return *this; + } + + Stat::Stat(int fd) + { + memset_zero(this); + fstat(fd); + } + + Stat::Stat(const string &filename) + { + memset_zero(this); + lstat(filename); + } + + string + readlink_or_die(const string &path) + { + // Start with a reasonable guess since it will usually work + off_t size = 4096; + while (size < 1048576) { + char buf[size + 1]; + int rv; + DIE_IF_MINUS_ONE(rv = readlink(path.c_str(), buf, size + 1)); + // No negative values allowed except -1 + THROW_CHECK1(runtime_error, rv, rv >= 0); + if (rv <= size) { + buf[rv] = 0; + return buf; + } + // cerr << "Retrying readlink(" << path << ", buf, " << size + 1 << ")" << endl; + // This is from the Linux readlink(2) man page (release 3.44). + // It only works when the filesystem reports st_size accurately for symlinks, + // and at least one doesn't, so we can't rely on it at all. + // size = lstat_or_die(path).st_size; + size *= 2; + } + THROW_ERROR(runtime_error, "readlink: maximum buffer size exceeded"); + } + + // Turn a FD into a human-recognizable filename OR an error message. + string + name_fd(int fd) + { + try { + ostringstream oss; + oss << "/proc/self/fd/" << fd; + return readlink_or_die(oss.str()); + } catch (exception &e) { + return string(e.what()); + } + } + + bool + assert_no_leaked_fds() + { + struct rlimit rlim; + int rv = getrlimit(RLIMIT_NOFILE, &rlim); + if (rv) { + perror("getrlimit(RLIMIT_NOFILE)"); + // Well, that sucked. Guess. + rlim.rlim_cur = 1024; + } + CHATTER("Checking for leaked FDs in range 3.." << rlim.rlim_cur); + int leaked_fds = 0; + for (unsigned i = 3; i < rlim.rlim_cur; ++i) { + struct stat buf; + if (! fstat(i, &buf)) { + CHATTER("WARNING: fd " << i << " open at exit"); + ++leaked_fds; + } + } + CHATTER(leaked_fds << " leaked FD(s) found"); + return leaked_fds == 0; + } + + pair + socketpair_or_die(int domain, int type, int protocol) + { + pair rv; + int sv[2]; + DIE_IF_MINUS_ONE(socketpair(domain, type, protocol, sv)); + rv.first = sv[0]; + rv.second = sv[1]; + return rv; + } + + void + dup2_or_die(int fd_in, int fd_out) + { + DIE_IF_MINUS_ONE(dup2(fd_in, fd_out)); + } + + string + st_mode_ntoa(mode_t mode) + { + static const bits_ntoa_table table[] = { + NTOA_TABLE_ENTRY_BITS(S_IFMT), + NTOA_TABLE_ENTRY_BITS(S_IFSOCK), + NTOA_TABLE_ENTRY_BITS(S_IFLNK), + NTOA_TABLE_ENTRY_BITS(S_IFMT), + NTOA_TABLE_ENTRY_BITS(S_IFSOCK), + NTOA_TABLE_ENTRY_BITS(S_IFLNK), + NTOA_TABLE_ENTRY_BITS(S_IFREG), + NTOA_TABLE_ENTRY_BITS(S_IFBLK), + NTOA_TABLE_ENTRY_BITS(S_IFDIR), + NTOA_TABLE_ENTRY_BITS(S_IFCHR), + NTOA_TABLE_ENTRY_BITS(S_IFIFO), + NTOA_TABLE_ENTRY_BITS(S_ISUID), + NTOA_TABLE_ENTRY_BITS(S_ISGID), + NTOA_TABLE_ENTRY_BITS(S_ISVTX), + NTOA_TABLE_ENTRY_BITS(S_IRWXU), + NTOA_TABLE_ENTRY_BITS(S_IRUSR), + NTOA_TABLE_ENTRY_BITS(S_IWUSR), + NTOA_TABLE_ENTRY_BITS(S_IXUSR), + NTOA_TABLE_ENTRY_BITS(S_IRWXG), + NTOA_TABLE_ENTRY_BITS(S_IRGRP), + NTOA_TABLE_ENTRY_BITS(S_IWGRP), + NTOA_TABLE_ENTRY_BITS(S_IXGRP), + NTOA_TABLE_ENTRY_BITS(S_IRWXO), + NTOA_TABLE_ENTRY_BITS(S_IROTH), + NTOA_TABLE_ENTRY_BITS(S_IWOTH), + NTOA_TABLE_ENTRY_BITS(S_IXOTH), + NTOA_TABLE_ENTRY_END() + }; + return bits_ntoa(mode, table); + } + +}; diff --git a/lib/fs.cc b/lib/fs.cc new file mode 100644 index 0000000..9fceea9 --- /dev/null +++ b/lib/fs.cc @@ -0,0 +1,1050 @@ +#include "crucible/fs.h" + +#include "crucible/error.h" +#include "crucible/fd.h" +#include "crucible/limits.h" +#include "crucible/ntoa.h" +#include "crucible/string.h" +#include "crucible/uuid.h" + +// FS_IOC_FIEMAP +#include + +#include +#include +#include +#include + +#include + +namespace crucible { + + void + punch_hole(int fd, off_t offset, off_t len) + { +#ifdef FALLOC_FL_PUNCH_HOLE + DIE_IF_MINUS_ONE(::fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset, len)); +#else + (void)fd; + (void)offset; + (void)len; + throw runtime_error("FALLOC_FL_PUNCH_HOLE not implemented"); +#endif + } + + BtrfsExtentInfo::BtrfsExtentInfo(int dst_fd, off_t dst_offset) + { + memset_zero(this); + fd = dst_fd; + logical_offset = dst_offset; + } + + BtrfsExtentSame::BtrfsExtentSame(int src_fd, off_t src_offset, off_t src_length) : + m_fd(src_fd) + { + memset_zero(this); + logical_offset = src_offset; + length = src_length; + } + + BtrfsExtentSame::~BtrfsExtentSame() + { + } + + void + BtrfsExtentSame::add(int fd, off_t offset) + { + m_info.push_back(BtrfsExtentInfo(fd, offset)); + } + + ostream & + operator<<(ostream &os, const btrfs_ioctl_same_extent_info *info) + { + if (!info) { + return os << "btrfs_ioctl_same_extent_info NULL"; + } + os << "btrfs_ioctl_same_extent_info {"; + os << " .fd = " << info->fd; + if (info->fd >= 0) { + catch_all([&](){ + string fd_name = name_fd(info->fd); + os << " '" << fd_name << "'"; + }); + } + os << ", .logical_offset = " << to_hex(info->logical_offset); + os << ", .bytes_deduped = " << to_hex(info->bytes_deduped); + os << ", .status = " << info->status; + if (info->status < 0) { + os << " (" << strerror(-info->status) << ")"; + } + os << ", .reserved = " << info->reserved; + return os << " }"; + } + + ostream & + operator<<(ostream &os, const btrfs_ioctl_same_args *args) + { + if (!args) { + return os << "btrfs_ioctl_same_args NULL"; + } + os << "btrfs_ioctl_same_args {"; + os << " .logical_offset = " << to_hex(args->logical_offset); + os << ", .length = " << to_hex(args->length); + os << ", .dest_count = " << args->dest_count; + os << ", .reserved1 = " << args->reserved1; + os << ", .reserved2 = " << args->reserved2; + os << ", .info[] = {"; + for (int i = 0; i < args->dest_count; ++i) { + os << " [" << i << "] = " << &(args->info[i]) << ","; + } + return os << " }"; + } + + ostream & + operator<<(ostream &os, const BtrfsExtentSame &bes) + { + os << "BtrfsExtentSame {"; + os << " .m_fd = " << bes.m_fd; + if (bes.m_fd >= 0) { + catch_all([&](){ + string fd_name = name_fd(bes.m_fd); + os << " '" << fd_name << "'"; + }); + } + os << ", .logical_offset = " << to_hex(bes.logical_offset); + os << ", .length = " << to_hex(bes.length); + os << ", .dest_count = " << bes.dest_count; + os << ", .reserved1 = " << bes.reserved1; + os << ", .reserved2 = " << bes.reserved2; + os << ", .info[] = {"; + for (size_t i = 0; i < bes.m_info.size(); ++i) { + os << " [" << i << "] = " << &(bes.m_info[i]) << ","; + } + return os << " }"; + } + + void + btrfs_clone_range(int src_fd, off_t src_offset, off_t src_length, int dst_fd, off_t dst_offset) + { + struct btrfs_ioctl_clone_range_args args; + memset_zero(&args); + args.src_fd = src_fd; + args.src_offset = src_offset; + args.src_length = src_length; + args.dest_offset = dst_offset; + DIE_IF_MINUS_ONE(ioctl(dst_fd, BTRFS_IOC_CLONE_RANGE, &args)); + } + + // Userspace emulation of extent-same ioctl to work around kernel bugs + // (a memory leak, a deadlock, inability to cope with unaligned EOF, and a length limit) + // The emulation is incomplete: no locking, and we always change ctime + void + BtrfsExtentSameByClone::do_ioctl() + { + if (length <= 0) { + throw out_of_range(string("length = 0 in ") + __PRETTY_FUNCTION__); + } + vector cmp_buf_common(length); + vector cmp_buf_iter(length); + pread_or_die(m_fd, cmp_buf_common.data(), length, logical_offset); + for (auto i = m_info.begin(); i != m_info.end(); ++i) { + i->status = -EIO; + i->bytes_deduped = 0; + + // save atime/ctime for later + Stat target_stat(i->fd); + + pread_or_die(i->fd, cmp_buf_iter.data(), length, i->logical_offset); + if (cmp_buf_common == cmp_buf_iter) { + + // This never happens, so stop checking. + // assert(!memcmp(cmp_buf_common.data(), cmp_buf_iter.data(), length)); + + btrfs_clone_range(m_fd, logical_offset, length, i->fd, i->logical_offset); + i->status = 0; + i->bytes_deduped = length; + + // The extent-same ioctl does not change mtime (as of patch v4) + struct timespec restore_ts[2] = { + target_stat.st_atim, + target_stat.st_mtim + }; + + // Ignore futimens failure as the real extent-same ioctl would never raise it + futimens(i->fd, restore_ts); + + } else { + assert(memcmp(cmp_buf_common.data(), cmp_buf_iter.data(), length)); + i->status = BTRFS_SAME_DATA_DIFFERS; + } + } + } + + void + BtrfsExtentSame::do_ioctl() + { + dest_count = m_info.size(); + vector ioctl_arg = vector_copy_struct(this); + ioctl_arg.resize(sizeof(btrfs_ioctl_same_args) + dest_count * sizeof(btrfs_ioctl_same_extent_info), 0); + btrfs_ioctl_same_args *ioctl_ptr = reinterpret_cast(ioctl_arg.data()); + size_t count = 0; + for (auto i = m_info.cbegin(); i != m_info.cend(); ++i) { + ioctl_ptr->info[count] = static_cast(m_info[count]); + ++count; + } + int rv = ioctl(m_fd, BTRFS_IOC_FILE_EXTENT_SAME, ioctl_ptr); + if (rv) { + THROW_ERRNO("After FILE_EXTENT_SAME (fd = " << m_fd << " '" << name_fd(m_fd) << "') : " << ioctl_ptr); + } + count = 0; + for (auto i = m_info.cbegin(); i != m_info.cend(); ++i) { + static_cast(m_info[count]) = ioctl_ptr->info[count]; + ++count; + } + } + + bool + btrfs_extent_same(int src_fd, off_t src_offset, off_t src_length, int dst_fd, off_t dst_offset) + { + THROW_CHECK1(invalid_argument, src_length, src_length > 0); + while (src_length > 0) { + off_t length = min(off_t(BTRFS_MAX_DEDUPE_LEN), src_length); + BtrfsExtentSame bes(src_fd, src_offset, length); + bes.add(dst_fd, dst_offset); + bes.do_ioctl(); + auto status = bes.m_info.at(0).status; + if (status == 0) { + src_offset += length; + dst_offset += length; + src_length -= length; + continue; + } + if (status == BTRFS_SAME_DATA_DIFFERS) { + return false; + } + if (status < 0) { + THROW_ERRNO_VALUE(-status, "btrfs-extent-same: " << bes); + } + // THROW_ERROR(runtime_error, "btrfs-extent-same src_fd " << name_fd(src_fd) << " src_offset " << src_offset << " length " << length << " dst_fd " << name_fd(dst_fd) << " dst_offset " << dst_offset << " status " << status); + THROW_ERROR(runtime_error, "btrfs-extent-same unknown status " << status << ": " << bes); + } + return true; + } + + BtrfsDataContainer::BtrfsDataContainer(size_t buf_size) : + m_data(buf_size, 0) + { + } + + void * + BtrfsDataContainer::prepare() + { + btrfs_data_container *p = reinterpret_cast(m_data.data()); + size_t min_size = offsetof(btrfs_data_container, val); + size_t container_size = m_data.size(); + if (container_size < min_size) { + THROW_ERROR(out_of_range, "container size " << container_size << " smaller than minimum " << min_size); + } + p->bytes_left = 0; + p->bytes_missing = 0; + p->elem_cnt = 0; + p->elem_missed = 0; + return p; + } + + size_t + BtrfsDataContainer::get_size() const + { + return m_data.size(); + } + + decltype(btrfs_data_container::bytes_left) + BtrfsDataContainer::get_bytes_left() const + { + return bytes_left; + } + + decltype(btrfs_data_container::bytes_missing) + BtrfsDataContainer::get_bytes_missing() const + { + return bytes_missing; + } + + decltype(btrfs_data_container::elem_cnt) + BtrfsDataContainer::get_elem_cnt() const + { + return elem_cnt; + } + + decltype(btrfs_data_container::elem_missed) + BtrfsDataContainer::get_elem_missed() const + { + return elem_missed; + } + + ostream & + operator<<(ostream &os, const BtrfsIoctlLogicalInoArgs *p) + { + if (!p) { + return os << "BtrfsIoctlLogicalInoArgs NULL"; + } + os << "BtrfsIoctlLogicalInoArgs {"; + os << " .logical = " << to_hex(p->logical); + os << " .inodes[] = {\n"; + unsigned count = 0; + for (auto i = p->m_iors.cbegin(); i != p->m_iors.cend(); ++i) { + os << "\t\t[" << count++ << "] = " << *i << ",\n"; + } + os << "}\n"; + return os; + } + + BtrfsIoctlLogicalInoArgs::BtrfsIoctlLogicalInoArgs(uint64_t new_logical, size_t new_size) : + m_container(new_size) + { + memset_zero(this); + logical = new_logical; + } + + bool + BtrfsIoctlLogicalInoArgs::do_ioctl_nothrow(int fd) + { + btrfs_ioctl_logical_ino_args *p = static_cast(this); + inodes = reinterpret_cast(m_container.prepare()); + size = m_container.get_size(); + + m_iors.clear(); + + if (ioctl(fd, BTRFS_IOC_LOGICAL_INO, p)) { + return false; + } + + btrfs_data_container *bdc = reinterpret_cast(p->inodes); + BtrfsInodeOffsetRoot *input_iter = reinterpret_cast(bdc->val); + m_iors.reserve(bdc->elem_cnt); + + for (auto count = bdc->elem_cnt; count > 2; count -= 3) { + m_iors.push_back(*input_iter++); + } + + return true; + } + + void + BtrfsIoctlLogicalInoArgs::do_ioctl(int fd) { + if (!do_ioctl_nothrow(fd)) { + THROW_ERRNO("BTRFS_IOC_LOGICAL_INO: " << name_fd(fd) << ", " << this); + } + } + + ostream & + operator<<(ostream &os, const BtrfsInodeOffsetRoot &ior) + { + os << "BtrfsInodeOffsetRoot {"; + os << " .m_inum = " << ior.m_inum << ","; + os << " .m_offset = " << to_hex(ior.m_offset) << ","; + os << " .m_root = " << ior.m_root; + os << " }"; + return os; + } + + BtrfsIoctlInoPathArgs::BtrfsIoctlInoPathArgs(uint64_t inode, size_t new_size) : + m_container(new_size) + { + memset_zero(this); + inum = inode; + } + + bool + BtrfsIoctlInoPathArgs::do_ioctl_nothrow(int fd) + { + btrfs_ioctl_ino_path_args *p = static_cast(this); + fspath = reinterpret_cast(m_container.prepare()); + size = m_container.get_size(); + + m_paths.clear(); + + if (ioctl(fd, BTRFS_IOC_INO_PATHS, p) < 0) { + return false; + } + + btrfs_data_container *bdc = reinterpret_cast(p->fspath); + m_paths.reserve(bdc->elem_cnt); + + const uint64_t *up = reinterpret_cast(bdc->val); + const char *cp = reinterpret_cast(bdc->val); + + for (auto count = bdc->elem_cnt; count > 0; --count) { + const char *path = cp + *up++; + if (static_cast(path - cp) > m_container.get_size()) { + THROW_ERROR(out_of_range, "offset " << (path - cp) << " > size " << m_container.get_size() << " in " << __PRETTY_FUNCTION__); + } + m_paths.push_back(string(path)); + } + + return true; + } + + void + BtrfsIoctlInoPathArgs::do_ioctl(int fd) { + if (!do_ioctl_nothrow(fd)) { + THROW_ERRNO("BTRFS_IOC_INO_PATHS: " << name_fd(fd)); + } + } + + ostream & + operator<<(ostream &os, const BtrfsIoctlInoPathArgs &ipa) + { + const BtrfsIoctlInoPathArgs *p = &ipa; + if (!p) { + return os << "BtrfsIoctlInoPathArgs NULL"; + } + os << "BtrfsIoctlInoPathArgs {"; + os << " .inum = " << p->inum; + os << " .paths[] = {\n"; + unsigned count = 0; + for (auto i = p->m_paths.cbegin(); i != p->m_paths.cend(); ++i) { + os << "\t\t[" << count++ << "] = \"" << *i << "\",\n"; + } + os << "\t}\n"; + return os; + } + + BtrfsIoctlInoLookupArgs::BtrfsIoctlInoLookupArgs(uint64_t new_objectid) + { + memset_zero(this); + objectid = new_objectid; + } + + bool + BtrfsIoctlInoLookupArgs::do_ioctl_nothrow(int fd) + { + btrfs_ioctl_ino_lookup_args *ioctl_ptr = static_cast(this); + return ioctl(fd, BTRFS_IOC_INO_LOOKUP, ioctl_ptr) == 0; + } + + void + BtrfsIoctlInoLookupArgs::do_ioctl(int fd) { + if (!do_ioctl_nothrow(fd)) { + THROW_ERRNO("BTRFS_IOC_INO_LOOKUP: " << name_fd(fd)); + } + } + + BtrfsIoctlDefragRangeArgs::BtrfsIoctlDefragRangeArgs() + { + memset_zero(this); + } + + bool + BtrfsIoctlDefragRangeArgs::do_ioctl_nothrow(int fd) + { + btrfs_ioctl_defrag_range_args *ioctl_ptr = static_cast(this); + return 0 == ioctl(fd, BTRFS_IOC_DEFRAG_RANGE, ioctl_ptr); + } + + void + BtrfsIoctlDefragRangeArgs::do_ioctl(int fd) + { + if (!do_ioctl_nothrow(fd)) { + THROW_ERRNO("BTRFS_IOC_DEFRAG_RANGE: " << name_fd(fd)); + } + } + + string + btrfs_ioctl_defrag_range_flags_ntoa(uint64_t flags) + { + static const bits_ntoa_table table[] = { + NTOA_TABLE_ENTRY_BITS(BTRFS_DEFRAG_RANGE_COMPRESS), + NTOA_TABLE_ENTRY_BITS(BTRFS_DEFRAG_RANGE_START_IO), + NTOA_TABLE_ENTRY_END() + }; + return bits_ntoa(flags, table); + } + + string + btrfs_ioctl_defrag_range_compress_type_ntoa(uint32_t compress_type) + { + static const bits_ntoa_table table[] = { + NTOA_TABLE_ENTRY_ENUM(BTRFS_COMPRESS_ZLIB), + NTOA_TABLE_ENTRY_ENUM(BTRFS_COMPRESS_LZO), + NTOA_TABLE_ENTRY_END() + }; + return bits_ntoa(compress_type, table); + } + + ostream & + operator<<(ostream &os, const BtrfsIoctlDefragRangeArgs *p) + { + if (!p) { + return os << "BtrfsIoctlDefragRangeArgs NULL"; + } + os << "BtrfsIoctlDefragRangeArgs {"; + os << " .start = " << p->start; + os << " .len = " << p->len; + os << " .flags = " << btrfs_ioctl_defrag_range_flags_ntoa(p->flags); + os << " .extent_thresh = " << p->extent_thresh; + os << " .compress_type = " << btrfs_ioctl_defrag_range_compress_type_ntoa(p->compress_type); + os << " .unused[4] = { " << p->unused[0] << ", " << p->unused[1] << ", " << p->unused[2] << ", " << p->unused[3] << "} }"; + return os; + } + + FiemapExtent::FiemapExtent() + { + memset_zero(this); + } + + FiemapExtent::FiemapExtent(const fiemap_extent &that) + { + static_cast(*this) = that; + } + + FiemapExtent::operator bool() const + { + return fe_length; + } + + off_t + FiemapExtent::begin() const + { + return ranged_cast(fe_logical); + } + + off_t + FiemapExtent::end() const + { + return ranged_cast(fe_logical + fe_length); + } + + string + fiemap_extent_flags_ntoa(unsigned long flags) + { + static const bits_ntoa_table table[] = { + NTOA_TABLE_ENTRY_BITS(FIEMAP_EXTENT_LAST), + NTOA_TABLE_ENTRY_BITS(FIEMAP_EXTENT_UNKNOWN), + NTOA_TABLE_ENTRY_BITS(FIEMAP_EXTENT_DELALLOC), + NTOA_TABLE_ENTRY_BITS(FIEMAP_EXTENT_ENCODED), + NTOA_TABLE_ENTRY_BITS(FIEMAP_EXTENT_DATA_ENCRYPTED), + NTOA_TABLE_ENTRY_BITS(FIEMAP_EXTENT_NOT_ALIGNED), + NTOA_TABLE_ENTRY_BITS(FIEMAP_EXTENT_DATA_INLINE), + NTOA_TABLE_ENTRY_BITS(FIEMAP_EXTENT_DATA_TAIL), + NTOA_TABLE_ENTRY_BITS(FIEMAP_EXTENT_UNWRITTEN), + NTOA_TABLE_ENTRY_BITS(FIEMAP_EXTENT_MERGED), + NTOA_TABLE_ENTRY_BITS(FIEMAP_EXTENT_SHARED), + NTOA_TABLE_ENTRY_END() + }; + return bits_ntoa(flags, table); + } + + ostream & + operator<<(ostream &os, const fiemap_extent *args) + { + if (!args) { + return os << "fiemap_extent NULL"; + } + os << "fiemap_extent {"; + os << " .fe_logical = " << to_hex(args->fe_logical) << ".." << to_hex(args->fe_logical + args->fe_length); + os << ", .fe_physical = " << to_hex(args->fe_physical) << ".." << to_hex(args->fe_physical + args->fe_length); + os << ", .fe_length = " << to_hex(args->fe_length); + if (args->fe_reserved64[0]) os << ", .fe_reserved64[0] = " << args->fe_reserved64[0]; + if (args->fe_reserved64[1]) os << ", .fe_reserved64[1] = " << args->fe_reserved64[1]; + if (args->fe_flags) os << ", .fe_flags = " << fiemap_extent_flags_ntoa(args->fe_flags); + if (args->fe_reserved[0]) os << ", .fe_reserved[0] = " << args->fe_reserved[0]; + if (args->fe_reserved[1]) os << ", .fe_reserved[1] = " << args->fe_reserved[1]; + if (args->fe_reserved[2]) os << ", .fe_reserved[2] = " << args->fe_reserved[2]; + return os << " }"; + } + + ostream & + operator<<(ostream &os, const FiemapExtent &args) + { + return os << static_cast(&args); + } + + string + fiemap_flags_ntoa(unsigned long flags) + { + static const bits_ntoa_table table[] = { + NTOA_TABLE_ENTRY_BITS(FIEMAP_FLAGS_COMPAT), + NTOA_TABLE_ENTRY_BITS(FIEMAP_FLAG_SYNC), + NTOA_TABLE_ENTRY_BITS(FIEMAP_FLAG_XATTR), + NTOA_TABLE_ENTRY_BITS(FIEMAP_FLAG_CACHE), + NTOA_TABLE_ENTRY_END() + }; + return bits_ntoa(flags, table); + } + + ostream & + operator<<(ostream &os, const fiemap *args) + { + if (!args) { + return os << "fiemap NULL"; + } + os << "fiemap {"; + os << " .fm_start = " << to_hex(args->fm_start) << ".." << to_hex(args->fm_start + args->fm_length); + os << ", .fm_length = " << to_hex(args->fm_length); + if (args->fm_flags) os << ", .fm_flags = " << fiemap_flags_ntoa(args->fm_flags); + os << ", .fm_mapped_extents = " << args->fm_mapped_extents; + os << ", .fm_extent_count = " << args->fm_extent_count; + if (args->fm_reserved) os << ", .fm_reserved = " << args->fm_reserved; + os << ", .fm_extents[] = {"; + for (uint32_t i = 0; i < args->fm_mapped_extents; ++i) { + os << "\n\t[" << i << "] = " << &(args->fm_extents[i]) << ","; + } + return os << "\n}"; + } + + ostream & + operator<<(ostream &os, const Fiemap &args) + { + os << "Fiemap {"; + os << " .fm_start = " << to_hex(args.fm_start) << ".." << to_hex(args.fm_start + args.fm_length); + os << ", .fm_length = " << to_hex(args.fm_length); + if (args.fm_flags) os << ", .fm_flags = " << fiemap_flags_ntoa(args.fm_flags); + os << ", .fm_mapped_extents = " << args.fm_mapped_extents; + os << ", .fm_extent_count = " << args.fm_extent_count; + if (args.fm_reserved) os << ", .fm_reserved = " << args.fm_reserved; + os << ", .fm_extents[] = {"; + size_t count = 0; + for (auto i = args.m_extents.cbegin(); i != args.m_extents.cend(); ++i) { + os << "\n\t[" << count++ << "] = " << &(*i) << ","; + } + return os << "\n}"; + } + + Fiemap::Fiemap(uint64_t start, uint64_t length) + { + memset_zero(this); + fm_start = start; + fm_length = length; + // FIEMAP is slow and full of lines. + // This makes FIEMAP even slower, but reduces the lies a little. + fm_flags = FIEMAP_FLAG_SYNC; + } + + void + Fiemap::do_ioctl(int fd) + { + CHECK_CONSTRAINT(m_min_count, m_min_count <= m_max_count); + + auto extent_count = m_min_count; + vector ioctl_arg = vector_copy_struct(this); + + ioctl_arg.resize(sizeof(fiemap) + extent_count * sizeof(fiemap_extent), 0); + + fiemap *ioctl_ptr = reinterpret_cast(ioctl_arg.data()); + + auto start = fm_start; + auto end = fm_start + fm_length; + + auto orig_start = fm_start; + auto orig_length = fm_length; + + vector extents; + + while (start < end && extents.size() < m_max_count) { + ioctl_ptr->fm_start = start; + ioctl_ptr->fm_length = end - start; + ioctl_ptr->fm_extent_count = extent_count; + ioctl_ptr->fm_mapped_extents = 0; + + // cerr << "Before (fd = " << fd << ") : " << ioctl_ptr << endl; + DIE_IF_MINUS_ONE(ioctl(fd, FS_IOC_FIEMAP, ioctl_ptr)); + // cerr << " After (fd = " << fd << ") : " << ioctl_ptr << endl; + + auto extents_left = ioctl_ptr->fm_mapped_extents; + if (extents_left == 0) { + start = end; + break; + } + + fiemap_extent *fep = ioctl_ptr->fm_extents; + while (extents_left-- && extents.size() < m_max_count) { + extents.push_back(FiemapExtent(*fep)); + if (fep->fe_flags & FIEMAP_EXTENT_LAST) { + assert(extents_left == 0); + start = end; + break; + } else { + start = fep->fe_logical + fep->fe_length; + } + ++fep; + } + } + + fiemap *this_ptr = static_cast(this); + *this_ptr = *ioctl_ptr; + fm_start = orig_start; + fm_length = orig_length; + fm_extent_count = extents.size(); + m_extents = extents; + } + + BtrfsIoctlSearchKey::BtrfsIoctlSearchKey(size_t buf_size) : + m_buf_size(buf_size) + { + memset_zero(this); + max_objectid = numeric_limits::max(); + max_offset = numeric_limits::max(); + max_transid = numeric_limits::max(); + max_type = numeric_limits::max(); + nr_items = numeric_limits::max(); + } + + BtrfsIoctlSearchHeader::BtrfsIoctlSearchHeader() + { + memset_zero(this); + } + + size_t + BtrfsIoctlSearchHeader::set_data(const vector &v, size_t offset) + { + THROW_CHECK2(invalid_argument, offset, v.size(), offset + sizeof(btrfs_ioctl_search_header) <= v.size()); + memcpy(this, &v[offset], sizeof(btrfs_ioctl_search_header)); + offset += sizeof(btrfs_ioctl_search_header); + THROW_CHECK2(invalid_argument, offset + len, v.size(), offset + len <= v.size()); + m_data = vector(&v[offset], &v[offset + len]); + return offset + len; + } + + bool + BtrfsIoctlSearchKey::do_ioctl_nothrow(int fd) + { + vector ioctl_arg = vector_copy_struct(this); + ioctl_arg.resize(sizeof(btrfs_ioctl_search_args_v2) + m_buf_size, 0); + btrfs_ioctl_search_args_v2 *ioctl_ptr = reinterpret_cast(ioctl_arg.data()); + + ioctl_ptr->buf_size = m_buf_size; + + // Don't bother supporting V1. Kernels that old have other problems. + int rv = ioctl(fd, BTRFS_IOC_TREE_SEARCH_V2, ioctl_ptr); + if (rv != 0) { + return false; + } + + static_cast(*this) = ioctl_ptr->key; + + m_result.clear(); + m_result.reserve(nr_items); + + size_t offset = pointer_distance(ioctl_ptr->buf, ioctl_ptr); + for (decltype(nr_items) i = 0; i < nr_items; ++i) { + BtrfsIoctlSearchHeader item; + offset = item.set_data(ioctl_arg, offset); + m_result.push_back(item); + } + + return true; + } + + void + BtrfsIoctlSearchKey::do_ioctl(int fd) + { + if (!do_ioctl_nothrow(fd)) { + THROW_ERRNO("BTRFS_IOC_TREE_SEARCH_V2: " << name_fd(fd)); + } + } + + void + BtrfsIoctlSearchKey::next_min(const BtrfsIoctlSearchHeader &ref) + { + min_objectid = ref.objectid; + min_type = ref.type; + min_offset = ref.offset + 1; + if (min_offset < ref.offset) { + // We wrapped, try the next objectid + ++min_objectid; + } + } + + ostream &hexdump(ostream &os, const vector &v) + { + os << "vector { size = " << v.size() << ", data:\n"; + for (size_t i = 0; i < v.size(); i += 8) { + string hex, ascii; + for (size_t j = i; j < i + 8; ++j) { + if (j < v.size()) { + unsigned char c = v[j]; + char buf[8]; + sprintf(buf, "%02x ", c); + hex += buf; + ascii += (c < 32 || c > 126) ? '.' : c; + } else { + hex += " "; + ascii += ' '; + } + } + os << astringprintf("\t%08x %s %s\n", i, hex.c_str(), ascii.c_str()); + } + return os << "}"; + } + + string + btrfs_search_type_ntoa(unsigned type) + { + static const bits_ntoa_table table[] = { + NTOA_TABLE_ENTRY_ENUM(BTRFS_INODE_ITEM_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_INODE_REF_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_INODE_EXTREF_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_XATTR_ITEM_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_ORPHAN_ITEM_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_DIR_LOG_ITEM_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_DIR_LOG_INDEX_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_DIR_ITEM_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_DIR_INDEX_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_EXTENT_DATA_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_CSUM_ITEM_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_EXTENT_CSUM_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_ROOT_ITEM_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_ROOT_BACKREF_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_ROOT_REF_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_EXTENT_ITEM_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_METADATA_ITEM_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_TREE_BLOCK_REF_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_EXTENT_DATA_REF_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_EXTENT_REF_V0_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_SHARED_BLOCK_REF_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_SHARED_DATA_REF_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_BLOCK_GROUP_ITEM_KEY), +#ifdef BTRFS_FREE_SPACE_INFO_KEY + NTOA_TABLE_ENTRY_ENUM(BTRFS_FREE_SPACE_INFO_KEY), +#endif +#ifdef BTRFS_FREE_SPACE_EXTENT_KEY + NTOA_TABLE_ENTRY_ENUM(BTRFS_FREE_SPACE_EXTENT_KEY), +#endif +#ifdef BTRFS_FREE_SPACE_BITMAP_KEY + NTOA_TABLE_ENTRY_ENUM(BTRFS_FREE_SPACE_BITMAP_KEY), +#endif + NTOA_TABLE_ENTRY_ENUM(BTRFS_DEV_EXTENT_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_DEV_ITEM_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_CHUNK_ITEM_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_BALANCE_ITEM_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_QGROUP_STATUS_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_QGROUP_INFO_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_QGROUP_LIMIT_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_QGROUP_RELATION_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_DEV_STATS_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_DEV_REPLACE_KEY), + NTOA_TABLE_ENTRY_ENUM(BTRFS_UUID_KEY_SUBVOL), + NTOA_TABLE_ENTRY_ENUM(BTRFS_UUID_KEY_RECEIVED_SUBVOL), + NTOA_TABLE_ENTRY_ENUM(BTRFS_STRING_ITEM_KEY), + NTOA_TABLE_ENTRY_END() + }; + return bits_ntoa(type, table); + } + + string + btrfs_search_objectid_ntoa(unsigned objectid) + { + static const bits_ntoa_table table[] = { + NTOA_TABLE_ENTRY_ENUM(BTRFS_ROOT_TREE_OBJECTID), + NTOA_TABLE_ENTRY_ENUM(BTRFS_EXTENT_TREE_OBJECTID), + NTOA_TABLE_ENTRY_ENUM(BTRFS_CHUNK_TREE_OBJECTID), + NTOA_TABLE_ENTRY_ENUM(BTRFS_DEV_TREE_OBJECTID), + NTOA_TABLE_ENTRY_ENUM(BTRFS_FS_TREE_OBJECTID), + NTOA_TABLE_ENTRY_ENUM(BTRFS_ROOT_TREE_DIR_OBJECTID), + NTOA_TABLE_ENTRY_ENUM(BTRFS_CSUM_TREE_OBJECTID), + NTOA_TABLE_ENTRY_ENUM(BTRFS_QUOTA_TREE_OBJECTID), + NTOA_TABLE_ENTRY_ENUM(BTRFS_UUID_TREE_OBJECTID), +#ifdef BTRFS_FREE_SPACE_TREE_OBJECTID + NTOA_TABLE_ENTRY_ENUM(BTRFS_FREE_SPACE_TREE_OBJECTID), +#endif + NTOA_TABLE_ENTRY_ENUM(BTRFS_BALANCE_OBJECTID), + NTOA_TABLE_ENTRY_ENUM(BTRFS_ORPHAN_OBJECTID), + NTOA_TABLE_ENTRY_ENUM(BTRFS_TREE_LOG_OBJECTID), + NTOA_TABLE_ENTRY_ENUM(BTRFS_TREE_LOG_FIXUP_OBJECTID), + NTOA_TABLE_ENTRY_ENUM(BTRFS_TREE_RELOC_OBJECTID), + NTOA_TABLE_ENTRY_ENUM(BTRFS_DATA_RELOC_TREE_OBJECTID), + NTOA_TABLE_ENTRY_ENUM(BTRFS_EXTENT_CSUM_OBJECTID), + NTOA_TABLE_ENTRY_ENUM(BTRFS_FREE_SPACE_OBJECTID), + NTOA_TABLE_ENTRY_ENUM(BTRFS_FREE_INO_OBJECTID), + // One of these is not an objectid + NTOA_TABLE_ENTRY_ENUM(BTRFS_MULTIPLE_OBJECTIDS), + NTOA_TABLE_ENTRY_ENUM(BTRFS_FIRST_FREE_OBJECTID), + NTOA_TABLE_ENTRY_ENUM(BTRFS_LAST_FREE_OBJECTID), + NTOA_TABLE_ENTRY_ENUM(BTRFS_FIRST_CHUNK_TREE_OBJECTID), + NTOA_TABLE_ENTRY_ENUM(BTRFS_DEV_ITEMS_OBJECTID), + NTOA_TABLE_ENTRY_END() + }; + return bits_ntoa(objectid, table); + } + + ostream & + operator<<(ostream &os, const btrfs_ioctl_search_key &key) + { + return os << "btrfs_ioctl_search_key {" + << " tree_id = " << key.tree_id + << ", min_objectid = " << key.min_objectid + << ", max_objectid = " << key.max_objectid + << ", min_offset = " << key.min_offset + << ", max_offset = " << key.max_offset + << ", min_transid = " << key.min_transid + << ", max_transid = " << key.max_transid + << ", min_type = " << key.min_type + << ", max_type = " << key.max_type + << ", nr_items = " << key.nr_items + << ", unused = " << key.unused + << ", unused1 = " << key.unused1 + << ", unused2 = " << key.unused2 + << ", unused3 = " << key.unused3 + << ", unused4 = " << key.unused4 + << " }"; + } + + ostream & + operator<<(ostream &os, const btrfs_ioctl_search_header &hdr) + { + return os << "btrfs_ioctl_search_header {" + << " transid = " << hdr.transid + << ", objectid = " << btrfs_search_objectid_ntoa(hdr.objectid) << " (" << hdr.objectid << ")" + << ", offset = " << hdr.offset + << ", type = " << btrfs_search_type_ntoa(hdr.type) << " (" << hdr.type << ")" + << ", len = " << hdr.len + << " }"; + } + + ostream & + operator<<(ostream &os, const BtrfsIoctlSearchHeader &hdr) + { + os << "BtrfsIoctlSearchHeader { " + << static_cast(hdr) + << ", data = "; + hexdump(os, hdr.m_data); + return os << "}"; + } + + ostream & + operator<<(ostream &os, const BtrfsIoctlSearchKey &key) + { + os << "BtrfsIoctlSearchKey { " + << static_cast(key) + << ", buf_size = " << key.m_buf_size + << ", buf[" << key.m_result.size() << "] = {"; + for (auto e : key.m_result) { + os << "\n\t" << e; + } + return os << "}}"; + } + + uint64_t + btrfs_get_root_id(int fd) + { + BtrfsIoctlInoLookupArgs biila(BTRFS_FIRST_FREE_OBJECTID); + biila.do_ioctl(fd); + return biila.treeid; + } + + uint64_t + btrfs_get_root_transid(int fd) + { + BtrfsIoctlSearchKey sk; + auto root_id = btrfs_get_root_id(fd); + sk.tree_id = BTRFS_ROOT_TREE_OBJECTID; + sk.min_objectid = root_id; + sk.max_objectid = root_id; + sk.max_type = BTRFS_ROOT_ITEM_KEY; + sk.min_type = BTRFS_ROOT_ITEM_KEY; + sk.nr_items = 4096; + uint64_t rv = 0; + do { + sk.do_ioctl(fd); + if (sk.nr_items == 0) { + break; + } + for (auto i : sk.m_result) { + sk.min_objectid = i.objectid; + sk.min_type = i.type; + sk.min_offset = i.offset; + + if (i.objectid > root_id) { + break; + } + + if (i.objectid == root_id && i.type == BTRFS_ROOT_ITEM_KEY) { + rv = max(rv, uint64_t(call_btrfs_get(btrfs_root_generation, i.m_data))); + } + } + if (sk.min_offset < numeric_limits::max()) { + ++sk.min_offset; + } else { + break; + } + } while (sk.min_type == BTRFS_ROOT_ITEM_KEY && sk.min_objectid == sk.tree_id); + return rv; + } + + Statvfs::Statvfs() + { + memset_zero(this); + } + + Statvfs::Statvfs(int fd) : + Statvfs() + { + DIE_IF_NON_ZERO(::fstatvfs(fd, this)); + } + + Statvfs::Statvfs(string path) : + Statvfs() + { + DIE_IF_NON_ZERO(::statvfs(path.c_str(), this)); + } + + unsigned long + Statvfs::size() const + { + return f_frsize * f_blocks; + } + + unsigned long + Statvfs::free() const + { + return f_frsize * f_bfree; + } + + unsigned long + Statvfs::available() const + { + return f_frsize * f_bavail; + } + + ostream & + operator<<(ostream &os, const BtrfsIoctlFsInfoArgs &a) + { + os << "BtrfsIoctlFsInfoArgs {" + << " max_id = " << a.max_id << "," + << " num_devices = " << a.num_devices << "," + << " fsid = " << a.uuid() << "," +#if 0 + << " nodesize = " << a.nodesize << "," + << " sectorsize = " << a.sectorsize << "," + << " clone_alignment = " << a.clone_alignment << "," + << " reserved32 = " << a.reserved32; +#else + ; +#endif + // probably don't need to bother with the other 122 reserved fields + return os << " }"; + }; + + BtrfsIoctlFsInfoArgs::BtrfsIoctlFsInfoArgs() + { + memset_zero(this); + } + + void + BtrfsIoctlFsInfoArgs::do_ioctl(int fd) + { + btrfs_ioctl_fs_info_args *p = static_cast(this); + if (ioctl(fd, BTRFS_IOC_FS_INFO, p)) { + THROW_ERRNO("BTRFS_IOC_FS_INFO: fd " << fd); + } + } + + string + BtrfsIoctlFsInfoArgs::uuid() const + { + return uuid_unparse(fsid); + } + +}; diff --git a/lib/interp.cc b/lib/interp.cc new file mode 100644 index 0000000..5415e32 --- /dev/null +++ b/lib/interp.cc @@ -0,0 +1,96 @@ +#include "crucible/interp.h" + +#include "crucible/chatter.h" + +namespace crucible { + using namespace std; + + int + Proc::exec(const ArgList &args) + { + return m_cmd(args); + } + + Proc::Proc(const function &f) : + m_cmd(f) + { + } + + Command::~Command() + { + } + + ArgList::ArgList(const char **argv) + { + while (argv && *argv) { + push_back(*argv++); + } + } + + ArgList::ArgList(const vector &&that) : + vector(that) + { + } + + Interp::~Interp() + { + } + + Interp::Interp(const map > &cmdlist) : + m_commands(cmdlist) + { + } + + void + Interp::add_command(const string &name, const shared_ptr &command) + { + m_commands[name] = command; + } + + int + Interp::exec(const ArgList &args) + { + auto next_arg = args.begin(); + ++next_arg; + return m_commands.at(args[0])->exec(vector(next_arg, args.end())); + } + + ArgParser::~ArgParser() + { + } + + ArgParser::ArgParser() + { + } + + void + ArgParser::add_opt(string opt, ArgActor actor) + { + m_string_opts[opt] = actor; + } + + void + ArgParser::parse_backend(void *t, const ArgList &args) + { + bool quote_args = false; + for (string arg : args) { + if (quote_args) { + cerr << "arg: '" << arg << "'" << endl; + continue; + } + if (arg == "--") { + quote_args = true; + continue; + } + if (arg.compare(0, 2, "--") == 0) { + auto found = m_string_opts.find(arg.substr(2, string::npos)); + if (found != m_string_opts.end()) { + found->second.predicate(t, "foo"); + } + (void)t; + } + } + } + + +}; diff --git a/lib/ntoa.cc b/lib/ntoa.cc new file mode 100644 index 0000000..af3292f --- /dev/null +++ b/lib/ntoa.cc @@ -0,0 +1,40 @@ +#include "crucible/ntoa.h" + +#include +#include +#include + +namespace crucible { + using namespace std; + + string bits_ntoa(unsigned long n, const bits_ntoa_table *table) + { + string out; + while (n && table->a) { + // No bits in n outside of mask + assert( ((~table->mask) & table->n) == 0); + if ( (n & table->mask) == table->n) { + if (!out.empty()) { + out += "|"; + } + out += table->a; + n &= ~(table->mask); + } + ++table; + } + if (n) { + ostringstream oss; + oss << "0x" << hex << n; + if (!out.empty()) { + out += "|"; + } + out += oss.str(); + } + if (out.empty()) { + out = "0"; + } + return out; + } + + +}; diff --git a/lib/path.cc b/lib/path.cc new file mode 100644 index 0000000..c713e45 --- /dev/null +++ b/lib/path.cc @@ -0,0 +1,26 @@ +#include "crucible/path.h" + +#include "crucible/error.h" + +namespace crucible { + using namespace std; + + string + basename(string s) + { + size_t left = s.find_last_of("/"); + size_t right = s.find_last_not_of("/"); + if (left == string::npos) { + return s; + } + return s.substr(left + 1, right); + } + + string + join(string dir, string base) + { + // TODO: a lot of sanity checking, maybe canonicalization + return dir + "/" + base; + } + +}; diff --git a/lib/process.cc b/lib/process.cc new file mode 100644 index 0000000..a9ce153 --- /dev/null +++ b/lib/process.cc @@ -0,0 +1,121 @@ +#include "crucible/process.h" + +#include "crucible/chatter.h" +#include "crucible/error.h" + +#include + +// for gettid() +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include + +namespace crucible { + using namespace std; + + bool + Process::joinable() + { + return !!m_pid; + } + + Process::~Process() + { + if (joinable()) { + // because it's just not the same without the word "zombie"... + CHATTER("ZOMBIE WARNING: joinable Process pid " << m_pid << " abandoned"); + } + } + + Process::Process() : + m_pid(0) + { + } + + Process::Process(Process &&move_from) : + m_pid(0) + { + swap(m_pid, move_from.m_pid); + } + + void + Process::do_fork(function child_func) + { + int rv = fork(); + if (rv < 0) { + THROW_ERRNO("fork failed"); + } + m_pid = rv; + + if (rv == 0) { + // child + catch_all([&]() { + int rv = child_func(); + exit(rv); + }); + terminate(); + } + } + + Process::status_type + Process::join() + { + if (m_pid == 0) { + THROW_ERROR(invalid_argument, "Process not created"); + } + + int status = 0; + pid_t rv = waitpid(m_pid, &status, 0); + if (rv == -1) { + THROW_ERRNO("waitpid failed, pid = " << m_pid); + } + if (rv != m_pid) { + THROW_ERROR(runtime_error, "waitpid failed, wanted pid = " << m_pid << ", got rv = " << rv << ", status = " << status); + } + m_pid = 0; + return status; + } + + void + Process::detach() + { + m_pid = 0; + } + + Process::native_handle_type + Process::native_handle() + { + return m_pid; + } + + Process::id + Process::get_id() + { + return m_pid; + } + + void + Process::kill(int sig) + { + if (!m_pid) { + THROW_ERROR(invalid_argument, "Process not created"); + } + + int rv = ::kill(m_pid, sig); + if (rv) { + THROW_ERRNO("killing process " << m_pid << " with signal " << sig); + } + } + + template<> + struct ResourceHandle; + + pid_t + gettid() + { + return syscall(SYS_gettid); + } + +} diff --git a/lib/string.cc b/lib/string.cc new file mode 100644 index 0000000..3285fa1 --- /dev/null +++ b/lib/string.cc @@ -0,0 +1,43 @@ +#include "crucible/string.h" + +#include "crucible/error.h" + +#include + +namespace crucible { + using namespace std; + + string + to_hex(uint64_t i) + { + return astringprintf("0x%" PRIx64, i); + } + + uint64_t + from_hex(const string &s) + { + return stoull(s, 0, 0); + } + + vector + split(string delim, string s) + { + if (delim.empty()) { + THROW_ERROR(invalid_argument, "delimiter empty when splitting '" << s << "'"); + } + vector rv; + size_t n = 0; + while (n < s.length()) { + size_t f = s.find(delim, n); + if (f == string::npos) { + rv.push_back(s.substr(n)); + break; + } + if (f > n) { + rv.push_back(s.substr(n, f - n)); + } + n = f + delim.length(); + } + return rv; + } +}; diff --git a/lib/time.cc b/lib/time.cc new file mode 100644 index 0000000..7e19343 --- /dev/null +++ b/lib/time.cc @@ -0,0 +1,158 @@ +#include "crucible/time.h" + +#include "crucible/error.h" + +#include +#include +#include +#include + +namespace crucible { + + double + nanosleep(double secs) + { + if (secs <= 0) return secs; + + struct timespec req; + req.tv_sec = time_t(floor(secs)); + req.tv_nsec = long((secs - floor(secs)) * 1000000000); + + // Just silently ignore weirdo values for now + if (req.tv_sec < 0) return secs; + if (req.tv_sec > 1000000000) return secs; + if (req.tv_nsec < 0) return secs; + if (req.tv_nsec > 1000000000) return secs; + + struct timespec rem; + rem.tv_sec = 0; + rem.tv_nsec = 0; + + int nanosleep_rv = ::nanosleep(&req, &rem); + if (nanosleep_rv) { + THROW_ERRNO("nanosleep (" << secs << ") { tv_sec = " << req.tv_sec << ", tv_nsec = " << req.tv_nsec << " }"); + } + return rem.tv_sec + (double(rem.tv_nsec) / 1000000000.0); + } + + Timer::Timer() : + m_start(chrono::high_resolution_clock::now()) + { + } + + double + Timer::age() const + { + chrono::high_resolution_clock::time_point end = chrono::high_resolution_clock::now(); + return chrono::duration(end - m_start).count(); + } + + double + Timer::report(int precision) const + { + return ceil(age() * precision) / precision; + } + + void + Timer::reset() + { + m_start = chrono::high_resolution_clock::now(); + } + + void + Timer::set(const chrono::high_resolution_clock::time_point &start) + { + m_start = start; + } + + void + Timer::set(double delta) + { + m_start += chrono::duration_cast(chrono::duration(delta)); + } + + double + Timer::lap() + { + auto end = chrono::high_resolution_clock::now(); + double rv = chrono::duration(end - m_start).count(); + m_start = end; + return rv; + } + + ostream & + operator<<(ostream &os, const Timer &t) + { + return os << t.report(); + } + + bool + Timer::operator<(double d) const + { + return age() < d; + } + + bool + Timer::operator>(double d) const + { + return age() > d; + } + + RateLimiter::RateLimiter(double rate, double burst) : + m_rate(rate), + m_burst(burst) + { + } + + RateLimiter::RateLimiter(double rate) : + m_rate(rate), + m_burst(rate) + { + } + + void + RateLimiter::update_tokens() + { + double delta = m_timer.lap(); + m_tokens += delta * m_rate; + if (m_tokens > m_burst) { + m_tokens = m_burst; + } + } + + void + RateLimiter::sleep_for(double cost) + { + borrow(cost); + while (1) { + unique_lock lock(m_mutex); + update_tokens(); + if (m_tokens >= 0) { + return; + } + double sleep_time(-m_tokens / m_rate); + lock.unlock(); + if (sleep_time > 0.0) { + nanosleep(sleep_time); + } else { + return; + } + } + } + + bool + RateLimiter::is_ready() + { + unique_lock lock(m_mutex); + update_tokens(); + return m_tokens >= 0; + } + + void + RateLimiter::borrow(double cost) + { + unique_lock lock(m_mutex); + m_tokens -= cost; + } + +} diff --git a/lib/uuid.cc b/lib/uuid.cc new file mode 100644 index 0000000..32c2958 --- /dev/null +++ b/lib/uuid.cc @@ -0,0 +1,16 @@ +#include "crucible/uuid.h" + +namespace crucible { + using namespace std; + + const size_t uuid_unparsed_size = 37; // "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx\0" + + string + uuid_unparse(const unsigned char in[16]) + { + char out[uuid_unparsed_size]; + ::uuid_unparse(in, out); + return string(out); + } + +} diff --git a/makeflags b/makeflags new file mode 100644 index 0000000..f3b0b6f --- /dev/null +++ b/makeflags @@ -0,0 +1,4 @@ +CCFLAGS = -Wall -Wextra -Werror -O3 -I../include -ggdb -fpic +# CCFLAGS = -Wall -Wextra -Werror -O0 -I../include -ggdb -fpic +CFLAGS = $(CCFLAGS) -std=c99 +CXXFLAGS = $(CCFLAGS) -std=c++11 -Wold-style-cast diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..1db2927 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,39 @@ +PROGRAMS = \ + ../bin/bees \ + ../bin/fiemap \ + ../bin/fiewalk \ + +all: $(PROGRAMS) depends.mk + +include ../makeflags + +LIBS = -lcrucible -lpthread +LDFLAGS = -L../lib -Wl,-rpath=$(shell realpath ../lib) + +depends.mk: Makefile *.cc + for x in *.cc; do $(CXX) $(CXXFLAGS) -M "$$x"; done > depends.mk.new + mv -fv depends.mk.new depends.mk + +-include depends.mk + +%.o: %.cc %.h + $(CXX) $(CXXFLAGS) -o "$@" -c "$<" + +../bin/%: %.o + @echo Implicit bin rule "$<" '->' "$@" + $(CXX) $(CXXFLAGS) -o "$@" "$<" $(LDFLAGS) $(LIBS) + +BEES_OBJS = \ + bees.o \ + bees-context.o \ + bees-hash.o \ + bees-resolve.o \ + bees-roots.o \ + bees-thread.o \ + bees-types.o \ + +../bin/bees: $(BEES_OBJS) + $(CXX) $(CXXFLAGS) -o "$@" $(BEES_OBJS) $(LDFLAGS) $(LIBS) + +clean: + -rm -fv *.o diff --git a/src/bees-context.cc b/src/bees-context.cc new file mode 100644 index 0000000..d4a78c7 --- /dev/null +++ b/src/bees-context.cc @@ -0,0 +1,1009 @@ +#include "bees.h" + +#include "crucible/limits.h" +#include "crucible/string.h" + +#include +#include + +using namespace crucible; +using namespace std; + +static inline +const char * +getenv_or_die(const char *name) +{ + const char *rv = getenv(name); + if (!rv) { + THROW_ERROR(runtime_error, "Environment variable " << name << " not defined"); + } + return rv; +} + +BeesFdCache::BeesFdCache() +{ + m_root_cache.func([&](shared_ptr ctx, uint64_t root) -> Fd { + return ctx->roots()->open_root_nocache(root); + }); + m_file_cache.func([&](shared_ptr ctx, uint64_t root, uint64_t ino) -> Fd { + return ctx->roots()->open_root_ino_nocache(root, ino); + }); +} + +Fd +BeesFdCache::open_root(shared_ptr ctx, uint64_t root) +{ + // Don't hold root FDs open too long. + // The open FDs prevent snapshots from being deleted. + // cleaner_kthread just keeps skipping over the open dir and all its children. + if (m_root_cache_timer.age() > BEES_COMMIT_INTERVAL) { + BEESINFO("Clearing root FD cache to enable subvol delete"); + m_root_cache.clear(); + m_root_cache_timer.reset(); + BEESCOUNT(root_clear); + } + return m_root_cache(ctx, root); +} + +Fd +BeesFdCache::open_root_ino(shared_ptr ctx, uint64_t root, uint64_t ino) +{ + return m_file_cache(ctx, root, ino); +} + +void +BeesFdCache::insert_root_ino(shared_ptr ctx, Fd fd) +{ + BeesFileId fid(fd); + return m_file_cache.insert(fd, ctx, fid.root(), fid.ino()); +} + +mutex BeesWorkQueueBase::s_mutex; +set BeesWorkQueueBase::s_all_workers; + +BeesWorkQueueBase::BeesWorkQueueBase(const string &name) : + m_name(name) +{ +} + +BeesWorkQueueBase::~BeesWorkQueueBase() +{ + unique_lock lock(s_mutex); + s_all_workers.erase(this); +} + +void +BeesWorkQueueBase::for_each_work_queue(std::function f) +{ + unique_lock lock(s_mutex); + for (auto i : s_all_workers) { + f(i); + } +} + +string +BeesWorkQueueBase::name() const +{ + return m_name; +} + +void +BeesWorkQueueBase::name(const string &new_name) +{ + m_name = new_name; +} + +template +BeesWorkQueue::~BeesWorkQueue() +{ +} + +template +BeesWorkQueue::BeesWorkQueue(const string &name) : + BeesWorkQueueBase(name) +{ + unique_lock lock(s_mutex); + s_all_workers.insert(this); +} + +template +void +BeesWorkQueue::push_active(const Task &t) +{ + BEESNOTE("pushing task " << t); + m_active_queue.push(t); +} + +template +void +BeesWorkQueue::push_active(const Task &t, size_t limit) +{ + // BEESNOTE("pushing limit " << limit << " task " << t); + m_active_queue.push_wait(t, limit); +} + +template +size_t +BeesWorkQueue::active_size() const +{ + return m_active_queue.size(); +} + +template +list +BeesWorkQueue::peek_active(size_t count) const +{ + list rv; + for (auto i : m_active_queue.peek(count)) { + ostringstream oss; + oss << i; + rv.push_back(oss.str()); + } + return rv; +} + +template +Task +BeesWorkQueue::pop() +{ + return m_active_queue.pop(); +} + +void +BeesContext::dump_status() +{ + auto status_charp = getenv("BEESSTATUS"); + if (!status_charp) return; + string status_file(status_charp); + BEESLOG("Writing status to file '" << status_file << "' every " << BEES_STATUS_INTERVAL << " sec"); + while (1) { + BEESNOTE("waiting " << BEES_STATUS_INTERVAL); + sleep(BEES_STATUS_INTERVAL); + + BEESNOTE("writing status to file '" << status_file << "'"); + ofstream ofs(status_file + ".tmp"); + + auto thisStats = BeesStats::s_global; + ofs << "TOTAL:\n"; + ofs << "\t" << thisStats << "\n"; + auto avg_rates = thisStats / m_total_timer.age(); + ofs << "RATES:\n"; + ofs << "\t" << avg_rates << "\n"; + + ofs << "THREADS:\n"; + for (auto t : BeesNote::get_status()) { + ofs << "\ttid " << t.first << ": " << t.second << "\n"; + } + + BeesWorkQueueBase::for_each_work_queue([&](BeesWorkQueueBase *worker) { + ofs << "QUEUE: " << worker->name() << " active: " << worker->active_size() << "\n"; + for (auto t : worker->peek_active(10)) { + ofs << "\t" << t << "\n"; + } + }); + ofs.close(); + + BEESNOTE("renaming status file '" << status_file << "'"); + rename((status_file + ".tmp").c_str(), status_file.c_str()); + } +} + +void +BeesContext::show_progress() +{ + auto lastProgressStats = BeesStats::s_global; + auto lastStats = lastProgressStats; + Timer stats_timer; + while (1) { + sleep(BEES_PROGRESS_INTERVAL); + + if (stats_timer.age() > BEES_STATS_INTERVAL) { + stats_timer.lap(); + + auto thisStats = BeesStats::s_global; + auto avg_rates = lastStats / BEES_STATS_INTERVAL; + BEESLOG("TOTAL: " << thisStats); + BEESLOG("RATES: " << avg_rates); + lastStats = thisStats; + } + + BEESLOG("ACTIVITY:"); + + auto thisStats = BeesStats::s_global; + auto deltaStats = thisStats - lastProgressStats; + if (deltaStats) { + BEESLOG("\t" << deltaStats / BEES_PROGRESS_INTERVAL); + }; + lastProgressStats = thisStats; + + BeesWorkQueueBase::for_each_work_queue([&](BeesWorkQueueBase *worker) { + BEESLOG("QUEUE: " << worker->name() << " active: " << worker->active_size()); + }); + + BEESLOG("THREADS:"); + + for (auto t : BeesNote::get_status()) { + BEESLOG("\ttid " << t.first << ": " << t.second); + } + } +} + +BeesContext::BeesContext(shared_ptr parent) : + m_parent_ctx(parent) +{ + auto base_dir = getenv_or_die("BEESHOME"); + BEESLOG("BEESHOME = " << base_dir); + m_home_fd = open_or_die(base_dir, FLAGS_OPEN_DIR); + if (m_parent_ctx) { + m_hash_table = m_parent_ctx->hash_table(); + m_hash_table->set_shared(true); + m_fd_cache = m_parent_ctx->fd_cache(); + } +} + +bool +BeesContext::dedup(const BeesRangePair &brp) +{ + // TOOLONG and NOTE can retroactively fill in the filename details, but LOG can't + BEESNOTE("dedup " << brp); + + brp.first.fd(shared_from_this()); + brp.second.fd(shared_from_this()); + +#if 0 + // This avoids some sort of kernel race condition; + // however, it also doubles our dedup times. + // Is avoiding a crash every few weeks worth it? + bees_sync(brp.first.fd()); +#endif + + BEESTOOLONG("dedup " << brp); + + thread_local BeesFileId tl_first_fid, tl_second_fid; + if (tl_first_fid != brp.first.fid()) { + BEESLOG("dedup: src " << name_fd(brp.first.fd())); + tl_first_fid = brp.first.fid(); + tl_second_fid = BeesFileId(); + } + ostringstream dst_line; + dst_line << " dst " << pretty(brp.first.size()) << " [" << to_hex(brp.first.begin()) << ".." << to_hex(brp.first.end()) << "]"; + if (brp.first.begin() != brp.second.begin()) { + dst_line << " [" << to_hex(brp.second.begin()) << ".." << to_hex(brp.second.end()) << "]"; + } + BeesAddress first_addr(brp.first.fd(), brp.first.begin()); + BeesAddress second_addr(brp.second.fd(), brp.second.begin()); + dst_line << " (" << first_addr << "->" << second_addr << ")"; + if (first_addr.get_physical_or_zero() == second_addr.get_physical_or_zero()) { + BEESLOGTRACE("equal physical addresses in dedup"); + BEESCOUNT(bug_dedup_same_physical); + } + if (tl_second_fid != brp.second.fid()) { + dst_line << " " << name_fd(brp.second.fd()); + tl_second_fid = brp.second.fid(); + } + BEESLOG(dst_line.str()); + + THROW_CHECK1(invalid_argument, brp, !brp.first.overlaps(brp.second)); + THROW_CHECK1(invalid_argument, brp, brp.first.size() == brp.second.size()); + + BEESCOUNT(dedup_try); + Timer dedup_timer; + bool rv = btrfs_extent_same(brp.first.fd(), brp.first.begin(), brp.first.size(), brp.second.fd(), brp.second.begin()); + BEESCOUNTADD(dedup_ms, dedup_timer.age() * 1000); + + if (rv) { + BEESCOUNT(dedup_hit); + BEESCOUNTADD(dedup_bytes, brp.first.size()); + thread_local BeesFileRange last_src_bfr; + if (!last_src_bfr.overlaps(brp.first)) { + BEESCOUNTADD(dedup_unique_bytes, brp.first.size()); + last_src_bfr = brp.first; + } + } else { + BEESCOUNT(dedup_miss); + BEESLOG("NO Dedup! " << brp); + } + + return rv; +} + +BeesRangePair +BeesContext::dup_extent(const BeesFileRange &src) +{ + BEESTRACE("dup_extent " << src); + BEESCOUNTADD(dedup_copy, src.size()); + return BeesRangePair(tmpfile()->make_copy(src), src); +} + +void +BeesContext::rewrite_file_range(const BeesFileRange &bfr) +{ + auto m_ctx = shared_from_this(); + BEESNOTE("Rewriting bfr " << bfr); + BeesRangePair dup_brp(dup_extent(BeesFileRange(bfr.fd(), bfr.begin(), min(bfr.file_size(), bfr.end())))); + // BEESLOG("\tdup_brp " << dup_brp); + BeesBlockData orig_bbd(bfr.fd(), bfr.begin(), min(BLOCK_SIZE_SUMS, bfr.size())); + // BEESLOG("\torig_bbd " << orig_bbd); + BeesBlockData dup_bbd(dup_brp.first.fd(), dup_brp.first.begin(), min(BLOCK_SIZE_SUMS, dup_brp.first.size())); + // BEESLOG("BeesResolver br(..., " << bfr << ")"); + BeesResolver br(m_ctx, BeesAddress(bfr.fd(), bfr.begin())); + // BEESLOG("\treplace_src " << dup_bbd); + br.replace_src(dup_bbd); + BEESCOUNT(scan_rewrite); + + // All the blocks are now somewhere else so scan again. + // We do this immediately instead of waiting for a later generation scan + // because the blocks we rewrote are likely duplicates of blocks from this + // generation that we are about to scan. Pretty ugly but effective as an + // interim solution while we wait for tree-2 extent scanning. + auto hash_table = m_ctx->hash_table(); + BtrfsExtentWalker ew(bfr.fd(), bfr.begin(), root_fd()); + for (off_t next_p = bfr.begin(); next_p < bfr.end(); ) { + off_t p = next_p; + next_p += BLOCK_SIZE_SUMS; + ew.seek(p); + Extent e = ew.current(); + BEESTRACE("next_p " << to_hex(next_p) << " p " << to_hex(p) << " e " << e); + BeesBlockData bbd(bfr.fd(), p, min(BLOCK_SIZE_SUMS, e.end() - p)); + BeesAddress addr(e, p); + bbd.addr(addr); + if (!addr.is_magic() && !bbd.is_data_zero()) { + hash_table->push_random_hash_addr(bbd.hash(), bbd.addr()); + BEESCOUNT(scan_reinsert); + } + } +} + +BeesFileRange +BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e) +{ + BEESNOTE("Scanning " << pretty(e.size()) << " " + << to_hex(e.begin()) << ".." << to_hex(e.end()) + << " " << name_fd(bfr.fd()) ); + BEESTRACE("scan extent " << e); + BEESCOUNT(scan_extent); + + // We keep moving this method around + auto m_ctx = shared_from_this(); + + shared_ptr hash_table = m_ctx->hash_table(); + + if (e.flags() & ~( + FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_UNKNOWN | + FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_LAST | + FIEMAP_EXTENT_SHARED | FIEMAP_EXTENT_NOT_ALIGNED | + FIEMAP_EXTENT_DATA_INLINE | Extent::HOLE | + Extent::OBSCURED | Extent::PREALLOC + )) { + BEESCOUNT(scan_interesting); + BEESLOG("Interesting extent flags " << e << " from fd " << name_fd(bfr.fd())); + } + + if (e.flags() & Extent::HOLE) { + // Nothing here, dispose of this early + BEESCOUNT(scan_hole); + return bfr; + } + + if (e.flags() & Extent::PREALLOC) { + // Prealloc is all zero and we replace it with a hole. + // No special handling is required here. Nuke it and move on. + BEESLOG("prealloc extent " << e); + BeesFileRange prealloc_bfr(m_ctx->tmpfile()->make_hole(e.size())); + BeesRangePair brp(prealloc_bfr, bfr); + // Raw dedup here - nothing else to do with this extent, nothing to merge with + if (m_ctx->dedup(brp)) { + BEESCOUNT(dedup_prealloc_hit); + BEESCOUNTADD(dedup_prealloc_bytes, e.size()); + return bfr; + } else { + BEESCOUNT(dedup_prealloc_miss); + } + } + + // OK we need to read extent now + posix_fadvise(bfr.fd(), bfr.begin(), bfr.size(), POSIX_FADV_WILLNEED); + + map> insert_map; + set noinsert_set; + + // Hole handling + bool extent_compressed = e.flags() & FIEMAP_EXTENT_ENCODED; + bool extent_contains_zero = false; + bool extent_contains_nonzero = false; + + // Need to replace extent + bool rewrite_extent = false; + + // Pretty graphs + off_t block_count = ((e.size() + BLOCK_MASK_SUMS) & ~BLOCK_MASK_SUMS) / BLOCK_SIZE_SUMS; + BEESTRACE(e << " block_count " << block_count); + string bar(block_count, '#'); + + for (off_t next_p = e.begin(); next_p < e.end(); ) { + + // Guarantee forward progress + off_t p = next_p; + next_p += BLOCK_SIZE_SUMS; + + off_t bar_p = (p - e.begin()) / BLOCK_SIZE_SUMS; + BeesAddress addr(e, p); + + // This extent should consist entirely of non-magic blocks + THROW_CHECK1(invalid_argument, addr, !addr.is_magic()); + + // Get block data + BeesBlockData bbd(bfr.fd(), p, min(BLOCK_SIZE_SUMS, e.end() - p)); + bbd.addr(addr); + BEESCOUNT(scan_block); + + BEESTRACE("scan bbd " << bbd); + + // Calculate the hash first because it lets us shortcut on is_data_zero + BEESNOTE("scan hash " << bbd); + BeesHash hash = bbd.hash(); + + // Schedule this block for insertion if we decide to keep this extent. + BEESCOUNT(scan_hash_preinsert); + BEESTRACE("Pushing hash " << hash << " addr " << addr << " bbd " << bbd); + insert_map.insert(make_pair(p, make_pair(hash, addr))); + bar.at(bar_p) = 'R'; + + // Weed out zero blocks + BEESNOTE("is_data_zero " << bbd); + bool extent_is_zero = bbd.is_data_zero(); + if (extent_is_zero) { + bar.at(bar_p) = '0'; + if (extent_compressed) { + if (!extent_contains_zero) { + // BEESLOG("compressed zero bbd " << bbd << "\n\tin extent " << e); + } + extent_contains_zero = true; + // Do not attempt to lookup hash of zero block + continue; + } else { + BEESLOG("zero bbd " << bbd << "\n\tin extent " << e); + BEESCOUNT(scan_zero_uncompressed); + rewrite_extent = true; + break; + } + } else { + if (extent_contains_zero && !extent_contains_nonzero) { + // BEESLOG("compressed nonzero bbd " << bbd << "\n\tin extent " << e); + } + extent_contains_nonzero = true; + } + + BEESNOTE("lookup hash " << bbd); + auto found = hash_table->find_cell(hash); + BEESCOUNT(scan_lookup); + + set resolved_addrs; + set found_addrs; + + // We know that there is at least one copy of the data and where it is, + // but we don't want to do expensive LOGICAL_INO operations unless there + // are at least two distinct addresses to look at. + found_addrs.insert(addr); + + for (auto i : found) { + BEESTRACE("found (hash, address): " << i); + BEESCOUNT(scan_found); + + // Hash has to match + THROW_CHECK2(runtime_error, i.e_hash, hash, i.e_hash == hash); + + BeesAddress found_addr(i.e_addr); + +#if 0 + // If address already in hash table, move on to next extent. + // We've already seen this block and may have made additional references to it. + // The current extent is effectively "pinned" and can't be modified any more. + if (found_addr.get_physical_or_zero() == addr.get_physical_or_zero()) { + BEESCOUNT(scan_already); + return bfr; + } +#endif + + // Block must have matching EOF alignment + if (found_addr.is_unaligned_eof() != addr.is_unaligned_eof()) { + BEESCOUNT(scan_malign); + continue; + } + + // Address is a duplicate + if (!found_addrs.insert(found_addr).second) { + BEESCOUNT(scan_twice); + continue; + } + + // Hash is toxic + if (found_addr.is_toxic()) { + BEESINFO("WORKAROUND: abandoned toxic match for hash " << hash << " addr " << found_addr); + // Don't push these back in because we'll never delete them. + // hash_table->push_front_hash_addr(hash, found_addr); + BEESCOUNT(scan_toxic_hash); + return bfr; + } + + // Distinct address, go resolve it + bool abandon_extent = false; + catch_all([&]() { + BEESNOTE("resolving " << found_addr << " matched " << bbd); + BEESTRACE("resolving " << found_addr << " matched " << bbd); + BeesResolver resolved(m_ctx, found_addr); + // Toxic extents are really toxic + if (resolved.is_toxic()) { + BEESINFO("WORKAROUND: abandoned toxic match at found_addr " << found_addr << " matching bbd " << bbd); + BEESCOUNT(scan_toxic_match); +#if 0 + // Don't push these back in because we'll never delete them. + // Make sure we never see this hash again + found_addr.set_toxic(); + hash_table->push_front_hash_addr(hash, found_addr); +#endif + abandon_extent = true; + } else if (!resolved.count()) { + BEESCOUNT(scan_resolve_zero); + // Didn't find anything, address is dead + BEESTRACE("matched hash " << hash << " addr " << addr << " count zero"); + hash_table->erase_hash_addr(hash, found_addr); + } else { + resolved_addrs.insert(resolved); + BEESCOUNT(scan_resolve_hit); + } + }); + + if (abandon_extent) { + return bfr; + } + } + + // This shouldn't happen (often), so let's count it separately + if (resolved_addrs.size() > 2) { + BEESCOUNT(matched_3_or_more); + } + if (resolved_addrs.size() > 1) { + BEESCOUNT(matched_2_or_more); + } + + // No need to do all this unless there are two or more distinct matches + if (!resolved_addrs.empty()) { + bar.at(bar_p) = 'M'; + BEESCOUNT(matched_1_or_more); + BEESTRACE("resolved_addrs.size() = " << resolved_addrs.size()); + BEESNOTE("resolving " << resolved_addrs.size() << " matches for hash " << hash); + + BeesFileRange replaced_bfr; + + BeesAddress last_replaced_addr; + for (auto it = resolved_addrs.begin(); it != resolved_addrs.end(); ++it) { + catch_all([&]() { + auto it_copy = *it; + BEESNOTE("finding one match (out of " << it_copy.count() << ") at " << it_copy.addr() << " for " << bbd); + BEESTRACE("finding one match (out of " << it_copy.count() << ") at " << it_copy.addr() << " for " << bbd); + replaced_bfr = it_copy.replace_dst(bbd); + BEESTRACE("next_p " << to_hex(next_p) << " -> replaced_bfr " << replaced_bfr); + + // If we didn't find this hash where the hash table said it would be, + // correct the hash table. + if (it_copy.found_hash()) { + BEESCOUNT(scan_hash_hit); + } else { + // BEESINFO("erase src hash " << hash << " addr " << it_copy.addr()); + BEESCOUNT(scan_hash_miss); + hash_table->erase_hash_addr(hash, it_copy.addr()); + } + + if (it_copy.found_dup()) { + BEESCOUNT(scan_dup_hit); + + // FIXME: we will thrash if we let multiple references to identical blocks + // exist in the hash table. Erase all but the last one. + if (last_replaced_addr) { + BEESLOG("Erasing redundant hash " << hash << " addr " << last_replaced_addr); + hash_table->erase_hash_addr(hash, last_replaced_addr); + BEESCOUNT(scan_erase_redundant); + } + last_replaced_addr = it_copy.addr(); + + // Invalidate resolve cache so we can count refs correctly + m_ctx->invalidate_addr(it_copy.addr()); + m_ctx->invalidate_addr(bbd.addr()); + + // Remove deduped blocks from insert map + THROW_CHECK0(runtime_error, replaced_bfr); + for (off_t ip = replaced_bfr.begin(); ip < replaced_bfr.end(); ip += BLOCK_SIZE_SUMS) { + BEESCOUNT(scan_dup_block); + noinsert_set.insert(ip); + if (ip >= e.begin() && ip < e.end()) { + off_t bar_p = (ip - e.begin()) / BLOCK_SIZE_SUMS; + bar.at(bar_p) = 'd'; + } + } + + // next_p may be past EOF so check p only + THROW_CHECK2(runtime_error, p, replaced_bfr, p < replaced_bfr.end()); + + BEESCOUNT(scan_bump); + next_p = replaced_bfr.end(); + } else { + BEESCOUNT(scan_dup_miss); + } + }); + } + if (last_replaced_addr) { + // If we replaced extents containing the incoming addr, + // push the addr we kept to the front of the hash LRU. + hash_table->push_front_hash_addr(hash, last_replaced_addr); + BEESCOUNT(scan_push_front); + } + } else { + BEESCOUNT(matched_0); + } + } + + // If the extent was compressed and all zeros, nuke entire thing + if (!rewrite_extent && (extent_contains_zero && !extent_contains_nonzero)) { + rewrite_extent = true; + BEESCOUNT(scan_zero_compressed); + } + + // Turning this off because it's a waste of time on small extents + // and it's incorrect for large extents. +#if 0 + // If the extent contains obscured blocks, and we can find no + // other refs to the extent that reveal those blocks, nuke the incoming extent. + // Don't rewrite extents that are bigger than the maximum FILE_EXTENT_SAME size + // because we can't make extents that large with dedup. + // Don't rewrite small extents because it is a waste of time without being + // able to combine them into bigger extents. + if (!rewrite_extent && (e.flags() & Extent::OBSCURED) && (e.physical_len() > BLOCK_SIZE_MAX_COMPRESSED_EXTENT) && (e.physical_len() < BLOCK_SIZE_MAX_EXTENT_SAME)) { + BEESCOUNT(scan_obscured); + BEESNOTE("obscured extent " << e); + // We have to map all the source blocks to see if any of them + // (or all of them aggregated) provide a path through the FS to the blocks + BeesResolver br(m_ctx, BeesAddress(e, e.begin())); + BeesBlockData ref_bbd(bfr.fd(), bfr.begin(), min(BLOCK_SIZE_SUMS, bfr.size())); + // BEESLOG("ref_bbd " << ref_bbd); + auto bfr_set = br.find_all_matches(ref_bbd); + bool non_obscured_extent_found = false; + set blocks_to_find; + for (off_t j = 0; j < e.physical_len(); j += BLOCK_SIZE_CLONE) { + blocks_to_find.insert(j); + } + // Don't bother if saving less than 1% + auto maximum_hidden_count = blocks_to_find.size() / 100; + for (auto i : bfr_set) { + BtrfsExtentWalker ref_ew(bfr.fd(), bfr.begin(), m_ctx->root_fd()); + Extent ref_e = ref_ew.current(); + // BEESLOG("\tref_e " << ref_e); + THROW_CHECK2(out_of_range, ref_e, e, ref_e.offset() + ref_e.logical_len() <= e.physical_len()); + for (off_t j = ref_e.offset(); j < ref_e.offset() + ref_e.logical_len(); j += BLOCK_SIZE_CLONE) { + blocks_to_find.erase(j); + } + if (blocks_to_find.size() <= maximum_hidden_count) { + BEESCOUNT(scan_obscured_miss); + BEESLOG("Found references to all but " << blocks_to_find.size() << " blocks"); + non_obscured_extent_found = true; + break; + } else { + BEESCOUNT(scan_obscured_hit); + // BEESLOG("blocks_to_find: " << blocks_to_find.size() << " from " << *blocks_to_find.begin() << ".." << *blocks_to_find.rbegin()); + } + } + if (!non_obscured_extent_found) { + // BEESLOG("No non-obscured extents found"); + rewrite_extent = true; + BEESCOUNT(scan_obscured_rewrite); + } + } +#endif + + // If we deduped any blocks then we must rewrite the remainder of the extent + if (!noinsert_set.empty()) { + rewrite_extent = true; + } + + // If we need to replace part of the extent, rewrite all instances of it + if (rewrite_extent) { + bool blocks_rewritten = false; + BEESTRACE("Rewriting extent " << e); + off_t last_p = e.begin(); + off_t p = last_p; + off_t next_p; + BEESTRACE("next_p " << to_hex(next_p) << " p " << to_hex(p) << " last_p " << to_hex(last_p)); + for (next_p = e.begin(); next_p < e.end(); ) { + p = next_p; + next_p += BLOCK_SIZE_SUMS; + + // BEESLOG("noinsert_set.count(" << to_hex(p) << ") " << noinsert_set.count(p)); + if (noinsert_set.count(p)) { + if (p - last_p > 0) { + rewrite_file_range(BeesFileRange(bfr.fd(), last_p, p)); + blocks_rewritten = true; + } + last_p = next_p; + } else { + off_t bar_p = (p - e.begin()) / BLOCK_SIZE_SUMS; + bar.at(bar_p) = '+'; + } + } + BEESTRACE("last"); + if (next_p - last_p > 0) { + rewrite_file_range(BeesFileRange(bfr.fd(), last_p, next_p)); + blocks_rewritten = true; + } + if (blocks_rewritten) { + // Nothing left to insert, all blocks clobbered + insert_map.clear(); + } else { + // BEESLOG("No blocks rewritten"); + BEESCOUNT(scan_no_rewrite); + } + } + + // We did not rewrite the extent and it contained data, so insert it. + for (auto i : insert_map) { + off_t bar_p = (i.first - e.begin()) / BLOCK_SIZE_SUMS; + BEESTRACE("e " << e << "bar_p = " << bar_p << " i.first-e.begin() " << i.first - e.begin() << " i.second " << i.second.first << ", " << i.second.second); + if (noinsert_set.count(i.first)) { + // FIXME: we removed one reference to this copy. Avoid thrashing? + hash_table->erase_hash_addr(i.second.first, i.second.second); + // Block was clobbered, do not insert + // Will look like 'Ddddd' because we skip deduped blocks + bar.at(bar_p) = 'D'; + BEESCOUNT(inserted_clobbered); + } else { + hash_table->push_random_hash_addr(i.second.first, i.second.second); + bar.at(bar_p) = '.'; + BEESCOUNT(inserted_block); + } + } + + // Visualize + if (bar != string(block_count, '.')) { + thread_local BeesFileId last_fid; + string file_name; + if (bfr.fid() != last_fid) { + last_fid = bfr.fid(); + file_name = " " + name_fd(bfr.fd()); + } + BEESLOG("scan: " << pretty(e.size()) << " " << to_hex(e.begin()) << " [" << bar << "] " << to_hex(e.end()) << file_name); + } + + return bfr; +} + +BeesFileRange +BeesContext::scan_forward(const BeesFileRange &bfr) +{ + // What are we doing here? + BEESTRACE("scan_forward " << bfr); + BEESCOUNT(scan_forward); + + Timer scan_timer; + + // Silently filter out blacklisted files + if (is_blacklisted(bfr.fid())) { + BEESCOUNT(scan_blacklisted); + return bfr; + } + + BEESNOTE("scan open " << bfr); + + // Reconstitute FD + bfr.fd(shared_from_this()); + + BEESNOTE("scan extent " << bfr); + + // No FD? Well, that was quick. + if (!bfr.fd()) { + BEESINFO("No FD in " << root_path() << " for " << bfr); + BEESCOUNT(scan_no_fd); + return bfr; + } + + // Sanity check + if (bfr.begin() >= bfr.file_size()) { + BEESLOG("past EOF: " << bfr); + BEESCOUNT(scan_eof); + return bfr; + } + + BtrfsExtentWalker ew(bfr.fd(), bfr.begin(), root_fd()); + + BeesFileRange return_bfr(bfr); + + Extent e; + catch_all([&]() { + while (true) { + e = ew.current(); + + catch_all([&]() { + Timer one_extent_timer; + return_bfr = scan_one_extent(bfr, e); + BEESCOUNTADD(scanf_extent_ms, one_extent_timer.age() * 1000); + BEESCOUNT(scanf_extent); + }); + + if (e.end() >= bfr.end()) { + break; + } + + if (!ew.next()) { + break; + } + } + }); + + BEESCOUNTADD(scanf_total_ms, scan_timer.age() * 1000); + BEESCOUNT(scanf_total); + + return return_bfr; +} + +BeesResolveAddrResult::BeesResolveAddrResult() +{ +} + +BeesResolveAddrResult +BeesContext::resolve_addr_uncached(BeesAddress addr) +{ + THROW_CHECK1(invalid_argument, addr, !addr.is_magic()); + THROW_CHECK0(invalid_argument, !!root_fd()); + Timer resolve_timer; + + // There is no performance benefit if we restrict the buffer size. + BtrfsIoctlLogicalInoArgs log_ino(addr.get_physical_or_zero()); + + { + BEESTOOLONG("Resolving addr " << addr << " in " << root_path() << " refs " << log_ino.m_iors.size()); + if (log_ino.do_ioctl_nothrow(root_fd())) { + BEESCOUNT(resolve_ok); + } else { + BEESCOUNT(resolve_fail); + } + BEESCOUNTADD(resolve_ms, resolve_timer.age() * 1000); + } + + // Prevent unavoidable performance bug from crippling the rest of the system + auto rt_age = resolve_timer.age(); + + // Avoid performance bug + BeesResolveAddrResult rv; + rv.m_biors = log_ino.m_iors; + if (rt_age < BEES_TOXIC_DURATION && log_ino.m_iors.size() < BEES_MAX_EXTENT_REF_COUNT) { + rv.m_is_toxic = false; + } else { + BEESLOG("WORKAROUND: toxic address " << addr << " in " << root_path() << " with " << log_ino.m_iors.size() << " refs took " << rt_age << "s in LOGICAL_INO"); + BEESCOUNT(resolve_toxic); + rv.m_is_toxic = true; + } + + return rv; +} + +BeesResolveAddrResult +BeesContext::resolve_addr(BeesAddress addr) +{ + // All compressed offset addresses resolve to the same physical addr, so use that value for the cache + return m_resolve_cache(addr.get_physical_or_zero()); +} + +void +BeesContext::invalidate_addr(BeesAddress addr) +{ + return m_resolve_cache.expire(addr.get_physical_or_zero()); +} + +void +BeesContext::set_root_fd(Fd fd) +{ + uint64_t root_fd_treeid = btrfs_get_root_id(fd); + BEESLOG("set_root_fd " << name_fd(fd)); + BEESTRACE("set_root_fd " << name_fd(fd)); + THROW_CHECK1(invalid_argument, root_fd_treeid, root_fd_treeid == BTRFS_FS_TREE_OBJECTID); + Stat st(fd); + THROW_CHECK1(invalid_argument, st.st_ino, st.st_ino == BTRFS_FIRST_FREE_OBJECTID); + m_root_fd = fd; + BtrfsIoctlFsInfoArgs fsinfo; + fsinfo.do_ioctl(fd); + m_root_uuid = fsinfo.uuid(); + BEESLOG("Filesystem UUID is " << m_root_uuid); + + // 65536 is big enough for two max-sized extents + m_resolve_cache.max_size(65536); + m_resolve_cache.func([&](BeesAddress addr) -> BeesResolveAddrResult { + return resolve_addr_uncached(addr); + }); + + // Start queue producers + roots(); + + BEESLOG("returning from set_root_fd in " << name_fd(fd)); +} + +void +BeesContext::blacklist_add(const BeesFileId &fid) +{ + BEESLOG("Adding " << fid << " to blacklist"); + unique_lock lock(m_blacklist_mutex); + m_blacklist.insert(fid); +} + +bool +BeesContext::is_blacklisted(const BeesFileId &fid) const +{ + // Everything on root 1 is blacklisted, no locks necessary. + if (fid.root() == 1) { + return true; + } + unique_lock lock(m_blacklist_mutex); + return m_blacklist.count(fid); +} + +shared_ptr +BeesContext::tmpfile() +{ + // There need be only one, this is not a high-contention path + static mutex s_mutex; + unique_lock lock(s_mutex); + + if (!m_tmpfiles[this_thread::get_id()]) { + m_tmpfiles[this_thread::get_id()] = make_shared(shared_from_this()); + } + return m_tmpfiles[this_thread::get_id()]; +} + +shared_ptr +BeesContext::fd_cache() +{ + static mutex s_mutex; + unique_lock lock(s_mutex); + if (!m_fd_cache) { + m_fd_cache = make_shared(); + } + return m_fd_cache; +} + +shared_ptr +BeesContext::roots() +{ + static mutex s_mutex; + unique_lock lock(s_mutex); + if (!m_roots) { + m_roots = make_shared(shared_from_this()); + } + return m_roots; +} + +shared_ptr +BeesContext::hash_table() +{ + static mutex s_mutex; + unique_lock lock(s_mutex); + if (!m_hash_table) { + m_hash_table = make_shared(shared_from_this(), "beeshash.dat"); + } + return m_hash_table; +} + +void +BeesContext::set_root_path(string path) +{ + BEESLOG("set_root_path " << path); + m_root_path = path; + set_root_fd(open_or_die(m_root_path, FLAGS_OPEN_DIR)); +} + +void +BeesContext::insert_root_ino(Fd fd) +{ + fd_cache()->insert_root_ino(shared_from_this(), fd); +} + +// instantiate templates for linkage ---------------------------------------- + +template class BeesWorkQueue; +template class BeesWorkQueue; diff --git a/src/bees-hash.cc b/src/bees-hash.cc new file mode 100644 index 0000000..2fa302b --- /dev/null +++ b/src/bees-hash.cc @@ -0,0 +1,682 @@ +#include "bees.h" + +#include "crucible/crc64.h" +#include "crucible/string.h" + +#include +#include + +#include + +using namespace crucible; +using namespace std; + +static inline +bool +using_any_madvise() +{ + return true; +} + +ostream & +operator<<(ostream &os, const BeesHash &bh) +{ + return os << to_hex(BeesHash::Type(bh)); +} + +ostream & +operator<<(ostream &os, const BeesHashTable::Cell &bhte) +{ + return os << "BeesHashTable::Cell { hash = " << BeesHash(bhte.e_hash) << ", addr = " + << BeesAddress(bhte.e_addr) << " }"; +} + +void +dump_bucket(BeesHashTable::Cell *p, BeesHashTable::Cell *q) +{ + // Must be called while holding m_bucket_mutex + for (auto i = p; i < q; ++i) { + BEESLOG("Entry " << i - p << " " << *i); + } +} + +const bool VERIFY_CLEARS_BUGS = false; + +bool +verify_cell_range(BeesHashTable::Cell *p, BeesHashTable::Cell *q, bool clear_bugs = VERIFY_CLEARS_BUGS) +{ + // Must be called while holding m_bucket_mutex + bool bugs_found = false; + set seen_it; + for (BeesHashTable::Cell *cell = p; cell < q; ++cell) { + if (cell->e_addr && cell->e_addr < 0x1000) { + BEESCOUNT(bug_hash_magic_addr); + BEESINFO("Bad hash table address hash " << to_hex(cell->e_hash) << " addr " << to_hex(cell->e_addr)); + if (clear_bugs) { + cell->e_addr = 0; + cell->e_hash = 0; + } + bugs_found = true; + } + if (cell->e_addr && !seen_it.insert(*cell).second) { + BEESCOUNT(bug_hash_duplicate_cell); + // BEESLOG("Duplicate hash table entry:\nthis = " << *cell << "\nold = " << *seen_it.find(*cell)); + BEESINFO("Duplicate hash table entry: " << *cell); + if (clear_bugs) { + cell->e_addr = 0; + cell->e_hash = 0; + } + bugs_found = true; + } + } + return bugs_found; +} + +pair +BeesHashTable::get_cell_range(HashType hash) +{ + THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0); + THROW_CHECK1(runtime_error, m_bucket_ptr, m_bucket_ptr != nullptr); + Bucket *pp = &m_bucket_ptr[hash % m_buckets]; + Cell *bp = pp[0].p_cells; + Cell *ep = pp[1].p_cells; + THROW_CHECK2(out_of_range, m_cell_ptr, bp, bp >= m_cell_ptr); + THROW_CHECK2(out_of_range, m_cell_ptr_end, ep, ep <= m_cell_ptr_end); + return make_pair(bp, ep); +} + +pair +BeesHashTable::get_extent_range(HashType hash) +{ + THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0); + THROW_CHECK1(runtime_error, m_bucket_ptr, m_bucket_ptr != nullptr); + Extent *iop = &m_extent_ptr[ (hash % m_buckets) / c_buckets_per_extent ]; + uint8_t *bp = iop[0].p_byte; + uint8_t *ep = iop[1].p_byte; + THROW_CHECK2(out_of_range, m_byte_ptr, bp, bp >= m_byte_ptr); + THROW_CHECK2(out_of_range, m_byte_ptr_end, ep, ep <= m_byte_ptr_end); + return make_pair(bp, ep); +} + +void +BeesHashTable::flush_dirty_extents() +{ + if (using_shared_map()) return; + + THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0); + + unique_lock lock(m_extent_mutex); + auto dirty_extent_copy = m_buckets_dirty; + m_buckets_dirty.clear(); + if (dirty_extent_copy.empty()) { + BEESNOTE("idle"); + m_condvar.wait(lock); + return; // please call later, i.e. immediately + } + lock.unlock(); + + size_t extent_counter = 0; + for (auto extent_number : dirty_extent_copy) { + ++extent_counter; + BEESNOTE("flush extent #" << extent_number << " (" << extent_counter << " of " << dirty_extent_copy.size() << ")"); + catch_all([&]() { + uint8_t *dirty_extent = m_extent_ptr[extent_number].p_byte; + uint8_t *dirty_extent_end = m_extent_ptr[extent_number + 1].p_byte; + THROW_CHECK1(out_of_range, dirty_extent, dirty_extent >= m_byte_ptr); + THROW_CHECK1(out_of_range, dirty_extent_end, dirty_extent_end <= m_byte_ptr_end); + if (using_shared_map()) { + BEESTOOLONG("flush extent " << extent_number); + copy(dirty_extent, dirty_extent_end, dirty_extent); + } else { + BEESTOOLONG("pwrite(fd " << m_fd << " '" << name_fd(m_fd)<< "', length " << to_hex(dirty_extent_end - dirty_extent) << ", offset " << to_hex(dirty_extent - m_byte_ptr) << ")"); + // Page locks slow us down more than copying the data does + vector extent_copy(dirty_extent, dirty_extent_end); + pwrite_or_die(m_fd, extent_copy, dirty_extent - m_byte_ptr); + BEESCOUNT(hash_extent_out); + } + }); + BEESNOTE("flush rate limited at extent #" << extent_number << " (" << extent_counter << " of " << dirty_extent_copy.size() << ")"); + m_flush_rate_limit.sleep_for(BLOCK_SIZE_HASHTAB_EXTENT); + } +} + +void +BeesHashTable::set_extent_dirty(HashType hash) +{ + if (using_shared_map()) return; + THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0); + auto pr = get_extent_range(hash); + uint64_t extent_number = reinterpret_cast(pr.first) - m_extent_ptr; + THROW_CHECK1(runtime_error, extent_number, extent_number < m_extents); + unique_lock lock(m_extent_mutex); + m_buckets_dirty.insert(extent_number); + m_condvar.notify_one(); +} + +void +BeesHashTable::writeback_loop() +{ + if (!using_shared_map()) { + while (1) { + flush_dirty_extents(); + } + } +} + +static +string +percent(size_t num, size_t den) +{ + if (den) { + return astringprintf("%u%%", num * 100 / den); + } else { + return "--%"; + } +} + +void +BeesHashTable::prefetch_loop() +{ + // Always do the mlock, whether shared or not + THROW_CHECK1(runtime_error, m_size, m_size > 0); + catch_all([&]() { + BEESNOTE("mlock " << pretty(m_size)); + DIE_IF_NON_ZERO(mlock(m_byte_ptr, m_size)); + }); + + while (1) { + size_t width = 64; + vector occupancy(width, 0); + size_t occupied_count = 0; + size_t total_count = 0; + size_t compressed_count = 0; + size_t compressed_offset_count = 0; + size_t toxic_count = 0; + size_t unaligned_eof_count = 0; + + for (uint64_t ext = 0; ext < m_extents; ++ext) { + BEESNOTE("prefetching hash table extent " << ext << " of " << m_extent_ptr_end - m_extent_ptr); + catch_all([&]() { + fetch_missing_extent(ext * c_buckets_per_extent); + + BEESNOTE("analyzing hash table extent " << ext << " of " << m_extent_ptr_end - m_extent_ptr); + bool duplicate_bugs_found = false; + unique_lock lock(m_bucket_mutex); + for (Bucket *bucket = m_extent_ptr[ext].p_buckets; bucket < m_extent_ptr[ext + 1].p_buckets; ++bucket) { + if (verify_cell_range(bucket[0].p_cells, bucket[1].p_cells)) { + duplicate_bugs_found = true; + } + size_t this_bucket_occupied_count = 0; + for (Cell *cell = bucket[0].p_cells; cell < bucket[1].p_cells; ++cell) { + if (cell->e_addr) { + ++this_bucket_occupied_count; + BeesAddress a(cell->e_addr); + if (a.is_compressed()) { + ++compressed_count; + if (a.has_compressed_offset()) { + ++compressed_offset_count; + } + } + if (a.is_toxic()) { + ++toxic_count; + } + if (a.is_unaligned_eof()) { + ++unaligned_eof_count; + } + } + ++total_count; + } + ++occupancy.at(this_bucket_occupied_count * width / (1 + c_cells_per_bucket) ); + // Count these instead of calculating the number so we get better stats in case of exceptions + occupied_count += this_bucket_occupied_count; + } + lock.unlock(); + if (duplicate_bugs_found) { + set_extent_dirty(ext); + } + }); + } + + BEESNOTE("calculating hash table statistics"); + + vector histogram; + vector thresholds; + size_t threshold = 1; + bool threshold_exceeded = false; + do { + threshold_exceeded = false; + histogram.push_back(string(width, ' ')); + thresholds.push_back(threshold); + for (size_t x = 0; x < width; ++x) { + if (occupancy.at(x) >= threshold) { + histogram.back().at(x) = '#'; + threshold_exceeded = true; + } + } + threshold *= 2; + } while (threshold_exceeded); + + ostringstream out; + size_t count = histogram.size(); + bool first_line = true; + for (auto it = histogram.rbegin(); it != histogram.rend(); ++it) { + out << *it << " " << thresholds.at(--count); + if (first_line) { + first_line = false; + out << " pages"; + } + out << "\n"; + } + + size_t uncompressed_count = occupied_count - compressed_count; + size_t legacy_count = compressed_count - compressed_offset_count; + + ostringstream graph_blob; + + graph_blob << "Now: " << format_time(time(NULL)) << "\n"; + graph_blob << "Uptime: " << m_ctx->total_timer().age() << " seconds\n"; + + graph_blob + << "\nHash table page occupancy histogram (" << occupied_count << "/" << total_count << " cells occupied, " << (occupied_count * 100 / total_count) << "%)\n" + << out.str() << "0% | 25% | 50% | 75% | 100% page fill\n" + << "compressed " << compressed_count << " (" << percent(compressed_count, occupied_count) << ")" + << " new-style " << compressed_offset_count << " (" << percent(compressed_offset_count, occupied_count) << ")" + << " old-style " << legacy_count << " (" << percent(legacy_count, occupied_count) << ")\n" + << "uncompressed " << uncompressed_count << " (" << percent(uncompressed_count, occupied_count) << ")" + << " unaligned_eof " << unaligned_eof_count << " (" << percent(unaligned_eof_count, occupied_count) << ")" + << " toxic " << toxic_count << " (" << percent(toxic_count, occupied_count) << ")"; + + graph_blob << "\n\n"; + + graph_blob << "TOTAL:\n"; + auto thisStats = BeesStats::s_global; + graph_blob << "\t" << thisStats << "\n"; + + graph_blob << "\nRATES:\n"; + auto avg_rates = thisStats / m_ctx->total_timer().age(); + graph_blob << "\t" << avg_rates << "\n"; + + BEESLOG(graph_blob.str()); + catch_all([&]() { + m_stats_file.write(graph_blob.str()); + }); + + BEESNOTE("idle " << BEES_HASH_TABLE_ANALYZE_INTERVAL << "s"); + nanosleep(BEES_HASH_TABLE_ANALYZE_INTERVAL); + } +} + +void +BeesHashTable::fetch_missing_extent(HashType hash) +{ + BEESTOOLONG("fetch_missing_extent for hash " << to_hex(hash)); + if (using_shared_map()) return; + THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0); + auto pr = get_extent_range(hash); + uint64_t extent_number = reinterpret_cast(pr.first) - m_extent_ptr; + THROW_CHECK1(runtime_error, extent_number, extent_number < m_extents); + + unique_lock lock(m_extent_mutex); + if (!m_buckets_missing.count(extent_number)) { + return; + } + + size_t missing_buckets = m_buckets_missing.size(); + lock.unlock(); + + BEESNOTE("fetch waiting for hash extent #" << extent_number << ", " << missing_buckets << " left to fetch"); + + // Acquire blocking lock on this extent only + LockSet::Lock extent_lock(m_extent_lock_set, extent_number); + + // Check missing again because someone else might have fetched this + // extent for us while we didn't hold any locks + lock.lock(); + if (!m_buckets_missing.count(extent_number)) { + BEESCOUNT(hash_extent_in_twice); + return; + } + lock.unlock(); + + // OK we have to read this extent + BEESNOTE("fetching hash extent #" << extent_number << ", " << missing_buckets << " left to fetch"); + + BEESTRACE("Fetching missing hash extent " << extent_number); + uint8_t *dirty_extent = m_extent_ptr[extent_number].p_byte; + uint8_t *dirty_extent_end = m_extent_ptr[extent_number + 1].p_byte; + + { + BEESTOOLONG("pread(fd " << m_fd << " '" << name_fd(m_fd)<< "', length " << to_hex(dirty_extent_end - dirty_extent) << ", offset " << to_hex(dirty_extent - m_byte_ptr) << ")"); + pread_or_die(m_fd, dirty_extent, dirty_extent_end - dirty_extent, dirty_extent - m_byte_ptr); + } + + BEESCOUNT(hash_extent_in); + // We don't block when fetching an extent but we do slow down the + // prefetch thread. + m_prefetch_rate_limit.borrow(BLOCK_SIZE_HASHTAB_EXTENT); + lock.lock(); + m_buckets_missing.erase(extent_number); +} + +bool +BeesHashTable::is_toxic_hash(BeesHashTable::HashType hash) const +{ + return m_toxic_hashes.find(hash) != m_toxic_hashes.end(); +} + +vector +BeesHashTable::find_cell(HashType hash) +{ + // This saves a lot of time prefilling the hash table, and there's no risk of eviction + if (is_toxic_hash(hash)) { + BEESCOUNT(hash_toxic); + BeesAddress toxic_addr(0x1000); + toxic_addr.set_toxic(); + Cell toxic_cell(hash, toxic_addr); + vector rv; + rv.push_back(toxic_cell); + return rv; + } + fetch_missing_extent(hash); + BEESTOOLONG("find_cell hash " << BeesHash(hash)); + vector rv; + unique_lock lock(m_bucket_mutex); + auto er = get_cell_range(hash); + // FIXME: Weed out zero addresses in the table due to earlier bugs + copy_if(er.first, er.second, back_inserter(rv), [=](const Cell &ip) { return ip.e_hash == hash && ip.e_addr >= 0x1000; }); + BEESCOUNT(hash_lookup); + return rv; +} + +// Move an entry to the end of the list. Used after an attempt to resolve +// an address in the hash table fails. Probably more correctly called +// push_back_hash_addr, except it never inserts. Shared hash tables +// never erase anything, since there is no way to tell if an entry is +// out of date or just belonging to the wrong filesystem. +void +BeesHashTable::erase_hash_addr(HashType hash, AddrType addr) +{ + // if (m_shared) return; + fetch_missing_extent(hash); + BEESTOOLONG("erase hash " << to_hex(hash) << " addr " << addr); + unique_lock lock(m_bucket_mutex); + auto er = get_cell_range(hash); + Cell mv(hash, addr); + Cell *ip = find(er.first, er.second, mv); + bool found = (ip < er.second); + if (found) { + // Lookups on invalid addresses really hurt us. Kill it with fire! + *ip = Cell(0, 0); + set_extent_dirty(hash); + BEESCOUNT(hash_erase); +#if 0 + if (verify_cell_range(er.first, er.second)) { + BEESINFO("while erasing hash " << hash << " addr " << addr); + } +#endif + } +} + +// If entry is already present in list, move it to the front of the +// list without dropping any entries, and return true. If entry is not +// present in list, insert it at the front of the list, possibly dropping +// the last entry in the list, and return false. Used to move duplicate +// hash blocks to the front of the list. +bool +BeesHashTable::push_front_hash_addr(HashType hash, AddrType addr) +{ + fetch_missing_extent(hash); + BEESTOOLONG("push_front_hash_addr hash " << BeesHash(hash) <<" addr " << BeesAddress(addr)); + unique_lock lock(m_bucket_mutex); + auto er = get_cell_range(hash); + Cell mv(hash, addr); + Cell *ip = find(er.first, er.second, mv); + bool found = (ip < er.second); + if (!found) { + // If no match found, get rid of an empty space instead + // If no empty spaces, ip will point to end + ip = find(er.first, er.second, Cell(0, 0)); + } + if (ip > er.first) { + // Delete matching entry, first empty entry, + // or last entry whether empty or not + // move_backward(er.first, ip - 1, ip); + auto sp = ip; + auto dp = ip; + --sp; + // If we are deleting the last entry then don't copy it + if (ip == er.second) { + --sp; + --dp; + BEESCOUNT(hash_evict); + } + while (dp > er.first) { + *dp-- = *sp--; + } + } + // There is now a space at the front, insert there if different + if (er.first[0] != mv) { + er.first[0] = mv; + set_extent_dirty(hash); + BEESCOUNT(hash_front); + } +#if 0 + if (verify_cell_range(er.first, er.second)) { + BEESINFO("while push_fronting hash " << hash << " addr " << addr); + } +#endif + return found; +} + +// If entry is already present in list, returns true and does not +// modify list. If entry is not present in list, returns false and +// inserts at a random position in the list, possibly evicting the entry +// at the end of the list. Used to insert new unique (not-yet-duplicate) +// blocks in random order. +bool +BeesHashTable::push_random_hash_addr(HashType hash, AddrType addr) +{ + fetch_missing_extent(hash); + BEESTOOLONG("push_random_hash_addr hash " << BeesHash(hash) << " addr " << BeesAddress(addr)); + unique_lock lock(m_bucket_mutex); + auto er = get_cell_range(hash); + Cell mv(hash, addr); + Cell *ip = find(er.first, er.second, mv); + bool found = (ip < er.second); + + thread_local default_random_engine generator; + thread_local uniform_int_distribution distribution(0, c_cells_per_bucket - 1); + auto pos = distribution(generator); + + int case_cond = 0; + vector saved(er.first, er.second); + + if (found) { + // If hash already exists after pos, swap with pos + if (ip > er.first + pos) { + + // move_backward(er.first + pos, ip - 1, ip); + auto sp = ip; + auto dp = ip; + --sp; + while (dp > er.first + pos) { + *dp-- = *sp--; + } + *dp = mv; + BEESCOUNT(hash_bump); + case_cond = 1; + goto ret_dirty; + } + // Hash already exists before (or at) pos, leave it there + BEESCOUNT(hash_already); + case_cond = 2; + goto ret; + } + + // Find an empty space to back of pos + for (ip = er.first + pos; ip < er.second; ++ip) { + if (*ip == Cell(0, 0)) { + *ip = mv; + case_cond = 3; + goto ret_dirty; + } + } + + // Find an empty space to front of pos + // if there is anything to front of pos + if (pos > 0) { + for (ip = er.first + pos - 1; ip >= er.first; --ip) { + if (*ip == Cell(0, 0)) { + *ip = mv; + case_cond = 4; + goto ret_dirty; + } + } + } + + // Evict something and insert at pos + move_backward(er.first + pos, er.second - 1, er.second); + er.first[pos] = mv; + BEESCOUNT(hash_evict); + case_cond = 5; +ret_dirty: + BEESCOUNT(hash_insert); + set_extent_dirty(hash); +ret: +#if 0 + if (verify_cell_range(er.first, er.second, false)) { + BEESLOG("while push_randoming (case " << case_cond << ") pos " << pos + << " ip " << (ip - er.first) << " " << mv); + // dump_bucket(saved.data(), saved.data() + saved.size()); + // dump_bucket(er.first, er.second); + } +#else + (void)case_cond; +#endif + return found; +} + +void +BeesHashTable::try_mmap_flags(int flags) +{ + if (!m_cell_ptr) { + THROW_CHECK1(out_of_range, m_size, m_size > 0); + Timer map_time; + catch_all([&]() { + BEESLOG("mapping hash table size " << m_size << " with flags " << mmap_flags_ntoa(flags)); + void *ptr = mmap_or_die(nullptr, m_size, PROT_READ | PROT_WRITE, flags, flags & MAP_ANONYMOUS ? -1 : int(m_fd), 0); + BEESLOG("mmap done in " << map_time << " sec"); + m_cell_ptr = static_cast(ptr); + void *ptr_end = static_cast(ptr) + m_size; + m_cell_ptr_end = static_cast(ptr_end); + }); + } +} + +void +BeesHashTable::set_shared(bool shared) +{ + m_shared = shared; +} + +BeesHashTable::BeesHashTable(shared_ptr ctx, string filename) : + m_ctx(ctx), + m_size(0), + m_void_ptr(nullptr), + m_void_ptr_end(nullptr), + m_buckets(0), + m_cells(0), + m_writeback_thread("hash_writeback"), + m_prefetch_thread("hash_prefetch " + m_ctx->root_path()), + m_flush_rate_limit(BEES_FLUSH_RATE), + m_prefetch_rate_limit(BEES_FLUSH_RATE), + m_stats_file(m_ctx->home_fd(), "beesstats.txt") +{ + BEESNOTE("opening hash table " << filename); + + m_fd = openat_or_die(m_ctx->home_fd(), filename, FLAGS_OPEN_FILE_RW, 0700); + Stat st(m_fd); + m_size = st.st_size; + + BEESTRACE("hash table size " << m_size); + BEESTRACE("hash table bucket size " << BLOCK_SIZE_HASHTAB_BUCKET); + BEESTRACE("hash table extent size " << BLOCK_SIZE_HASHTAB_EXTENT); + + THROW_CHECK2(invalid_argument, BLOCK_SIZE_HASHTAB_BUCKET, BLOCK_SIZE_HASHTAB_EXTENT, (BLOCK_SIZE_HASHTAB_EXTENT % BLOCK_SIZE_HASHTAB_BUCKET) == 0); + + // Does the union work? + THROW_CHECK2(runtime_error, m_void_ptr, m_cell_ptr, m_void_ptr == m_cell_ptr); + THROW_CHECK2(runtime_error, m_void_ptr, m_byte_ptr, m_void_ptr == m_byte_ptr); + THROW_CHECK2(runtime_error, m_void_ptr, m_bucket_ptr, m_void_ptr == m_bucket_ptr); + THROW_CHECK2(runtime_error, m_void_ptr, m_extent_ptr, m_void_ptr == m_extent_ptr); + + // There's more than one union + THROW_CHECK2(runtime_error, sizeof(Bucket), BLOCK_SIZE_HASHTAB_BUCKET, BLOCK_SIZE_HASHTAB_BUCKET == sizeof(Bucket)); + THROW_CHECK2(runtime_error, sizeof(Bucket::p_byte), BLOCK_SIZE_HASHTAB_BUCKET, BLOCK_SIZE_HASHTAB_BUCKET == sizeof(Bucket::p_byte)); + THROW_CHECK2(runtime_error, sizeof(Extent), BLOCK_SIZE_HASHTAB_EXTENT, BLOCK_SIZE_HASHTAB_EXTENT == sizeof(Extent)); + THROW_CHECK2(runtime_error, sizeof(Extent::p_byte), BLOCK_SIZE_HASHTAB_EXTENT, BLOCK_SIZE_HASHTAB_EXTENT == sizeof(Extent::p_byte)); + + BEESLOG("opened hash table filename '" << filename << "' length " << m_size); + m_buckets = m_size / BLOCK_SIZE_HASHTAB_BUCKET; + m_cells = m_buckets * c_cells_per_bucket; + m_extents = (m_size + BLOCK_SIZE_HASHTAB_EXTENT - 1) / BLOCK_SIZE_HASHTAB_EXTENT; + BEESLOG("\tcells " << m_cells << ", buckets " << m_buckets << ", extents " << m_extents); + + BEESLOG("\tflush rate limit " << BEES_FLUSH_RATE); + + if (using_shared_map()) { + try_mmap_flags(MAP_SHARED); + } else { + try_mmap_flags(MAP_PRIVATE | MAP_ANONYMOUS); + } + + if (!m_cell_ptr) { + THROW_ERROR(runtime_error, "unable to mmap " << filename); + } + + if (!using_shared_map()) { + // madvise fails if MAP_SHARED + if (using_any_madvise()) { + // DONTFORK because we sometimes do fork, + // but the child doesn't touch any of the many, many pages + BEESTOOLONG("madvise(MADV_HUGEPAGE | MADV_DONTFORK)"); + DIE_IF_NON_ZERO(madvise(m_byte_ptr, m_size, MADV_HUGEPAGE | MADV_DONTFORK)); + } + for (uint64_t i = 0; i < m_size / sizeof(Extent); ++i) { + m_buckets_missing.insert(i); + } + } + + m_writeback_thread.exec([&]() { + writeback_loop(); + }); + + m_prefetch_thread.exec([&]() { + prefetch_loop(); + }); + + // Blacklist might fail if the hash table is not stored on a btrfs + catch_all([&]() { + m_ctx->blacklist_add(BeesFileId(m_fd)); + }); + + // Skip zero because we already weed that out before it gets near a hash function + for (unsigned i = 1; i < 256; ++i) { + vector v(BLOCK_SIZE_SUMS, i); + HashType hash = Digest::CRC::crc64(v.data(), v.size()); + m_toxic_hashes.insert(hash); + } +} + +BeesHashTable::~BeesHashTable() +{ + if (m_cell_ptr && m_size) { + flush_dirty_extents(); + catch_all([&]() { + DIE_IF_NON_ZERO(munmap(m_cell_ptr, m_size)); + m_cell_ptr = nullptr; + m_size = 0; + }); + } +} + diff --git a/src/bees-resolve.cc b/src/bees-resolve.cc new file mode 100644 index 0000000..b5dd03d --- /dev/null +++ b/src/bees-resolve.cc @@ -0,0 +1,487 @@ +#include "bees.h" + +#include "crucible/limits.h" +#include "crucible/string.h" + +using namespace crucible; +using namespace std; + +BeesAddress +BeesResolver::addr(BeesAddress new_addr) +{ + THROW_CHECK1(invalid_argument, new_addr, !new_addr.is_magic()); + + m_found_data = false; + m_found_dup = false; + m_found_hash = false; + m_wrong_data = false; + m_biors.clear(); + m_ranges.clear(); + m_addr = new_addr; + m_bior_count = 0; + + auto rv = m_ctx->resolve_addr(m_addr); + m_biors = rv.m_biors; + m_is_toxic = rv.m_is_toxic; + m_bior_count = m_biors.size(); + + return m_addr; +} + +BeesResolver::BeesResolver(shared_ptr ctx, BeesAddress new_addr) : + m_ctx(ctx), + m_bior_count(0) +{ + addr(new_addr); +} + +BeesBlockData +BeesResolver::adjust_offset(const BeesFileRange &haystack, const BeesBlockData &needle) +{ + BEESTRACE("Searching for needle " << needle << "\n\tin haystack " << haystack); + + BEESCOUNT(adjust_try); + + // Constraint checks + THROW_CHECK1(invalid_argument, needle.begin(), (needle.begin() & BLOCK_MASK_CLONE) == 0); + THROW_CHECK1(invalid_argument, haystack.begin(), (haystack.begin() & BLOCK_MASK_CLONE) == 0); + + // Need to know the precise dimensions of the haystack and needle + off_t haystack_size = haystack.file_size(); + + // If the needle is not a full block then it can only match at EOF + off_t needle_len = needle.size(); + bool is_unaligned_eof = needle_len & BLOCK_MASK_CLONE; + BEESTRACE("is_unaligned_eof = " << is_unaligned_eof << ", needle_len = " << to_hex(needle_len) << ", haystack_size = " << to_hex(haystack_size)); + + // Unaligned EOF can only match at EOF, so only check there + if (is_unaligned_eof) { + BEESTRACE("Construct needle_bfr from " << needle); + BeesFileRange needle_bfr(needle); + + // Census + if (haystack_size & BLOCK_MASK_CLONE) { + BEESCOUNT(adjust_eof_haystack); + } + if (needle_bfr.end() & BLOCK_MASK_CLONE) { + BEESCOUNT(adjust_eof_needle); + } + + // Non-aligned part of the lengths must be the same + if ( (haystack_size & BLOCK_MASK_CLONE) != (needle_bfr.end() & BLOCK_MASK_CLONE) ) { + BEESCOUNT(adjust_eof_fail); + return BeesBlockData(); + } + + // Read the haystack block + BEESTRACE("Reading haystack (haystack_size = " << to_hex(haystack_size) << ")"); + BeesBlockData straw(haystack.fd(), haystack_size & ~BLOCK_MASK_CLONE, haystack_size & BLOCK_MASK_CLONE); + + // It either matches or it doesn't + BEESTRACE("Verifying haystack " << straw); + if (straw.is_data_equal(needle)) { + BEESCOUNT(adjust_eof_hit); + m_found_data = true; + m_found_hash = true; + return straw; + } + + // Check for matching hash + BEESTRACE("Verifying haystack hash"); + if (straw.hash() == needle.hash()) { + // OK at least the hash is still valid + m_found_hash = true; + } + + BEESCOUNT(adjust_eof_miss); + // BEESLOG("adjust_eof_miss " << straw); + return BeesBlockData(); + } + + off_t lower_offset = haystack.begin(); + off_t upper_offset = haystack.end(); + bool is_compressed_offset = false; + bool is_exact = false; + bool is_legacy = false; + if (m_addr.is_compressed()) { + BtrfsExtentWalker ew(haystack.fd(), haystack.begin(), m_ctx->root_fd()); + BEESTRACE("haystack extent data " << ew); + Extent e = ew.current(); + if (m_addr.has_compressed_offset()) { + off_t coff = m_addr.get_compressed_offset(); + if (e.offset() > coff) { + // this extent begins after the target block + BEESCOUNT(adjust_offset_low); + return BeesBlockData(); + } + coff -= e.offset(); + if (e.size() <= coff) { + // this extent ends before the target block + BEESCOUNT(adjust_offset_high); + return BeesBlockData(); + } + lower_offset = e.begin() + coff; + upper_offset = lower_offset + BLOCK_SIZE_CLONE; + BEESCOUNT(adjust_offset_hit); + is_compressed_offset = true; + } else { + lower_offset = e.begin(); + upper_offset = e.end(); + BEESCOUNT(adjust_legacy); + is_legacy = true; + } + } else { + BEESCOUNT(adjust_exact); + is_exact = true; + } + + BEESTRACE("Checking haystack " << haystack << " offsets " << to_hex(lower_offset) << ".." << to_hex(upper_offset)); + + // Check all the blocks in the list + for (off_t haystack_offset = lower_offset; haystack_offset < upper_offset; haystack_offset += BLOCK_SIZE_CLONE) { + THROW_CHECK1(out_of_range, haystack_offset, (haystack_offset & BLOCK_MASK_CLONE) == 0); + + // Straw cannot extend beyond end of haystack + if (haystack_offset + needle.size() > haystack_size) { + BEESCOUNT(adjust_needle_too_long); + break; + } + + // Read the haystack + BEESTRACE("straw " << name_fd(haystack.fd()) << ", offset " << to_hex(haystack_offset) << ", length " << needle.size()); + BeesBlockData straw(haystack.fd(), haystack_offset, needle.size()); + + BEESTRACE("straw = " << straw); + + // Stop if we find a match + if (straw.is_data_equal(needle)) { + BEESCOUNT(adjust_hit); + m_found_data = true; + m_found_hash = true; + if (is_compressed_offset) BEESCOUNT(adjust_compressed_offset_correct); + if (is_legacy) BEESCOUNT(adjust_legacy_correct); + if (is_exact) BEESCOUNT(adjust_exact_correct); + return straw; + } + + if (straw.hash() != needle.hash()) { + // Not the same hash or data, try next block + BEESCOUNT(adjust_miss); + continue; + } + + // Found the hash but not the data. Yay! + m_found_hash = true; + BEESLOG("HASH COLLISION\n" + << "\tneedle " << needle << "\n" + << "\tstraw " << straw); + BEESCOUNT(hash_collision); + } + + // Ran out of offsets to try + BEESCOUNT(adjust_no_match); + if (is_compressed_offset) BEESCOUNT(adjust_compressed_offset_wrong); + if (is_legacy) BEESCOUNT(adjust_legacy_wrong); + if (is_exact) BEESCOUNT(adjust_exact_wrong); + m_wrong_data = true; + return BeesBlockData(); +} + +BeesFileRange +BeesResolver::chase_extent_ref(const BtrfsInodeOffsetRoot &bior, BeesBlockData &needle_bbd) +{ + BEESTRACE("chase_extent_ref bior " << bior << " needle_bbd " << needle_bbd); + BEESNOTE("chase_extent_ref bior " << bior << " needle_bbd " << needle_bbd); + BEESCOUNT(chase_try); + + Fd file_fd = m_ctx->roots()->open_root_ino(bior.m_root, bior.m_inum); + if (!file_fd) { + // Delete snapshots generate craptons of these + // BEESINFO("No FD in chase_extent_ref " << bior); + BEESCOUNT(chase_no_fd); + return BeesFileRange(); + } + + BEESNOTE("searching at offset " << to_hex(bior.m_offset) << " in file " << name_fd(file_fd) << "\n\tfor " << needle_bbd); + + BEESTRACE("bior file " << name_fd(file_fd)); + BEESTRACE("get file_addr " << bior); + BeesAddress file_addr(file_fd, bior.m_offset, m_ctx); + BEESTRACE("file_addr " << file_addr); + + // ...or are we? + if (file_addr.is_magic()) { + BEESINFO("file_addr is magic: file_addr = " << file_addr << " bior = " << bior << " needle_bbd = " << needle_bbd); + BEESCOUNT(chase_wrong_magic); + return BeesFileRange(); + } + THROW_CHECK1(invalid_argument, m_addr, !m_addr.is_magic()); + + // Did we get the physical block we asked for? The magic bits have to match too, + // but the compressed offset bits do not. + if (file_addr.get_physical_or_zero() != m_addr.get_physical_or_zero()) { + // BEESINFO("found addr " << file_addr << " at " << name_fd(file_fd) << " offset " << to_hex(bior.m_offset) << " but looking for " << m_addr); + // FIEMAP/resolve are working, but the data is old. + BEESCOUNT(chase_wrong_addr); + return BeesFileRange(); + } + + // Calculate end of range, which is a sum block or less + // It's a sum block because we have to compare content now + off_t file_size = Stat(file_fd).st_size; + off_t bior_offset = ranged_cast(bior.m_offset); + off_t end_offset = min(file_size, bior_offset + needle_bbd.size()); + BeesBlockData haystack_bbd(file_fd, bior_offset, end_offset - bior_offset); + + BEESTRACE("matched haystack_bbd " << haystack_bbd << " file_addr " << file_addr); + + // If the data was compressed and no offset was captured then + // we won't get an exact address from resolve. + // Search near the resolved address for a matching data block. + // ...even if it's not compressed, we should do this sanity + // check before considering the block as a duplicate candidate. + auto new_bbd = adjust_offset(haystack_bbd, needle_bbd); + if (new_bbd.empty()) { + // matching offset search failed + BEESCOUNT(chase_wrong_data); + return BeesFileRange(); + } + if (new_bbd.begin() == haystack_bbd.begin()) { + BEESCOUNT(chase_uncorrected); + } else { + // corrected the bfr + BEESCOUNT(chase_corrected); + haystack_bbd = new_bbd; + } + + // We have found at least one duplicate block, so resolve was a success + BEESCOUNT(chase_hit); + + // Matching block + BEESTRACE("Constructing dst_bfr { " << BeesFileId(haystack_bbd.fd()) << ", " << to_hex(haystack_bbd.begin()) << ".." << to_hex(haystack_bbd.end()) << " }"); + BeesFileRange dst_bfr(BeesFileId(haystack_bbd.fd()), haystack_bbd.begin(), haystack_bbd.end()); + + return dst_bfr; +} + +void +BeesResolver::replace_src(const BeesFileRange &src_bfr) +{ + BEESTRACE("replace_src src_bfr " << src_bfr); + THROW_CHECK0(runtime_error, !m_is_toxic); + BEESCOUNT(replacesrc_try); + + // Open src, reuse it for all dst + auto i_bfr = src_bfr; + BEESNOTE("Opening src bfr " << i_bfr); + BEESTRACE("Opening src bfr " << i_bfr); + i_bfr.fd(m_ctx); + + BeesBlockData bbd(i_bfr); + + for_each_extent_ref(bbd, [&](const BeesFileRange &j) -> bool { + // Open dst + auto j_bfr = j; + BEESNOTE("Opening dst bfr " << j_bfr); + BEESTRACE("Opening dst bfr " << j_bfr); + j_bfr.fd(m_ctx); + + if (i_bfr.overlaps(j_bfr)) { + BEESCOUNT(replacesrc_overlaps); + return false; // i.e. continue + } + + // Make pair(src, dst) + BEESTRACE("creating brp (" << i_bfr << ", " << j_bfr << ")"); + BeesRangePair brp(i_bfr, j_bfr); + BEESTRACE("Found matching range: " << brp); + + // Extend range at beginning + BEESNOTE("Extending matching range: " << brp); + // No particular reason to be constrained? + if (brp.grow(m_ctx, true)) { + BEESCOUNT(replacesrc_grown); + } + + // Dedup + BEESNOTE("dedup " << brp); + if (m_ctx->dedup(brp)) { + BEESCOUNT(replacesrc_dedup_hit); + m_found_dup = true; + } else { + BEESCOUNT(replacesrc_dedup_miss); + } + return false; // i.e. continue + }); +} + +void +BeesResolver::find_matches(bool just_one, BeesBlockData &bbd) +{ + // Walk through the (ino, offset, root) tuples until we find a match. + BEESTRACE("finding all matches for " << bbd << " at " << m_addr << ": " << m_biors.size() << " found"); + THROW_CHECK0(runtime_error, !m_is_toxic); + bool stop_now = false; + for (auto ino_off_root : m_biors) { + if (m_wrong_data) { + return; + } + + BEESTRACE("ino_off_root " << ino_off_root); + BeesFileId this_fid(ino_off_root.m_root, ino_off_root.m_inum); + + // Silently ignore blacklisted files, e.g. BeesTempFile files + if (m_ctx->is_blacklisted(this_fid)) { + continue; + } + + // Look at the old data + catch_all([&]() { + BEESTRACE("chase_extent_ref ino " << ino_off_root << " bbd " << bbd); + auto new_range = chase_extent_ref(ino_off_root, bbd); + if (new_range) { + m_ranges.insert(new_range.copy_closed()); + stop_now = true; + } + }); + + if (just_one && stop_now) { + break; + } + } +} + +bool +BeesResolver::for_each_extent_ref(BeesBlockData bbd, function visitor) +{ + // Walk through the (ino, offset, root) tuples until we are told to stop + BEESTRACE("for_each_extent_ref " << bbd << " at " << m_addr << ": " << m_biors.size() << " found"); + THROW_CHECK0(runtime_error, !m_is_toxic); + bool stop_now = false; + for (auto ino_off_root : m_biors) { + BEESTRACE("ino_off_root " << ino_off_root); + BeesFileId this_fid(ino_off_root.m_root, ino_off_root.m_inum); + + // Silently ignore blacklisted files, e.g. BeesTempFile files + if (m_ctx->is_blacklisted(this_fid)) { + continue; + } + + // Look at the old data + catch_all([&]() { + BEESTRACE("chase_extent_ref ino " << ino_off_root << " bbd " << bbd); + auto new_range = chase_extent_ref(ino_off_root, bbd); + // XXX: should we catch visitor's exceptions here? + if (new_range) { + stop_now = visitor(new_range); + } else { + // We have reliable block addresses now, so we guarantee we can hit the desired block. + // Failure in chase_extent_ref means we are done, and don't need to look up all the + // other references. + stop_now = true; + } + }); + + if (stop_now) { + break; + } + } + return stop_now; +} + +BeesFileRange +BeesResolver::replace_dst(const BeesFileRange &dst_bfr) +{ + BEESTRACE("replace_dst dst_bfr " << dst_bfr); + BEESCOUNT(replacedst_try); + + // Open dst, reuse it for all src + BEESNOTE("Opening dst bfr " << dst_bfr); + BEESTRACE("Opening dst bfr " << dst_bfr); + dst_bfr.fd(m_ctx); + + BeesFileRange overlap_bfr; + BEESTRACE("overlap_bfr " << overlap_bfr); + + BeesBlockData bbd(dst_bfr); + + for_each_extent_ref(bbd, [&](const BeesFileRange &src_bfr) -> bool { + // Open src + BEESNOTE("Opening src bfr " << src_bfr); + BEESTRACE("Opening src bfr " << src_bfr); + src_bfr.fd(m_ctx); + + if (dst_bfr.overlaps(src_bfr)) { + BEESCOUNT(replacedst_overlaps); + return false; // i.e. continue + } + + // If dst is already occupying src, skip. + // FIXME: BeesContext::scan_one_extent should be weeding these out, but does not. + BeesBlockData src_bbd(src_bfr.fd(), src_bfr.begin(), min(BLOCK_SIZE_SUMS, src_bfr.size())); + if (bbd.addr().get_physical_or_zero() == src_bbd.addr().get_physical_or_zero()) { + BEESCOUNT(replacedst_same); + return false; // i.e. continue + } + + // Make pair(src, dst) + BEESTRACE("creating brp (" << src_bfr << ", " << dst_bfr << ")"); + BeesRangePair brp(src_bfr, dst_bfr); + BEESTRACE("Found matching range: " << brp); + + // Extend range at beginning + BEESNOTE("Extending matching range: " << brp); + // 'false' Has nasty loops, and may not be faster. + // 'true' At best, keeps fragmentation constant...but can also make it worse + if (brp.grow(m_ctx, true)) { + BEESCOUNT(replacedst_grown); + } + + // Dedup + BEESNOTE("dedup " << brp); + if (m_ctx->dedup(brp)) { + BEESCOUNT(replacedst_dedup_hit); + m_found_dup = true; + overlap_bfr = brp.second; + // FIXME: find best range first, then dedup that + return true; // i.e. break + } else { + BEESCOUNT(replacedst_dedup_miss); + return false; // i.e. continue + } + }); + // BEESLOG("overlap_bfr after " << overlap_bfr); + return overlap_bfr.copy_closed(); +} + +BeesFileRange +BeesResolver::find_one_match(BeesBlockData &bbd) +{ + THROW_CHECK0(runtime_error, !m_is_toxic); + find_matches(true, bbd); + if (m_ranges.empty()) { + return BeesFileRange(); + } else { + return *m_ranges.begin(); + } +} + +set +BeesResolver::find_all_matches(BeesBlockData &bbd) +{ + THROW_CHECK0(runtime_error, !m_is_toxic); + find_matches(false, bbd); + return m_ranges; +} + +bool +BeesResolver::operator<(const BeesResolver &that) const +{ + if (that.m_bior_count < m_bior_count) { + return true; + } else if (m_bior_count < that.m_bior_count) { + return false; + } + return m_addr < that.m_addr; +} + diff --git a/src/bees-roots.cc b/src/bees-roots.cc new file mode 100644 index 0000000..8f11570 --- /dev/null +++ b/src/bees-roots.cc @@ -0,0 +1,823 @@ +#include "bees.h" + +#include "crucible/cache.h" +#include "crucible/string.h" + +#include +#include + +using namespace crucible; +using namespace std; + +string +format_time(time_t t) +{ + struct tm *tmp = localtime(&t); + char buf[1024]; + strftime(buf, sizeof(buf), "%Y-%m-%d-%H-%M-%S", tmp); + return buf; +} + +ostream & +operator<<(ostream &os, const BeesCrawlState &bcs) +{ + time_t now = time(NULL); + auto age = now - bcs.m_started; + return os << "BeesCrawlState " + << bcs.m_root << ":" << bcs.m_objectid << " offset " << to_hex(bcs.m_offset) + << " transid " << bcs.m_min_transid << ".." << bcs.m_max_transid + << " started " << format_time(bcs.m_started) << " (" << age << "s ago)"; +} + +BeesCrawlState::BeesCrawlState() : + m_root(0), + m_objectid(0), + m_offset(0), + m_min_transid(0), + m_max_transid(0), + m_started(time(NULL)) +{ +} + +bool +BeesCrawlState::operator<(const BeesCrawlState &that) const +{ + return tie(m_root, m_objectid, m_offset, m_min_transid, m_max_transid) + < tie(that.m_root, that.m_objectid, that.m_offset, that.m_min_transid, that.m_max_transid); +} + +string +BeesRoots::crawl_state_filename() const +{ + string rv; + rv += "beescrawl."; + rv += m_ctx->root_uuid(); + rv += ".dat"; + return rv; +} + +void +BeesRoots::state_save() +{ + // Make sure we have a full complement of crawlers + insert_new_crawl(); + + BEESNOTE("saving crawl state"); + BEESLOG("Saving crawl state"); + BEESTOOLONG("Saving crawl state"); + + Timer save_time; + + unique_lock lock(m_mutex); + + // We don't have ofstreamat or ofdstream in C++11, so we're building a string and writing it with raw syscalls. + ostringstream ofs; + + if (!m_crawl_dirty) { + BEESLOG("Nothing to save"); + return; + } + + for (auto i : m_root_crawl_map) { + auto ibcs = i.second->get_state(); + if (ibcs.m_max_transid) { + ofs << "root " << ibcs.m_root << " "; + ofs << "objectid " << ibcs.m_objectid << " "; + ofs << "offset " << ibcs.m_offset << " "; + ofs << "min_transid " << ibcs.m_min_transid << " "; + ofs << "max_transid " << ibcs.m_max_transid << " "; + ofs << "started " << ibcs.m_started << " "; + ofs << "start_ts " << format_time(ibcs.m_started) << "\n"; + } + } + + if (ofs.str().empty()) { + BEESLOG("Crawl state empty!"); + m_crawl_dirty = false; + return; + } + + lock.unlock(); + + m_crawl_state_file.write(ofs.str()); + + BEESNOTE("relocking crawl state"); + lock.lock(); + // Not really correct but probably close enough + m_crawl_dirty = false; + BEESLOG("Saved crawl state in " << save_time << "s"); +} + +BeesCrawlState +BeesRoots::crawl_state_get(uint64_t rootid) +{ + unique_lock lock(m_mutex); + auto rv = m_root_crawl_map.at(rootid)->get_state(); + THROW_CHECK2(runtime_error, rv.m_root, rootid, rv.m_root == rootid); + return rv; +} + +void +BeesRoots::crawl_state_set_dirty() +{ + unique_lock lock(m_mutex); + m_crawl_dirty = true; +} + +void +BeesRoots::crawl_state_erase(const BeesCrawlState &bcs) +{ + unique_lock lock(m_mutex); + + // Do not delete the last entry, it holds our max_transid + if (m_root_crawl_map.size() < 2) { + BEESCOUNT(crawl_no_empty); + return; + } + + if (m_root_crawl_map.count(bcs.m_root)) { + m_root_crawl_map.erase(bcs.m_root); + m_crawl_dirty = true; + } +} + +uint64_t +BeesRoots::transid_min() +{ + BEESNOTE("Calculating transid_min"); + unique_lock lock(m_mutex); + if (m_root_crawl_map.empty()) { + return 0; + } + uint64_t rv = numeric_limits::max(); + for (auto i : m_root_crawl_map) { + rv = min(rv, i.second->get_state().m_min_transid); + } + return rv; +} + +uint64_t +BeesRoots::transid_max() +{ + BEESNOTE("Calculating transid_max"); + uint64_t rv = 0; + uint64_t root = 0; + BEESTRACE("Calculating transid_max..."); + do { + root = next_root(root); + if (root) { + catch_all([&]() { + auto transid = btrfs_get_root_transid(open_root(root)); + rv = max(rv, transid); + // BEESLOG("\troot " << root << " transid " << transid << " max " << rv); + }); + } + } while (root); + return rv; +} + +void +BeesRoots::crawl_roots() +{ + BEESNOTE("Crawling roots"); + + unique_lock lock(m_mutex); + if (m_root_crawl_map.empty()) { + BEESNOTE("idle, crawl map is empty"); + m_condvar.wait(lock); + // Don't count the time we were waiting as part of the crawl time + m_crawl_timer.reset(); + } + + // Work from a copy because BeesCrawl might change the world under us + auto crawl_map_copy = m_root_crawl_map; + lock.unlock(); + + BeesFileRange first_range; + shared_ptr first_crawl; + for (auto i : crawl_map_copy) { + auto this_crawl = i.second; + auto this_range = this_crawl->peek_front(); + if (this_range) { + auto tuple_this = make_tuple(this_range.fid().ino(), this_range.fid().root(), this_range.begin()); + auto tuple_first = make_tuple(first_range.fid().ino(), first_range.fid().root(), first_range.begin()); + if (!first_range || tuple_this < tuple_first) { + first_crawl = this_crawl; + first_range = this_range; + } + } + } + + if (first_range) { + catch_all([&]() { + // BEESINFO("scan_forward " << first_range); + m_ctx->scan_forward(first_range); + }); + BEESCOUNT(crawl_scan); + m_crawl_current = first_crawl->get_state(); + auto first_range_popped = first_crawl->pop_front(); + THROW_CHECK2(runtime_error, first_range, first_range_popped, first_range == first_range_popped); + return; + } + + BEESLOG("Crawl ran out of data after " << m_crawl_timer.lap() << "s, waiting for more..."); + BEESCOUNT(crawl_done); + BEESNOTE("idle, waiting for more data"); + lock.lock(); + m_condvar.wait(lock); + + // Don't count the time we were waiting as part of the crawl time + m_crawl_timer.reset(); +} + +void +BeesRoots::crawl_thread() +{ + BEESNOTE("crawling"); + while (1) { + catch_all([&]() { + crawl_roots(); + }); + } +} + +void +BeesRoots::writeback_thread() +{ + while (1) { + BEESNOTE(m_crawl_current << (m_crawl_dirty ? " (dirty)" : "")); + + catch_all([&]() { + BEESNOTE("saving crawler state"); + state_save(); + }); + + nanosleep(BEES_WRITEBACK_INTERVAL); + + } +} + +void +BeesRoots::insert_root(const BeesCrawlState &new_bcs) +{ + unique_lock lock(m_mutex); + if (!m_root_crawl_map.count(new_bcs.m_root)) { + auto new_bcp = make_shared(m_ctx, new_bcs); + auto new_pair = make_pair(new_bcs.m_root, new_bcp); + m_root_crawl_map.insert(new_pair); + m_crawl_dirty = true; + } +} + +void +BeesRoots::insert_new_crawl() +{ + BEESNOTE("adding crawlers for new subvols and removing crawlers for removed subvols"); + + BeesCrawlState new_bcs; + // Avoid a wasted loop iteration by starting from root 5 + new_bcs.m_root = BTRFS_FS_TREE_OBJECTID; + new_bcs.m_min_transid = transid_min(); + new_bcs.m_max_transid = transid_max(); + + unique_lock lock(m_mutex); + set excess_roots; + for (auto i : m_root_crawl_map) { + excess_roots.insert(i.first); + } + lock.unlock(); + + while (new_bcs.m_root) { + excess_roots.erase(new_bcs.m_root); + insert_root(new_bcs); + BEESCOUNT(crawl_create); + new_bcs.m_root = next_root(new_bcs.m_root); + } + + for (auto i : excess_roots) { + new_bcs.m_root = i; + crawl_state_erase(new_bcs); + } + + // Wake up crawl_roots if sleeping + lock.lock(); + m_condvar.notify_all(); +} + +void +BeesRoots::state_load() +{ + BEESNOTE("loading crawl state"); + BEESLOG("loading crawl state"); + + string crawl_data = m_crawl_state_file.read(); + + for (auto line : split("\n", crawl_data)) { + BEESLOG("Read line: " << line); + map d; + auto words = split(" ", line); + for (auto it = words.begin(); it < words.end(); ++it) { + auto it1 = it; + ++it; + THROW_CHECK1(out_of_range, words.size(), it < words.end()); + string key = *it1; + uint64_t val = from_hex(*it); + BEESTRACE("key " << key << " val " << val); + auto result = d.insert(make_pair(key, val)); + THROW_CHECK0(runtime_error, result.second); + } + BeesCrawlState loaded_state; + loaded_state.m_root = d.at("root"); + loaded_state.m_objectid = d.at("objectid"); + loaded_state.m_offset = d.at("offset"); + loaded_state.m_min_transid = d.count("gen_current") ? d.at("gen_current") : d.at("min_transid"); + loaded_state.m_max_transid = d.count("gen_next") ? d.at("gen_next") : d.at("max_transid"); + if (d.count("started")) { + loaded_state.m_started = d.at("started"); + } + BEESLOG("loaded_state " << loaded_state); + insert_root(loaded_state); + } +} + +BeesRoots::BeesRoots(shared_ptr ctx) : + m_ctx(ctx), + m_crawl_state_file(ctx->home_fd(), crawl_state_filename()), + m_crawl_thread("crawl " + ctx->root_path()), + m_writeback_thread("crawl_writeback " + ctx->root_path()) +{ + m_crawl_thread.exec([&]() { + catch_all([&]() { + state_load(); + }); + m_writeback_thread.exec([&]() { + writeback_thread(); + }); + crawl_thread(); + }); +} + +Fd +BeesRoots::open_root_nocache(uint64_t rootid) +{ + BEESTRACE("open_root_nocache " << rootid); + BEESNOTE("open_root_nocache " << rootid); + + // Stop recursion at the root of the filesystem tree + if (rootid == BTRFS_FS_TREE_OBJECTID) { + return m_ctx->root_fd(); + } + + // Find backrefs for this rootid and follow up to root + BtrfsIoctlSearchKey sk; + sk.tree_id = BTRFS_ROOT_TREE_OBJECTID; + sk.min_objectid = sk.max_objectid = rootid; + sk.min_type = sk.max_type = BTRFS_ROOT_BACKREF_KEY; + + BEESTRACE("sk " << sk); + while (sk.min_objectid <= rootid) { + sk.nr_items = 1024; + sk.do_ioctl(m_ctx->root_fd()); + + if (sk.m_result.empty()) { + break; + } + + for (auto i : sk.m_result) { + sk.next_min(i); + if (i.type == BTRFS_ROOT_BACKREF_KEY && i.objectid == rootid) { + auto dirid = call_btrfs_get(btrfs_stack_root_ref_dirid, i.m_data); + auto name_len = call_btrfs_get(btrfs_stack_root_ref_name_len, i.m_data); + auto name_start = sizeof(struct btrfs_root_ref); + auto name_end = name_len + name_start; + THROW_CHECK2(runtime_error, i.m_data.size(), name_end, i.m_data.size() >= name_end); + string name(i.m_data.data() + name_start, i.m_data.data() + name_end); + + auto parent_rootid = i.offset; + // BEESLOG("parent_rootid " << parent_rootid << " dirid " << dirid << " name " << name); + BEESTRACE("parent_rootid " << parent_rootid << " dirid " << dirid << " name " << name); + Fd parent_fd = open_root(parent_rootid); + if (!parent_fd) { + BEESLOGTRACE("no parent_fd"); + continue; + } + + if (dirid != BTRFS_FIRST_FREE_OBJECTID) { + BEESTRACE("dirid " << dirid << " root " << rootid << " INO_PATH"); + BtrfsIoctlInoPathArgs ino(dirid); + if (!ino.do_ioctl_nothrow(parent_fd)) { + BEESINFO("dirid " << dirid << " inode path lookup failed in parent_fd " << name_fd(parent_fd)); + continue; + } + if (ino.m_paths.empty()) { + BEESINFO("dirid " << dirid << " inode has no paths in parent_fd " << name_fd(parent_fd)); + continue; + } + BEESTRACE("dirid " << dirid << " path " << ino.m_paths.at(0)); + parent_fd = openat(parent_fd, ino.m_paths.at(0).c_str(), FLAGS_OPEN_DIR); + if (!parent_fd) { + BEESLOGTRACE("no parent_fd from dirid"); + continue; + } + } + // BEESLOG("openat(" << name_fd(parent_fd) << ", " << name << ")"); + BEESTRACE("openat(" << name_fd(parent_fd) << ", " << name << ")"); + Fd rv = openat(parent_fd, name.c_str(), FLAGS_OPEN_DIR); + if (!rv) { + BEESLOGTRACE("open failed for name " << name); + continue; + } + BEESCOUNT(root_found); + + // Verify correct root ID + auto new_root_id = btrfs_get_root_id(rv); + THROW_CHECK2(runtime_error, new_root_id, rootid, new_root_id == rootid); + Stat st(rv); + THROW_CHECK1(runtime_error, st.st_ino, st.st_ino == BTRFS_FIRST_FREE_OBJECTID); + BEESINFO("open_root_nocache " << rootid << ": " << name_fd(rv)); + return rv; + } + } + } + BEESINFO("No path for rootid " << rootid); + BEESCOUNT(root_notfound); + return Fd(); +} + +Fd +BeesRoots::open_root(uint64_t rootid) +{ + // Ignore some of the crap that comes out of LOGICAL_INO + if (rootid == BTRFS_ROOT_TREE_OBJECTID) { + return Fd(); + } + + return m_ctx->fd_cache()->open_root(m_ctx, rootid); +} + + +uint64_t +BeesRoots::next_root(uint64_t root) +{ + BEESNOTE("Next root from " << root); + BEESTRACE("Next root from " << root); + + // BTRFS_FS_TREE_OBJECTID has no backref keys so we can't find it that way + if (root < BTRFS_FS_TREE_OBJECTID) { + // BEESLOG("First root is BTRFS_FS_TREE_OBJECTID = " << BTRFS_FS_TREE_OBJECTID); + return BTRFS_FS_TREE_OBJECTID; + } + + BtrfsIoctlSearchKey sk; + sk.tree_id = BTRFS_ROOT_TREE_OBJECTID; + sk.min_type = sk.max_type = BTRFS_ROOT_BACKREF_KEY; + sk.min_objectid = root + 1; + + while (true) { + sk.nr_items = 1024; + sk.do_ioctl(m_ctx->root_fd()); + + if (sk.m_result.empty()) { + return 0; + } + + for (auto i : sk.m_result) { + sk.next_min(i); + if (i.type == BTRFS_ROOT_BACKREF_KEY) { + // BEESLOG("Found root " << i.objectid << " parent " << i.offset); + return i.objectid; + } + } + } +} + +Fd +BeesRoots::open_root_ino_nocache(uint64_t root, uint64_t ino) +{ + BEESTRACE("opening root " << root << " ino " << ino); + + Fd root_fd = open_root(root); + if (!root_fd) { + return root_fd; + } + + BEESTOOLONG("open_root_ino(root " << root << ", ino " << ino << ")"); + + BEESTRACE("looking up ino " << ino); + BtrfsIoctlInoPathArgs ipa(ino); + if (!ipa.do_ioctl_nothrow(root_fd)) { + BEESINFO("Lookup root " << root << " ino " << ino << " failed: " << strerror(errno)); + return Fd(); + } + + BEESTRACE("searching paths for root " << root << " ino " << ino); + Fd rv; + if (ipa.m_paths.empty()) { + BEESLOG("No paths for root " << root << " ino " << ino); + } + for (auto file_path : ipa.m_paths) { + BEESTRACE("Looking up root " << root << " ino " << ino << " in dir " << name_fd(root_fd) << " path " << file_path); + BEESCOUNT(open_file); + // Try to open file RW, fall back to RO + const char *fp_cstr = file_path.c_str(); + rv = openat(root_fd, fp_cstr, FLAGS_OPEN_FILE); + if (!rv) { + BEESCOUNT(open_fail); + // errno == ENOENT is common during snapshot delete, ignore it + if (errno != ENOENT) { + BEESLOG("Could not open path '" << file_path << "' at root " << root << " " << name_fd(root_fd) << ": " << strerror(errno)); + BEESNOTE("ipa" << ipa); + } + continue; + } + + // Correct inode? + Stat file_stat(rv); + if (file_stat.st_ino != ino) { + BEESLOG("Opening " << name_fd(root_fd) << "/" << file_path << " found wrong inode " << file_stat.st_ino << " instead of " << ino); + rv = Fd(); + BEESCOUNT(open_wrong_ino); + break; + } + + // Correct root? + auto file_root = btrfs_get_root_id(rv); + if (file_root != root) { + BEESLOG("Opening " << name_fd(root_fd) << "/" << file_path << " found wrong root " << file_root << " instead of " << root); + rv = Fd(); + BEESCOUNT(open_wrong_root); + break; + } + + // Same filesystem? + Stat root_stat(root_fd); + if (root_stat.st_dev != file_stat.st_dev) { + BEESLOG("Opening root " << name_fd(root_fd) << " path " << file_path << " found path st_dev " << file_stat.st_dev << " but root st_dev is " << root_stat.st_dev); + rv = Fd(); + BEESCOUNT(open_wrong_dev); + break; + } + + BEESTRACE("mapped " << BeesFileId(root, ino)); + BEESTRACE("\tto " << name_fd(rv)); + BEESCOUNT(open_hit); + return rv; + } + + // Odd, we didn't find a path. + return Fd(); +} + +Fd +BeesRoots::open_root_ino(uint64_t root, uint64_t ino) +{ + return m_ctx->fd_cache()->open_root_ino(m_ctx, root, ino); +} + +BeesCrawl::BeesCrawl(shared_ptr ctx, BeesCrawlState initial_state) : + m_ctx(ctx), + m_state(initial_state) +{ +} + +bool +BeesCrawl::next_transid() +{ + // If this crawl is recently empty, quickly and _silently_ bail out + auto current_time = time(NULL); + auto crawl_state = get_state(); + auto elapsed_time = current_time - crawl_state.m_started; + if (elapsed_time < BEES_COMMIT_INTERVAL) { + if (!m_deferred) { + BEESLOG("Deferring next transid in " << get_state()); + } + m_deferred = true; + BEESCOUNT(crawl_defer); + return false; + } + + // Log performance stats from the old crawl + BEESLOG("Next transid in " << get_state()); + + // Start new crawl + m_deferred = false; + auto roots = m_ctx->roots(); + crawl_state.m_min_transid = crawl_state.m_max_transid; + crawl_state.m_max_transid = roots->transid_max(); + crawl_state.m_objectid = 0; + crawl_state.m_offset = 0; + crawl_state.m_started = current_time; + BEESLOG("Restarting crawl " << get_state()); + BEESCOUNT(crawl_restart); + set_state(crawl_state); + return true; +} + +bool +BeesCrawl::fetch_extents() +{ + THROW_CHECK1(runtime_error, m_extents.size(), m_extents.empty()); + + auto old_state = get_state(); + if (m_deferred || old_state.m_max_transid <= old_state.m_min_transid) { + BEESTRACE("Nothing to crawl in " << get_state()); + return next_transid(); + } + + BEESNOTE("crawling " << get_state()); + BEESLOG("Crawling " << get_state()); + + Timer crawl_timer; + + BtrfsIoctlSearchKey sk; + sk.tree_id = old_state.m_root; + sk.min_objectid = old_state.m_objectid; + sk.min_type = sk.max_type = BTRFS_EXTENT_DATA_KEY; + sk.min_offset = old_state.m_offset; + sk.min_transid = old_state.m_min_transid; + sk.max_transid = old_state.m_max_transid; + sk.nr_items = BEES_MAX_CRAWL_SIZE; + + // Lock in the old state + set_state(old_state); + + BEESTRACE("Searching crawl sk " << static_cast(sk)); + bool ioctl_ok = false; + { + BEESNOTE("searching crawl sk " << static_cast(sk)); + BEESTOOLONG("Searching crawl sk " << static_cast(sk)); + ioctl_ok = sk.do_ioctl_nothrow(m_ctx->root_fd()); + } + + if (ioctl_ok) { + BEESCOUNT(crawl_search); + } else { + BEESLOG("Search ioctl failed: " << strerror(errno)); + BEESCOUNT(crawl_fail); + } + + if (!ioctl_ok || sk.m_result.empty()) { + BEESCOUNT(crawl_empty); + BEESLOG("Crawl empty " << get_state()); + return next_transid(); + } + + BEESLOG("Crawling " << sk.m_result.size() << " results from " << get_state()); + auto results_left = sk.m_result.size(); + BEESNOTE("crawling " << results_left << " results from " << get_state()); + size_t count_other = 0; + size_t count_inline = 0; + size_t count_unknown = 0; + size_t count_data = 0; + size_t count_low = 0; + size_t count_high = 0; + BeesFileRange last_bfr; + for (auto i : sk.m_result) { + sk.next_min(i); + --results_left; + BEESCOUNT(crawl_items); + + BEESTRACE("i = " << i); + +#if 1 + // We need the "+ 1" and objectid rollover that next_min does. + auto new_state = get_state(); + new_state.m_objectid = sk.min_objectid; + new_state.m_offset = sk.min_offset; + + // Saving state here means we can skip a search result + // if we are interrupted. Not saving state here means we + // can fail to make forward progress in cases where there + // is a lot of metadata we can't process. Favor forward + // progress over losing search results. + set_state(new_state); +#endif + + // Ignore things that aren't EXTENT_DATA_KEY + if (i.type != BTRFS_EXTENT_DATA_KEY) { + ++count_other; + BEESCOUNT(crawl_nondata); + continue; + } + + auto gen = call_btrfs_get(btrfs_stack_file_extent_generation, i.m_data); + if (gen < get_state().m_min_transid) { + BEESCOUNT(crawl_gen_low); + ++count_low; + // We probably want (need?) to scan these anyway. + // continue; + } + if (gen > get_state().m_max_transid) { + BEESCOUNT(crawl_gen_high); + ++count_high; + // This shouldn't ever happen + // continue; + } + + auto type = call_btrfs_get(btrfs_stack_file_extent_type, i.m_data); + switch (type) { + default: + BEESINFO("Unhandled file extent type " << type << " in root " << get_state().m_root << " ino " << i.objectid << " offset " << to_hex(i.offset)); + ++count_unknown; + BEESCOUNT(crawl_unknown); + break; + case BTRFS_FILE_EXTENT_INLINE: + // Ignore these for now. + // BEESINFO("Ignored file extent type INLINE in root " << get_state().m_root << " ino " << i.objectid << " offset " << to_hex(i.offset)); + ++count_inline; + // TODO: replace with out-of-line dup extents + BEESCOUNT(crawl_inline); + break; + case BTRFS_FILE_EXTENT_PREALLOC: + BEESCOUNT(crawl_prealloc); + case BTRFS_FILE_EXTENT_REG: { + auto physical = call_btrfs_get(btrfs_stack_file_extent_disk_bytenr, i.m_data); + auto ram = call_btrfs_get(btrfs_stack_file_extent_ram_bytes, i.m_data); + auto len = call_btrfs_get(btrfs_stack_file_extent_num_bytes, i.m_data); + auto offset = call_btrfs_get(btrfs_stack_file_extent_offset, i.m_data); + BEESTRACE("Root " << get_state().m_root << " ino " << i.objectid << " physical " << to_hex(physical) + << " logical " << to_hex(i.offset) << ".." << to_hex(i.offset + len) + << " gen " << gen); + ++count_data; + if (physical) { + THROW_CHECK1(runtime_error, ram, ram > 0); + THROW_CHECK1(runtime_error, len, len > 0); + THROW_CHECK2(runtime_error, offset, ram, offset < ram); + BeesFileId bfi(get_state().m_root, i.objectid); + if (m_ctx->is_blacklisted(bfi)) { + BEESCOUNT(crawl_blacklisted); + } else { + BeesFileRange bfr(bfi, i.offset, i.offset + len); + // BEESNOTE("pushing bfr " << bfr << " limit " << BEES_MAX_QUEUE_SIZE); + m_extents.insert(bfr); + BEESCOUNT(crawl_push); + } + } else { + BEESCOUNT(crawl_hole); + } + break; + } + } + } + BEESLOG("Crawled inline " << count_inline << " data " << count_data << " other " << count_other << " unknown " << count_unknown << " gen_low " << count_low << " gen_high " << count_high << " " << get_state() << " in " << crawl_timer << "s"); + + return true; +} + +void +BeesCrawl::fetch_extents_harder() +{ + BEESNOTE("fetch_extents_harder " << get_state() << " with " << m_extents.size() << " extents"); + while (m_extents.empty()) { + bool progress_made = fetch_extents(); + if (!progress_made) { + return; + } + } +} + +BeesFileRange +BeesCrawl::peek_front() +{ + unique_lock lock(m_mutex); + fetch_extents_harder(); + if (m_extents.empty()) { + return BeesFileRange(); + } + return *m_extents.begin(); +} + +BeesFileRange +BeesCrawl::pop_front() +{ + unique_lock lock(m_mutex); + fetch_extents_harder(); + if (m_extents.empty()) { + return BeesFileRange(); + } + auto rv = *m_extents.begin(); + m_extents.erase(m_extents.begin()); +#if 0 + auto state = get_state(); + state.m_objectid = rv.fid().ino(); + state.m_offset = rv.begin(); + set_state(state); +#endif + return rv; +} + +BeesCrawlState +BeesCrawl::get_state() +{ + unique_lock lock(m_state_mutex); + return m_state; +} + +void +BeesCrawl::set_state(const BeesCrawlState &bcs) +{ + unique_lock lock(m_state_mutex); + m_state = bcs; + lock.unlock(); + m_ctx->roots()->crawl_state_set_dirty(); +} diff --git a/src/bees-thread.cc b/src/bees-thread.cc new file mode 100644 index 0000000..a702231 --- /dev/null +++ b/src/bees-thread.cc @@ -0,0 +1,91 @@ +#include "bees.h" + +using namespace crucible; +using namespace std; + +BeesThread::BeesThread(string name) : + m_name(name) +{ + THROW_CHECK1(invalid_argument, name, !name.empty()); +} + +void +BeesThread::exec(function func) +{ + m_timer.reset(); + BEESLOG("BeesThread exec " << m_name); + m_thread_ptr = make_shared([=]() { + BEESLOG("Starting thread " << m_name); + BeesNote::set_name(m_name); + BEESNOTE("thread function"); + Timer thread_time; + catch_all([&]() { + DIE_IF_MINUS_ERRNO(pthread_setname_np(pthread_self(), m_name.c_str())); + }); + catch_all([&]() { + func(); + }); + BEESLOG("Exiting thread " << m_name << ", " << thread_time << " sec"); + }); +} + +BeesThread::BeesThread(string name, function func) : + m_name(name) +{ + THROW_CHECK1(invalid_argument, name, !name.empty()); + BEESLOG("BeesThread construct " << m_name); + exec(func); +} + +void +BeesThread::join() +{ + if (!m_thread_ptr) { + BEESLOG("Thread " << m_name << " no thread ptr"); + return; + } + + BEESLOG("BeesThread::join " << m_name); + if (m_thread_ptr->joinable()) { + BEESLOG("Joining thread " << m_name); + Timer thread_time; + m_thread_ptr->join(); + BEESLOG("Waited for " << m_name << ", " << thread_time << " sec"); + } else if (!m_name.empty()) { + BEESLOG("BeesThread " << m_name << " not joinable"); + } else { + BEESLOG("BeesThread else " << m_name); + } +} + +void +BeesThread::set_name(const string &name) +{ + m_name = name; +} + +BeesThread::~BeesThread() +{ + if (!m_thread_ptr) { + BEESLOG("Thread " << m_name << " no thread ptr"); + return; + } + + BEESLOG("BeesThread destructor " << m_name); + if (m_thread_ptr->joinable()) { + BEESLOG("Cancelling thread " << m_name); + int rv = pthread_cancel(m_thread_ptr->native_handle()); + if (rv) { + BEESLOG("pthread_cancel returned " << strerror(-rv)); + } + BEESLOG("Waiting for thread " << m_name); + Timer thread_time; + m_thread_ptr->join(); + BEESLOG("Waited for " << m_name << ", " << thread_time << " sec"); + } else if (!m_name.empty()) { + BEESLOG("Thread " << m_name << " not joinable"); + } else { + BEESLOG("Thread destroy else " << m_name); + } +} + diff --git a/src/bees-types.cc b/src/bees-types.cc new file mode 100644 index 0000000..95ad945 --- /dev/null +++ b/src/bees-types.cc @@ -0,0 +1,1006 @@ +#include "bees.h" + +#include "crucible/crc64.h" +#include "crucible/limits.h" +#include "crucible/ntoa.h" +#include "crucible/string.h" + +#include +#include + +using namespace crucible; +using namespace std; + +ostream & +operator<<(ostream &os, const BeesFileId &bfi) +{ + return os << bfi.root() << ":" << bfi.ino(); +} + +bool +BeesFileId::operator<(const BeesFileId &that) const +{ + // Order by inode first so we get good locality when scanning across snapshots + return tie(m_ino, m_root) < tie(that.m_ino, that.m_root); +} + +bool +BeesFileId::operator==(const BeesFileId &that) const +{ + return m_root == that.m_root && m_ino == that.m_ino; +} + +bool +BeesFileId::operator!=(const BeesFileId &that) const +{ + return m_root != that.m_root || m_ino != that.m_ino; +} + +BeesFileId::operator bool() const +{ + return m_root && m_ino; +} + +BeesFileId::BeesFileId(const BtrfsInodeOffsetRoot &bior) : + m_root(bior.m_root), + m_ino(bior.m_inum) +{ +} + +BeesFileId::BeesFileId(uint64_t root, uint64_t ino) : + m_root(root), + m_ino(ino) +{ +} + +BeesFileId::BeesFileId(int fd) : + m_root(btrfs_get_root_id(fd)), + m_ino(Stat(fd).st_ino) +{ +} + +BeesFileId::BeesFileId() : + m_root(0), + m_ino(0) +{ +} + +ostream & +operator<<(ostream &os, const BeesFileRange &bfr) +{ + if (bfr.end() == numeric_limits::max()) { + os << "- [" << to_hex(bfr.begin()) << "..eof]"; + } else { + os << pretty(bfr.size()) << " [" << to_hex(bfr.begin()) << ".." << to_hex(bfr.end()) << "]"; + } + if (bfr.m_fid) { + os << " fid = " << bfr.m_fid; + } + if (!!bfr.m_fd) { + os << " fd = " << bfr.m_fd << " '" << name_fd(bfr.m_fd) << "'"; + } + return os; +} + +ostream & +operator<<(ostream &os, const BeesRangePair &brp) +{ + return os << "BeesRangePair: " << pretty(brp.first.size()) + << " src[" << to_hex(brp.first.begin()) << ".." << to_hex(brp.first.end()) << "]" + << " dst[" << to_hex(brp.second.begin()) << ".." << to_hex(brp.second.end()) << "]" + << "\nsrc = " << brp.first.fd() << " " << name_fd(brp.first.fd()) + << "\ndst = " << brp.second.fd() << " " << name_fd(brp.second.fd()); +} + +mutex BeesFileRange::s_mutex; + +bool +BeesFileRange::operator<(const BeesFileRange &that) const +{ + // Read file blocks in order + return make_tuple(fid(), m_begin, m_end) < make_tuple(that.fid(), that.m_begin, that.m_end); + // Faster to read big chunks first? Probably confuses the hell + // out of crawl state, so let's only keep this if there's a clear + // performance win. + // return make_tuple(that.size(), fid(), m_begin, m_end) < make_tuple(size(), that.fid(), that.m_begin, that.m_end); +} + +bool +BeesFileRange::operator==(const BeesFileRange &that) const +{ + // These fields are cheap to compare and have the most variety + if (m_begin != that.m_begin || m_end != that.m_end) { + return false; + } + // If they both have the same fd they're equal, + // but different fds are not necessarily distinct + if (!!m_fd && !!that.m_fd && m_fd == that.m_fd) { + return true; + } + // OK now we have to go check their FileIds + return fid() == that.fid(); +} + +bool +BeesFileRange::operator!=(const BeesFileRange &that) const +{ + return !((*this) == that); +} + +bool +BeesFileRange::empty() const +{ + THROW_CHECK2(invalid_argument, m_begin, m_end, m_begin <= m_end); + return m_begin >= m_end; +} + +off_t +BeesFileRange::size() const +{ + THROW_CHECK2(invalid_argument, m_begin, m_end, m_begin <= m_end); + return m_end - m_begin; +} + +off_t +BeesFileRange::file_size() const +{ + if (m_file_size <= 0) { + // Use method fd() not member m_fd() so we hold lock + Stat st(fd()); + m_file_size = st.st_size; + // These checks could trigger on valid input, but that would mean we have + // lost a race (e.g. a file was truncated while we were building a + // matching range pair with it). In such cases we should probably stop + // whatever we were doing and backtrack to some higher level anyway. + THROW_CHECK1(invalid_argument, m_file_size, m_file_size > 0); + // THROW_CHECK2(invalid_argument, m_file_size, m_end, m_end <= m_file_size || m_end == numeric_limits::max()); + } + return m_file_size; +} + +off_t +BeesFileRange::grow_end(off_t delta) +{ + THROW_CHECK1(invalid_argument, delta, delta > 0); + m_end = min(m_end + delta, file_size()); + THROW_CHECK2(runtime_error, m_file_size, m_end, m_end <= m_file_size); + return m_end; +} + +off_t +BeesFileRange::grow_begin(off_t delta) +{ + THROW_CHECK1(invalid_argument, delta, delta > 0); + m_begin -= min(delta, m_begin); + return m_begin; +} + +BeesFileRange::BeesFileRange(const BeesBlockData &bbd) : + m_fd(bbd.fd()), + m_begin(bbd.begin()), + m_end(bbd.end()), + m_file_size(-1) +{ +} + +BeesFileRange::BeesFileRange(Fd fd, off_t begin, off_t end) : + m_fd(fd), + m_begin(begin), + m_end(end), + m_file_size(-1) +{ +} + +BeesFileRange::BeesFileRange(const BeesFileId &fid, off_t begin, off_t end) : + m_fid(fid), + m_begin(begin), + m_end(end), + m_file_size(-1) +{ +} + +BeesFileRange::BeesFileRange() : + m_begin(0), + m_end(0), + m_file_size(-1) +{ +} + +bool +BeesFileRange::is_same_file(const BeesFileRange &that) const +{ + // If we have two FDs, start by comparing those + if (!!m_fd && !!that.m_fd && m_fd == that.m_fd) { + return true; + } + // OK have to go fetch the fid from both files and compare them + return fid() == that.fid(); +} + +bool +BeesFileRange::overlaps(const BeesFileRange &that) const +{ + // Determine whether the byte ranges overlap before doing syscalls on file descriptors + + pair a(m_begin, m_end); + pair b(that.m_begin, that.m_end); + + // range a starts lower than or equal b + if (b.first < a.first) { + swap(a, b); + } + + // if b starts within a, they overlap + // (and the intersecting region is b.first..min(a.second, b.second)) + // (and the union region is a.first..max(a.second, b.second)) + if (b.first >= a.first && b.first < a.second) { + return is_same_file(that); + } + + return false; +} + +bool +BeesFileRange::coalesce(const BeesFileRange &that) +{ + // Let's define coalesce-with-null as identity, + // and coalesce-null-with-null as coalesced + if (!*this) { + operator=(that); + return true; + } + if (!that) { + return true; + } + + // Can't coalesce different files + if (!is_same_file(that)) return false; + + pair a(m_begin, m_end); + pair b(that.m_begin, that.m_end); + + // range a starts lower than or equal b + if (b.first < a.first) { + swap(a, b); + } + + // if b starts within a, they overlap + // (and the intersecting region is b.first..min(a.second, b.second)) + // (and the union region is a.first..max(a.second, b.second)) + if (b.first >= a.first && b.first < a.second) { + m_begin = a.first; + m_end = max(a.second, b.second); + return true; + } + + return false; +} + +BeesFileRange::operator BeesBlockData() const +{ + BEESTRACE("operator BeesBlockData " << *this); + return BeesBlockData(m_fd, m_begin, m_end - m_begin); +} + +Fd +BeesFileRange::fd() const +{ + unique_lock lock(s_mutex); + return m_fd; +} + +Fd +BeesFileRange::fd(const shared_ptr &ctx) const +{ + unique_lock lock(s_mutex); + // If we don't have a fid we can't do much here + if (m_fid) { + if (!m_fd) { + // If we don't have a fd, open by fid + if (m_fid && ctx) { + lock.unlock(); + Fd new_fd = ctx->roots()->open_root_ino(m_fid); + lock.lock(); + m_fd = new_fd; + } + } else { + // If we have both fid and fd, make sure they match + BeesFileId fd_fid(m_fd); + THROW_CHECK2(invalid_argument, fd_fid, m_fid, fd_fid == m_fid); + } + } + // We either had a fid and opened it, or we didn't and we're just stuck with our fd + return m_fd; +} + +BeesFileRange +BeesFileRange::copy_closed() const +{ + return BeesFileRange(fid(), m_begin, m_end); +} + +BeesFileId +BeesFileRange::fid() const +{ + if (!m_fid) { + if (!!m_fd) { + m_fid = BeesFileId(m_fd); + } + } + return m_fid; +} + +BeesRangePair::BeesRangePair(const BeesFileRange &src, const BeesFileRange &dst) : + pair(src, dst) +{ + BEESTRACE("checking constraints on " << *this); + + // Must not initially overlap + THROW_CHECK2(invalid_argument, first, second, !first.overlaps(second)); + + // Must initially be equal + THROW_CHECK2(invalid_argument, first, second, first.size() == second.size()); + + // Can't check content unless open + if (!first.fd() || !second.fd()) { + return; + } + + // Must check every block individually + off_t first_begin = first.begin(); + off_t second_begin = second.begin(); + off_t size = first.size(); + while (size) { + off_t len = min(BLOCK_SIZE_SUMS, size); + BeesBlockData first_bbd(first.fd(), first_begin, len); + BeesBlockData second_bbd(second.fd(), second_begin, len); + THROW_CHECK2(invalid_argument, first_bbd, second_bbd, first_bbd.is_data_equal(second_bbd)); + first_begin += len; + second_begin += len; + size -= len; + } +} + +bool +BeesRangePair::operator<(const BeesRangePair &that) const +{ + // Order by destination then source + return tie(second, first) < tie(that.second, that.first); +} + +bool +BeesRangePair::grow(shared_ptr ctx, bool constrained) +{ + BEESTOOLONG("grow constrained = " << constrained << " *this = " << *this); + BEESTRACE("grow constrained = " << constrained << " *this = " << *this); + bool rv = false; + + THROW_CHECK1(invalid_argument, first.begin(), (first.begin() & BLOCK_MASK_CLONE) == 0); + THROW_CHECK1(invalid_argument, second.begin(), (second.begin() & BLOCK_MASK_CLONE) == 0); + + // We should not be overlapping already + THROW_CHECK2(invalid_argument, first, second, !first.overlaps(second)); + + BtrfsExtentWalker ew_second(second.fd()); + + // Stop on aligned extent boundary + ew_second.seek(second.begin()); + + Extent e_second = ew_second.current(); + BEESTRACE("e_second " << e_second); + + // Preread entire extent + posix_fadvise(second.fd(), e_second.begin(), e_second.size(), POSIX_FADV_WILLNEED); + posix_fadvise(first.fd(), e_second.begin() + first.begin() - second.begin(), e_second.size(), POSIX_FADV_WILLNEED); + + auto hash_table = ctx->hash_table(); + + // Look backward + BEESTRACE("grow_backward " << *this); + while (first.size() < BLOCK_SIZE_MAX_EXTENT) { + if (second.begin() <= e_second.begin()) { +#if 0 + if (constrained) { + break; + } + BEESCOUNT(pairbackward_extent); + ew_second.seek(second.begin() - min(BLOCK_SIZE_CLONE, second.begin())); + e_second = ew_second.current(); + if (e_second.flags() & Extent::HOLE) { + BEESCOUNT(pairbackward_hole); + break; + } + posix_fadvise(second.fd(), e_second.begin(), e_second.size(), POSIX_FADV_WILLNEED); +#else + // This tends to repeatedly process extents that were recently processed. + // We tend to catch duplicate blocks early since we scan them forwards. + // Also, reading backwards is slow so we probably don't want to do it much. + break; +#endif + } + BEESCOUNT(pairbackward_try); + + // Extend first range. If we hit BOF we can go no further. + BeesFileRange new_first = first; + BEESTRACE("new_first = " << new_first); + new_first.grow_begin(BLOCK_SIZE_CLONE); + if (new_first.begin() == first.begin()) { + BEESCOUNT(pairbackward_bof_first); + break; + } + + // Source extent cannot be toxic + BeesAddress first_addr(first.fd(), new_first.begin()); + if (!first_addr.is_magic()) { + auto first_resolved = ctx->resolve_addr(first_addr); + if (first_resolved.is_toxic()) { + BEESLOG("WORKAROUND: not growing matching pair backward because src addr is toxic:\n" << *this); + BEESCOUNT(pairbackward_toxic_addr); + break; + } + } + + // Extend second range. If we hit BOF we can go no further. + BeesFileRange new_second = second; + BEESTRACE("new_second = " << new_second); + new_second.grow_begin(BLOCK_SIZE_CLONE); + if (new_second.begin() == second.begin()) { + BEESCOUNT(pairbackward_bof_second); + break; + } + + // If the ranges now overlap we went too far + if (new_first.overlaps(new_second)) { + BEESCOUNT(pairbackward_overlap); + break; + } + + BEESTRACE("first " << first << " new_first " << new_first); + BeesBlockData first_bbd(first.fd(), new_first.begin(), first.begin() - new_first.begin()); + BEESTRACE("first_bbd " << first_bbd); + BEESTRACE("second " << second << " new_second " << new_second); + BeesBlockData second_bbd(second.fd(), new_second.begin(), second.begin() - new_second.begin()); + BEESTRACE("second_bbd " << second_bbd); + + // Both blocks must have identical content + if (!first_bbd.is_data_equal(second_bbd)) { + BEESCOUNT(pairbackward_miss); + break; + } + + // Physical blocks must be distinct + if (first_bbd.addr().get_physical_or_zero() == second_bbd.addr().get_physical_or_zero()) { + BEESCOUNT(pairbackward_same); + break; + } + + // Source block cannot be zero in a non-compressed non-magic extent + if (first_bbd.is_data_zero() && !first_addr.is_magic() && !first_addr.is_compressed()) { + BEESCOUNT(pairbackward_zero); + break; + } + + // Source block cannot have a toxic hash + auto found_hashes = hash_table->find_cell(first_bbd.hash()); + bool found_toxic = false; + for (auto i : found_hashes) { + if (BeesAddress(i.e_addr).is_toxic()) { + found_toxic = true; + break; + } + } + if (found_toxic) { + BEESLOG("WORKAROUND: found toxic hash in " << first_bbd << " while extending backward:\n" << *this); + BEESCOUNT(pairbackward_toxic_hash); + break; + } + + THROW_CHECK2(invalid_argument, new_first.size(), new_second.size(), new_first.size() == new_second.size()); + first = new_first; + second = new_second; + rv = true; + BEESCOUNT(pairbackward_hit); + } + BEESCOUNT(pairbackward_stop); + + // Look forward + BEESTRACE("grow_forward " << *this); + while (first.size() < BLOCK_SIZE_MAX_EXTENT) { + if (second.end() >= e_second.end()) { + if (constrained) { + break; + } + BEESCOUNT(pairforward_extent); + ew_second.seek(second.end()); + e_second = ew_second.current(); + if (e_second.flags() & Extent::HOLE) { + BEESCOUNT(pairforward_hole); + break; + } + posix_fadvise(second.fd(), e_second.begin(), e_second.size(), POSIX_FADV_WILLNEED); + } + BEESCOUNT(pairforward_try); + + // Extend first range. If we hit EOF we can go no further. + BeesFileRange new_first = first; + BEESTRACE("new_first = " << new_first); + new_first.grow_end(BLOCK_SIZE_CLONE); + if (new_first.end() == first.end()) { + BEESCOUNT(pairforward_eof_first); + break; + } + + // Source extent cannot be toxic + BeesAddress first_addr(first.fd(), new_first.begin()); + if (!first_addr.is_magic()) { + auto first_resolved = ctx->resolve_addr(first_addr); + if (first_resolved.is_toxic()) { + BEESLOG("WORKAROUND: not growing matching pair forward because src is toxic:\n" << *this); + BEESCOUNT(pairforward_toxic); + break; + } + } + + // Extend second range. If we hit EOF we can go no further. + BeesFileRange new_second = second; + BEESTRACE("new_second = " << new_second); + new_second.grow_end(BLOCK_SIZE_CLONE); + if (new_second.end() == second.end()) { + BEESCOUNT(pairforward_eof_second); + break; + } + + // If we have hit an unaligned EOF then it has to be the same unaligned EOF. + // If we haven't hit EOF then the ends of the ranges are still aligned, + // so the misalignment (zero) will be equal. + if ((new_second.end() & BLOCK_MASK_CLONE) != (new_first.end() & BLOCK_MASK_CLONE)) { + BEESCOUNT(pairforward_eof_malign); + break; + } + + // If the ranges now overlap we went too far + if (new_first.overlaps(new_second)) { + BEESCOUNT(pairforward_overlap); + break; + } + + BEESTRACE("first " << first << " new_first " << new_first); + BeesBlockData first_bbd(first.fd(), first.end(), new_first.end() - first.end()); + BEESTRACE("first_bbd " << first_bbd); + BEESTRACE("second " << second << " new_second " << new_second); + BeesBlockData second_bbd(second.fd(), second.end(), new_second.end() - second.end()); + BEESTRACE("second_bbd " << second_bbd); + + // Both blocks must have identical content + if (!first_bbd.is_data_equal(second_bbd)) { + BEESCOUNT(pairforward_miss); + break; + } + + // Physical blocks must be distinct + if (first_bbd.addr().get_physical_or_zero() == second_bbd.addr().get_physical_or_zero()) { + BEESCOUNT(pairforward_same); + break; + } + + // Source block cannot be zero in a non-compressed non-magic extent + if (first_bbd.is_data_zero() && !first_addr.is_magic() && !first_addr.is_compressed()) { + BEESCOUNT(pairforward_zero); + break; + } + + // Source block cannot have a toxic hash + auto found_hashes = hash_table->find_cell(first_bbd.hash()); + bool found_toxic = false; + for (auto i : found_hashes) { + if (BeesAddress(i.e_addr).is_toxic()) { + found_toxic = true; + break; + } + } + if (found_toxic) { + BEESLOG("WORKAROUND: found toxic hash in " << first_bbd << " while extending forward:\n" << *this); + BEESCOUNT(pairforward_toxic_hash); + break; + } + + // OK, next block + THROW_CHECK2(invalid_argument, new_first.size(), new_second.size(), new_first.size() == new_second.size()); + first = new_first; + second = new_second; + rv = true; + BEESCOUNT(pairforward_hit); + } + + if (first.overlaps(second)) { + BEESLOGTRACE("after grow, first " << first << "\n\toverlaps " << second); + BEESCOUNT(bug_grow_pair_overlaps); + } + + BEESCOUNT(pairforward_stop); + return rv; +} + +BeesRangePair +BeesRangePair::copy_closed() const +{ + return BeesRangePair(first.copy_closed(), second.copy_closed()); +} + +ostream & +operator<<(ostream &os, const BeesAddress &ba) +{ + if (ba.is_magic()) { + enum { + ZERO = BeesAddress::MagicValue::ZERO, + DELALLOC = BeesAddress::MagicValue::DELALLOC, + HOLE = BeesAddress::MagicValue::HOLE, + UNUSABLE = BeesAddress::MagicValue::UNUSABLE, + }; + static const bits_ntoa_table table[] = { + NTOA_TABLE_ENTRY_ENUM(ZERO), + NTOA_TABLE_ENTRY_ENUM(DELALLOC), + NTOA_TABLE_ENTRY_ENUM(HOLE), + NTOA_TABLE_ENTRY_ENUM(UNUSABLE), + NTOA_TABLE_ENTRY_END() + }; + return os << bits_ntoa(static_cast(ba), table); + } + + auto gpz = ba.get_physical_or_zero(); + if (gpz == 0x1000) { + os << "NIL"; + } else { + os << to_hex(gpz); + } + + if (ba.is_toxic()) { + os << "t"; + } + + if (ba.is_unaligned_eof()) { + os << "u"; + } + + if (ba.is_compressed()) { + os << "z"; + if (ba.has_compressed_offset()) { + os << astringprintf("%" PRIx64, ba.get_compressed_offset()); + } + } + + return os; +} + +bool +BeesAddress::magic_check(uint64_t flags) +{ + // This one isn't FIEMAP + if (flags & Extent::HOLE) { + m_addr = HOLE; + BEESCOUNT(addr_hole); + return true; + } + + // These trigger extra processing steps for compressed extents + static const unsigned compressed_flags = FIEMAP_EXTENT_ENCODED; + + // These indicate the extent is not yet on disk (try again with sync) + static const unsigned delalloc_flags = FIEMAP_EXTENT_UNKNOWN | FIEMAP_EXTENT_DELALLOC; + + // These flags are irrelevant to extent-same + static const unsigned ignore_flags = FIEMAP_EXTENT_LAST | FIEMAP_EXTENT_SHARED; + + // These flags mean we can't use extent-same + static const unsigned unusable_flags = FIEMAP_EXTENT_NOT_ALIGNED | FIEMAP_EXTENT_DATA_INLINE; + + // All of the above (any other flag is a new feature we maybe can't cope with) + static const unsigned recognized_flags = compressed_flags | delalloc_flags | ignore_flags | unusable_flags; + + if (flags & ~recognized_flags) { + BEESLOGTRACE("Unrecognized flags in " << fiemap_extent_flags_ntoa(flags)); + m_addr = UNUSABLE; + // maybe we throw here? + BEESCOUNT(addr_unrecognized); + return true; + } + + if (flags & unusable_flags) { + // we know these, but can't touch them + BEESCOUNT(addr_unusable); + m_addr = UNUSABLE; + return true; + } + + if (flags & delalloc_flags) { + // delayed allocation, try again with force + BEESCOUNT(addr_delalloc); + m_addr = DELALLOC; + return true; + } + + return false; +} + +BeesAddress::BeesAddress(const Extent &e, off_t offset) : + m_addr(ZERO) +{ + BEESTRACE("BeesAddress " << e << " offset " << to_hex(offset)); + Type new_addr = 0; + + THROW_CHECK1(invalid_argument, e, (e.physical() & BLOCK_MASK_CLONE) == 0); + THROW_CHECK1(invalid_argument, e, (e.begin() & BLOCK_MASK_CLONE) == 0); + THROW_CHECK1(invalid_argument, e, (offset & BLOCK_MASK_CLONE) == 0); + THROW_CHECK1(invalid_argument, e, e.end() > e.begin()); + + if (magic_check(e.flags())) { + BEESCOUNT(addr_magic); + return; + } + + // All addresses from here on are physical + THROW_CHECK1(invalid_argument, e, e.physical() > 0); + + if (e.flags() & FIEMAP_EXTENT_ENCODED) { + THROW_CHECK1(invalid_argument, e, (e.offset() & BLOCK_MASK_CLONE) == 0); + THROW_CHECK1(invalid_argument, e, e.offset() >= 0 && e.offset() < BLOCK_SIZE_MAX_COMPRESSED_EXTENT); + int extent_offset = offset - e.begin() + e.offset(); + BEESTRACE("extent_offset = " << to_hex(extent_offset)); + THROW_CHECK1(invalid_argument, extent_offset, extent_offset >= 0 && extent_offset < BLOCK_SIZE_MAX_COMPRESSED_EXTENT); + THROW_CHECK1(invalid_argument, extent_offset, (extent_offset & BLOCK_MASK_CLONE) == 0); + unsigned offset_bits = (extent_offset / BLOCK_SIZE_CLONE) + 1; + BEESTRACE("offset_bits = " << offset_bits); + THROW_CHECK1(invalid_argument, offset_bits, offset_bits >= c_offset_min && offset_bits <= c_offset_max); + THROW_CHECK1(invalid_argument, offset_bits, (offset_bits & ~c_offset_mask) == 0); +#if 1 + new_addr = e.physical() | c_compressed_mask | offset_bits; + BEESCOUNT(addr_compressed_offset); +#else + new_addr = e.physical() | c_compressed_mask; + BEESCOUNT(addr_compressed); +#endif + } else { + new_addr = e.physical() + (offset - e.begin()); + BEESCOUNT(addr_uncompressed); + } + + if ((e.flags() & FIEMAP_EXTENT_LAST) && (e.end() & BLOCK_MASK_CLONE) != 0 && (offset & ~BLOCK_MASK_CLONE) == (e.end() & ~BLOCK_MASK_CLONE)) { + new_addr |= c_eof_mask; + BEESCOUNT(addr_eof_e); + } + + m_addr = new_addr; + BEESCOUNT(addr_block); +} + +BeesAddress::BeesAddress(int fd, off_t offset) : + m_addr(ZERO) +{ + BEESTOOLONG("BeesAddress(fd " << fd << " " << name_fd(fd) << " offset " << to_hex(offset) << ")"); + BEESTRACE("BeesAddress(fd " << fd << " " << name_fd(fd) << " offset " << to_hex(offset) << ")"); + + Type uoffset = ranged_cast(offset); + + THROW_CHECK1(invalid_argument, uoffset, (uoffset & c_all_mask) == 0); + THROW_CHECK1(invalid_argument, uoffset, (uoffset & BLOCK_MASK_CLONE) == 0); + + Timer extentwalker_timer; + BtrfsExtentWalker ew(fd, uoffset); + Extent e = ew.current(); + BEESCOUNT(addr_from_fd); + BEESCOUNTADD(addr_ms, extentwalker_timer.age() * 1000); + + *this = BeesAddress(e, offset); +} + +BeesAddress::BeesAddress(int fd, off_t offset, shared_ptr ctx) : + m_addr(ZERO) +{ + BEESTOOLONG("BeesAddress(fd " << fd << " " << name_fd(fd) << " offset " << to_hex(offset) << " ctx " << ctx->root_path() << ")"); + BEESTRACE("BeesAddress(fd " << fd << " " << name_fd(fd) << " offset " << to_hex(offset) << " ctx " << ctx->root_path() << ")"); + + Type uoffset = ranged_cast(offset); + + THROW_CHECK1(invalid_argument, uoffset, (uoffset & c_all_mask) == 0); + THROW_CHECK1(invalid_argument, uoffset, (uoffset & BLOCK_MASK_CLONE) == 0); + + Timer extentwalker_timer; + BtrfsExtentWalker ew(fd, uoffset, ctx->root_fd()); + Extent e = ew.current(); + BEESCOUNT(addr_from_root_fd); + BEESCOUNTADD(addr_ms, extentwalker_timer.age() * 1000); + + *this = BeesAddress(e, offset); +} + +// Get just the physical address with no extra bits or compressed block offset (magic values become zero) + +BeesAddress::Type +BeesAddress::get_physical_or_zero() const +{ + if (is_magic()) { + return 0; + } else { + return m_addr & ~c_all_mask; + } +} + +// A compressed block address is divided into two fields: +// the beginning of the physical extent, +// and the distance (in CLONE blocks) from the start of the extent to the current block. +// Throws an exception if has_compressed_offset is not true. + +BeesAddress::Type +BeesAddress::get_compressed_offset() const +{ + THROW_CHECK1(invalid_argument, *this, has_compressed_offset()); + return ((m_addr & c_offset_mask) - 1) * BLOCK_SIZE_CLONE; +} + +void +BeesAddress::set_toxic() +{ + THROW_CHECK1(invalid_argument, *this, !is_magic()); + m_addr |= c_toxic_mask; +} + +bool +BeesAddress::operator==(const BeesAddress &that) const +{ + // If one side has an offset and the other doesn't, compare without checking offset bits + // This returns the right result for comparisons between magic and non-magic values, + // even though the math is all wrong. + if (has_compressed_offset() != that.has_compressed_offset()) { + return (m_addr & ~c_offset_mask) == (that.m_addr & ~c_offset_mask); + } else { + return m_addr == that.m_addr; + } +} + +bool +BeesAddress::operator<(const BeesAddress &that) const +{ + if (has_compressed_offset() != that.has_compressed_offset()) { + return (m_addr & ~c_offset_mask) < (that.m_addr & ~c_offset_mask); + } else { + return m_addr < that.m_addr; + } +} + +ostream & +operator<<(ostream &os, const BeesBlockData &bbd) +{ + os << "BeesBlockData { " << pretty(bbd.m_length) << " " << to_hex(bbd.m_offset) << " fd = " << bbd.m_fd << " '" << name_fd(bbd.m_fd) << "'"; + if (bbd.m_addr != BeesAddress::ZERO) { + os << ", address = " << bbd.m_addr; + } + if (bbd.m_hash_done) { + os << ", hash = " << bbd.m_hash; + } + if (!bbd.m_data.empty()) { + os << ", data[" << bbd.m_data.size() << "] = '"; + + size_t max_print = 12; + size_t to_print = min(bbd.m_data.size(), max_print); + for (size_t i = 0; i < to_print; ++i) { + uint8_t c = bbd.m_data[i]; + // We are ASCII heathens here + if (c >= 32 && c < 127 && c != '\\') { + os << c; + } else { + char buf[8]; + sprintf(buf, "\\x%02x", c); + os << buf; + } + } + os << "...'"; + } + return os << " }"; +} + +BeesBlockData::BeesBlockData(Fd fd, off_t offset, size_t read_length) : + m_fd(fd), + m_offset(offset), + m_length(read_length) +{ + BEESTRACE("Constructing " << *this); + THROW_CHECK1(invalid_argument, m_length, m_length > 0); + THROW_CHECK1(invalid_argument, m_length, m_length <= BLOCK_SIZE_SUMS); + THROW_CHECK1(invalid_argument, m_offset, (m_offset % BLOCK_SIZE_SUMS) == 0); +} + +BeesBlockData::BeesBlockData() : + m_offset(0), + m_length(0) +{ +} + +BeesAddress +BeesBlockData::addr() const +{ + if (m_addr == BeesAddress::ZERO) { + m_addr = BeesAddress(fd(), m_offset); + } + return m_addr; +} + +BeesBlockData & +BeesBlockData::addr(const BeesAddress &a) +{ + m_addr = a; + return *this; +} + +const BeesBlockData::Blob & +BeesBlockData::data() const +{ + if (m_data.empty()) { + THROW_CHECK1(invalid_argument, size(), size() > 0); + BEESTOOLONG("Reading BeesBlockData " << *this); + Timer read_timer; + + Blob rv(m_length); + pread_or_die(m_fd, rv, m_offset); + THROW_CHECK2(runtime_error, rv.size(), m_length, ranged_cast(rv.size()) == m_length); + m_data = rv; + BEESCOUNT(block_read); + BEESCOUNTADD(block_bytes, rv.size()); + BEESCOUNTADD(block_ms, read_timer.age() * 1000); + } + + return m_data; +} + +BeesHash +BeesBlockData::hash() const +{ + if (!m_hash_done) { + // We can only dedup unaligned EOF blocks against other unaligned EOF blocks, + // so we do NOT round up to a full sum block size. + const Blob &blob = data(); + // TODO: It turns out that file formats with 4K block + // alignment and embedded CRC64 do exist, and every block + // of such files has the same hash. Could use a subset + // of SHA1 here instead. + m_hash = Digest::CRC::crc64(blob.data(), blob.size()); + m_hash_done = true; + BEESCOUNT(block_hash); + } + + return m_hash; +} + +bool +BeesBlockData::is_data_zero() const +{ + // The CRC64 of zero is zero, so skip some work if we already know the CRC + if (m_hash_done && m_hash != 0) { + return false; + } + + // OK read block (maybe) and check every byte + for (auto c : data()) { + if (c != '\0') { + return false; + } + } + + BEESCOUNT(block_zero); + return true; +} + +bool +BeesBlockData::is_data_equal(const BeesBlockData &that) const +{ + BEESTRACE("is_data_equal this = " << *this << ", that = " << that); + THROW_CHECK1(invalid_argument, size(), size() > 0); + THROW_CHECK2(invalid_argument, size(), that.size(), size() == that.size()); + + // skip some work if we already know the CRCs don't match + if (m_hash_done && that.m_hash_done && m_hash != that.m_hash) { + return false; + } + + return data() == that.data(); +} + diff --git a/src/bees.cc b/src/bees.cc new file mode 100644 index 0000000..6f90cad --- /dev/null +++ b/src/bees.cc @@ -0,0 +1,599 @@ +#include "bees.h" + +#include "crucible/interp.h" +#include "crucible/limits.h" +#include "crucible/process.h" +#include "crucible/string.h" + +#include +#include + +#include +#include + +// PRIx64 +#include + +#include +#include + +#include +#include + +using namespace crucible; +using namespace std; + +int +do_cmd_help(const ArgList &argv) +{ + cerr << "Usage: " << argv[0] << " fs-root-path [fs-root-path-2...]\n" + "Performs best-effort extent-same deduplication on btrfs.\n" + "\n" + "fs-root-path MUST be the root of a btrfs filesystem tree (id 5).\n" + "Other directories will be rejected.\n" + "\n" + "Multiple filesystems can share a single hash table (BEESHOME)\n" + "but this only works well if the content of each filesystem\n" + "is distinct from all the others.\n" + "\n" + "Required environment variables:\n" + "\tBEESHOME\tPath to hash table and configuration files\n" + "\n" + "Optional environment variables:\n" + "\tBEESSTATUS\tFile to write status to (tmpfs recommended, e.g. /run)\n" + "\n" + << endl; + return 0; +} + +// tracing ---------------------------------------- + +RateLimiter bees_info_rate_limit(BEES_INFO_RATE, BEES_INFO_BURST); + +thread_local BeesTracer *BeesTracer::s_next_tracer = nullptr; + +BeesTracer::~BeesTracer() +{ + if (uncaught_exception()) { + m_func(); + if (!m_next_tracer) { + BEESLOG("--- END TRACE --- exception ---"); + } + } + s_next_tracer = m_next_tracer; +} + +BeesTracer::BeesTracer(function f) : + m_func(f) +{ + m_next_tracer = s_next_tracer; + s_next_tracer = this; +} + +void +BeesTracer::trace_now() +{ + BeesTracer *tp = s_next_tracer; + BEESLOG("--- BEGIN TRACE ---"); + while (tp) { + tp->m_func(); + tp = tp->m_next_tracer; + } + BEESLOG("--- END TRACE ---"); +} + +thread_local BeesNote *BeesNote::s_next = nullptr; +mutex BeesNote::s_mutex; +map BeesNote::s_status; +thread_local string BeesNote::s_name; + +BeesNote::~BeesNote() +{ + unique_lock lock(s_mutex); + s_next = m_prev; + if (s_next) { + s_status[gettid()] = s_next; + } else { + s_status.erase(gettid()); + } +} + +BeesNote::BeesNote(function f) : + m_func(f) +{ + unique_lock lock(s_mutex); + m_name = s_name; + m_prev = s_next; + s_next = this; + s_status[gettid()] = s_next; +} + +void +BeesNote::set_name(const string &name) +{ + unique_lock lock(s_mutex); + s_name = name; +} + +string +BeesNote::get_name() +{ + unique_lock lock(s_mutex); + if (s_name.empty()) { + return "bees"; + } else { + return s_name; + } +} + +BeesNote::ThreadStatusMap +BeesNote::get_status() +{ + unique_lock lock(s_mutex); + ThreadStatusMap rv; + for (auto t : s_status) { + ostringstream oss; + if (!t.second->m_name.empty()) { + oss << t.second->m_name << ": "; + } + if (t.second->m_timer.age() > BEES_TOO_LONG) { + oss << "[" << t.second->m_timer << "s] "; + } + t.second->m_func(oss); + rv[t.first] = oss.str(); + } + return rv; +} + +// static inline helpers ---------------------------------------- + +static inline +bool +bees_addr_check(uint64_t v) +{ + return !(v & (1ULL << 63)); +} + +static inline +bool +bees_addr_check(int64_t v) +{ + return !(v & (1ULL << 63)); +} + +string +pretty(double d) +{ + static const char * units[] = { "", "K", "M", "G", "T", "P", "E" }; + static const char * *units_stop = units + sizeof(units) / sizeof(units[0]) - 1; + const char * *unit = units; + while (d >= 1024 && unit < units_stop) { + d /= 1024; + ++unit; + } + ostringstream oss; + oss << (round(d * 1000.0) / 1000.0) << *unit; + return oss.str(); +} + +// ostream operators ---------------------------------------- + +template +ostream & +operator<<(ostream &os, const BeesStatTmpl &bs) +{ + unique_lock lock(bs.m_mutex); + bool first = true; + string last_tag; + for (auto i : bs.m_stats_map) { + if (i.second == 0) { + continue; + } + string tag = i.first.substr(0, i.first.find_first_of("_")); + if (!last_tag.empty() && tag != last_tag) { + os << "\n\t"; + } else if (!first) { + os << " "; + } + last_tag = tag; + first = false; + os << i.first << "=" << i.second; + } + return os; +} + +// other ---------------------------------------- + +template +T& +BeesStatTmpl::at(string idx) +{ + unique_lock lock(m_mutex); + if (!m_stats_map.count(idx)) { + m_stats_map[idx] = 0; + } + return m_stats_map[idx]; +} + +template +T +BeesStatTmpl::at(string idx) const +{ + unique_lock lock(m_mutex); + return m_stats_map.at(idx); +} + +template +void +BeesStatTmpl::add_count(string idx, size_t amount) +{ + unique_lock lock(m_mutex); + if (!m_stats_map.count(idx)) { + m_stats_map[idx] = 0; + } + m_stats_map.at(idx) += amount; +} + +template +BeesStatTmpl::BeesStatTmpl(const BeesStatTmpl &that) +{ + if (&that == this) return; + unique_lock lock(m_mutex); + unique_lock lock2(that.m_mutex); + m_stats_map = that.m_stats_map; +} + +template +BeesStatTmpl & +BeesStatTmpl::operator=(const BeesStatTmpl &that) +{ + if (&that == this) return *this; + unique_lock lock(m_mutex); + unique_lock lock2(that.m_mutex); + m_stats_map = that.m_stats_map; + return *this; +} + +BeesStats BeesStats::s_global; + +BeesStats +BeesStats::operator-(const BeesStats &that) const +{ + if (&that == this) return BeesStats(); + unique_lock this_lock(m_mutex); + BeesStats this_copy; + this_copy.m_stats_map = m_stats_map; + unique_lock that_lock(that.m_mutex); + BeesStats that_copy; + that_copy.m_stats_map = that.m_stats_map; + this_lock.unlock(); + that_lock.unlock(); + for (auto i : that.m_stats_map) { + if (i.second != 0) { + this_copy.at(i.first) -= i.second; + } + } + return this_copy; +} + +BeesRates +BeesStats::operator/(double d) const +{ + BeesRates rv; + unique_lock lock(m_mutex); + for (auto i : m_stats_map) { + rv.m_stats_map[i.first] = ceil(i.second / d * 1000) / 1000; + } + return rv; +} + +BeesStats::operator bool() const +{ + unique_lock lock(m_mutex); + for (auto i : m_stats_map) { + if (i.second != 0) { + return true; + } + } + return false; +} + +BeesTooLong::BeesTooLong(const string &s, double limit) : + m_limit(limit), + m_func([s](ostream &os) { os << s; }) +{ +} + +BeesTooLong::BeesTooLong(const func_type &func, double limit) : + m_limit(limit), + m_func(func) +{ +} + +void +BeesTooLong::check() const +{ + if (age() > m_limit) { + ostringstream oss; + m_func(oss); + BEESLOG("PERFORMANCE: " << *this << " sec: " << oss.str()); + } +} + +BeesTooLong::~BeesTooLong() +{ + check(); +} + +BeesTooLong & +BeesTooLong::operator=(const func_type &f) +{ + m_func = f; + return *this; +} + +void +bees_sync(int fd) +{ + Timer sync_timer; + BEESNOTE("syncing " << name_fd(fd)); + BEESTOOLONG("syncing " << name_fd(fd)); + DIE_IF_NON_ZERO(fsync(fd)); + BEESCOUNT(sync_count); + BEESCOUNTADD(sync_ms, sync_timer.age() * 1000); +} + +BeesStringFile::BeesStringFile(Fd dir_fd, string name, size_t limit) : + m_dir_fd(dir_fd), + m_name(name), + m_limit(limit) +{ + BEESLOG("BeesStringFile " << name_fd(m_dir_fd) << "/" << m_name << " max size " << pretty(m_limit)); +} + +string +BeesStringFile::read() +{ + BEESNOTE("opening " << m_name << " in " << name_fd(m_dir_fd)); + Fd fd(openat(m_dir_fd, m_name.c_str(), FLAGS_OPEN_FILE)); + if (!fd) { + return string(); + } + + BEESNOTE("sizing " << m_name << " in " << name_fd(m_dir_fd)); + Stat st(fd); + THROW_CHECK1(out_of_range, st.st_size, st.st_size > 0); + THROW_CHECK1(out_of_range, st.st_size, st.st_size < ranged_cast(m_limit)); + + BEESNOTE("reading " << m_name << " in " << name_fd(m_dir_fd)); + return read_string(fd, st.st_size); +} + +void +BeesStringFile::write(string contents) +{ + THROW_CHECK2(out_of_range, contents.size(), m_limit, contents.size() < m_limit); + auto tmpname = m_name + ".tmp"; + + BEESNOTE("unlinking " << tmpname << " in " << name_fd(m_dir_fd)); + unlinkat(m_dir_fd, tmpname.c_str(), 0); + // ignore error + + BEESNOTE("closing " << tmpname << " in " << name_fd(m_dir_fd)); + { + Fd ofd = openat_or_die(m_dir_fd, tmpname, FLAGS_CREATE_FILE, S_IRUSR | S_IWUSR); + BEESNOTE("writing " << tmpname << " in " << name_fd(m_dir_fd)); + write_or_die(ofd, contents); + BEESNOTE("fsyncing " << tmpname << " in " << name_fd(m_dir_fd)); + DIE_IF_NON_ZERO(fsync(ofd)); + } + BEESNOTE("renaming " << tmpname << " to " << m_name << " in FD " << name_fd(m_dir_fd)); + BEESTRACE("renaming " << tmpname << " to " << m_name << " in FD " << name_fd(m_dir_fd)); + renameat_or_die(m_dir_fd, tmpname, m_dir_fd, m_name); +} + +void +BeesTempFile::create() +{ + // BEESLOG("creating temporary file in " << m_ctx->root_path()); + BEESNOTE("creating temporary file in " << m_ctx->root_path()); + BEESTOOLONG("creating temporary file in " << m_ctx->root_path()); + + DIE_IF_MINUS_ONE(m_fd = openat(m_ctx->root_fd(), ".", FLAGS_OPEN_TMPFILE, S_IRUSR | S_IWUSR)); + BEESCOUNT(tmp_create); + + // Can't reopen this file, so don't allow any resolves there + // Resolves won't work there anyway. There are lots of tempfiles + // and they're short-lived, so this ends up being just a memory leak + // m_ctx->blacklist_add(BeesFileId(m_fd)); + m_ctx->insert_root_ino(m_fd); + + // Set compression attribute + int flags = 0; + BEESTRACE("Getting FS_COMPR_FL on m_fd " << name_fd(m_fd) << " flags " << to_hex(flags)); + DIE_IF_MINUS_ONE(ioctl(m_fd, FS_IOC_GETFLAGS, &flags)); + flags |= FS_COMPR_FL; + BEESTRACE("Setting FS_COMPR_FL on m_fd " << name_fd(m_fd) << " flags " << to_hex(flags)); + DIE_IF_MINUS_ONE(ioctl(m_fd, FS_IOC_SETFLAGS, &flags)); + + // Always leave first block empty to avoid creating a file with an inline extent + m_end_offset = BLOCK_SIZE_CLONE; +} + +void +BeesTempFile::resize(off_t offset) +{ + BEESTOOLONG("Resizing temporary file to " << to_hex(offset)); + BEESNOTE("Resizing temporary file " << name_fd(m_fd) << " to " << to_hex(offset)); + BEESTRACE("Resizing temporary file " << name_fd(m_fd) << " to " << to_hex(offset)); + + // Ensure that file covers m_end_offset..offset + THROW_CHECK2(invalid_argument, m_end_offset, offset, m_end_offset < offset); + + // Truncate + DIE_IF_NON_ZERO(ftruncate(m_fd, offset)); + BEESCOUNT(tmp_resize); + + // Success + m_end_offset = offset; +} + +BeesTempFile::BeesTempFile(shared_ptr ctx) : + m_ctx(ctx), + m_end_offset(0) +{ + create(); +} + +void +BeesTempFile::realign() +{ + if (m_end_offset > BLOCK_SIZE_MAX_TEMP_FILE) { + BEESLOG("temporary file size " << to_hex(m_end_offset) << " > max " << BLOCK_SIZE_MAX_TEMP_FILE); + BEESCOUNT(tmp_trunc); + return create(); + } + if (m_end_offset & BLOCK_MASK_CLONE) { + // BEESTRACE("temporary file size " << to_hex(m_end_offset) << " not aligned"); + BEESCOUNT(tmp_realign); + return create(); + } + // OK as is + BEESCOUNT(tmp_aligned); +} + +BeesFileRange +BeesTempFile::make_hole(off_t count) +{ + THROW_CHECK1(invalid_argument, count, count > 0); + realign(); + + BEESTRACE("make hole at " << m_end_offset); + + auto end = m_end_offset + count; + BeesFileRange rv(m_fd, m_end_offset, end); + + resize(end); + + BEESTRACE("created temporary hole " << rv); + BEESCOUNT(tmp_hole); + return rv; +} + +BeesFileRange +BeesTempFile::make_copy(const BeesFileRange &src) +{ + BEESLOG("copy: " << src); + BEESNOTE("Copying " << src); + BEESTRACE("Copying " << src); + + THROW_CHECK1(invalid_argument, src, src.size() > 0); + + // FIXME: don't know where these come from, but we can't handle them. + // Grab a trace for the log. + THROW_CHECK1(invalid_argument, src, src.size() < BLOCK_SIZE_MAX_TEMP_FILE); + + realign(); + + auto begin = m_end_offset; + auto end = m_end_offset + src.size(); + resize(end); + + BeesFileRange rv(m_fd, begin, end); + BEESTRACE("copying to: " << rv); + BEESNOTE("copying " << src << " to " << rv); + + auto src_p = src.begin(); + auto dst_p = begin; + + bool did_block_write = false; + while (dst_p < end) { + auto len = min(BLOCK_SIZE_CLONE, end - dst_p); + BeesBlockData bbd(src.fd(), src_p, len); + // Don't fill in holes + if (bbd.is_data_zero()) { + BEESCOUNT(tmp_block_zero); + } else { + BEESNOTE("copying " << src << " to " << rv << "\n" + "\tpwrite " << bbd << " to " << name_fd(m_fd) << " offset " << to_hex(dst_p) << " len " << len); + pwrite_or_die(m_fd, bbd.data().data(), len, dst_p); + did_block_write = true; + BEESCOUNT(tmp_block); + BEESCOUNTADD(tmp_bytes, len); + } + src_p += len; + dst_p += len; + } + + // We seem to get lockups without this! + if (did_block_write) { + bees_sync(m_fd); + } + + BEESCOUNT(tmp_copy); + return rv; +} + +int +bees_main(ArgList args) +{ + set_catch_explainer([&](string s) { + BEESLOG("\n\n*** EXCEPTION ***\n\t" << s << "\n***\n"); + BEESCOUNT(exception_caught); + }); + + BEESNOTE("main"); + BeesNote::set_name("main"); + + list> all_contexts; + shared_ptr bc; + + // Subscribe to fanotify events + bool did_subscription = false; + for (string arg : args) { + catch_all([&]() { + bc = make_shared(bc); + bc->set_root_path(arg); + did_subscription = true; + }); + } + + if (!did_subscription) { + BEESLOG("WARNING: no filesystems added"); + } + + BeesThread status_thread("status", [&]() { + bc->dump_status(); + }); + + // Now we just wait forever + bc->show_progress(); + + // That is all. + return 0; +} + +int +main(int argc, const char **argv) +{ + if (argc < 2) { + do_cmd_help(argv); + return 2; + } + + ArgList args(argv + 1); + + int rv = 1; + catch_and_explain([&]() { + rv = bees_main(args); + }); + return rv; +} + +// instantiate templates for linkage ---------------------------------------- + +template class BeesStatTmpl; +template ostream & operator<<(ostream &os, const BeesStatTmpl &bs); + +template class BeesStatTmpl; +template ostream & operator<<(ostream &os, const BeesStatTmpl &bs); diff --git a/src/bees.h b/src/bees.h new file mode 100644 index 0000000..cc472ea --- /dev/null +++ b/src/bees.h @@ -0,0 +1,828 @@ +#ifndef BEES_H +#define BEES_H + +#include "crucible/bool.h" +#include "crucible/cache.h" +#include "crucible/chatter.h" +#include "crucible/error.h" +#include "crucible/extentwalker.h" +#include "crucible/fd.h" +#include "crucible/fs.h" +#include "crucible/lockset.h" +#include "crucible/time.h" +#include "crucible/timequeue.h" +#include "crucible/workqueue.h" + +#include +#include +#include +#include +#include +#include + +#include + +using namespace crucible; +using namespace std; + +// Block size for clone alignment (FIXME: should read this from /sys/fs/btrfs//clone_alignment) +const off_t BLOCK_SIZE_CLONE = 4096; + +// Block size for dedup checksums (arbitrary, but must be a multiple of clone alignment) +const off_t BLOCK_SIZE_SUMS = 4096; + +// Block size for memory allocations and file mappings (FIXME: should be CPU page size) +const off_t BLOCK_SIZE_MMAP = 4096; + +// Maximum length parameter to extent-same ioctl (FIXME: hardcoded in kernel) +const off_t BLOCK_SIZE_MAX_EXTENT_SAME = 4096 * 4096; + +// Maximum length of a compressed extent in bytes +const off_t BLOCK_SIZE_MAX_COMPRESSED_EXTENT = 128 * 1024; + +// Try to combine smaller extents into larger ones +const off_t BLOCK_SIZE_MIN_EXTENT_DEFRAG = BLOCK_SIZE_MAX_COMPRESSED_EXTENT; + +// Avoid splitting extents that are already too small +const off_t BLOCK_SIZE_MIN_EXTENT_SPLIT = BLOCK_SIZE_MAX_COMPRESSED_EXTENT; +// const off_t BLOCK_SIZE_MIN_EXTENT_SPLIT = 1024LL * 1024 * 1024 * 1024; + +// Maximum length of any extent in bytes +// except we've seen 1.03G extents... +// ...FIEMAP is slow and full of lies +const off_t BLOCK_SIZE_MAX_EXTENT = 128 * 1024 * 1024; + +// Masks, so we don't have to write "(BLOCK_SIZE_CLONE - 1)" everywhere +const off_t BLOCK_MASK_CLONE = BLOCK_SIZE_CLONE - 1; +const off_t BLOCK_MASK_SUMS = BLOCK_SIZE_SUMS - 1; +const off_t BLOCK_MASK_MMAP = BLOCK_SIZE_MMAP - 1; +const off_t BLOCK_MASK_MAX_COMPRESSED_EXTENT = BLOCK_SIZE_MAX_COMPRESSED_EXTENT * 2 - 1; + +// Maximum temporary file size +const off_t BLOCK_SIZE_MAX_TEMP_FILE = 1024 * 1024 * 1024; + +// Bucket size for hash table (size of one hash bucket) +const off_t BLOCK_SIZE_HASHTAB_BUCKET = BLOCK_SIZE_MMAP; + +// Extent size for hash table (since the nocow file attribute does not seem to be working today) +const off_t BLOCK_SIZE_HASHTAB_EXTENT = 16 * 1024 * 1024; + +// Bytes per second we want to flush (8GB every two hours) +const double BEES_FLUSH_RATE = 8.0 * 1024 * 1024 * 1024 / 7200.0; + +// Interval between writing non-hash-table things to disk (15 minutes) +const int BEES_WRITEBACK_INTERVAL = 900; + +// Statistics reports while scanning +const int BEES_STATS_INTERVAL = 3600; + +// Progress shows instantaneous rates and thread status +const int BEES_PROGRESS_INTERVAL = 3600; + +// Status is output every freakin second. Use a ramdisk. +const int BEES_STATUS_INTERVAL = 1; + +// Log warnings when an operation takes too long +const double BEES_TOO_LONG = 2.5; + +// Avoid any extent where LOGICAL_INO takes this long +const double BEES_TOXIC_DURATION = 9.9; + +// How long we should wait for new btrfs transactions +const double BEES_COMMIT_INTERVAL = 900; + +// How long between hash table histograms +const double BEES_HASH_TABLE_ANALYZE_INTERVAL = 3600; + +// Rate limiting of informational messages +const double BEES_INFO_RATE = 10.0; +const double BEES_INFO_BURST = 1.0; + +// After we have this many events queued, wait +const size_t BEES_MAX_QUEUE_SIZE = 1024; + +// Read this many items at a time in SEARCHv2 +const size_t BEES_MAX_CRAWL_SIZE = 4096; + +// If an extent has this many refs, pretend it does not exist +// to avoid a crippling btrfs performance bug +// The actual limit in LOGICAL_INO seems to be 2730, but let's leave a little headroom +const size_t BEES_MAX_EXTENT_REF_COUNT = 2560; + +// Flags +const int FLAGS_OPEN_COMMON = O_NOFOLLOW | O_NONBLOCK | O_CLOEXEC | O_NOATIME | O_LARGEFILE | O_NOCTTY; +const int FLAGS_OPEN_DIR = FLAGS_OPEN_COMMON | O_RDONLY | O_DIRECTORY; +const int FLAGS_OPEN_FILE = FLAGS_OPEN_COMMON | O_RDONLY; +const int FLAGS_OPEN_FILE_RW = FLAGS_OPEN_COMMON | O_RDWR; +const int FLAGS_OPEN_TMPFILE = FLAGS_OPEN_FILE_RW | O_TMPFILE | O_TRUNC | O_EXCL; +const int FLAGS_CREATE_FILE = FLAGS_OPEN_COMMON | O_WRONLY | O_CREAT | O_EXCL; + +// Fanotify allows O_APPEND, O_DSYNC, O_NOATIME, O_NONBLOCK, O_CLOEXEC, O_LARGEFILE +const int FLAGS_OPEN_FANOTIFY = O_RDWR | O_NOATIME | O_CLOEXEC | O_LARGEFILE; + +// macros ---------------------------------------- + +#define BEESLOG(x) do { Chatter c(BeesNote::get_name()); c << x; } while (0) +#define BEESLOGTRACE(x) do { BEESLOG(x); BeesTracer::trace_now(); } while (0) + +#define BEESTRACE(x) BeesTracer SRSLY_WTF_C(beesTracer_, __LINE__) ([&]() { BEESLOG(x); }) +#define BEESTOOLONG(x) BeesTooLong SRSLY_WTF_C(beesTooLong_, __LINE__) ([&](ostream &_btl_os) { _btl_os << x; }) +#define BEESNOTE(x) BeesNote SRSLY_WTF_C(beesNote_, __LINE__) ([&](ostream &_btl_os) { _btl_os << x; }) +#define BEESINFO(x) do { \ + if (bees_info_rate_limit.is_ready()) { \ + bees_info_rate_limit.borrow(1); \ + Chatter c(BeesNote::get_name()); \ + c << x; \ + } \ +} while (0) + +#define BEESCOUNT(stat) do { \ + BeesStats::s_global.add_count(#stat); \ +} while (0) + +#define BEESCOUNTADD(stat, amount) do { \ + BeesStats::s_global.add_count(#stat, (amount)); \ +} while (0) + +// ---------------------------------------- + +template class BeesStatTmpl; +template ostream& operator<<(ostream &os, const BeesStatTmpl &bs); + +template +class BeesStatTmpl { + map m_stats_map; + mutable mutex m_mutex; + +public: + BeesStatTmpl() = default; + BeesStatTmpl(const BeesStatTmpl &that); + BeesStatTmpl &operator=(const BeesStatTmpl &that); + void add_count(string idx, size_t amount = 1); + T& at(string idx); + T at(string idx) const; + +friend ostream& operator<< <>(ostream &os, const BeesStatTmpl &bs); +friend class BeesStats; +}; + +using BeesRates = BeesStatTmpl; + +struct BeesStats : public BeesStatTmpl { + static BeesStats s_global; + + BeesStats operator-(const BeesStats &that) const; + BeesRates operator/(double d) const; + explicit operator bool() const; +}; + +class BeesContext; +class BeesBlockData; + +class BeesTracer { + function m_func; + BeesTracer *m_next_tracer = 0; + + thread_local static BeesTracer *s_next_tracer; +public: + BeesTracer(function f); + ~BeesTracer(); + static void trace_now(); +}; + +class BeesNote { + function m_func; + BeesNote *m_prev; + Timer m_timer; + string m_name; + + static mutex s_mutex; + static map s_status; + + thread_local static BeesNote *s_next; + thread_local static string s_name; + +public: + BeesNote(function f); + ~BeesNote(); + + using ThreadStatusMap = map; + + static ThreadStatusMap get_status(); + + static void set_name(const string &name); + static string get_name(); +}; + +// C++ threads dumbed down even further +class BeesThread { + string m_name; + Timer m_timer; + shared_ptr m_thread_ptr; + +public: + ~BeesThread(); + BeesThread(string name); + BeesThread(string name, function args); + void exec(function args); + void join(); + void set_name(const string &name); +}; + +class BeesFileId { + uint64_t m_root; + uint64_t m_ino; + +public: + uint64_t root() const { return m_root; } + uint64_t ino() const { return m_ino; } + bool operator<(const BeesFileId &that) const; + bool operator!=(const BeesFileId &that) const; + bool operator==(const BeesFileId &that) const; + operator bool() const; + BeesFileId(const BtrfsInodeOffsetRoot &bior); + BeesFileId(int fd); + BeesFileId(uint64_t root, uint64_t ino); + BeesFileId(); +}; + +ostream& operator<<(ostream &os, const BeesFileId &bfi); + +class BeesFileRange { +protected: + static mutex s_mutex; + mutable Fd m_fd; + mutable BeesFileId m_fid; + off_t m_begin, m_end; + mutable off_t m_file_size; + +public: + + BeesFileRange(); + BeesFileRange(Fd fd, off_t begin, off_t end); + BeesFileRange(const BeesFileId &fid, off_t begin, off_t end); + BeesFileRange(const BeesBlockData &bbd); + + operator BeesBlockData() const; + + bool operator<(const BeesFileRange &that) const; + bool operator==(const BeesFileRange &that) const; + bool operator!=(const BeesFileRange &that) const; + + bool empty() const; + bool is_same_file(const BeesFileRange &that) const; + bool overlaps(const BeesFileRange &that) const; + + // If file ranges overlap, extends this to include that. + // Coalesce with empty bfr = non-empty bfr + bool coalesce(const BeesFileRange &that); + + // Remove that from this, creating 0, 1, or 2 new objects + pair subtract(const BeesFileRange &that) const; + + off_t begin() const { return m_begin; } + off_t end() const { return m_end; } + off_t size() const; + + // Lazy accessors + off_t file_size() const; + BeesFileId fid() const; + + // Get the fd if there is one + Fd fd() const; + + // Get the fd, opening it if necessary + Fd fd(const shared_ptr &ctx) const; + + BeesFileRange copy_closed() const; + + // Is it defined? + operator bool() const { return !!m_fd || m_fid; } + + // Make range larger + off_t grow_end(off_t delta); + off_t grow_begin(off_t delta); + +friend ostream & operator<<(ostream &os, const BeesFileRange &bfr); +}; + +class BeesAddress { +public: + using Type = uint64_t; +private: + Type m_addr = ZERO; + bool magic_check(uint64_t flags); +public: + + // Blocks with no physical address (not yet allocated, hole, or "other"). + // PREALLOC blocks have a physical address so they're not magic enough to be handled here. + // Compressed blocks have a physical address but it's two-dimensional. + enum MagicValue { + ZERO, // BeesAddress uninitialized + DELALLOC, // delayed allocation + HOLE, // no extent present, no space allocated + UNUSABLE, // inline extent or unrecognized FIEMAP flags + LAST, // all further values are non-magic + }; + + BeesAddress(Type addr = ZERO) : m_addr(addr) {} + BeesAddress(MagicValue addr) : m_addr(addr) {} + BeesAddress& operator=(const BeesAddress &that) = default; + operator Type() const { return m_addr; } + bool operator==(const BeesAddress &that) const; + bool operator==(const MagicValue that) const { return *this == BeesAddress(that); } + bool operator!=(const BeesAddress &that) const { return !(*this == that); } + bool operator!=(const MagicValue that) const { return *this != BeesAddress(that); } + bool operator<(const BeesAddress &that) const; + + static const Type c_offset_min = 1; + static const Type c_offset_max = BLOCK_SIZE_MAX_COMPRESSED_EXTENT / BLOCK_SIZE_CLONE; + + // if this isn't 0x3f we will have problems + static const Type c_offset_mask = (c_offset_max - 1) | (c_offset_max); + + static const Type c_compressed_mask = 1 << 11; + static const Type c_eof_mask = 1 << 10; + static const Type c_toxic_mask = 1 << 9; + + static const Type c_all_mask = c_compressed_mask | c_eof_mask | c_offset_mask | c_toxic_mask; + + bool is_compressed() const { return m_addr >= MagicValue::LAST && (m_addr & c_compressed_mask); } + bool has_compressed_offset() const { return m_addr >= MagicValue::LAST && (m_addr & c_compressed_mask) && (m_addr & c_offset_mask); } + bool is_toxic() const { return m_addr >= MagicValue::LAST && (m_addr & c_toxic_mask); } + bool is_unaligned_eof() const { return m_addr >= MagicValue::LAST && (m_addr & c_eof_mask); } + bool is_magic() const { return m_addr < MagicValue::LAST; } + + Type get_compressed_offset() const; + Type get_physical_or_zero() const; + + void set_toxic(); + + BeesAddress(int fd, off_t offset); + BeesAddress(int fd, off_t offset, shared_ptr ctx); + BeesAddress(const Extent &e, off_t offset); +}; + +ostream & operator<<(ostream &os, const BeesAddress &ba); + +class BeesStringFile { + Fd m_dir_fd; + string m_name; + size_t m_limit; + +public: + BeesStringFile(Fd dir_fd, string name, size_t limit = 1024 * 1024); + string read(); + void write(string contents); +}; + +class BeesHashTable { + shared_ptr m_ctx; +public: + using HashType = uint64_t; + using AddrType = uint64_t; + + struct Cell { + HashType e_hash; + AddrType e_addr; + Cell(const Cell &) = default; + Cell(HashType hash, AddrType addr) : e_hash(hash), e_addr(addr) { } + bool operator==(const Cell &e) const { return tie(e_hash, e_addr) == tie(e.e_hash, e.e_addr); } + bool operator!=(const Cell &e) const { return tie(e_hash, e_addr) != tie(e.e_hash, e.e_addr); } + bool operator<(const Cell &e) const { return tie(e_hash, e_addr) < tie(e.e_hash, e.e_addr); } + } __attribute__((packed)); + +private: + static const uint64_t c_cells_per_bucket = BLOCK_SIZE_HASHTAB_BUCKET / sizeof(Cell); + static const uint64_t c_buckets_per_extent = BLOCK_SIZE_HASHTAB_EXTENT / BLOCK_SIZE_HASHTAB_BUCKET; + +public: + union Bucket { + Cell p_cells[c_cells_per_bucket]; + uint8_t p_byte[BLOCK_SIZE_HASHTAB_BUCKET]; + } __attribute__((packed)); + + union Extent { + Bucket p_buckets[BLOCK_SIZE_HASHTAB_EXTENT / BLOCK_SIZE_HASHTAB_BUCKET]; + uint8_t p_byte[BLOCK_SIZE_HASHTAB_EXTENT]; + } __attribute__((packed)); + + BeesHashTable(shared_ptr ctx, string filename); + ~BeesHashTable(); + + vector find_cell(HashType hash); + bool push_random_hash_addr(HashType hash, AddrType addr); + void erase_hash_addr(HashType hash, AddrType addr); + bool push_front_hash_addr(HashType hash, AddrType addr); + + void set_shared(bool shared); + +private: + string m_filename; + Fd m_fd; + uint64_t m_size; + union { + void *m_void_ptr; // Save some casting + uint8_t *m_byte_ptr; // for pointer arithmetic + Cell *m_cell_ptr; // pointer to one table cell (entry) + Bucket *m_bucket_ptr; // all cells in one LRU unit + Extent *m_extent_ptr; // all buckets in one I/O unit + }; + union { + void *m_void_ptr_end; + uint8_t *m_byte_ptr_end; + Cell *m_cell_ptr_end; + Bucket *m_bucket_ptr_end; + Extent *m_extent_ptr_end; + }; + uint64_t m_buckets; + uint64_t m_extents; + uint64_t m_cells; + set m_buckets_dirty; + set m_buckets_missing; + BeesThread m_writeback_thread; + BeesThread m_prefetch_thread; + RateLimiter m_flush_rate_limit; + RateLimiter m_prefetch_rate_limit; + mutex m_extent_mutex; + mutex m_bucket_mutex; + condition_variable m_condvar; + set m_toxic_hashes; + BeesStringFile m_stats_file; + + LockSet m_extent_lock_set; + + DefaultBool m_shared; + + void writeback_loop(); + void prefetch_loop(); + void try_mmap_flags(int flags); + pair get_cell_range(HashType hash); + pair get_extent_range(HashType hash); + void fetch_missing_extent(HashType hash); + void set_extent_dirty(HashType hash); + void flush_dirty_extents(); + bool is_toxic_hash(HashType h) const; + + bool using_shared_map() const { return false; } + + BeesHashTable(const BeesHashTable &) = delete; + BeesHashTable &operator=(const BeesHashTable &) = delete; +}; + +ostream &operator<<(ostream &os, const BeesHashTable::Cell &bhte); + +struct BeesCrawlState { + uint64_t m_root; + uint64_t m_objectid; + uint64_t m_offset; + uint64_t m_min_transid; + uint64_t m_max_transid; + time_t m_started; + BeesCrawlState(); + bool operator<(const BeesCrawlState &that) const; +}; + +class BeesCrawl { + shared_ptr m_ctx; + + mutex m_mutex; + set m_extents; + DefaultBool m_deferred; + + mutex m_state_mutex; + BeesCrawlState m_state; + + bool fetch_extents(); + void fetch_extents_harder(); + bool next_transid(); + +public: + BeesCrawl(shared_ptr ctx, BeesCrawlState initial_state); + BeesFileRange peek_front(); + BeesFileRange pop_front(); + BeesCrawlState get_state(); + void set_state(const BeesCrawlState &bcs); +}; + +class BeesRoots { + shared_ptr m_ctx; + + BeesStringFile m_crawl_state_file; + BeesCrawlState m_crawl_current; + map> m_root_crawl_map; + mutex m_mutex; + condition_variable m_condvar; + DefaultBool m_crawl_dirty; + Timer m_crawl_timer; + BeesThread m_crawl_thread; + BeesThread m_writeback_thread; + + void insert_new_crawl(); + void insert_root(const BeesCrawlState &bcs); + Fd open_root_nocache(uint64_t root); + Fd open_root_ino_nocache(uint64_t root, uint64_t ino); + uint64_t transid_min(); + uint64_t transid_max(); + void state_load(); + void state_save(); + void crawl_roots(); + string crawl_state_filename() const; + BeesCrawlState crawl_state_get(uint64_t root); + void crawl_state_set_dirty(); + void crawl_state_erase(const BeesCrawlState &bcs); + void crawl_thread(); + void writeback_thread(); + uint64_t next_root(uint64_t root = 0); + void current_state_set(const BeesCrawlState &bcs); + +friend class BeesFdCache; +friend class BeesCrawl; + +public: + BeesRoots(shared_ptr ctx); + Fd open_root(uint64_t root); + Fd open_root_ino(uint64_t root, uint64_t ino); + Fd open_root_ino(const BeesFileId &bfi) { return open_root_ino(bfi.root(), bfi.ino()); } +}; + +struct BeesHash { + using Type = uint64_t; + + BeesHash() : m_hash(0) { } + BeesHash(Type that) : m_hash(that) { } + operator Type() const { return m_hash; } + BeesHash& operator=(const Type that) { m_hash = that; return *this; } +private: + Type m_hash; + +}; + +ostream & operator<<(ostream &os, const BeesHash &bh); + +class BeesBlockData { + using Blob = vector; + + mutable Fd m_fd; + off_t m_offset; + off_t m_length; + mutable BeesAddress m_addr; + mutable Blob m_data; + mutable BeesHash m_hash; + mutable DefaultBool m_hash_done; + +public: + // Constructor with the immutable fields + BeesBlockData(Fd fd, off_t offset, size_t read_length = BLOCK_SIZE_SUMS); + BeesBlockData(); + + // Non-lazy accessors + Fd fd() const { return m_fd; } + + // Renaming + off_t begin() const { return m_offset; } + off_t end() const { return m_offset + m_length; } + off_t size() const { return m_length; } + bool empty() const { return !m_length; } + + // Lazy accessors may modify const things + const Blob &data() const; + BeesHash hash() const; + BeesAddress addr() const; + bool is_data_zero() const; + bool is_data_equal(const BeesBlockData &that) const; + + // Setters + BeesBlockData &addr(const BeesAddress &a); + +friend ostream &operator<<(ostream &, const BeesBlockData &); +}; + +class BeesRangePair : public pair { +public: + BeesRangePair(const BeesFileRange &src, const BeesFileRange &dst); + bool grow(shared_ptr ctx, bool constrained); + BeesRangePair copy_closed() const; + bool operator<(const BeesRangePair &that) const; +friend ostream & operator<<(ostream &os, const BeesRangePair &brp); +}; + +class BeesWorkQueueBase { + string m_name; + +protected: + static mutex s_mutex; + static set s_all_workers; + +public: + virtual ~BeesWorkQueueBase(); + BeesWorkQueueBase(const string &name); + + string name() const; + void name(const string &new_name); + + virtual size_t active_size() const = 0; + virtual list peek_active(size_t count) const = 0; + + static void for_each_work_queue(function f); +}; + +template +class BeesWorkQueue : public BeesWorkQueueBase { + WorkQueue m_active_queue; + +public: + BeesWorkQueue(const string &name); + ~BeesWorkQueue(); + void push_active(const Task &task, size_t limit); + void push_active(const Task &task); + + size_t active_size() const override; + list peek_active(size_t count) const override; + + Task pop(); +}; + +class BeesTempFile { + shared_ptr m_ctx; + Fd m_fd; + off_t m_end_offset; + + void create(); + void realign(); + void resize(off_t new_end_offset); + +public: + BeesTempFile(shared_ptr ctx); + BeesFileRange make_hole(off_t count); + BeesFileRange make_copy(const BeesFileRange &src); +}; + +class BeesFdCache { + LRUCache, uint64_t> m_root_cache; + LRUCache, uint64_t, uint64_t> m_file_cache; + Timer m_root_cache_timer; + +public: + BeesFdCache(); + Fd open_root(shared_ptr ctx, uint64_t root); + Fd open_root_ino(shared_ptr ctx, uint64_t root, uint64_t ino); + void insert_root_ino(shared_ptr ctx, Fd fd); +}; + +struct BeesResolveAddrResult { + BeesResolveAddrResult(); + vector m_biors; + DefaultBool m_is_toxic; + bool is_toxic() const { return m_is_toxic; } +}; + +class BeesContext : public enable_shared_from_this { + shared_ptr m_parent_ctx; + + Fd m_home_fd; + + shared_ptr m_fd_cache; + shared_ptr m_hash_table; + shared_ptr m_roots; + + map> m_tmpfiles; + + LRUCache m_resolve_cache; + + string m_root_path; + Fd m_root_fd; + string m_root_uuid; + + mutable mutex m_blacklist_mutex; + set m_blacklist; + + string m_uuid; + + Timer m_total_timer; + + void set_root_fd(Fd fd); + + BeesResolveAddrResult resolve_addr_uncached(BeesAddress addr); + + BeesFileRange scan_one_extent(const BeesFileRange &bfr, const Extent &e); + void rewrite_file_range(const BeesFileRange &bfr); + +public: + BeesContext(shared_ptr parent_ctx = nullptr); + + void set_root_path(string path); + + Fd root_fd() const { return m_root_fd; } + Fd home_fd() const { return m_home_fd; } + string root_path() const { return m_root_path; } + string root_uuid() const { return m_root_uuid; } + + BeesFileRange scan_forward(const BeesFileRange &bfr); + + BeesRangePair dup_extent(const BeesFileRange &src); + bool dedup(const BeesRangePair &brp); + + void blacklist_add(const BeesFileId &fid); + bool is_blacklisted(const BeesFileId &fid) const; + + BeesResolveAddrResult resolve_addr(BeesAddress addr); + void invalidate_addr(BeesAddress addr); + + void dump_status(); + void show_progress(); + + shared_ptr fd_cache(); + shared_ptr hash_table(); + shared_ptr roots(); + shared_ptr tmpfile(); + + const Timer &total_timer() const { return m_total_timer; } + + // TODO: move the rest of the FD cache methods here + void insert_root_ino(Fd fd); +}; + +class BeesResolver { + shared_ptr m_ctx; + BeesAddress m_addr; + vector m_biors; + set m_ranges; + unsigned m_bior_count; + + // We found matching data, so we can dedup + DefaultBool m_found_data; + + // We found matching data, so we *did* dedup + DefaultBool m_found_dup; + + // We found matching hash, so the hash table is still correct + DefaultBool m_found_hash; + + // We found matching physical address, so the hash table isn't totally wrong + DefaultBool m_found_addr; + + // We found matching physical address, but data did not match + DefaultBool m_wrong_data; + + // The whole thing is a placebo to avoid crippling btrfs performance bugs + DefaultBool m_is_toxic; + + BeesFileRange chase_extent_ref(const BtrfsInodeOffsetRoot &bior, BeesBlockData &needle_bbd); + BeesBlockData adjust_offset(const BeesFileRange &haystack, const BeesBlockData &needle); + void find_matches(bool just_one, BeesBlockData &bbd); + + // FIXME: Do we need these? We probably always have at least one BBD + BeesFileRange chase_extent_ref(const BtrfsInodeOffsetRoot &bior, BeesHash hash); + BeesBlockData adjust_offset(const BeesFileRange &haystack, bool inexact, BeesHash needle); + void find_matches(bool just_one, BeesHash hash); + +public: + BeesResolver(shared_ptr ctx, BeesAddress addr); + BeesAddress addr(BeesAddress new_addr); + + // visitor returns true to stop loop, false to continue + bool for_each_extent_ref(BeesBlockData bbd, function visitor); + + set find_all_matches(BeesBlockData &bbd); + set find_all_matches(BeesHash hash); + + // TODO: Replace these with "for_each_extent_ref" + BeesFileRange find_one_match(BeesBlockData &bbd); + BeesFileRange find_one_match(BeesHash hash); + + void replace_src(const BeesFileRange &src_bfr); + BeesFileRange replace_dst(const BeesFileRange &dst_bfr); + + bool found_addr() const { return m_found_addr; } + bool found_data() const { return m_found_data; } + bool found_dup() const { return m_found_dup; } + bool found_hash() const { return m_found_hash; } + bool is_toxic() const { return m_is_toxic; } + size_t count() const { return m_bior_count; } + BeesAddress addr() const { return m_addr; } + + bool operator<(const BeesResolver &that) const; +}; + +class BeesTooLong : public Timer { + using func_type = function; + double m_limit; + func_type m_func; + +public: + BeesTooLong(const func_type &func = [](ostream &os) { os << __PRETTY_FUNCTION__; }, double limit = BEES_TOO_LONG); + BeesTooLong(const string &s, double limit = BEES_TOO_LONG); + BeesTooLong &operator=(const func_type &s); + ~BeesTooLong(); + void check() const; + +}; + +// And now, a giant pile of extern declarations +string pretty(double d); +extern RateLimiter bees_info_rate_limit; +void bees_sync(int fd); +string format_time(time_t t); + +#endif diff --git a/src/fiemap.cc b/src/fiemap.cc new file mode 100644 index 0000000..247de39 --- /dev/null +++ b/src/fiemap.cc @@ -0,0 +1,52 @@ +#include "crucible/fd.h" +#include "crucible/fs.h" +#include "crucible/error.h" +#include "crucible/string.h" + +#include + +#include +#include +#include + +using namespace crucible; +using namespace std; + +int +main(int argc, char **argv) +{ + catch_all([&]() { + THROW_CHECK1(invalid_argument, argc, argc > 1); + string filename = argv[1]; + + + cout << "File: " << filename << endl; + Fd fd = open_or_die(filename, O_RDONLY); + Fiemap fm; + fm.m_max_count = 100; + if (argc > 2) { fm.fm_start = stoull(argv[2], nullptr, 0); } + if (argc > 3) { fm.fm_length = stoull(argv[3], nullptr, 0); } + if (argc > 4) { fm.fm_flags = stoull(argv[4], nullptr, 0); } + fm.fm_length = min(fm.fm_length, FIEMAP_MAX_OFFSET - fm.fm_start); + uint64_t stop_at = fm.fm_start + fm.fm_length; + uint64_t last_byte = fm.fm_start; + do { + fm.do_ioctl(fd); + // cerr << fm; + uint64_t last_logical = FIEMAP_MAX_OFFSET; + for (auto &extent : fm.m_extents) { + if (extent.fe_logical > last_byte) { + cout << "Log " << to_hex(last_byte) << ".." << to_hex(extent.fe_logical) << " Hole" << endl; + } + cout << "Log " << to_hex(extent.fe_logical) << ".." << to_hex(extent.fe_logical + extent.fe_length) + << " Phy " << to_hex(extent.fe_physical) << ".." << to_hex(extent.fe_physical + extent.fe_length) + << " Flags " << fiemap_extent_flags_ntoa(extent.fe_flags) << endl; + last_logical = extent.fe_logical + extent.fe_length; + last_byte = last_logical; + } + fm.fm_start = last_logical; + } while (fm.fm_start < stop_at); + }); + exit(EXIT_SUCCESS); +} + diff --git a/src/fiewalk.cc b/src/fiewalk.cc new file mode 100644 index 0000000..91b906b --- /dev/null +++ b/src/fiewalk.cc @@ -0,0 +1,40 @@ +#include "crucible/extentwalker.h" +#include "crucible/error.h" +#include "crucible/string.h" + +#include + +#include +#include + +using namespace crucible; +using namespace std; + +int +main(int argc, char **argv) +{ + catch_all([&]() { + THROW_CHECK1(invalid_argument, argc, argc > 1); + string filename = argv[1]; + + cout << "File: " << filename << endl; + Fd fd = open_or_die(filename, O_RDONLY); + BtrfsExtentWalker ew(fd); + off_t pos = 0; + if (argc > 2) { pos = stoull(argv[2], nullptr, 0); } + ew.seek(pos); + do { + // cout << "\n\n>>>" << ew.current() << "<<<\n\n" << endl; + cout << ew.current() << endl; + } while (ew.next()); +#if 0 + cout << "\n\n\nAnd now, backwards...\n\n\n" << endl; + do { + cout << "\n\n>>>" << ew.current() << "<<<\n\n" << endl; + } while (ew.prev()); + cout << "\n\n\nDone!\n\n\n" << endl; +#endif + }); + exit(EXIT_SUCCESS); +} + diff --git a/test/.gitignore b/test/.gitignore new file mode 100644 index 0000000..70dadc1 --- /dev/null +++ b/test/.gitignore @@ -0,0 +1,5 @@ +* +!Makefile +!*.c +!*.cc +!*.h diff --git a/test/Makefile b/test/Makefile new file mode 100644 index 0000000..1170161 --- /dev/null +++ b/test/Makefile @@ -0,0 +1,36 @@ +PROGRAMS = \ + chatter \ + crc64 \ + execpipe \ + fd \ + interp \ + limits \ + path \ + process \ + +all: test + +test: $(PROGRAMS) + set -x; for prog in $(PROGRAMS); do ./$$prog || exit 1; done + +include ../makeflags + +LIBS = -lcrucible +LDFLAGS = -L../lib -Wl,-rpath=$(shell realpath ../lib) + +depends.mk: *.cc + for x in *.cc; do $(CXX) $(CXXFLAGS) -M "$$x"; done >> depends.mk.new + mv -fv depends.mk.new depends.mk + +-include depends.mk + +%.o: %.cc %.h ../makeflags + -echo "Implicit rule %.o: %.cc" >&2 + $(CXX) $(CXXFLAGS) -o "$@" -c "$<" + +%: %.o ../makeflags + -echo "Implicit rule %: %.o" >&2 + $(CXX) $(CXXFLAGS) -o "$@" "$<" $(LDFLAGS) $(LIBS) + +clean: + -rm -fv *.o diff --git a/test/chatter.cc b/test/chatter.cc new file mode 100644 index 0000000..f838b5e --- /dev/null +++ b/test/chatter.cc @@ -0,0 +1,49 @@ +#include "tests.h" + +#include "crucible/chatter.h" + +#include +#include +#include +#include + +#include + +using namespace crucible; + +static +void +test_chatter_one() +{ + cerr << endl; + CHATTER("simple chatter case"); +} + +static +void +test_chatter_two() +{ + cerr << endl; + CHATTER("two lines\nof chatter"); +} + +static +void +test_chatter_three() +{ + cerr << endl; + Chatter c("tct"); + c << "More complicated"; + c << "\ncase with\n"; + c << "some \\ns"; +} + +int +main(int, char**) +{ + RUN_A_TEST(test_chatter_one()); + RUN_A_TEST(test_chatter_two()); + RUN_A_TEST(test_chatter_three()); + + exit(EXIT_SUCCESS); +} diff --git a/test/crc64.cc b/test/crc64.cc new file mode 100644 index 0000000..df3c674 --- /dev/null +++ b/test/crc64.cc @@ -0,0 +1,39 @@ +#include "tests.h" +#include "crucible/crc64.h" + +#include + +using namespace crucible; + +static +void +test_getcrc64_strings() +{ + assert(Digest::CRC::crc64("John") == 5942451273432301568); + assert(Digest::CRC::crc64("Paul") == 5838402100630913024); + assert(Digest::CRC::crc64("George") == 6714394476893704192); + assert(Digest::CRC::crc64("Ringo") == 6038837226071130112); + assert(Digest::CRC::crc64("") == 0); + assert(Digest::CRC::crc64("\377\277\300\200") == 15615382887346470912ULL); +} + +static +void +test_getcrc64_byte_arrays() +{ + assert(Digest::CRC::crc64("John", 4) == 5942451273432301568); + assert(Digest::CRC::crc64("Paul", 4) == 5838402100630913024); + assert(Digest::CRC::crc64("George", 6) == 6714394476893704192); + assert(Digest::CRC::crc64("Ringo", 5) == 6038837226071130112); + assert(Digest::CRC::crc64("", 0) == 0); + assert(Digest::CRC::crc64("\377\277\300\200", 4) == 15615382887346470912ULL); +} + +int +main(int, char**) +{ + RUN_A_TEST(test_getcrc64_strings()); + RUN_A_TEST(test_getcrc64_byte_arrays()); + + exit(EXIT_SUCCESS); +} diff --git a/test/execpipe.cc b/test/execpipe.cc new file mode 100644 index 0000000..e28886f --- /dev/null +++ b/test/execpipe.cc @@ -0,0 +1,64 @@ +#include "tests.h" + +#include "crucible/execpipe.h" + +#include +#include +#include +#include +#include + +#include + +using namespace crucible; +using namespace std; + +#if 1 // Needs rework +static inline +void +test_hello_world() +{ + // alarm(9); + Fd fd = popen([]() { return system("echo Hello, World!"); }); + char buf[1024]; + size_t rv = -1; + read_partial_or_die(fd, buf, rv); + assert(rv > 0); + string b(buf, buf + rv - 1); + // cerr << "hello_world says: '" << b << "'" << endl; + assert(b == "Hello, World!"); +} + +static inline +void +test_read_limit(size_t limit = 4096) +{ + alarm(9); + Fd fd = popen([]() { return system("yes Hello!"); }); + try { + string b = read_all(fd, limit); + } catch (out_of_range &re) { + return; + } + assert(!"no exception thrown by read_all"); +} +#endif + +namespace crucible { + extern bool assert_no_leaked_fds(); +}; + +int +main(int, char**) +{ +#if 1 + RUN_A_TEST(test_hello_world()); + assert(assert_no_leaked_fds()); + RUN_A_TEST(test_read_limit(4095)); + RUN_A_TEST(test_read_limit(4096)); + RUN_A_TEST(test_read_limit(4097)); + assert(assert_no_leaked_fds()); +#endif + + exit(EXIT_SUCCESS); +} diff --git a/test/fd.cc b/test/fd.cc new file mode 100644 index 0000000..de89a8d --- /dev/null +++ b/test/fd.cc @@ -0,0 +1,393 @@ +// TEST DATA DO NOT REMOVE THIS LINE + +#include "tests.h" + +#include "crucible/chatter.h" +#include "crucible/error.h" +#include "crucible/fd.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace crucible; + +static +void +test_default_constructor_and_destructor() +{ + Fd f; +} + +static +void +test_basic_read() +{ + Fd f = open_or_die("fd.cc"); + const char test_string[] = "// TEST DATA DO NOT REMOVE THIS LINE"; + const int test_string_len = sizeof(test_string) - 1; + char read_buf[test_string_len]; + read_or_die(f, read_buf); + assert(!strncmp(read_buf, test_string, test_string_len)); + f->close(); +} + +static +void +test_create_read_write() +{ + Fd f = open_or_die("tmp/fd-read-write", O_CREAT | O_RDWR | O_TRUNC); + + struct test_str_out { + int i; + float f; + } tso = { + .i = 5, + .f = 3.14159, + }, tsi = { + .i = 0, + .f = 0, + }; + + size_t bytes_read = 0; + read_partial_or_die(f, tsi, bytes_read); + assert(bytes_read == 0); + assert(tsi.i == 0); + assert(tsi.f == 0); + + pwrite_or_die(f, tso, 1024); + pread_or_die(f, tsi, 1024); + assert(!memcmp(&tsi, &tso, sizeof(tsi))); + +} + +static +void +test_flags() +{ +#define FLAG_TEST(x) cerr << #x << ": " << flush; cerr << x << endl; + + FLAG_TEST(o_flags_ntoa(O_RDONLY)); + FLAG_TEST(o_flags_ntoa(O_WRONLY)); + FLAG_TEST(o_flags_ntoa(O_RDWR)); + FLAG_TEST(o_flags_ntoa(O_CREAT|O_WRONLY|O_TRUNC)); + FLAG_TEST(o_mode_ntoa(0001)); + FLAG_TEST(o_mode_ntoa(0002)); + FLAG_TEST(o_mode_ntoa(0004)); + FLAG_TEST(o_mode_ntoa(0010)); + FLAG_TEST(o_mode_ntoa(0020)); + FLAG_TEST(o_mode_ntoa(0040)); + FLAG_TEST(o_mode_ntoa(0100)); + FLAG_TEST(o_mode_ntoa(0200)); + FLAG_TEST(o_mode_ntoa(0400)); + FLAG_TEST(o_mode_ntoa(01000)); + FLAG_TEST(o_mode_ntoa(02000)); + FLAG_TEST(o_mode_ntoa(04000)); + FLAG_TEST(o_mode_ntoa(010000)); + FLAG_TEST(o_mode_ntoa(020000)); + FLAG_TEST(o_mode_ntoa(040000)); + FLAG_TEST(o_mode_ntoa(0777)); + FLAG_TEST(o_mode_ntoa(02775)); + FLAG_TEST(o_mode_ntoa(01777)); + FLAG_TEST(o_mode_ntoa(022)); + FLAG_TEST(o_mode_ntoa(077)); +} + +// Test code +namespace crucible { + extern bool assert_no_leaked_fds(); +}; + +struct FdChecker { + ~FdChecker() + { + assert_no_leaked_fds(); + } +}; + +static FdChecker fd_destructor_check; + +static inline void assert_is_closed(int i, bool closed = true) +{ + pid_t self_pid = getpid(); + char buf[1024]; + snprintf(buf, sizeof(buf), "/proc/%d/fd/%d", self_pid, i); + assert(access(buf, F_OK) ? closed : !closed); +} + +static void test_construct_destroy() +{ + int i; + { + Fd fd(open("fd.cc", O_RDONLY)); + i = fd; + } + assert_is_closed(i); +} + +static void test_construct_copy() +{ + int i; + { + Fd fd(open("fd.cc", O_RDONLY)); + i = fd; + Fd fd2(fd); + int j = fd2; + assert(i == j); + } + assert_is_closed(i); +} + +static void test_construct_default_assign() +{ + int i; + { + i = open("fd.cc", O_RDONLY); + Fd fd; + fd = i; + Fd fd2; + fd2 = fd; + int j = fd2; + assert(i == j); + } + assert_is_closed(i); +} + +static void test_assign_int() +{ + int i; + { + i = open("fd.cc", O_RDONLY); + Fd fd; + fd = i; + Fd fd2; + fd2 = i; + int j = fd2; + assert(i == j); + } + assert_is_closed(i); +} + +static void test_assign_int_survives_scope() +{ + int i, j; + { + Fd fd2; + { + i = open("fd.cc", O_RDONLY); + Fd fd; + fd = i; + fd2 = i; + j = fd2; + assert(i == j); + } + assert_is_closed(i, false); + } + assert_is_closed(i, true); +} + +static void test_assign_int_close() +{ + int i; + { + Fd fd(open("fd.cc", O_RDONLY)); + i = fd; + assert_is_closed(i, false); + fd = -1; + assert_is_closed(i, true); + int j = fd; + assert(j == -1); + // Bonus conversion operator tests + assert(fd == -1); + // Chasing a closed ref now triggers an exception + assert(catch_all([&]() { return fd->get_fd() == -1; })); + } + assert_is_closed(i, true); +} + +static void test_assign_int_close_2() +{ + int i; + { + Fd fd(open("fd.cc", O_RDONLY)); + i = fd; + assert_is_closed(i, false); + // -2 is null... + fd = -2; + assert_is_closed(i, true); + int j = fd; + // ...but it will come back as -1 + assert(j == -1); + // Bonus conversion operator tests + assert(fd == -1); + // Chasing a closed ref now triggers an exception + assert(catch_all([&]() { return fd->get_fd() == -1; })); + } + assert_is_closed(i, true); +} + +static void test_map() +{ + int a, b, c; + map fds; + { + Fd fd_dot_cc = open("fd.cc", O_RDONLY); + a = fd_dot_cc; + assert_is_closed(a, false); + Fd fd_tests_h = open("tests.h", O_RDONLY); + b = fd_tests_h; + assert_is_closed(b, false); + Fd fd_makefile = open("Makefile", O_RDONLY); + c = fd_makefile; + assert_is_closed(c, false); + fds["fd.cc"] = fd_dot_cc; + fds.insert(make_pair("tests.h", fd_tests_h)); + int j = fds["Makefile"]; + assert(j == -1); + fds["Makefile"] = fd_makefile; + assert_is_closed(a, false); + assert_is_closed(b, false); + assert_is_closed(c, false); + } + assert_is_closed(a, false); + assert_is_closed(b, false); + assert_is_closed(c, false); +} + +static void test_close_method() +{ + Fd fd = open("fd.cc", O_RDONLY); + int i = fd; + assert_is_closed(i, false); + fd->close(); + assert_is_closed(i, true); +} + +static void test_shared_close_method() +{ + Fd fd = open("fd.cc", O_RDONLY); + int i = fd; + Fd fd2 = fd; + assert_is_closed(i, false); + assert_is_closed(fd2, false); + fd->close(); + assert_is_closed(i, true); + assert_is_closed(fd2, true); +} + +struct DerivedFdResource : public Fd::resource_type { + string m_name; + DerivedFdResource(string name) : m_name(name) { + Fd::resource_type::operator=(open(name.c_str(), O_RDONLY)); + assert_is_closed(this->get_fd(), false); + } + const string &name() const { return m_name; } +}; + +struct DerivedFd : public Fd { + using resource_type = DerivedFdResource; + DerivedFd(string name) { + shared_ptr ptr = make_shared(name); + Fd::operator=(static_pointer_cast(ptr)); + } + shared_ptr operator->() const { + shared_ptr rv = cast(); + THROW_CHECK1(out_of_range, rv, rv); + return rv; + } +private: + DerivedFd() = default; +}; + +static void test_derived_resource_type() +{ + DerivedFd fd("fd.cc"); + assert_is_closed(fd, false); + assert(fd->name() == "fd.cc"); + DerivedFd fd3(fd); + assert_is_closed(fd, false); + assert_is_closed(fd3, false); + Fd fd2(fd3); + assert_is_closed(fd, false); + assert_is_closed(fd2, false); + assert_is_closed(fd3, false); +} + +static void test_derived_cast() +{ + DerivedFd fd("fd.cc"); + assert_is_closed(fd, false); + Fd fd2(fd); + Fd fd3 = open("fd.cc", O_RDONLY); + assert(fd->name() == "fd.cc"); + assert(fd.cast()); + assert(fd.cast()); + assert(fd2.cast()); + assert(fd2.cast()); + assert(fd3.cast()); + assert(catch_all([&](){ assert(!fd3.cast()); } )); +} + +static void test_derived_map() +{ + int a, b, c; + map fds; + { + DerivedFd fd_dot_cc("fd.cc"); + a = fd_dot_cc; + assert_is_closed(a, false); + Fd fd_tests_h = open("tests.h", O_RDONLY); + b = fd_tests_h; + assert_is_closed(b, false); + DerivedFd fd_makefile("Makefile"); + c = fd_makefile; + assert_is_closed(c, false); + fds["fd.cc"] = fd_dot_cc; + fds.insert(make_pair("tests.h", fd_tests_h)); + int j = fds["Makefile"]; + assert(j == -1); + fds["Makefile"] = fd_makefile; + assert_is_closed(a, false); + assert_is_closed(b, false); + assert_is_closed(c, false); + } + assert_is_closed(a, false); + assert_is_closed(b, false); + assert_is_closed(c, false); +} + +int main(int, const char **) +{ + + RUN_A_TEST(test_default_constructor_and_destructor()); + RUN_A_TEST(test_basic_read()); + RUN_A_TEST(test_create_read_write()); + + RUN_A_TEST(test_flags()); + + RUN_A_TEST(test_construct_destroy()); + RUN_A_TEST(test_construct_copy()); + RUN_A_TEST(test_construct_default_assign()); + RUN_A_TEST(test_assign_int()); + RUN_A_TEST(test_assign_int_survives_scope()); + RUN_A_TEST(test_assign_int_close()); + RUN_A_TEST(test_assign_int_close_2()); + RUN_A_TEST(test_map()); + RUN_A_TEST(test_close_method()); + RUN_A_TEST(test_shared_close_method()); + RUN_A_TEST(test_derived_resource_type()); + RUN_A_TEST(test_derived_map()); + RUN_A_TEST(test_derived_cast()); + + assert_no_leaked_fds(); + + return 0; +} diff --git a/test/interp.cc b/test/interp.cc new file mode 100644 index 0000000..dd96124 --- /dev/null +++ b/test/interp.cc @@ -0,0 +1,88 @@ +#include "tests.h" + +#include "crucible/interp.h" + +using namespace crucible; +using namespace std; + +/*********************************************************************** + +How this should work: + +Interpreter reads an arg list: + + argv[0] --method0args --method1arg arg1 --method1arg=arg1 -- args... + +argv[0] should look up a shared_ptr which creates an object of +type shared_ptr. This object is used to receive args by +method calls or one at a time. + + and can be the same object, or not. + +Process p methods: + + p->spawn(Interp*) -> Process + p->exec(ArgList) -> Process / Result + p->method (from ArgParser<>) + p->finish() -> void (destroys object without early destruction warnings...?) + p->~Process() -> complains loudly if finish() not called first...? + +Result might be a pair of Process, string. Or just string. + +ArgParser should be more like GetOpt: + + build a dictionary and an arg list from arguments + Process methods should interrogate ArgParser + ArgParser might have a table of boolean and string option names so it can reject invalid options + but if it had that, we could also pass in Process and have it call methods on it + ...but that is a _lot_ of pointer-hiding when we could KISS + ...but if we had that solved, argparser tables look like lists of method names + ArgParser has a table of names and methods on object of type T + ArgParser hides everything behind void* and hands off to a compiled implementation to do callbacks + +Extreme simplification: arguments are themselves executable + + so '--method_foo arg' really means construct MethodFoo(arg) and cast to shared_ptr + then Process->invokeSomething(ProcArg) + too extreme, use argparser instead + +***********************************************************************/ + +void +test_arg_parser() +{ + ArgParser ap; + ArgList al( { "abc", "--def", "ghi" } ); + ap.parse(NULL, al); +} + +struct Thing { + int m_i; + double m_d; + string m_s; + + void set_i(int i) { cerr << "i = " << i << endl; m_i = i; } + void set_d(double d) { cerr << "d = " << d << endl; m_d = d; } + void set_s(string s) { cerr << "s = " << s << endl; m_s = s; } +}; + +template +void +assign(T& t, F f, A a) +{ + cerr << __PRETTY_FUNCTION__ << " - a = " << a << endl; + (t.*f)(a); +} + +int +main(int, char**) +{ + RUN_A_TEST(test_arg_parser()); + + Thing p; + assign(p, &Thing::set_i, 5); + + cerr << "p.m_i = " << p.m_i << endl; + + exit(EXIT_SUCCESS); +} diff --git a/test/limits.cc b/test/limits.cc new file mode 100644 index 0000000..846c9d7 --- /dev/null +++ b/test/limits.cc @@ -0,0 +1,325 @@ +#include "tests.h" +#include "crucible/error.h" +#include "crucible/limits.h" + +#include + +using namespace crucible; + +// Like catch_all but don't bother printing anything +static +int +silent_catch_all(const function &f) +{ + try { + f(); + return 0; + } catch (const exception &) { + return 1; + } catch (...) { + return -1; + } +} + + +#define SHOULD_FAIL(expr) assert(1 == silent_catch_all([&]() { (expr); })) + +#define SHOULD_PASS(expr, result) assert(0 == silent_catch_all([&]() { assert((result) == (expr)); })) + +static +void +test_cast_signed_negative_to_unsigned() +{ + off_t tv = -1; + SHOULD_FAIL(ranged_cast(tv)); + SHOULD_FAIL(ranged_cast(tv)); + SHOULD_FAIL(ranged_cast(tv)); + SHOULD_FAIL(ranged_cast(tv)); + SHOULD_FAIL(ranged_cast(tv)); + SHOULD_FAIL(ranged_cast(tv)); + SHOULD_FAIL(ranged_cast(tv)); + SHOULD_FAIL(ranged_cast(tv)); + SHOULD_FAIL(ranged_cast(tv)); +} + +static +void +test_cast_1_to_things() +{ + auto tv = 1; + SHOULD_PASS(ranged_cast(tv), 1); + SHOULD_PASS(ranged_cast(tv), 1); + SHOULD_PASS(ranged_cast(tv), 1); + SHOULD_PASS(ranged_cast(tv), 1); + SHOULD_PASS(ranged_cast(tv), 1); + SHOULD_PASS(ranged_cast(tv), 1); + SHOULD_PASS(ranged_cast(tv), 1); + SHOULD_PASS(ranged_cast(tv), 1); + SHOULD_PASS(ranged_cast(tv), 1); + SHOULD_PASS(ranged_cast(tv), 1); + SHOULD_PASS(ranged_cast(tv), 1); + SHOULD_PASS(ranged_cast(tv), 1); + SHOULD_PASS(ranged_cast(tv), 1); + SHOULD_PASS(ranged_cast(tv), 1); + SHOULD_PASS(ranged_cast(tv), 1); + SHOULD_PASS(ranged_cast(tv), 1); + SHOULD_PASS(ranged_cast(tv), 1); + SHOULD_PASS(ranged_cast(tv), 1); + SHOULD_PASS(ranged_cast(tv), 1); +} + +static +void +test_cast_128_to_things() +{ + auto tv = 128; + SHOULD_PASS(ranged_cast(tv), 128); + SHOULD_PASS(ranged_cast(tv), 128); + SHOULD_PASS(ranged_cast(tv), 128); + SHOULD_PASS(ranged_cast(tv), 128); + SHOULD_PASS(ranged_cast(tv), 128); + SHOULD_PASS(ranged_cast(tv), 128); + SHOULD_PASS(ranged_cast(tv), 128); + SHOULD_PASS(ranged_cast(tv), 128); + SHOULD_FAIL(ranged_cast(tv)); + SHOULD_PASS(ranged_cast(tv), 128); + SHOULD_PASS(ranged_cast(tv), 128); + SHOULD_PASS(ranged_cast(tv), 128); + SHOULD_PASS(ranged_cast(tv), 128); + SHOULD_PASS(ranged_cast(tv), 128); + SHOULD_PASS(ranged_cast(tv), 128); + SHOULD_PASS(ranged_cast(tv), 128); + SHOULD_PASS(ranged_cast(tv), 128); + SHOULD_PASS(ranged_cast(tv), 128); + SHOULD_FAIL(ranged_cast(tv)); +} + +static +void +test_cast_256_to_things() +{ + auto tv = 256; + SHOULD_PASS(ranged_cast(tv), 256); + SHOULD_PASS(ranged_cast(tv), 256); + SHOULD_PASS(ranged_cast(tv), 256); + SHOULD_PASS(ranged_cast(tv), 256); + SHOULD_FAIL(ranged_cast(tv)); + SHOULD_PASS(ranged_cast(tv), 256); + SHOULD_PASS(ranged_cast(tv), 256); + SHOULD_PASS(ranged_cast(tv), 256); + SHOULD_FAIL(ranged_cast(tv)); + SHOULD_PASS(ranged_cast(tv), 256); + SHOULD_PASS(ranged_cast(tv), 256); + SHOULD_PASS(ranged_cast(tv), 256); + SHOULD_PASS(ranged_cast(tv), 256); + SHOULD_FAIL(ranged_cast(tv)); + SHOULD_PASS(ranged_cast(tv), 256); + SHOULD_PASS(ranged_cast(tv), 256); + SHOULD_PASS(ranged_cast(tv), 256); + SHOULD_PASS(ranged_cast(tv), 256); + SHOULD_FAIL(ranged_cast(tv)); +} + +static +void +test_cast_0x80000000_to_things() +{ + auto sv = 0x80000000LL; + auto uv = 0x80000000ULL; + SHOULD_PASS(ranged_cast(sv), sv); + SHOULD_PASS(ranged_cast(uv), uv); + SHOULD_PASS(ranged_cast(uv), uv); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_PASS(ranged_cast(sv), sv); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_PASS(ranged_cast(uv), uv); + SHOULD_PASS(ranged_cast(uv), uv); + SHOULD_PASS(ranged_cast(uv), uv); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_PASS(ranged_cast(sv), sv); + SHOULD_PASS(ranged_cast(sv), sv); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + if (sizeof(int) == 4) { + SHOULD_FAIL(ranged_cast(sv)); + } else if (sizeof(int) == 8) { + SHOULD_PASS(ranged_cast(sv), sv); + } else { + assert(!"unhandled case, please add code here"); + } +} + +static +void +test_cast_0xffffffff_to_things() +{ + auto sv = 0xffffffffLL; + auto uv = 0xffffffffULL; + SHOULD_PASS(ranged_cast(sv), sv); + SHOULD_PASS(ranged_cast(uv), uv); + SHOULD_PASS(ranged_cast(uv), uv); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_PASS(ranged_cast(sv), sv); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_PASS(ranged_cast(uv), uv); + SHOULD_PASS(ranged_cast(uv), uv); + SHOULD_PASS(ranged_cast(uv), uv); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_PASS(ranged_cast(sv), sv); + SHOULD_PASS(ranged_cast(sv), sv); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + if (sizeof(int) == 4) { + SHOULD_FAIL(ranged_cast(sv)); + } else if (sizeof(int) == 8) { + SHOULD_PASS(ranged_cast(sv), sv); + } else { + assert(!"unhandled case, please add code here"); + } +} + +static +void +test_cast_0xfffffffff_to_things() +{ + auto sv = 0xfffffffffLL; + auto uv = 0xfffffffffULL; + SHOULD_PASS(ranged_cast(sv), sv); + SHOULD_PASS(ranged_cast(uv), uv); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_PASS(ranged_cast(sv), sv); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_PASS(ranged_cast(uv), uv); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_PASS(ranged_cast(sv), sv); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + if (sizeof(int) == 4) { + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(uv)); + } else if (sizeof(int) == 8) { + SHOULD_PASS(ranged_cast(sv), sv); + SHOULD_PASS(ranged_cast(uv), uv); + } else { + assert(!"unhandled case, please add code here"); + } + if (sizeof(long) == 4) { + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(uv)); + } else if (sizeof(long) == 8) { + SHOULD_PASS(ranged_cast(sv), sv); + SHOULD_PASS(ranged_cast(uv), uv); + } else { + assert(!"unhandled case, please add code here"); + } +} + +static +void +test_cast_0x8000000000000000_to_things() +{ + auto sv = 0x8000000000000000LL; + auto uv = 0x8000000000000000ULL; + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_PASS(ranged_cast(uv), uv); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_PASS(ranged_cast(uv), uv); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + if (sizeof(int) == 4) { + SHOULD_FAIL(ranged_cast(uv)); + } else if (sizeof(int) == 8) { + SHOULD_PASS(ranged_cast(uv), uv); + } else { + assert(!"unhandled case, please add code here"); + } + if (sizeof(long) == 4) { + SHOULD_FAIL(ranged_cast(uv)); + } else if (sizeof(long) == 8) { + SHOULD_PASS(ranged_cast(uv), uv); + } else { + assert(!"unhandled case, please add code here"); + } +} + +static +void +test_cast_0xffffffffffffffff_to_things() +{ + auto sv = 0xffffffffffffffffLL; + auto uv = 0xffffffffffffffffULL; + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_PASS(ranged_cast(uv), uv); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_PASS(ranged_cast(uv), uv); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_FAIL(ranged_cast(uv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + SHOULD_FAIL(ranged_cast(sv)); + if (sizeof(int) == 4) { + SHOULD_FAIL(ranged_cast(uv)); + } else if (sizeof(int) == 8) { + SHOULD_PASS(ranged_cast(uv), uv); + } else { + assert(!"unhandled case, please add code here"); + } + if (sizeof(long) == 4) { + SHOULD_FAIL(ranged_cast(uv)); + } else if (sizeof(long) == 8) { + SHOULD_PASS(ranged_cast(uv), uv); + } else { + assert(!"unhandled case, please add code here"); + } +} + +// OK enough with the small values. We want to know if 32-bit machines break. + +int +main(int, char**) +{ + RUN_A_TEST(test_cast_signed_negative_to_unsigned()); + RUN_A_TEST(test_cast_1_to_things()); + RUN_A_TEST(test_cast_128_to_things()); + RUN_A_TEST(test_cast_256_to_things()); + RUN_A_TEST(test_cast_0x80000000_to_things()); + RUN_A_TEST(test_cast_0xffffffff_to_things()); + RUN_A_TEST(test_cast_0xfffffffff_to_things()); + RUN_A_TEST(test_cast_0x8000000000000000_to_things()); + RUN_A_TEST(test_cast_0xffffffffffffffff_to_things()); + + exit(EXIT_SUCCESS); +} + diff --git a/test/path.cc b/test/path.cc new file mode 100644 index 0000000..c309480 --- /dev/null +++ b/test/path.cc @@ -0,0 +1,40 @@ +#include "tests.h" + +#include "crucible/path.h" + +#include +#include +#include +#include + +#include + +using namespace crucible; + +unsigned failures = 0; +static +void +test_path_basename(string input, string expected) +{ + string result = basename(input); + if (expected != result) { + std::cerr << "result was \"" << result << "\"" << std::endl; + ++failures; + } +} + +int +main(int, char**) +{ + RUN_A_TEST(test_path_basename("/foo/bar.c", "bar.c")); + RUN_A_TEST(test_path_basename("/foo/bar/", "")); + RUN_A_TEST(test_path_basename("/foo/", "")); + RUN_A_TEST(test_path_basename("/", "")); + RUN_A_TEST(test_path_basename("foo/bar.c", "bar.c")); + RUN_A_TEST(test_path_basename("bar.c", "bar.c")); + RUN_A_TEST(test_path_basename("", "")); + + assert(!failures); + + exit(EXIT_SUCCESS); +} diff --git a/test/process.cc b/test/process.cc new file mode 100644 index 0000000..a24eb0e --- /dev/null +++ b/test/process.cc @@ -0,0 +1,65 @@ +#include "tests.h" + +#include "crucible/process.h" + +#include +#include +#include +#include +#include + +#include + +using namespace crucible; +using namespace std; + +static inline +int +return_value(int val) +{ + // cerr << "pid " << getpid() << " returning " << val << endl; + return val; +} + +static inline +int +return_value_2(int val, int val2) +{ + return val + val2; +} + +static inline +void +test_fork_return(int val) +{ + Pid child(return_value, val); + assert(child == child->get_id()); + assert(child == child->native_handle()); + int status = child->join(); + int rv_status = WEXITSTATUS(status); + assert(WIFEXITED(status)); + assert(rv_status == val); +} + +static inline +void +test_fork_return(int val, int val2) +{ + Pid child(return_value_2, val, val2); + int status = child->join(); + int rv_status = WEXITSTATUS(status); + assert(WIFEXITED(status)); + assert(rv_status == val + val2); +} + +int +main(int, char**) +{ + RUN_A_TEST(test_fork_return(0)); + RUN_A_TEST(test_fork_return(1)); + RUN_A_TEST(test_fork_return(9)); + RUN_A_TEST(test_fork_return(2, 3)); + RUN_A_TEST(test_fork_return(7, 9)); + + exit(EXIT_SUCCESS); +} diff --git a/test/tests.h b/test/tests.h new file mode 100644 index 0000000..41ce0fa --- /dev/null +++ b/test/tests.h @@ -0,0 +1,14 @@ +#ifndef CRUCIBLE_TESTS_H +#define CRUCIBLE_TESTS_H + +#undef NDEBUG + +#include + +#define RUN_A_TEST(test) do { \ + std::cerr << "Testing " << #test << "..." << std::flush; \ + do { test ; } while (0); \ + std::cerr << "OK" << std::endl; \ +} while (0) + +#endif // CRUCIBLE_TESTS_H diff --git a/test/tmp/.gitignore b/test/tmp/.gitignore new file mode 100644 index 0000000..72e8ffc --- /dev/null +++ b/test/tmp/.gitignore @@ -0,0 +1 @@ +*