diff --git a/src/bees-hash.cc b/src/bees-hash.cc index f17ecc8..d6b0204 100644 --- a/src/bees-hash.cc +++ b/src/bees-hash.cc @@ -1,5 +1,6 @@ #include "bees.h" +#include "crucible/city.h" #include "crucible/crc64.h" #include "crucible/string.h" @@ -11,6 +12,12 @@ using namespace crucible; using namespace std; +BeesHash::BeesHash(const uint8_t *ptr, size_t len) : + // m_hash(CityHash64(reinterpret_cast(ptr), len)) + m_hash(Digest::CRC::crc64(ptr, len)) +{ +} + ostream & operator<<(ostream &os, const BeesHash &bh) { diff --git a/src/bees-types.cc b/src/bees-types.cc index 211d87e..5960892 100644 --- a/src/bees-types.cc +++ b/src/bees-types.cc @@ -1,6 +1,5 @@ #include "bees.h" -#include "crucible/crc64.h" #include "crucible/limits.h" #include "crucible/ntoa.h" #include "crucible/string.h" @@ -964,11 +963,7 @@ BeesBlockData::hash() const // We can only dedup unaligned EOF blocks against other unaligned EOF blocks, // so we do NOT round up to a full sum block size. const Blob &blob = data(); - // TODO: It turns out that file formats with 4K block - // alignment and embedded CRC64 do exist, and every block - // of such files has the same hash. Could use a subset - // of SHA1 here instead. - m_hash = Digest::CRC::crc64(blob.data(), blob.size()); + m_hash = BeesHash(blob.data(), blob.size()); m_hash_done = true; BEESCOUNT(block_hash); } @@ -980,9 +975,8 @@ bool BeesBlockData::is_data_zero() const { // The CRC64 of zero is zero, so skip some work if we already know the CRC - if (m_hash_done && m_hash != 0) { - return false; - } + // ...but that doesn't work for any other hash function, and it + // saves us next to nothing. // OK read block (maybe) and check every byte for (auto c : data()) { diff --git a/src/bees.h b/src/bees.h index f2b6a04..baa7185 100644 --- a/src/bees.h +++ b/src/bees.h @@ -605,6 +605,7 @@ struct BeesHash { BeesHash(Type that) : m_hash(that) { } operator Type() const { return m_hash; } BeesHash& operator=(const Type that) { m_hash = that; return *this; } + BeesHash(const uint8_t *ptr, size_t len); private: Type m_hash;