diff --git a/PSync/detail/bloom-filter.cpp b/PSync/detail/bloom-filter.cpp index f700f16..89447c0 100644 --- a/PSync/detail/bloom-filter.cpp +++ b/PSync/detail/bloom-filter.cpp @@ -47,22 +47,19 @@ #include "PSync/detail/util.hpp" #include -#include #include #include #include -#include -#include #include - -// https://github.com/ArashPartow/bloom - -NDN_LOG_INIT(psync.BloomFilter); +#include namespace psync { -static const std::size_t bits_per_char = 0x08; +// https://github.com/ArashPartow/bloom + +static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned) + static const unsigned char bit_mask[bits_per_char] = { 0x01, //00000001 0x02, //00000010 @@ -74,89 +71,154 @@ static const unsigned char bit_mask[bits_per_char] = { 0x80 //10000000 }; -BloomParameters::BloomParameters() -: minimum_size(1) -, maximum_size(std::numeric_limits::max()) -, minimum_number_of_hashes(1) -, maximum_number_of_hashes(std::numeric_limits::max()) -, projected_element_count(200) -, false_positive_probability(1.0 / projected_element_count) -, random_seed(0xA5A5A5A55A5A5A5AULL) -{} - -bool -BloomParameters::compute_optimal_parameters() +class bloom_parameters { - if (!(*this)) { - return false; - } +public: - double min_m = std::numeric_limits::infinity(); - double min_k = 0.0; - double curr_m = 0.0; - double k = 1.0; + bloom_parameters() + : minimum_size(1), + maximum_size(std::numeric_limits::max()), + minimum_number_of_hashes(1), + maximum_number_of_hashes(std::numeric_limits::max()), + projected_element_count(200), + false_positive_probability(1.0 / projected_element_count), + random_seed(0xA5A5A5A55A5A5A5AULL) + {} - while (k < 1000.0) - { - double numerator = (- k * projected_element_count); - double denominator = std::log(1.0 - std::pow(false_positive_probability, 1.0 / k)); - curr_m = numerator / denominator; - if (curr_m < min_m) - { - min_m = curr_m; - min_k = k; - } - k += 1.0; - } + inline bool operator!() + { + return (minimum_size > maximum_size) || + (minimum_number_of_hashes > maximum_number_of_hashes) || + (minimum_number_of_hashes < 1) || + (0 == maximum_number_of_hashes) || + (0 == projected_element_count) || + (false_positive_probability < 0.0) || + (std::numeric_limits::infinity() == std::abs(false_positive_probability)) || + (0 == random_seed) || + (0xFFFFFFFFFFFFFFFFULL == random_seed); + } - optimal_parameters_t& optp = optimal_parameters; + // Allowable min/max size of the bloom filter in bits + unsigned int minimum_size; + unsigned int maximum_size; - optp.number_of_hashes = static_cast(min_k); - optp.table_size = static_cast(min_m); - optp.table_size += (((optp.table_size % bits_per_char) != 0) ? (bits_per_char - (optp.table_size % bits_per_char)) : 0); + // Allowable min/max number of hash functions + unsigned int minimum_number_of_hashes; + unsigned int maximum_number_of_hashes; - if (optp.number_of_hashes < minimum_number_of_hashes) - optp.number_of_hashes = minimum_number_of_hashes; - else if (optp.number_of_hashes > maximum_number_of_hashes) - optp.number_of_hashes = maximum_number_of_hashes; + // The approximate number of elements to be inserted + // into the bloom filter, should be within one order + // of magnitude. The default is 200. + unsigned int projected_element_count; - if (optp.table_size < minimum_size) - optp.table_size = minimum_size; - else if (optp.table_size > maximum_size) - optp.table_size = maximum_size; + // The approximate false positive probability expected + // from the bloom filter. The default is assumed to be + // the reciprocal of the projected_element_count. + double false_positive_probability; - return true; -} + unsigned long long int random_seed; -BloomFilter::BloomFilter() - : bit_table_(0) - , salt_count_(0) - , table_size_(0) - , raw_table_size_(0) - , projected_element_count_(0) - , inserted_element_count_(0) - , random_seed_(0) - , desired_false_positive_probability_(0.0) -{} + struct optimal_parameters_t + { + optimal_parameters_t() + : number_of_hashes(0), + table_size(0) + {} -BloomFilter::BloomFilter(const BloomParameters& p) - : bit_table_(0) - , projected_element_count_(p.projected_element_count) - , inserted_element_count_(0) - , random_seed_((p.random_seed * 0xA5A5A5A5) + 1) - , desired_false_positive_probability_(p.false_positive_probability) + unsigned int number_of_hashes; + unsigned int table_size; + }; + + optimal_parameters_t optimal_parameters; + + bool compute_optimal_parameters() + { + /* + Note: + The following will attempt to find the number of hash functions + and minimum amount of storage bits required to construct a bloom + filter consistent with the user defined false positive probability + and estimated element insertion count. + */ + + if (!(*this)) + return false; + + double min_m = std::numeric_limits::infinity(); + double min_k = 0.0; + double k = 1.0; + + while (k < 1000.0) + { + const double numerator = (- k * projected_element_count); + const double denominator = std::log(1.0 - std::pow(false_positive_probability, 1.0 / k)); + + const double curr_m = numerator / denominator; + + if (curr_m < min_m) + { + min_m = curr_m; + min_k = k; + } + + k += 1.0; + } + + optimal_parameters_t& optp = optimal_parameters; + + optp.number_of_hashes = static_cast(min_k); + + optp.table_size = static_cast(min_m); + + optp.table_size += (((optp.table_size % bits_per_char) != 0) ? (bits_per_char - (optp.table_size % bits_per_char)) : 0); + + if (optp.number_of_hashes < minimum_number_of_hashes) + optp.number_of_hashes = minimum_number_of_hashes; + else if (optp.number_of_hashes > maximum_number_of_hashes) + optp.number_of_hashes = maximum_number_of_hashes; + + if (optp.table_size < minimum_size) + optp.table_size = minimum_size; + else if (optp.table_size > maximum_size) + optp.table_size = maximum_size; + + return true; + } + +}; + + +BloomFilter::BloomFilter(const bloom_parameters& p) +: projected_element_count_(p.projected_element_count), + inserted_element_count_(0), + random_seed_((p.random_seed * 0xA5A5A5A5) + 1), + desired_false_positive_probability_(p.false_positive_probability) { salt_count_ = p.optimal_parameters.number_of_hashes; table_size_ = p.optimal_parameters.table_size; + generate_unique_salt(); - raw_table_size_ = table_size_ / bits_per_char; - //bit_table_ = new cell_type[static_cast(raw_table_size_)]; - bit_table_.resize(static_cast(raw_table_size_), 0x00); + + bit_table_.resize(table_size_ / bits_per_char, static_cast(0x00)); +} + +static bloom_parameters +makeParameters(unsigned int projected_element_count, + double false_positive_probability) +{ + bloom_parameters p; + p.projected_element_count = projected_element_count; + p.false_positive_probability = false_positive_probability; + + if (!p.compute_optimal_parameters()) { + NDN_THROW(BloomFilter::Error("Bloom filter parameters are not correct!")); + } + return p; } BloomFilter::BloomFilter(unsigned int projected_element_count, double false_positive_probability) - : BloomFilter(getParameters(projected_element_count, false_positive_probability)) + : BloomFilter(makeParameters(projected_element_count, false_positive_probability)) { } @@ -165,42 +227,25 @@ BloomFilter::BloomFilter(unsigned int projected_element_count, const ndn::name::Component& bfName) : BloomFilter(projected_element_count, false_positive_probability) { - std::vector table(bfName.value_begin(), bfName.value_end()); - - if (table.size() != raw_table_size_) { - NDN_THROW(Error("Received BloomFilter cannot be decoded!")); + std::vector table(bfName.value_begin(), bfName.value_end()); + if (table.size() != table_size_ / bits_per_char) { + NDN_THROW(Error("Bloom filter cannot be decoded!")); } - bit_table_ = table; -} - -BloomParameters -BloomFilter::getParameters(unsigned int projected_element_count, - double false_positive_probability) -{ - BloomParameters opt; - opt.false_positive_probability = false_positive_probability; - opt.projected_element_count = projected_element_count; - - if (!opt) { - NDN_LOG_WARN("Bloom parameters are not correct!"); - } - - opt.compute_optimal_parameters(); - return opt; + bit_table_ = std::move(table); } void BloomFilter::appendToName(ndn::Name& name) const { name.appendNumber(projected_element_count_); - name.appendNumber((int)(desired_false_positive_probability_ * 1000)); + name.appendNumber(static_cast(desired_false_positive_probability_ * 1000)); name.append(bit_table_.begin(), bit_table_.end()); } void BloomFilter::clear() { - bit_table_.resize(static_cast(raw_table_size_), 0x00); + std::fill(bit_table_.begin(), bit_table_.end(), static_cast(0x00)); inserted_element_count_ = 0; } @@ -208,12 +253,15 @@ void BloomFilter::insert(const std::string& key) { std::size_t bit_index = 0; - std::size_t bit = 0; + std::size_t bit = 0; + for (std::size_t i = 0; i < salt_.size(); ++i) { - compute_indices(murmurHash3(salt_[i], key), bit_index, bit); - bit_table_[bit_index/bits_per_char] |= bit_mask[bit]; + compute_indices(murmurHash3(salt_[i], key), bit_index, bit); + + bit_table_[bit_index / bits_per_char] |= bit_mask[bit]; } + ++inserted_element_count_; } @@ -221,12 +269,14 @@ bool BloomFilter::contains(const std::string& key) const { std::size_t bit_index = 0; - std::size_t bit = 0; + std::size_t bit = 0; for (std::size_t i = 0; i < salt_.size(); ++i) { compute_indices(murmurHash3(salt_[i], key), bit_index, bit); - if ((bit_table_[bit_index/bits_per_char] & bit_mask[bit]) != bit_mask[bit]) { + + if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit]) + { return false; } } @@ -234,10 +284,11 @@ BloomFilter::contains(const std::string& key) const return true; } -std::vector -BloomFilter::table() const +void +BloomFilter::compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const { - return bit_table_; + bit_index = hash % table_size_; + bit = bit_index % bits_per_char; } void @@ -250,6 +301,7 @@ BloomFilter::generate_unique_salt() hash function with different values seems to be adequate. */ const unsigned int predef_salt_count = 128; + static const bloom_type predef_salt[predef_salt_count] = { 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC, @@ -291,19 +343,31 @@ BloomFilter::generate_unique_salt() std::copy(predef_salt, predef_salt + salt_count_, std::back_inserter(salt_)); - for (unsigned int i = 0; i < salt_.size(); ++i) + + for (std::size_t i = 0; i < salt_.size(); ++i) { + /* + Note: + This is done to integrate the user defined random seed, + so as to allow for the generation of unique bloom filter + instances. + */ salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + static_cast(random_seed_); } } else { - std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_)); + std::copy(predef_salt, predef_salt + predef_salt_count, std::back_inserter(salt_)); + srand(static_cast(random_seed_)); + while (salt_.size() < salt_count_) { bloom_type current_salt = static_cast(rand()) * static_cast(rand()); - if (0 == current_salt) continue; + + if (0 == current_salt) + continue; + if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt)) { salt_.push_back(current_salt); @@ -312,39 +376,4 @@ BloomFilter::generate_unique_salt() } } -void -BloomFilter::compute_indices(const bloom_type& hash, - std::size_t& bit_index, std::size_t& bit) const -{ - bit_index = hash % table_size_; - bit = bit_index % bits_per_char; -} - -bool -operator==(const BloomFilter& bf1, const BloomFilter& bf2) -{ - auto table1 = bf1.table(); - auto table2 = bf2.table(); - - if (table1.size() != table2.size()) { - return false; - } - - for (size_t i = 0; i < table1.size(); i++) { - if (table1[i] != table2[i]) { - return false; - } - } - return true; -} - -std::ostream& -operator<<(std::ostream& out, const BloomFilter& bf) -{ - for (const auto& element : bf.table()) { - out << unsigned(element); - } - return out; -} - } // namespace psync diff --git a/PSync/detail/bloom-filter.hpp b/PSync/detail/bloom-filter.hpp index f7e3726..ff0d213 100644 --- a/PSync/detail/bloom-filter.hpp +++ b/PSync/detail/bloom-filter.hpp @@ -47,63 +47,17 @@ #define PSYNC_DETAIL_BLOOM_FILTER_HPP #include +#include #include #include -#include -#include namespace psync { -struct optimal_parameters_t -{ - optimal_parameters_t() - : number_of_hashes(0), - table_size(0) - {} - - unsigned int number_of_hashes; - unsigned int table_size; -}; - -class BloomParameters -{ -public: - BloomParameters(); - - bool - compute_optimal_parameters(); - - bool operator!() const - { - return (minimum_size > maximum_size) || - (minimum_number_of_hashes > maximum_number_of_hashes) || - (minimum_number_of_hashes < 1) || - (0 == maximum_number_of_hashes) || - (0 == projected_element_count) || - (false_positive_probability < 0.0) || - (std::numeric_limits::infinity() == std::abs(false_positive_probability)) || - (0 == random_seed) || - (0xFFFFFFFFFFFFFFFFULL == random_seed); - } - - unsigned int minimum_size; - unsigned int maximum_size; - unsigned int minimum_number_of_hashes; - unsigned int maximum_number_of_hashes; - unsigned int projected_element_count; - double false_positive_probability; - unsigned long long int random_seed; - optimal_parameters_t optimal_parameters; -}; +class bloom_parameters; class BloomFilter { -protected: - typedef uint32_t bloom_type; - typedef uint8_t cell_type; - typedef std::vector ::iterator Iterator; - public: class Error : public std::runtime_error { @@ -111,9 +65,7 @@ public: using std::runtime_error::runtime_error; }; - BloomFilter(); - - explicit BloomFilter(const BloomParameters& p); + BloomFilter() = default; BloomFilter(unsigned int projected_element_count, double false_positive_probability); @@ -122,10 +74,6 @@ public: double false_positive_probability, const ndn::name::Component& bfName); - BloomParameters - getParameters(unsigned int projected_element_count, - double false_positive_probability); - /** * @brief Append our bloom filter to the given name * @@ -146,35 +94,53 @@ public: bool contains(const std::string& key) const; - std::vector - table() const; - private: + typedef uint32_t bloom_type; + typedef uint8_t cell_type; + + explicit + BloomFilter(const bloom_parameters& p); + + void + compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const; + void generate_unique_salt(); - void - compute_indices(const bloom_type& hash, - std::size_t& bit_index, std::size_t& bit) const; +private: // non-member operators + // NOTE: the following "hidden friend" operators are available via + // argument-dependent lookup only and must be defined inline. + + friend bool + operator==(const BloomFilter& lhs, const BloomFilter& rhs) + { + return lhs.bit_table_ == rhs.bit_table_; + } + + friend bool + operator!=(const BloomFilter& lhs, const BloomFilter& rhs) + { + return lhs.bit_table_ != rhs.bit_table_; + } + + friend std::ostream& + operator<<(std::ostream& os, const BloomFilter& bf) + { + ndn::printHex(os, bf.bit_table_.data(), bf.bit_table_.size(), false); + return os; + } private: - std::vector salt_; - std::vector bit_table_; - unsigned int salt_count_; - unsigned int table_size_; // 8 * raw_table_size; - unsigned int raw_table_size_; - unsigned int projected_element_count_; - unsigned int inserted_element_count_; - unsigned long long int random_seed_; - double desired_false_positive_probability_; + std::vector salt_; + std::vector bit_table_; + unsigned int salt_count_ = 0; + unsigned int table_size_ = 0; + unsigned int projected_element_count_ = 0; + unsigned int inserted_element_count_ = 0; + unsigned long long int random_seed_ = 0; + double desired_false_positive_probability_ = 0.0; }; -bool -operator==(const BloomFilter& bf1, const BloomFilter& bf2); - -std::ostream& -operator<<(std::ostream& out, const BloomFilter& bf); - } // namespace psync #endif // PSYNC_DETAIL_BLOOM_FILTER_HPP