bloom-filter: code cleanup

Change-Id: I72cec068778d26c62b05060030013f148ef7c5da
This commit is contained in:
Davide Pesavento
2020-12-18 23:38:06 -05:00
parent b60398c5fc
commit aafef2bd91
2 changed files with 213 additions and 218 deletions
+171 -142
View File
@@ -47,22 +47,19 @@
#include "PSync/detail/util.hpp"
#include <ndn-cxx/util/exception.hpp>
#include <ndn-cxx/util/logger.hpp>
#include <algorithm>
#include <cmath>
#include <cstddef>
#include <iterator>
#include <limits>
#include <cstdlib>
// https://github.com/ArashPartow/bloom
NDN_LOG_INIT(psync.BloomFilter);
#include <limits>
namespace psync {
static const std::size_t bits_per_char = 0x08;
// https://github.com/ArashPartow/bloom
static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned)
static const unsigned char bit_mask[bits_per_char] = {
0x01, //00000001
0x02, //00000010
@@ -74,89 +71,154 @@ static const unsigned char bit_mask[bits_per_char] = {
0x80 //10000000
};
BloomParameters::BloomParameters()
: minimum_size(1)
, maximum_size(std::numeric_limits<unsigned int>::max())
, minimum_number_of_hashes(1)
, maximum_number_of_hashes(std::numeric_limits<unsigned int>::max())
, projected_element_count(200)
, false_positive_probability(1.0 / projected_element_count)
, random_seed(0xA5A5A5A55A5A5A5AULL)
{}
bool
BloomParameters::compute_optimal_parameters()
class bloom_parameters
{
if (!(*this)) {
return false;
}
public:
double min_m = std::numeric_limits<double>::infinity();
double min_k = 0.0;
double curr_m = 0.0;
double k = 1.0;
bloom_parameters()
: minimum_size(1),
maximum_size(std::numeric_limits<unsigned int>::max()),
minimum_number_of_hashes(1),
maximum_number_of_hashes(std::numeric_limits<unsigned int>::max()),
projected_element_count(200),
false_positive_probability(1.0 / projected_element_count),
random_seed(0xA5A5A5A55A5A5A5AULL)
{}
while (k < 1000.0)
{
double numerator = (- k * projected_element_count);
double denominator = std::log(1.0 - std::pow(false_positive_probability, 1.0 / k));
curr_m = numerator / denominator;
if (curr_m < min_m)
{
min_m = curr_m;
min_k = k;
}
k += 1.0;
}
inline bool operator!()
{
return (minimum_size > maximum_size) ||
(minimum_number_of_hashes > maximum_number_of_hashes) ||
(minimum_number_of_hashes < 1) ||
(0 == maximum_number_of_hashes) ||
(0 == projected_element_count) ||
(false_positive_probability < 0.0) ||
(std::numeric_limits<double>::infinity() == std::abs(false_positive_probability)) ||
(0 == random_seed) ||
(0xFFFFFFFFFFFFFFFFULL == random_seed);
}
optimal_parameters_t& optp = optimal_parameters;
// Allowable min/max size of the bloom filter in bits
unsigned int minimum_size;
unsigned int maximum_size;
optp.number_of_hashes = static_cast<unsigned int>(min_k);
optp.table_size = static_cast<unsigned int>(min_m);
optp.table_size += (((optp.table_size % bits_per_char) != 0) ? (bits_per_char - (optp.table_size % bits_per_char)) : 0);
// Allowable min/max number of hash functions
unsigned int minimum_number_of_hashes;
unsigned int maximum_number_of_hashes;
if (optp.number_of_hashes < minimum_number_of_hashes)
optp.number_of_hashes = minimum_number_of_hashes;
else if (optp.number_of_hashes > maximum_number_of_hashes)
optp.number_of_hashes = maximum_number_of_hashes;
// The approximate number of elements to be inserted
// into the bloom filter, should be within one order
// of magnitude. The default is 200.
unsigned int projected_element_count;
if (optp.table_size < minimum_size)
optp.table_size = minimum_size;
else if (optp.table_size > maximum_size)
optp.table_size = maximum_size;
// The approximate false positive probability expected
// from the bloom filter. The default is assumed to be
// the reciprocal of the projected_element_count.
double false_positive_probability;
return true;
}
unsigned long long int random_seed;
BloomFilter::BloomFilter()
: bit_table_(0)
, salt_count_(0)
, table_size_(0)
, raw_table_size_(0)
, projected_element_count_(0)
, inserted_element_count_(0)
, random_seed_(0)
, desired_false_positive_probability_(0.0)
{}
struct optimal_parameters_t
{
optimal_parameters_t()
: number_of_hashes(0),
table_size(0)
{}
BloomFilter::BloomFilter(const BloomParameters& p)
: bit_table_(0)
, projected_element_count_(p.projected_element_count)
, inserted_element_count_(0)
, random_seed_((p.random_seed * 0xA5A5A5A5) + 1)
, desired_false_positive_probability_(p.false_positive_probability)
unsigned int number_of_hashes;
unsigned int table_size;
};
optimal_parameters_t optimal_parameters;
bool compute_optimal_parameters()
{
/*
Note:
The following will attempt to find the number of hash functions
and minimum amount of storage bits required to construct a bloom
filter consistent with the user defined false positive probability
and estimated element insertion count.
*/
if (!(*this))
return false;
double min_m = std::numeric_limits<double>::infinity();
double min_k = 0.0;
double k = 1.0;
while (k < 1000.0)
{
const double numerator = (- k * projected_element_count);
const double denominator = std::log(1.0 - std::pow(false_positive_probability, 1.0 / k));
const double curr_m = numerator / denominator;
if (curr_m < min_m)
{
min_m = curr_m;
min_k = k;
}
k += 1.0;
}
optimal_parameters_t& optp = optimal_parameters;
optp.number_of_hashes = static_cast<unsigned int>(min_k);
optp.table_size = static_cast<unsigned int>(min_m);
optp.table_size += (((optp.table_size % bits_per_char) != 0) ? (bits_per_char - (optp.table_size % bits_per_char)) : 0);
if (optp.number_of_hashes < minimum_number_of_hashes)
optp.number_of_hashes = minimum_number_of_hashes;
else if (optp.number_of_hashes > maximum_number_of_hashes)
optp.number_of_hashes = maximum_number_of_hashes;
if (optp.table_size < minimum_size)
optp.table_size = minimum_size;
else if (optp.table_size > maximum_size)
optp.table_size = maximum_size;
return true;
}
};
BloomFilter::BloomFilter(const bloom_parameters& p)
: projected_element_count_(p.projected_element_count),
inserted_element_count_(0),
random_seed_((p.random_seed * 0xA5A5A5A5) + 1),
desired_false_positive_probability_(p.false_positive_probability)
{
salt_count_ = p.optimal_parameters.number_of_hashes;
table_size_ = p.optimal_parameters.table_size;
generate_unique_salt();
raw_table_size_ = table_size_ / bits_per_char;
//bit_table_ = new cell_type[static_cast<std::size_t>(raw_table_size_)];
bit_table_.resize(static_cast<std::size_t>(raw_table_size_), 0x00);
bit_table_.resize(table_size_ / bits_per_char, static_cast<cell_type>(0x00));
}
static bloom_parameters
makeParameters(unsigned int projected_element_count,
double false_positive_probability)
{
bloom_parameters p;
p.projected_element_count = projected_element_count;
p.false_positive_probability = false_positive_probability;
if (!p.compute_optimal_parameters()) {
NDN_THROW(BloomFilter::Error("Bloom filter parameters are not correct!"));
}
return p;
}
BloomFilter::BloomFilter(unsigned int projected_element_count,
double false_positive_probability)
: BloomFilter(getParameters(projected_element_count, false_positive_probability))
: BloomFilter(makeParameters(projected_element_count, false_positive_probability))
{
}
@@ -165,42 +227,25 @@ BloomFilter::BloomFilter(unsigned int projected_element_count,
const ndn::name::Component& bfName)
: BloomFilter(projected_element_count, false_positive_probability)
{
std::vector<BloomFilter::cell_type> table(bfName.value_begin(), bfName.value_end());
if (table.size() != raw_table_size_) {
NDN_THROW(Error("Received BloomFilter cannot be decoded!"));
std::vector<cell_type> table(bfName.value_begin(), bfName.value_end());
if (table.size() != table_size_ / bits_per_char) {
NDN_THROW(Error("Bloom filter cannot be decoded!"));
}
bit_table_ = table;
}
BloomParameters
BloomFilter::getParameters(unsigned int projected_element_count,
double false_positive_probability)
{
BloomParameters opt;
opt.false_positive_probability = false_positive_probability;
opt.projected_element_count = projected_element_count;
if (!opt) {
NDN_LOG_WARN("Bloom parameters are not correct!");
}
opt.compute_optimal_parameters();
return opt;
bit_table_ = std::move(table);
}
void
BloomFilter::appendToName(ndn::Name& name) const
{
name.appendNumber(projected_element_count_);
name.appendNumber((int)(desired_false_positive_probability_ * 1000));
name.appendNumber(static_cast<uint64_t>(desired_false_positive_probability_ * 1000));
name.append(bit_table_.begin(), bit_table_.end());
}
void
BloomFilter::clear()
{
bit_table_.resize(static_cast<std::size_t>(raw_table_size_), 0x00);
std::fill(bit_table_.begin(), bit_table_.end(), static_cast<cell_type>(0x00));
inserted_element_count_ = 0;
}
@@ -208,12 +253,15 @@ void
BloomFilter::insert(const std::string& key)
{
std::size_t bit_index = 0;
std::size_t bit = 0;
std::size_t bit = 0;
for (std::size_t i = 0; i < salt_.size(); ++i)
{
compute_indices(murmurHash3(salt_[i], key), bit_index, bit);
bit_table_[bit_index/bits_per_char] |= bit_mask[bit];
compute_indices(murmurHash3(salt_[i], key), bit_index, bit);
bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
}
++inserted_element_count_;
}
@@ -221,12 +269,14 @@ bool
BloomFilter::contains(const std::string& key) const
{
std::size_t bit_index = 0;
std::size_t bit = 0;
std::size_t bit = 0;
for (std::size_t i = 0; i < salt_.size(); ++i)
{
compute_indices(murmurHash3(salt_[i], key), bit_index, bit);
if ((bit_table_[bit_index/bits_per_char] & bit_mask[bit]) != bit_mask[bit]) {
if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
{
return false;
}
}
@@ -234,10 +284,11 @@ BloomFilter::contains(const std::string& key) const
return true;
}
std::vector <BloomFilter::cell_type>
BloomFilter::table() const
void
BloomFilter::compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
{
return bit_table_;
bit_index = hash % table_size_;
bit = bit_index % bits_per_char;
}
void
@@ -250,6 +301,7 @@ BloomFilter::generate_unique_salt()
hash function with different values seems to be adequate.
*/
const unsigned int predef_salt_count = 128;
static const bloom_type predef_salt[predef_salt_count] =
{
0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
@@ -291,19 +343,31 @@ BloomFilter::generate_unique_salt()
std::copy(predef_salt,
predef_salt + salt_count_,
std::back_inserter(salt_));
for (unsigned int i = 0; i < salt_.size(); ++i)
for (std::size_t i = 0; i < salt_.size(); ++i)
{
/*
Note:
This is done to integrate the user defined random seed,
so as to allow for the generation of unique bloom filter
instances.
*/
salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + static_cast<bloom_type>(random_seed_);
}
}
else
{
std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_));
std::copy(predef_salt, predef_salt + predef_salt_count, std::back_inserter(salt_));
srand(static_cast<unsigned int>(random_seed_));
while (salt_.size() < salt_count_)
{
bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
if (0 == current_salt) continue;
if (0 == current_salt)
continue;
if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
{
salt_.push_back(current_salt);
@@ -312,39 +376,4 @@ BloomFilter::generate_unique_salt()
}
}
void
BloomFilter::compute_indices(const bloom_type& hash,
std::size_t& bit_index, std::size_t& bit) const
{
bit_index = hash % table_size_;
bit = bit_index % bits_per_char;
}
bool
operator==(const BloomFilter& bf1, const BloomFilter& bf2)
{
auto table1 = bf1.table();
auto table2 = bf2.table();
if (table1.size() != table2.size()) {
return false;
}
for (size_t i = 0; i < table1.size(); i++) {
if (table1[i] != table2[i]) {
return false;
}
}
return true;
}
std::ostream&
operator<<(std::ostream& out, const BloomFilter& bf)
{
for (const auto& element : bf.table()) {
out << unsigned(element);
}
return out;
}
} // namespace psync
+42 -76
View File
@@ -47,63 +47,17 @@
#define PSYNC_DETAIL_BLOOM_FILTER_HPP
#include <ndn-cxx/name.hpp>
#include <ndn-cxx/util/string-helper.hpp>
#include <string>
#include <vector>
#include <cmath>
#include <cstdlib>
namespace psync {
struct optimal_parameters_t
{
optimal_parameters_t()
: number_of_hashes(0),
table_size(0)
{}
unsigned int number_of_hashes;
unsigned int table_size;
};
class BloomParameters
{
public:
BloomParameters();
bool
compute_optimal_parameters();
bool operator!() const
{
return (minimum_size > maximum_size) ||
(minimum_number_of_hashes > maximum_number_of_hashes) ||
(minimum_number_of_hashes < 1) ||
(0 == maximum_number_of_hashes) ||
(0 == projected_element_count) ||
(false_positive_probability < 0.0) ||
(std::numeric_limits<double>::infinity() == std::abs(false_positive_probability)) ||
(0 == random_seed) ||
(0xFFFFFFFFFFFFFFFFULL == random_seed);
}
unsigned int minimum_size;
unsigned int maximum_size;
unsigned int minimum_number_of_hashes;
unsigned int maximum_number_of_hashes;
unsigned int projected_element_count;
double false_positive_probability;
unsigned long long int random_seed;
optimal_parameters_t optimal_parameters;
};
class bloom_parameters;
class BloomFilter
{
protected:
typedef uint32_t bloom_type;
typedef uint8_t cell_type;
typedef std::vector <cell_type>::iterator Iterator;
public:
class Error : public std::runtime_error
{
@@ -111,9 +65,7 @@ public:
using std::runtime_error::runtime_error;
};
BloomFilter();
explicit BloomFilter(const BloomParameters& p);
BloomFilter() = default;
BloomFilter(unsigned int projected_element_count,
double false_positive_probability);
@@ -122,10 +74,6 @@ public:
double false_positive_probability,
const ndn::name::Component& bfName);
BloomParameters
getParameters(unsigned int projected_element_count,
double false_positive_probability);
/**
* @brief Append our bloom filter to the given name
*
@@ -146,35 +94,53 @@ public:
bool
contains(const std::string& key) const;
std::vector<cell_type>
table() const;
private:
typedef uint32_t bloom_type;
typedef uint8_t cell_type;
explicit
BloomFilter(const bloom_parameters& p);
void
compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const;
void
generate_unique_salt();
void
compute_indices(const bloom_type& hash,
std::size_t& bit_index, std::size_t& bit) const;
private: // non-member operators
// NOTE: the following "hidden friend" operators are available via
// argument-dependent lookup only and must be defined inline.
friend bool
operator==(const BloomFilter& lhs, const BloomFilter& rhs)
{
return lhs.bit_table_ == rhs.bit_table_;
}
friend bool
operator!=(const BloomFilter& lhs, const BloomFilter& rhs)
{
return lhs.bit_table_ != rhs.bit_table_;
}
friend std::ostream&
operator<<(std::ostream& os, const BloomFilter& bf)
{
ndn::printHex(os, bf.bit_table_.data(), bf.bit_table_.size(), false);
return os;
}
private:
std::vector <bloom_type> salt_;
std::vector <cell_type> bit_table_;
unsigned int salt_count_;
unsigned int table_size_; // 8 * raw_table_size;
unsigned int raw_table_size_;
unsigned int projected_element_count_;
unsigned int inserted_element_count_;
unsigned long long int random_seed_;
double desired_false_positive_probability_;
std::vector<bloom_type> salt_;
std::vector<cell_type> bit_table_;
unsigned int salt_count_ = 0;
unsigned int table_size_ = 0;
unsigned int projected_element_count_ = 0;
unsigned int inserted_element_count_ = 0;
unsigned long long int random_seed_ = 0;
double desired_false_positive_probability_ = 0.0;
};
bool
operator==(const BloomFilter& bf1, const BloomFilter& bf2);
std::ostream&
operator<<(std::ostream& out, const BloomFilter& bf);
} // namespace psync
#endif // PSYNC_DETAIL_BLOOM_FILTER_HPP