Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions src/util/irep_hash.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,22 @@ Author: Michael Tautschnig, mt@eecs.qmul.ac.uk

// you need to pick one of the following options

#define IREP_HASH_BASIC
// #define IREP_HASH_BASIC
// #define IREP_HASH_MURMURHASH2A
// #define IREP_HASH_MURMURHASH3
#define IREP_HASH_MURMURHASH3

Comment thread
tautschnig marked this conversation as resolved.
#if !defined(IREP_HASH_BASIC) && !defined(IREP_HASH_MURMURHASH2A) && \
!defined(IREP_HASH_MURMURHASH3)
# error \
"Exactly one of IREP_HASH_BASIC, IREP_HASH_MURMURHASH2A, or IREP_HASH_MURMURHASH3 must be defined"
#endif

#if(defined(IREP_HASH_BASIC) && defined(IREP_HASH_MURMURHASH2A)) || \
(defined(IREP_HASH_BASIC) && defined(IREP_HASH_MURMURHASH3)) || \
(defined(IREP_HASH_MURMURHASH2A) && defined(IREP_HASH_MURMURHASH3))
# error \
"Exactly one of IREP_HASH_BASIC, IREP_HASH_MURMURHASH2A, or IREP_HASH_MURMURHASH3 must be defined"
#endif

// Comparison for OS X, 64 bit, LLVM version 5.1:
//
Expand Down
67 changes: 55 additions & 12 deletions src/util/string_hash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,66 @@ Author: Daniel Kroening, kroening@kroening.com

#include "string_hash.h"

size_t hash_string(const std::string &s)
{
size_t h=0;
size_t size=s.size();
#include <cstdint>
#include <cstring>

for(unsigned i=0; i<size; i++)
h=(h<<5)-h+s[i];
// Use a hash with good avalanche properties for CBMC's symbol names,
// which often share long common prefixes (e.g., main::1::x!0@1#1).
//
// We pick the FNV-1a variant width to match `std::size_t` so that
// 32-bit builds use 32-bit constants and arithmetic (avoiding libgcc
// 64-bit multiplication helpers) and 64-bit builds use the wider
// 64-bit variant. The Murmur fmix finalizer is applied at the end for
// additional avalanche.

return h;
template <std::size_t bits>
static inline std::size_t hash_string_impl(const char *data, std::size_t len);

template <>
inline std::size_t hash_string_impl<64>(const char *data, std::size_t len)
{
// FNV-1a-64.
std::uint64_t h = 0xcbf29ce484222325ULL; // offset basis
// Mix in the length first to differentiate strings that are prefixes
// of each other.
h ^= len;
for(std::size_t i = 0; i < len; ++i)
h = (h ^ static_cast<unsigned char>(data[i])) * 0x100000001b3ULL; // prime
// Murmur fmix64 finalizer (kept in sync with fmix32 / fmix64 in
// irep_hash.h).
h ^= h >> 33;
h *= 0xff51afd7ed558ccdULL;
h ^= h >> 33;
h *= 0xc4ceb9fe1a85ec53ULL;
h ^= h >> 33;
return static_cast<std::size_t>(h);
}

size_t hash_string(const char *s)
template <>
inline std::size_t hash_string_impl<32>(const char *data, std::size_t len)
{
size_t h=0;
// FNV-1a-32.
std::uint32_t h = 0x811c9dc5u; // offset basis
// Mix in the length first to differentiate strings that are prefixes
// of each other.
h ^= static_cast<std::uint32_t>(len);
for(std::size_t i = 0; i < len; ++i)
h = (h ^ static_cast<unsigned char>(data[i])) * 0x01000193u; // prime
// Murmur fmix32 finalizer (kept in sync with fmix32 in irep_hash.h).
h ^= h >> 16;
h *= 0x85ebca6bu;
h ^= h >> 13;
h *= 0xc2b2ae35u;
h ^= h >> 16;
return h;
}

for(; *s!=0; s++)
h=(h<<5)-h+*s;
size_t hash_string(const std::string &s)
{
return hash_string_impl<sizeof(std::size_t) * 8>(s.data(), s.size());
}

return h;
size_t hash_string(const char *s)
{
return hash_string_impl<sizeof(std::size_t) * 8>(s, std::strlen(s));
}
1 change: 1 addition & 0 deletions unit/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ SRC += analyses/ai/ai.cpp \
util/ssa_expr.cpp \
util/std_expr.cpp \
util/string2int.cpp \
util/string_hash.cpp \
util/structured_data.cpp \
util/string_utils/capitalize.cpp \
util/string_utils/escape_non_alnum.cpp \
Expand Down
83 changes: 83 additions & 0 deletions unit/util/string_hash.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/*******************************************************************\

Module: Unit tests for string_hash.h

Author: Diffblue Ltd.

\*******************************************************************/

#include <util/string_hash.h>

#include <testing-utils/use_catch.h>

#include <set>
#include <string>

TEST_CASE(
"hash_string is deterministic for equal inputs",
"[core][util][string_hash]")
{
// Pure determinism: equal inputs map to equal hash values, no matter
// whether we go through the std::string overload or the const char *
// overload, and no matter how many times we compute it.
const std::string s = "main::1::x!0@1#1";
REQUIRE(hash_string(s) == hash_string(s));
REQUIRE(hash_string(s) == hash_string(s.c_str()));
REQUIRE(hash_string("") == hash_string(std::string{}));
}

TEST_CASE(
"hash_string distinguishes two-character strings that differ in one bit",
"[core][util][string_hash]")
{
// The classic worst-case for hash functions with poor avalanche: tiny
// inputs that differ in a single bit. FNV-1a with the Murmur fmix
// finalizer should map these to clearly distinct hash values.
CHECK(hash_string("a") != hash_string("b"));
CHECK(hash_string("ab") != hash_string("ac"));
CHECK(hash_string("ab") != hash_string("aa"));
CHECK(hash_string("ab") != hash_string("ba"));
}

TEST_CASE(
"hash_string distinguishes strings that are prefixes of one another",
"[core][util][string_hash]")
{
// CBMC produces lots of SSA-renamed symbol names that share long
// common prefixes (e.g., main::1::x!0@1#1, main::1::x!0@1#2). Mixing
// the length into the hash before processing the bytes ensures that
// a string and any of its prefixes hash to different values.
CHECK(hash_string("main::1::x") != hash_string("main::1::x!"));
CHECK(hash_string("main::1::x!0@1#1") != hash_string("main::1::x!0@1#11"));
CHECK(hash_string("") != hash_string("a"));
}

TEST_CASE(
"hash_string has acceptable distribution on CBMC-style symbol names",
"[core][util][string_hash]")
{
// Sanity check: hashing a small batch of CBMC-style SSA-renamed
// symbol names should produce no collisions at all in 64-bit (and
// very few in 32-bit). The previous djb2-variant hash would
// collide noticeably on inputs of this shape, which is what
// motivated this change.
std::set<std::size_t> seen;
std::size_t inserted = 0;
for(int frame = 0; frame < 16; ++frame)
{
for(int var = 0; var < 16; ++var)
{
for(int ssa = 0; ssa < 16; ++ssa)
{
const std::string name = "main::" + std::to_string(frame) + "::x" +
std::to_string(var) + "!0@1#" +
std::to_string(ssa);
seen.insert(hash_string(name));
++inserted;
}
}
}
// Allow a small margin so 32-bit builds (which truncate to 32 bits)
// are not flaky if a handful of collisions show up.
REQUIRE(seen.size() >= inserted * 95 / 100);
}
Loading