/** @file Table.h Templated hash table class. @maintainer Morgan McGuire, http://graphics.cs.williams.edu @created 2001-04-22 @edited 2013-01-22 Copyright 2000-2013, Morgan McGuire. All rights reserved. */ #ifndef G3D_Table_h #define G3D_Table_h #include #include #include "G3D/platform.h" #include "G3D/Array.h" #include "G3D/debug.h" #include "G3D/System.h" #include "G3D/g3dmath.h" #include "G3D/EqualsTrait.h" #include "G3D/HashTrait.h" #include "G3D/MemoryManager.h" #ifdef _MSC_VER # pragma warning (push) // Debug name too long warning # pragma warning (disable : 4786) #endif namespace G3D { /** An unordered data structure mapping keys to values. There are two ways of definining custom hash functions (G3D provides built-in ones for most classes):

 class Foo {
 public:
     std::string     name;
     int             index;
     static size_t hashCode(const Foo& key) {
          return HashTrait::hashCode(key.name) + key.index;
     }
  };

  template<> struct HashTrait {
       static size_t hashCode(const Foo& key) { return HashTrait::hashCode(key.name) + key.index; }
  }; 


  // Use Foo::hashCode
  Table fooTable1;

  // Use HashTrait
  Table      fooTable2;

Key must be a pointer, an int, a std::string or provide overloads for:

    template<> struct HashTrait {
        static size_t hashCode(const Key& key) { return reinterpret_cast( ... ); }
    };

and one of

    template<> struct EqualsTrait{
         static bool equals(const Key& a, const Key& b) { return ... ; }
    };


    bool operator==(const Key&, const Key&);

G3D pre-defines HashTrait specializations for common types (like int and std::string). If you use a Table with a different type you must write those functions yourself. For example, an enum would use:

    template<> struct HashTrait {
        static size_t hashCode(const MyEnum& key) const { return reinterpret_cast( key ); }
    };

And rely on the default enum operator==. Periodically check that debugGetLoad() is low (> 0.1). When it gets near 1.0 your hash function is badly designed and maps too many inputs to the same output. */ template, class EqualsFunc = EqualsTrait > class Table { public: /** The pairs returned by iterator. */ class Entry { public: Key key; Value value; Entry() {} Entry(const Key& k) : key(k) {} Entry(const Key& k, const Value& v) : key(k), value(v) {} bool operator==(const Entry &peer) const { return (key == peer.key && value == peer.value); } bool operator!=(const Entry &peer) const { return !operator==(peer); } }; private: typedef Table ThisType; /** Linked list nodes used internally by HashTable. */ class Node { public: Entry entry; size_t hashCode; Node* next; private: // Private to require use of the allocator Node(const Key& k, const Value& v, size_t h, Node* n) : entry(k, v), hashCode(h), next(n) { debugAssert((next == NULL) || isValidHeapPointer(next)); } Node(const Key& k, size_t h, Node* n) : entry(k), hashCode(h), next(n) { debugAssert((next == NULL) || isValidHeapPointer(next)); } public: static Node* create(const Key& k, const Value& v, size_t h, Node* n, MemoryManager::Ref& mm) { Node* node = (Node*)mm->alloc(sizeof(Node)); return new (node) Node(k, v, h, n); } static Node* create(const Key& k, size_t hashCode, Node* n, MemoryManager::Ref& mm) { Node* node = (Node*)mm->alloc(sizeof(Node)); return new (node) Node(k, hashCode, n); } static void destroy(Node* n, MemoryManager::Ref& mm) { n->~Node(); mm->free(n); } /** Clones a whole chain; */ Node* clone(MemoryManager::Ref& mm) { return create(this->entry.key, this->entry.value, hashCode, (next == NULL) ? NULL : next->clone(mm), mm); } }; void checkIntegrity() const { # ifdef G3D_DEBUG debugAssert(m_bucket == NULL || isValidHeapPointer(m_bucket)); for (size_t b = 0; b < m_numBuckets; ++b) { Node* node = m_bucket[b]; debugAssert(node == NULL || isValidHeapPointer(node)); while (node != NULL) { debugAssert(node == NULL || isValidHeapPointer(node)); node = node->next; } } # endif } /** Number of elements in the table.*/ size_t m_size; /** Array of Node*. We don't use Array because Table is lower-level than Array. Some elements may be NULL. */ Node** m_bucket; /** Length of the m_bucket array. */ size_t m_numBuckets; MemoryManager::Ref m_memoryManager; void* alloc(size_t s) const { return m_memoryManager->alloc(s); } void free(void* p) const { return m_memoryManager->free(p); } /** Re-hashes for a larger m_bucket size. */ void resize(size_t newSize) { // Hang onto the old m_bucket array Node** oldBucket = m_bucket; // Allocate a new m_bucket array with the new size m_bucket = (Node**)alloc(sizeof(Node*) * newSize); alwaysAssertM(m_bucket != NULL, "MemoryManager::alloc returned NULL. Out of memory."); // Set all pointers to NULL System::memset(m_bucket, 0, newSize * sizeof(Node*)); // Move each node to its new hash location for (size_t b = 0; b < m_numBuckets; ++b) { Node* node = oldBucket[b]; // There is a linked list of nodes at this m_bucket while (node != NULL) { // Hang onto the old next pointer Node* nextNode = node->next; // Insert at the head of the list for m_bucket[i] size_t i = node->hashCode % newSize; node->next = m_bucket[i]; m_bucket[i] = node; // Move on to the next node node = nextNode; } // Drop the old pointer for cleanliness when debugging oldBucket[b] = NULL; } // Delete the old storage free(oldBucket); this->m_numBuckets = newSize; checkIntegrity(); } void copyFrom(const ThisType& h) { if (&h == this) { return; } debugAssert(m_bucket == NULL); m_size = h.m_size; m_numBuckets = h.m_numBuckets; m_bucket = (Node**)alloc(sizeof(Node*) * m_numBuckets); // No need to NULL elements since we're about to overwrite them for (size_t b = 0; b < m_numBuckets; ++b) { if (h.m_bucket[b] != NULL) { m_bucket[b] = h.m_bucket[b]->clone(m_memoryManager); } else { m_bucket[b] = NULL; } } checkIntegrity(); } /** Frees the heap structures for the nodes. */ void freeMemory() { checkIntegrity(); for (size_t b = 0; b < m_numBuckets; ++b) { Node* node = m_bucket[b]; while (node != NULL) { Node* next = node->next; Node::destroy(node, m_memoryManager); node = next; } m_bucket[b] = NULL; } free(m_bucket); m_bucket = NULL; m_numBuckets = 0; m_size = 0; } public: /** Creates an empty hash table using the default MemoryManager. */ Table() : m_bucket(NULL) { m_memoryManager = MemoryManager::create(); m_numBuckets = 0; m_size = 0; m_bucket = NULL; checkIntegrity(); } /** Changes the internal memory manager to m */ void clearAndSetMemoryManager(const MemoryManager::Ref& m) { clear(); debugAssert(m_bucket == NULL); m_memoryManager = m; } /** Recommends that the table resize to anticipate at least this number of elements. */ void setSizeHint(size_t n) { size_t s = n * 3; if (s > m_numBuckets) { resize(s); } } /** Destroys all of the memory allocated by the table, but does not call delete on keys or values if they are pointers. If you want to deallocate things that the table points at, use getKeys() and Array::deleteAll() to delete them. */ virtual ~Table() { freeMemory(); } /** Uses the default memory manager */ Table(const ThisType& h) { m_memoryManager = MemoryManager::create(); m_numBuckets = 0; m_size = 0; m_bucket = NULL; this->copyFrom(h); checkIntegrity(); } Table& operator=(const ThisType& h) { // No need to copy if the argument is this if (this != &h) { // Free the existing nodes freeMemory(); this->copyFrom(h); checkIntegrity(); } return *this; } /** Returns the length of the deepest m_bucket. */ size_t debugGetDeepestBucketSize() const { size_t deepest = 0; for (size_t b = 0; b < m_numBuckets; ++b) { size_t count = 0; Node* node = m_bucket[b]; while (node != NULL) { node = node->next; ++count; } if (count > deepest) { deepest = count; } } return deepest; } /** Returns the average size of non-empty buckets. */ float debugGetAverageBucketSize() const { uint64 num = 0; for (size_t b = 0; b < m_numBuckets; ++b) { Node* node = m_bucket[b]; if (node != NULL) { ++num; } } return (float)((double)size() / num); } /** A small load (close to zero) means the hash table is acting very efficiently most of the time. A large load (close to 1) means the hash table is acting poorly-- all operations will be very slow. A large load will result from a bad hash function that maps too many keys to the same code. */ double debugGetLoad() const { return (double)size() / m_numBuckets; } /** Returns the number of buckets. */ size_t debugGetNumBuckets() const { return m_numBuckets; } /** C++ STL style iterator variable. See begin(). */ class Iterator { private: friend class Table; /** Bucket index. */ size_t index; /** Linked list node. */ Node* node; size_t m_numBuckets; Node** m_bucket; bool isDone; /** Creates the end iterator. */ Iterator() : index(0), node(NULL), m_bucket(NULL) { isDone = true; } Iterator(size_t numBuckets, Node** m_bucket) : index(0), node(NULL), m_numBuckets(numBuckets), m_bucket(m_bucket) { if (m_numBuckets == 0) { // Empty table isDone = true; return; } # ifdef G3D_DEBUG for (unsigned int i = 0; i < m_numBuckets; ++i) { debugAssert((m_bucket[i] == NULL) || isValidHeapPointer(m_bucket[i])); } # endif index = 0; node = m_bucket[index]; debugAssert((node == NULL) || isValidHeapPointer(node)); isDone = false; findNext(); debugAssert((node == NULL) || isValidHeapPointer(node)); } /** If node is NULL, then finds the next element by searching through the bucket array. Sets isDone if no more nodes are available. */ void findNext() { while (node == NULL) { ++index; if (index >= m_numBuckets) { m_bucket = NULL; index = 0; isDone = true; return; } else { node = m_bucket[index]; debugAssert((node == NULL) || isValidHeapPointer(node)); } } debugAssert(isValidHeapPointer(node)); } public: inline bool operator!=(const Iterator& other) const { return !(*this == other); } bool operator==(const Iterator& other) const { if (other.isDone || isDone) { // Common case; check against isDone. return (isDone == other.isDone); } else { return (node == other.node) && (index == other.index); } } /** Pre increment. */ Iterator& operator++() { debugAssert(! isDone); debugAssert(node != NULL); debugAssert(isValidHeapPointer(node)); debugAssert((node->next == NULL) || isValidHeapPointer(node->next)); node = node->next; findNext(); debugAssert(isDone || isValidHeapPointer(node)); return *this; } /** Post increment (slower than preincrement). */ Iterator operator++(int) { Iterator old = *this; ++(*this); return old; } const Entry& operator*() const { return node->entry; } const Value& value() const { return node->entry.value; } const Key& key() const { return node->entry.key; } Entry* operator->() const { debugAssert(isValidHeapPointer(node)); return &(node->entry); } operator Entry*() const { debugAssert(isValidHeapPointer(node)); return &(node->entry); } bool isValid() const { return ! isDone; } /** @deprecated Use isValid */ bool hasMore() const { return ! isDone; } }; /** C++ STL style iterator method. Returns the first Entry, which contains a key and value. Use preincrement (++entry) to get to the next element. Do not modify the table while iterating. */ Iterator begin() const { return Iterator(m_numBuckets, m_bucket); } /** C++ STL style iterator method. Returns one after the last iterator element. */ const Iterator end() const { return Iterator(); } /** Removes all elements. Guaranteed to free all memory associated with the table. */ void clear() { freeMemory(); m_numBuckets = 0; m_size = 0; m_bucket = NULL; } /** Returns the number of keys. */ size_t size() const { return m_size; } /** If you insert a pointer into the key or value of a table, you are responsible for deallocating the object eventually. Inserting key into a table is O(1), but may cause a potentially slow rehashing. */ void set(const Key& key, const Value& value) { getCreateEntry(key).value = value; } private: /** Helper for remove() and getRemove() */ bool remove(const Key& key, Key& removedKey, Value& removedValue, bool updateRemoved) { if (m_numBuckets == 0) { return false; } const size_t code = HashFunc::hashCode(key); const size_t b = code % m_numBuckets; // Go to the m_bucket Node* n = m_bucket[b]; if (n == NULL) { return false; } Node* previous = NULL; // Try to find the node do { if ((code == n->hashCode) && EqualsFunc::equals(n->entry.key, key)) { // This is the node; remove it // Replace the previous's next pointer if (previous == NULL) { m_bucket[b] = n->next; } else { previous->next = n->next; } if (updateRemoved) { removedKey = n->entry.key; removedValue = n->entry.value; } // Delete the node Node::destroy(n, m_memoryManager); --m_size; //checkIntegrity(); return true; } previous = n; n = n->next; } while (n != NULL); //checkIntegrity(); return false; } public: /** If @a member is present, sets @a removed to the element being removed and returns true. Otherwise returns false and does not write to @a removed. */ bool getRemove(const Key& key, Key& removedKey, Value& removedValue) { return remove(key, removedKey, removedValue, true); } /** Removes an element from the table if it is present. @return true if the element was found and removed, otherwise false */ bool remove(const Key& key) { Key x; Value v; return remove(key, x, v, false); } private: Entry* getEntryPointer(const Key& key) const { if (m_numBuckets == 0) { return NULL; } size_t code = HashFunc::hashCode(key); size_t b = code % m_numBuckets; Node* node = m_bucket[b]; while (node != NULL) { if ((node->hashCode == code) && EqualsFunc::equals(node->entry.key, key)) { return &(node->entry); } node = node->next; } return NULL; } public: /** If a value that is EqualsFunc to @a member is present, returns a pointer to the version stored in the data structure, otherwise returns NULL. */ const Key* getKeyPointer(const Key& key) const { const Entry* e = getEntryPointer(key); if (e == NULL) { return NULL; } else { return &(e->key); } } /** Returns the value associated with key. @deprecated Use get(key, val) or getPointer(key) */ Value& get(const Key& key) const { Entry* e = getEntryPointer(key); debugAssertM(e != NULL, "Key not found"); return e->value; } /** Returns a pointer to the element if it exists, or NULL if it does not. Note that if your value type is a pointer, the return value is a pointer to a pointer. Do not remove the element while holding this pointer. It is easy to accidentally mis-use this method. Consider making a Table and using get(key, val) instead, which makes you manage the memory for the values yourself and is less likely to result in pointer errors. */ Value* getPointer(const Key& key) const { if (m_numBuckets == 0) { return NULL; } size_t code = HashFunc::hashCode(key); size_t b = code % m_numBuckets; Node* node = m_bucket[b]; while (node != NULL) { if ((node->hashCode == code) && EqualsFunc::equals(node->entry.key, key)) { // found key return &(node->entry.value); } node = node->next; } // Failed to find key return NULL; } /** If the key is present in the table, val is set to the associated value and returns true. If the key is not present, returns false. */ bool get(const Key& key, Value& val) const { Value* v = getPointer(key); if (v != NULL) { val = *v; return true; } else { return false; } } /** Called by getCreate() and set() \param created Set to true if the entry was created by this method. */ Entry& getCreateEntry(const Key& key, bool& created) { created = false; if (m_numBuckets == 0) { resize(10); } size_t code = HashFunc::hashCode(key); size_t b = code % m_numBuckets; // Go to the m_bucket Node* n = m_bucket[b]; // No m_bucket, so this must be the first if (n == NULL) { m_bucket[b] = Node::create(key, code, NULL, m_memoryManager); ++m_size; created = true; //checkIntegrity(); return m_bucket[b]->entry; } size_t bucketLength = 1; // Sometimes a bad hash code will cause all elements // to collide. Detect this case and don't rehash when // it occurs; nothing good will come from the rehashing. bool allSameCode = true; // Try to find the node do { allSameCode = allSameCode && (code == n->hashCode); if ((code == n->hashCode) && EqualsFunc::equals(n->entry.key, key)) { // This is the a pre-existing node //checkIntegrity(); return n->entry; } n = n->next; ++bucketLength; } while (n != NULL); // Allow the load factor to rise as the table gets huge const int bucketsPerElement = (m_size > 50000) ? 3 : ((m_size > 10000) ? 5 : ((m_size > 5000) ? 10 : 15)); const size_t maxBucketLength = 3; // (Don't bother changing the size of the table if all entries // have the same hashcode--they'll still collide) if ((bucketLength > maxBucketLength) && ! allSameCode && (m_numBuckets < m_size * bucketsPerElement)) { // This m_bucket was really large; rehash if all elements // don't have the same hashcode the number of buckets is // reasonable. // Back off the scale factor as the number of buckets gets // large float f = 3.0f; if (m_numBuckets > 1000000) { f = 1.5f; } else if (m_numBuckets > 100000) { f = 2.0f; } int newSize = iMax((int)(m_numBuckets * f) + 1, (int)(m_size * f)); resize(newSize); } // Not found; insert at the head. b = code % m_numBuckets; m_bucket[b] = Node::create(key, code, m_bucket[b], m_memoryManager); ++m_size; created = true; //checkIntegrity(); return m_bucket[b]->entry; } Entry& getCreateEntry(const Key& key) { bool ignore; return getCreateEntry(key, ignore); } /** Returns the current value that key maps to, creating it if necessary.*/ Value& getCreate(const Key& key) { return getCreateEntry(key).value; } /** \param created True if the element was created. */ Value& getCreate(const Key& key, bool& created) { return getCreateEntry(key, created).value; } /** Returns true if key is in the table. */ bool containsKey(const Key& key) const { if (m_numBuckets == 0) { return false; } size_t code = HashFunc::hashCode(key); size_t b = code % m_numBuckets; Node* node = m_bucket[b]; while (node != NULL) { if ((node->hashCode == code) && EqualsFunc::equals(node->entry.key, key)) { return true; } node = node->next; } return false; } /** Short syntax for get. */ inline Value& operator[](const Key &key) const { return get(key); } /** Returns an array of all of the keys in the table. You can iterate over the keys to get the values. @deprecated */ Array getKeys() const { Array keyArray; getKeys(keyArray); return keyArray; } void getKeys(Array& keyArray) const { keyArray.resize(0, DONT_SHRINK_UNDERLYING_ARRAY); for (size_t i = 0; i < m_numBuckets; ++i) { Node* node = m_bucket[i]; while (node != NULL) { keyArray.append(node->entry.key); node = node->next; } } } /** Will contain duplicate values if they exist in the table. This array is parallel to the one returned by getKeys() if the table has not been modified. */ void getValues(Array& valueArray) const { valueArray.resize(0, DONT_SHRINK_UNDERLYING_ARRAY); for (size_t i = 0; i < m_numBuckets; ++i) { Node* node = m_bucket[i]; while (node != NULL) { valueArray.append(node->entry.value); node = node->next; } } } /** Calls delete on all of the keys and then clears the table. */ void deleteKeys() { for (size_t i = 0; i < m_numBuckets; ++i) { Node* node = m_bucket[i]; while (node != NULL) { delete node->entry.key; node->entry.key = NULL; node = node->next; } } clear(); } /** Calls delete on all of the values. This is unsafe-- do not call unless you know that each value appears at most once. Does not clear the table, so you are left with a table of NULL pointers. */ void deleteValues() { for (size_t i = 0; i < m_numBuckets; ++i) { Node* node = m_bucket[i]; while (node != NULL) { delete node->entry.value; node->entry.value = NULL; node = node->next; } } } template bool operator==(const Table& other) const { if (size() != other.size()) { return false; } for (Iterator it = begin(); it.hasMore(); ++it) { const Value* v = other.getPointer(it->key); if ((v == NULL) || (*v != it->value)) { // Either the key did not exist or the value was not the same return false; } } // this and other have the same number of keys, so we don't // have to check for extra keys in other. return true; } template bool operator!=(const Table& other) const { return ! (*this == other); } void debugPrintStatus() { debugPrintf("Deepest bucket size = %d\n", (int)debugGetDeepestBucketSize()); debugPrintf("Average bucket size = %g\n", debugGetAverageBucketSize()); debugPrintf("Load factor = %g\n", debugGetLoad()); } }; } // namespace #ifdef _MSC_VER # pragma warning (pop) #endif #endif