Simulation implementation and encapsulation of unordered_map and unordered_set

Closed hash hash table and open hash hash bucket data structure

enum State//state
    {
        EMPTY, //empty
        EXIST, // exists
        DELETE//delete
    };

template<class K, class V>
struct HashData//data structure
    {
        pair<K, V> _kv;
        State_state = EMPTY;
    };
template<class T>
struct HashNode
    {
        HashNode<T>* _next;
        T_data;

        HashNode(const T & data)
                :_next(nullptr)
                , _data(data)
        {}
    };

The closed hash hash table is actually a vector array, which is the same in essence as the sequence table. In order to facilitate deletion, we directly set a State value for each data. When it is EMPTY, it is empty and can insert values. When it is EXIST, it is Because it exists, a value cannot be inserted. When we need to delete, we directly set the status value to DELETE, which is regarded as deletion. When inserting a value, we only need to judge whether it is EMPTY or DELETE.
To open a hash bucket is to store pointers in the vector, and then link the key values.

Hash Table

 class HashTable
    {
    public:
bool Insert(const pair<K, V> & kv);
HashData<K, V>* Find(const K & amp; key);
bool Erase(const K & amp; key);
    private:
        vector<HashData<K, V>> _tables;
        size_t _n = 0; // the number of stored data
    };

Find()

 HashData<K, V>* Find(const K & amp; key)
        {
            if (_tables. size() == 0)
            {
                return nullptr;
            }

            size_t hash = key % _tables. size();

            // linear detection
            size_t i = 1;
            size_t index = hash;
            while (_tables[index]._state != EMPTY)
            {
                if (_tables[index]._state == EXIST
                         & amp; & amp; _tables[index]._kv.first == key)
                {
                    return &_tables[index];
                }

                index = hashi + i;//Look backward
                index %= _tables.size();//After searching around index==hashi
                 + + i;
                // Exit the loop if not found
                if (index == hash)
                {
                    break;
                }
            }

            return nullptr;
        }

Data lookup only needs to judge whether the value exists, and then look back one by one. Use hash to locate the key position. If the hash conflicts, the key is hashed. To look back, add an i value to the index One finds backwards, and then modifies % when the search is completed, and the search is completed when it is equal.

Erase()

 bool Erase(const K & amp; key)
        {
            HashData<K, V>* ret = Find(key);
            if (ret)
            {
                ret->_state = DELETE;//Modify the state value to "delete"
                --_n;
                return true;
            }
            else
            {
                return false;
            }
        }

After finding the value, modify the State state to DELETE.

Insert()

First judge whether expansion is needed, if the load factor>=7, then expand

 //Determine whether to expand
//if ((double)_n / (double)_tables. size() >= 0.7)
if (_tables. size() == 0 || _n * 10 / _tables. size() >= 7)
            {
                size_t newsize = _tables. size() == 0 ? 10 : _tables. size() * 2;
                HashTable<K, V> newht;
                newht._tables.resize(newsize);

                // traverse the old table and remap to the new table
                for (auto & data : _tables)
                {
                    if (data._state == EXIST)
                    {
                        newht.Insert(data._kv);
                    }
                }

                _tables. swap(newht. _tables);
            }

            size_t hash = kv.first % _tables.size();

The method of expansion is to traverse the old table, reinsert it into the new table, then exchange the two tables, and update the mapping relationship hashi

 bool Insert(const pair<K, V> & kv)
        {
            if (Find(kv.first))//if found, it will not be inserted
                return false;
            // Judgment expansion
//
            // linear detection
            size_t i = 1;
            size_t index = hash;
            while (_tables[index]._state == EXIST)
            {
                index = hash + i;
                index %= _tables. size();
                 + + i;
            }

            _tables[index]._kv = kv;
            _tables[index]._state = EXIST;
            _n++;

            return true;
        }

After the expansion is completed, continue the linear detection to find an empty position to insert.

Iterator

 template<class K, class T, class Ref, class Ptr, class KeyOfT, class Hash>
    struct __HashIterator
    {
        typedef HashNode<T> Node;
        typedef HashTable<K, T, KeyOfT, Hash> HT;
        typedef __HashIterator<K, T, Ref, Ptr, KeyOfT, Hash> Self;

        typedef __HashIterator<K, T, T &, T*, KeyOfT, Hash> Iterator;

        Node* _node;
        const HT* _ht;

        __HashIterator(Node* node, const HT* ht)
                :_node(node)
                , _ht(ht)
        {}

        __HashIterator(const Iterator & it)
                :_node(it._node)
                , _ht(it._ht)
        {}
Ref operator*()
        {
            return _node->_data;
        }

        Ptr operator->()
        {
            return &_node->_data;
        }

        bool operator!=(const Self & s)
        {
            return _node != s._node;
        }

The difficulty in the implementation of iterators lies in the set of templates. Like red-black tree iterators, const iterators can also be constructed with non-const

operator + + ()

 Self & operator + + ()
        {
            if (_node->_next != nullptr)
            {
                _node = _node->_next;
            }
            else
            {
                // Find the next non-empty bucket
                KeyOfT kot;
                Hash hash;
                // figure out my current bucket position
                size_t hash = hash(kot(_node->_data)) % _ht->_tables.size();
                 + +hashi;
                while (hashi < _ht->_tables. size())
                {
                    if (_ht->_tables[hashi])
                    {
                        _node = _ht->_tables[hashi];
                        break;
                    }
                    else
                    {
                         + +hashi;
                    }
                }

                // No non-empty bucket found
                if (hashi == _ht->_tables. size())
                {
                    _node = nullptr;
                }
            }

            return *this;
        }

After the linked list of a bucket is over, find the next bucket that is not empty. Two functors kot and hash are used here, which is to pass the key value in _data and compare it with size. The functor will be used to encapsulate unorderedmap and unorderedset later.

struct MapKeyOfT
        {
            const K & amp; operator()(const pair<K, V> & amp; kv)
            {
                return kv.first;
            }
        };
struct SetKeyOfT
        {
            const K & amp; operator()(const K & amp; key)
            {
                return key;
            }
        };

Hash is a functor that converts the key value into [integer that can be modulo]

template<class K>
struct HashFunc
{
    size_t operator()(const K & key)
    {
        return key;
    }
};

// specialization
template<>
struct HashFunc<string>
{
    // BKDR
    size_t operator()(const string & s)
    {
        size_t hash = 0;
        for (auto ch : s)
        {
            hash += ch;
            hash *= 31;
        }

        return hash;
    }
};

Due to the particularity of the string, it needs to be processed separately. The case of specialization is used here; multiplying by 31 is to deal with the case where the characters are the same but in different order.

Hash bucket

template<class K, class T, class KeyOfT, class Hash>
class HashTable
{
  template<class K1, class T1, class Ref, class Ptr, class KeyOfT1, class Hash1>
  friend struct __HashIterator;

  typedef HashNode<T> Node;
public:
  typedef __HashIterator<K, T, T &, T*, KeyOfT, Hash> iterator;
  typedef __HashIterator<K, T, const T &, const T*, KeyOfT, Hash> const_iterator;
private:
        vector<Node*> _tables; // array of pointers
        size_t _n = 0; // Store the number of valid data
    };

To allow the iterator to access private members, set it as a friend function. Note that the template names cannot conflict.

 iterator begin()
        {
            Node* cur = nullptr;
            for (size_t i = 0; i < _tables. size(); + + i)
            {
                cur = _tables[i];
                if (cur)
                {
                    break;
                }
            }

            return iterator(cur, this);
        }

        iterator end()
        {
            return iterator(nullptr, this);
        }

        const_iterator begin() const
        {
            Node* cur = nullptr;
            for (size_t i = 0; i < _tables. size(); + + i)
            {
                cur = _tables[i];
                if (cur)
                {
                    break;
                }
            }

            return const_iterator(cur, this);
        }

        const_iterator end() const
        {
            return const_iterator(nullptr, this);
        }

Begin only needs to find the first node in the first non-empty bucket, and end is empty.

Find()

 iterator Find(const K & amp; key)
        {
            if (_tables. size() == 0)
                return end();

            KeyOfT kot;
            Hash hash;
            size_t hash = hash(key) % _tables. size();
            Node* cur = _tables[hashi];
            while (cur)
            {
                if (kot(cur->_data) == key)
                {
                    return iterator(cur, this);
                }

                cur = cur->_next;
            }

            return end();
        }

find has nothing to say, just find the mapped bucket, and then traverse the linked list

Erase()

 bool Erase(const K & amp; key)
        {
            Hash hash;
            KeyOfT kot;
            size_t hash = hash(key) % _tables. size();
            Node* prev = nullptr;
            Node* cur = _tables[hashi];
            while (cur)
            {
                if (kot(cur->_data) == key)
                {
                    if (prev == nullptr)
                    {
                        _tables[hashi] = cur->_next;
                    }
                    else
                    {
                        prev->_next = cur->_next;
                    }
                    delete cur;

                    return true;
                }
                else
                {
                    prev = cur;
                    cur = cur->_next;
                }
            }

            return false;
        }

To delete is to find the node, and then delete the node first.

Insert()

 pair<iterator, bool> Insert(const T & amp; data)
        {
            KeyOfT kot;
            iterator it = Find(kot(data));
            if (it != end())
            {
                return make_pair(it, false);
            }
            Hash hash;
            // Expansion when load factor==1
            if (_n == _tables. size())
                size_t newsize = GetNextPrime(_tables. size());
                vector<Node*> newtables(newsize, nullptr);
                //for (Node* & amp; cur : _tables)
                for (auto & cur : _tables)
                {
                    while (cur)
                    {
                        Node* next = cur->_next;

                        size_t hash = hash(kot(cur->_data)) % newtables. size();

                        // insert head into new table
                        cur->_next = newtables[hashi];
                        newtables[hashi] = cur;

                        cur = next;
                    }
                }

                _tables. swap(newtables);
            }

            size_t hash = hash(kot(data)) % _tables. size();
            // plug
            Node* newnode = new Node(data);
            newnode->_next = _tables[hashi];
            _tables[hashi] = newnode;

             + + _n;
            return make_pair(iterator(newnode, this), false);
        }

Insertion is mainly due to the troublesome expansion. Here, prime number expansion is used. This is what is used in the library:

size_t GetNextPrime(size_t prime)
        {
            // SGI
            static const int __stl_num_primes = 28;
            static const unsigned long __stl_prime_list[__stl_num_primes] =
                    {
                            53, 97, 193, 389, 769,
                            1543, 3079, 6151, 12289, 24593,
                            49157, 98317, 196613, 393241, 786433,
                            1572869, 3145739, 6291469, 12582917, 25165843,
                            50331653, 100663319, 201326611, 402653189, 805306457,
                            1610612741, 3221225473, 4294967291
                    };

            size_t i = 0;
            for (; i < __stl_num_primes; + + i)
            {
                if (__stl_prime_list[i] > prime)
                    return __stl_prime_list[i];
            }

            return __stl_prime_list[i];
        }

Maximum bucket

 size_t MaxBucketSize()
        {
            size_t max = 0;
            for (size_t i = 0; i < _tables. size(); + + i)
            {
                auto cur = _tables[i];
                size_t size = 0;
                while (cur)
                {
                     + + size;
                    cur = cur->_next;
                }

                //printf("[%d]->%d\\
", i, size);
                if (size > max)
                {
                    max = size;
                }
            }

            return max;
        }

After the hash table is implemented, it only needs to be simply encapsulated for unordered_map/set

#include "HashTable.h"

namespace lty
{
    template<class K, class V, class Hash = HashFunc<K>>
    class unordered_map
    {
    public:
        struct MapKeyOfT
        {
            const K & amp; operator()(const pair<K, V> & amp; kv)
            {
                return kv.first;
            }
        };
    public:
        typedef typename HashBucket::HashTable<K, pair<const K, V>, MapKeyOfT, Hash>::iterator iterator;
        typedef typename HashBucket::HashTable<K, pair<const K, V>, MapKeyOfT, Hash>::const_iterator const_iterator;

        iterator begin()
        {
            return _ht.begin();
        }

        iterator end()
        {
            return _ht. end();
        }

        const_iterator begin() const
        {
            return _ht.begin();
        }

        const_iterator end() const
        {
            return _ht. end();
        }

        pair<iterator, bool> insert(const pair<K, V> & kv)
        {
            return _ht. Insert(kv);
        }

        V & amp; operator[](const K & amp; key)
        {
            pair<iterator, bool> ret = insert(make_pair(key, V()));
            return ret.first->second;
            //first==iterator first-> == _node->_data
              first->second==_node->_data->second, which is the V value
        }

        iterator find(const K & key)
        {
            return _ht. Find(key);
        }

        bool erase(const K & key)
        {
            return _ht. Erase(key);
        }

    private:
        HashBucket::HashTable<K, pair<const K, V>, MapKeyOfT, Hash> _ht;
    };

In addition to the difference in K and V values between set and map, map is overloaded with [], which is the same as [] of red-black tree, returning the second value of pari

#pragma once
#include "HashTable.h"

namespace lty
{
    template<class K, class Hash = HashFunc<K>>
    class unordered_set
    {
    public:
        struct SetKeyOfT
        {
            const K & amp; operator()(const K & amp; key)
            {
                return key;
            }
        };
    public:
        typedef typename HashBucket::HashTable<K, K, SetKeyOfT, Hash>::const_iterator iterator;
        typedef typename HashBucket::HashTable<K, K, SetKeyOfT, Hash>::const_iterator const_iterator;

        iterator begin()
        {
            return _ht.begin();
        }

        iterator end()
        {
            return _ht. end();
        }

        const_iterator begin() const
        {
            return _ht.begin();
        }

        const_iterator end() const
        {
            return _ht. end();
        }

        pair<iterator, bool> insert(const K & amp; key)
        {
            return _ht. Insert(key);
        }

        iterator find(const K & key)
        {
            return _ht. Find(key);
        }

        bool erase(const K & key)
        {
            return _ht. Erase(key);
        }

    private:
        HashBucket::HashTable<K, K, SetKeyOfT, Hash> _ht;
    };

The iterator can directly call the one in the hash.

The difficulty in the implementation of the hash table lies in the application of the template, which is very convoluted. As long as you understand how the template is applied, the implementation will become simple, as long as you know the principle of the hash table.