kiss_header

Most of our data in Backworlds is stored in either XML or plain text. There are a lot of benefits to using human-readable data, it is usually trivial to add or remove information to the format without destroying backwards-compatibility and it is easy to make changes even before any tools have been written. That said, it is a lot less efficient than storing things in ready-made binary chunks and to reduce the performance loss we use a lot of hashing.

This is not particularly revolutionary – I got most of this from a studio I worked at – but it is a useful programming trick so I’ll provide a short overview. For strings, we use the djb2 hash method as follows:

uint32 StringHash(const char * _pStr)
{
  uint32 Hs = 5381;

  if( _pStr )
  {
    while( char Ch = *_pStr++ )
      Hs = ((Hs << 5) + Hs) + ((Ch >= 'A' && Ch <= 'Z') ? Ch + ('a'-'A') : Ch);
  }
  return Hs - 5381;
}

… Note that our hash function is case-insensitive, and that we remove the 5381-seed so that empty strings will always be hashed to 0. I have been using this method fairly frequently for 8 or so years and I’ve yet to see a hash conflict, but for difficult-to-detect cases and dynamic data it is prudent to check for them.

The advantage of hashing strings like this is that an expensive string comparison operation can be turned into a much cheaper integer comparison – but the only real win is if we precalculate the hash values. While doing this in a loading step and storing the values is certainly a possibility, for data that is to be parsed according to a precompiled spec we can just have the preprocessor calculate it for us.

The djb2 hash function simply starts with a seed, multiplies it by 33 and adds the current character to create a new hash value – then repeats this for every character in the string. In order to keep this as clean as possible we hash characters four at a time with the following macros:

//Extract a character from uint32 (case-insensitive)
#define _STRHASH_U8_LWR(c)    (((c) >= 'A' && (c) <= 'Z') ? ((c) | 0x20) : (c))
#define _STRHASH_U32_CH0(c) _STRHASH_U8_LWR(c & 0xff)
#define _STRHASH_U32_CH1(c) _STRHASH_U8_LWR((c >> 8) & 0xff)
#define _STRHASH_U32_CH2(c) _STRHASH_U8_LWR((c >> 16) & 0xff)
#define _STRHASH_U32_CH3(c) _STRHASH_U8_LWR((c >> 24) & 0xff)

//Multiply
#define _STRHASH_HSMUL(a) uint32(uint64(a) * 33)

//Hash value of 1, 2, 3 or 4 characters in uint32
#define _STRHASH_HS1(Hash, c0)    (_STRHASH_HSMUL(Hash) + _STRHASH_U32_CH0(c0))
#define _STRHASH_HS2(Hash, c0)    (_STRHASH_HSMUL(_STRHASH_HSMUL(Hash) + _STRHASH_U32_CH1(c0)) + _STRHASH_U32_CH0(c0))
#define _STRHASH_HS3(Hash, c0)    (_STRHASH_HSMUL(_STRHASH_HSMUL(_STRHASH_HSMUL(Hash) + _STRHASH_U32_CH2(c0)) + _STRHASH_U32_CH1(c0)) + _STRHASH_U32_CH0(c0))
#define _STRHASH_HS4(Hash, c0)    (_STRHASH_HSMUL(_STRHASH_HSMUL(_STRHASH_HSMUL(_STRHASH_HSMUL(Hash) + _STRHASH_U32_CH3(c0)) + _STRHASH_U32_CH2(c0)) + _STRHASH_U32_CH1(c0)) + _STRHASH_U32_CH0(c0))

//Hash value of uint32 dependent on character count
#define _STRHASH_HSH(Hash, c0)    (_STRHASH_U32_CH3(c0) ? _STRHASH_HS4(Hash,c0) : (_STRHASH_U32_CH2(c0) ? _STRHASH_HS3(Hash,c0) : (_STRHASH_U32_CH1(c0) ? _STRHASH_HS2(Hash,c0) : (_STRHASH_U32_CH0(c0) ? _STRHASH_HS1(Hash,c0) : Hash))))

//Hash value of 4-character string
#define _STRHASH_STRH(Hs,c0)        (_STRHASH_HSH(Hs,c0)-5381)

… Since the hash repeats, we can then simply use a combination of these in sequence to calculate the hash of strings of arbitrary length. This part can use preprocessor defines as well but I opted to use templates as the compiler ran out of memory in a case on an old machine.

template<uint32 _UIHs, uint32 _UI0>
class TK_StrHash
{
  public:
    enum { HASH = _STRHASH_STRH(_UIHs,_UI0) };
};

template<uint32 _UI0, uint32 _UI1>
class TK_StrHash2
{
  public:
    enum { HASH = TK_StrHash<TK_StrHash<5381,_UI0>::HASH + 5381, _UI1>::HASH };
};

template<uint32 _UI0, uint32 _UI1, uint32 _UI2>
class TK_StrHash3
{
  public:
    enum { HASH = TK_StrHash<TK_StrHash2<_UI0,_UI1>::HASH + 5381, _UI2>::HASH };
};

template<uint32 _UI0, uint32 _UI1, uint32 _UI2, uint32 _UI3>
class TK_StrHash4
{
  public:
    enum { HASH = TK_StrHash<TK_StrHash3<_UI0,_UI1,_UI2>::HASH + 5381, _UI3>::HASH };
};

// etc

Finally, we can use the following define to perform a stringhash – this means that every hash will use the full 10 objects (up to 40 characters), but since it’s done in the preprocessor that doesn’t really matter.

#define STRHASH(...)    cloud::TK_StrHash10<__VA_ARGS__>::HASH

// Example code
switch( StringHash(strObjectType) )
{
case STRHASH('play','er'): // "player"
    return new CPlayerObject;
case STRHASH('enem','y'): // "enemy"
    return new CEnemyObject;
case STRHASH('craz','y po','weru','p'): // "crazy powerup"
    return new CCrazyPowerupObject;
}

… An added benefit in the example is that since the switch statement produces an error on duplicate case labels it will protect us from hash collisions. Note that I added the string as a comment after every use of the macro – this is good practice so searching the files for identifiers gets easier.

For reference, this is the entire hash macro definition;

#pragma once

//Extract a character from uint32 (case-insensitive)
#define _STRHASH_U8_LWR(c)    (((c) >= 'A' && (c) <= 'Z') ? ((c) | 0x20) : (c))
#define _STRHASH_U32_CH0(c) _STRHASH_U8_LWR(c & 0xff)
#define _STRHASH_U32_CH1(c) _STRHASH_U8_LWR((c >> 8) & 0xff)
#define _STRHASH_U32_CH2(c) _STRHASH_U8_LWR((c >> 16) & 0xff)
#define _STRHASH_U32_CH3(c) _STRHASH_U8_LWR((c >> 24) & 0xff)

//Multiply
#define _STRHASH_HSMUL(a) uint32(uint64(a) * 33)

//Hash value of 1, 2, 3 or 4 characters in uint32
#define _STRHASH_HS1(Hash, c0)    (_STRHASH_HSMUL(Hash) + _STRHASH_U32_CH0(c0))
#define _STRHASH_HS2(Hash, c0)    (_STRHASH_HSMUL(_STRHASH_HSMUL(Hash) + _STRHASH_U32_CH1(c0)) + _STRHASH_U32_CH0(c0))
#define _STRHASH_HS3(Hash, c0)    (_STRHASH_HSMUL(_STRHASH_HSMUL(_STRHASH_HSMUL(Hash) + _STRHASH_U32_CH2(c0)) + _STRHASH_U32_CH1(c0)) + _STRHASH_U32_CH0(c0))
#define _STRHASH_HS4(Hash, c0)    (_STRHASH_HSMUL(_STRHASH_HSMUL(_STRHASH_HSMUL(_STRHASH_HSMUL(Hash) + _STRHASH_U32_CH3(c0)) + _STRHASH_U32_CH2(c0)) + _STRHASH_U32_CH1(c0)) + _STRHASH_U32_CH0(c0))

//Hash value of uint32 dependent on character count
#define _STRHASH_HSH(Hash, c0)    (_STRHASH_U32_CH3(c0) ? _STRHASH_HS4(Hash,c0) : (_STRHASH_U32_CH2(c0) ? _STRHASH_HS3(Hash,c0) : (_STRHASH_U32_CH1(c0) ? _STRHASH_HS2(Hash,c0) : (_STRHASH_U32_CH0(c0) ? _STRHASH_HS1(Hash,c0) : Hash))))

//Hash value of 4-character string
#define _STRHASH_STRH(Hs,c0)        (_STRHASH_HSH(Hs,c0)-5381)

template<uint32 _UIHs, uint32 _UI0>
class TK_StrHash
{
public:
    enum { HASH = _STRHASH_STRH(_UIHs,_UI0) };
};

template<uint32 _UI0, uint32 _UI1>
class TK_StrHash2
{
public:
    enum { HASH = TK_StrHash<TK_StrHash<5381,_UI0>::HASH + 5381, _UI1>::HASH };
};

template<uint32 _UI0, uint32 _UI1, uint32 _UI2>
class TK_StrHash3
{
public:
    enum { HASH = TK_StrHash<TK_StrHash2<_UI0,_UI1>::HASH + 5381, _UI2>::HASH };
};

template<uint32 _UI0, uint32 _UI1, uint32 _UI2, uint32 _UI3>
class TK_StrHash4
{
public:
    enum { HASH = TK_StrHash<TK_StrHash3<_UI0,_UI1,_UI2>::HASH + 5381, _UI3>::HASH };
};

template<uint32 _UI0, uint32 _UI1, uint32 _UI2, uint32 _UI3, uint32 _UI4>
class TK_StrHash5
{
public:
    enum { HASH = TK_StrHash<TK_StrHash4<_UI0,_UI1,_UI2,_UI3>::HASH + 5381, _UI4>::HASH };
};

template<uint32 _UI0, uint32 _UI1, uint32 _UI2, uint32 _UI3, uint32 _UI4, uint32 _UI5>
class TK_StrHash6
{
public:
    enum { HASH = TK_StrHash<TK_StrHash5<_UI0,_UI1,_UI2,_UI3,_UI4>::HASH + 5381, _UI5>::HASH };
};

template<uint32 _UI0, uint32 _UI1, uint32 _UI2, uint32 _UI3, uint32 _UI4, uint32 _UI5, uint32 _UI6>
class TK_StrHash7
{
public:
    enum { HASH = TK_StrHash<TK_StrHash6<_UI0,_UI1,_UI2,_UI3,_UI4,_UI5>::HASH + 5381, _UI6>::HASH };
};

template<uint32 _UI0, uint32 _UI1, uint32 _UI2, uint32 _UI3, uint32 _UI4, uint32 _UI5, uint32 _UI6,uint32 _UI7>
class TK_StrHash8
{
public:
    enum { HASH = TK_StrHash<TK_StrHash7<_UI0,_UI1,_UI2,_UI3,_UI4,_UI5,_UI6>::HASH + 5381, _UI7>::HASH };
};

template<uint32 _UI0, uint32 _UI1, uint32 _UI2, uint32 _UI3, uint32 _UI4, uint32 _UI5, uint32 _UI6,uint32 _UI7,uint32 _UI8>
class TK_StrHash9
{
public:
    enum { HASH = TK_StrHash<TK_StrHash8<_UI0,_UI1,_UI2,_UI3,_UI4,_UI5,_UI6,_UI7>::HASH + 5381, _UI8>::HASH };
};

template<uint32 _UI0, uint32 _UI1=0, uint32 _UI2=0, uint32 _UI3=0, uint32 _UI4=0, uint32 _UI5=0, uint32 _UI6=0,uint32 _UI7=0,uint32 _UI8=0,uint32 _UI9=0>
class TK_StrHash10
{
public:
    enum { HASH = TK_StrHash<TK_StrHash9<_UI0,_UI1,_UI2,_UI3,_UI4,_UI5,_UI6,_UI7,_UI8>::HASH + 5381, _UI9>::HASH };
};

//Cleanup
#undef _STRHASH_HS_MUL
#undef _STRHASH_U32_CH0
#undef _STRHASH_U32_CH1
#undef _STRHASH_U32_CH2
#undef _STRHASH_U32_CH3
#undef _STRHASH_HSMUL
#undef _STRHASH_HS1
#undef _STRHASH_HS2
#undef _STRHASH_HS3
#undef _STRHASH_HS4
#undef _STRHASH_HSH
#undef _STRHASH_STRH

//Usable
#define STRHASH(...)                                TK_StrHash10<__VA_ARGS__>::HASH