Most of our data in Backworlds is stored in either XML or plain text. There are a lot of benefits to using human-readable data, it is usually trivial to add or remove information to the format without destroying backwards-compatibility and it is easy to make changes even before any tools have been written. That said, it is a lot less efficient than storing things in ready-made binary chunks and to reduce the performance loss we use a lot of hashing.
This is not particularly revolutionary – I got most of this from a studio I worked at – but it is a useful programming trick so I’ll provide a short overview. For strings, we use the djb2 hash method as follows:
uint32 StringHash(const char * _pStr) { uint32 Hs = 5381; if( _pStr ) { while( char Ch = *_pStr++ ) Hs = ((Hs << 5) + Hs) + ((Ch >= 'A' && Ch <= 'Z') ? Ch + ('a'-'A') : Ch); } return Hs - 5381; }
… Note that our hash function is case-insensitive, and that we remove the 5381-seed so that empty strings will always be hashed to 0. I have been using this method fairly frequently for 8 or so years and I’ve yet to see a hash conflict, but for difficult-to-detect cases and dynamic data it is prudent to check for them.
The advantage of hashing strings like this is that an expensive string comparison operation can be turned into a much cheaper integer comparison – but the only real win is if we precalculate the hash values. While doing this in a loading step and storing the values is certainly a possibility, for data that is to be parsed according to a precompiled spec we can just have the preprocessor calculate it for us.
The djb2 hash function simply starts with a seed, multiplies it by 33 and adds the current character to create a new hash value – then repeats this for every character in the string. In order to keep this as clean as possible we hash characters four at a time with the following macros:
//Extract a character from uint32 (case-insensitive) #define _STRHASH_U8_LWR(c) (((c) >= 'A' && (c) <= 'Z') ? ((c) | 0x20) : (c)) #define _STRHASH_U32_CH0(c) _STRHASH_U8_LWR(c & 0xff) #define _STRHASH_U32_CH1(c) _STRHASH_U8_LWR((c >> 8) & 0xff) #define _STRHASH_U32_CH2(c) _STRHASH_U8_LWR((c >> 16) & 0xff) #define _STRHASH_U32_CH3(c) _STRHASH_U8_LWR((c >> 24) & 0xff) //Multiply #define _STRHASH_HSMUL(a) uint32(uint64(a) * 33) //Hash value of 1, 2, 3 or 4 characters in uint32 #define _STRHASH_HS1(Hash, c0) (_STRHASH_HSMUL(Hash) + _STRHASH_U32_CH0(c0)) #define _STRHASH_HS2(Hash, c0) (_STRHASH_HSMUL(_STRHASH_HSMUL(Hash) + _STRHASH_U32_CH1(c0)) + _STRHASH_U32_CH0(c0)) #define _STRHASH_HS3(Hash, c0) (_STRHASH_HSMUL(_STRHASH_HSMUL(_STRHASH_HSMUL(Hash) + _STRHASH_U32_CH2(c0)) + _STRHASH_U32_CH1(c0)) + _STRHASH_U32_CH0(c0)) #define _STRHASH_HS4(Hash, c0) (_STRHASH_HSMUL(_STRHASH_HSMUL(_STRHASH_HSMUL(_STRHASH_HSMUL(Hash) + _STRHASH_U32_CH3(c0)) + _STRHASH_U32_CH2(c0)) + _STRHASH_U32_CH1(c0)) + _STRHASH_U32_CH0(c0)) //Hash value of uint32 dependent on character count #define _STRHASH_HSH(Hash, c0) (_STRHASH_U32_CH3(c0) ? _STRHASH_HS4(Hash,c0) : (_STRHASH_U32_CH2(c0) ? _STRHASH_HS3(Hash,c0) : (_STRHASH_U32_CH1(c0) ? _STRHASH_HS2(Hash,c0) : (_STRHASH_U32_CH0(c0) ? _STRHASH_HS1(Hash,c0) : Hash)))) //Hash value of 4-character string #define _STRHASH_STRH(Hs,c0) (_STRHASH_HSH(Hs,c0)-5381)
… Since the hash repeats, we can then simply use a combination of these in sequence to calculate the hash of strings of arbitrary length. This part can use preprocessor defines as well but I opted to use templates as the compiler ran out of memory in a case on an old machine.
template<uint32 _UIHs, uint32 _UI0> class TK_StrHash { public: enum { HASH = _STRHASH_STRH(_UIHs,_UI0) }; }; template<uint32 _UI0, uint32 _UI1> class TK_StrHash2 { public: enum { HASH = TK_StrHash<TK_StrHash<5381,_UI0>::HASH + 5381, _UI1>::HASH }; }; template<uint32 _UI0, uint32 _UI1, uint32 _UI2> class TK_StrHash3 { public: enum { HASH = TK_StrHash<TK_StrHash2<_UI0,_UI1>::HASH + 5381, _UI2>::HASH }; }; template<uint32 _UI0, uint32 _UI1, uint32 _UI2, uint32 _UI3> class TK_StrHash4 { public: enum { HASH = TK_StrHash<TK_StrHash3<_UI0,_UI1,_UI2>::HASH + 5381, _UI3>::HASH }; }; // etc
Finally, we can use the following define to perform a stringhash – this means that every hash will use the full 10 objects (up to 40 characters), but since it’s done in the preprocessor that doesn’t really matter.
#define STRHASH(...) cloud::TK_StrHash10<__VA_ARGS__>::HASH // Example code switch( StringHash(strObjectType) ) { case STRHASH('play','er'): // "player" return new CPlayerObject; case STRHASH('enem','y'): // "enemy" return new CEnemyObject; case STRHASH('craz','y po','weru','p'): // "crazy powerup" return new CCrazyPowerupObject; }
… An added benefit in the example is that since the switch statement produces an error on duplicate case labels it will protect us from hash collisions. Note that I added the string as a comment after every use of the macro – this is good practice so searching the files for identifiers gets easier.
For reference, this is the entire hash macro definition;
#pragma once //Extract a character from uint32 (case-insensitive) #define _STRHASH_U8_LWR(c) (((c) >= 'A' && (c) <= 'Z') ? ((c) | 0x20) : (c)) #define _STRHASH_U32_CH0(c) _STRHASH_U8_LWR(c & 0xff) #define _STRHASH_U32_CH1(c) _STRHASH_U8_LWR((c >> 8) & 0xff) #define _STRHASH_U32_CH2(c) _STRHASH_U8_LWR((c >> 16) & 0xff) #define _STRHASH_U32_CH3(c) _STRHASH_U8_LWR((c >> 24) & 0xff) //Multiply #define _STRHASH_HSMUL(a) uint32(uint64(a) * 33) //Hash value of 1, 2, 3 or 4 characters in uint32 #define _STRHASH_HS1(Hash, c0) (_STRHASH_HSMUL(Hash) + _STRHASH_U32_CH0(c0)) #define _STRHASH_HS2(Hash, c0) (_STRHASH_HSMUL(_STRHASH_HSMUL(Hash) + _STRHASH_U32_CH1(c0)) + _STRHASH_U32_CH0(c0)) #define _STRHASH_HS3(Hash, c0) (_STRHASH_HSMUL(_STRHASH_HSMUL(_STRHASH_HSMUL(Hash) + _STRHASH_U32_CH2(c0)) + _STRHASH_U32_CH1(c0)) + _STRHASH_U32_CH0(c0)) #define _STRHASH_HS4(Hash, c0) (_STRHASH_HSMUL(_STRHASH_HSMUL(_STRHASH_HSMUL(_STRHASH_HSMUL(Hash) + _STRHASH_U32_CH3(c0)) + _STRHASH_U32_CH2(c0)) + _STRHASH_U32_CH1(c0)) + _STRHASH_U32_CH0(c0)) //Hash value of uint32 dependent on character count #define _STRHASH_HSH(Hash, c0) (_STRHASH_U32_CH3(c0) ? _STRHASH_HS4(Hash,c0) : (_STRHASH_U32_CH2(c0) ? _STRHASH_HS3(Hash,c0) : (_STRHASH_U32_CH1(c0) ? _STRHASH_HS2(Hash,c0) : (_STRHASH_U32_CH0(c0) ? _STRHASH_HS1(Hash,c0) : Hash)))) //Hash value of 4-character string #define _STRHASH_STRH(Hs,c0) (_STRHASH_HSH(Hs,c0)-5381) template<uint32 _UIHs, uint32 _UI0> class TK_StrHash { public: enum { HASH = _STRHASH_STRH(_UIHs,_UI0) }; }; template<uint32 _UI0, uint32 _UI1> class TK_StrHash2 { public: enum { HASH = TK_StrHash<TK_StrHash<5381,_UI0>::HASH + 5381, _UI1>::HASH }; }; template<uint32 _UI0, uint32 _UI1, uint32 _UI2> class TK_StrHash3 { public: enum { HASH = TK_StrHash<TK_StrHash2<_UI0,_UI1>::HASH + 5381, _UI2>::HASH }; }; template<uint32 _UI0, uint32 _UI1, uint32 _UI2, uint32 _UI3> class TK_StrHash4 { public: enum { HASH = TK_StrHash<TK_StrHash3<_UI0,_UI1,_UI2>::HASH + 5381, _UI3>::HASH }; }; template<uint32 _UI0, uint32 _UI1, uint32 _UI2, uint32 _UI3, uint32 _UI4> class TK_StrHash5 { public: enum { HASH = TK_StrHash<TK_StrHash4<_UI0,_UI1,_UI2,_UI3>::HASH + 5381, _UI4>::HASH }; }; template<uint32 _UI0, uint32 _UI1, uint32 _UI2, uint32 _UI3, uint32 _UI4, uint32 _UI5> class TK_StrHash6 { public: enum { HASH = TK_StrHash<TK_StrHash5<_UI0,_UI1,_UI2,_UI3,_UI4>::HASH + 5381, _UI5>::HASH }; }; template<uint32 _UI0, uint32 _UI1, uint32 _UI2, uint32 _UI3, uint32 _UI4, uint32 _UI5, uint32 _UI6> class TK_StrHash7 { public: enum { HASH = TK_StrHash<TK_StrHash6<_UI0,_UI1,_UI2,_UI3,_UI4,_UI5>::HASH + 5381, _UI6>::HASH }; }; template<uint32 _UI0, uint32 _UI1, uint32 _UI2, uint32 _UI3, uint32 _UI4, uint32 _UI5, uint32 _UI6,uint32 _UI7> class TK_StrHash8 { public: enum { HASH = TK_StrHash<TK_StrHash7<_UI0,_UI1,_UI2,_UI3,_UI4,_UI5,_UI6>::HASH + 5381, _UI7>::HASH }; }; template<uint32 _UI0, uint32 _UI1, uint32 _UI2, uint32 _UI3, uint32 _UI4, uint32 _UI5, uint32 _UI6,uint32 _UI7,uint32 _UI8> class TK_StrHash9 { public: enum { HASH = TK_StrHash<TK_StrHash8<_UI0,_UI1,_UI2,_UI3,_UI4,_UI5,_UI6,_UI7>::HASH + 5381, _UI8>::HASH }; }; template<uint32 _UI0, uint32 _UI1=0, uint32 _UI2=0, uint32 _UI3=0, uint32 _UI4=0, uint32 _UI5=0, uint32 _UI6=0,uint32 _UI7=0,uint32 _UI8=0,uint32 _UI9=0> class TK_StrHash10 { public: enum { HASH = TK_StrHash<TK_StrHash9<_UI0,_UI1,_UI2,_UI3,_UI4,_UI5,_UI6,_UI7,_UI8>::HASH + 5381, _UI9>::HASH }; }; //Cleanup #undef _STRHASH_HS_MUL #undef _STRHASH_U32_CH0 #undef _STRHASH_U32_CH1 #undef _STRHASH_U32_CH2 #undef _STRHASH_U32_CH3 #undef _STRHASH_HSMUL #undef _STRHASH_HS1 #undef _STRHASH_HS2 #undef _STRHASH_HS3 #undef _STRHASH_HS4 #undef _STRHASH_HSH #undef _STRHASH_STRH //Usable #define STRHASH(...) TK_StrHash10<__VA_ARGS__>::HASH