1
0
Fork 0
mirror of https://github.com/VSadov/Satori.git synced 2025-06-09 09:34:49 +09:00

Replace MD5 with MurmurHash3_128 in SPMI (#78527)

Co-authored-by: Jakob Botsch Nielsen <Jakob.botsch.nielsen@gmail.com>
Co-authored-by: Jan Kotas <jkotas@microsoft.com>
This commit is contained in:
Egor Bogatov 2022-11-20 03:44:51 +01:00 committed by GitHub
parent 37cb86cb82
commit 5c420f1aa7
Signed by: github
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 127 additions and 190 deletions

View file

@ -1206,3 +1206,12 @@ As an exception, if, as a result of your compiling your source code, portions
of this Software are embedded into a machine-executable object form of such
source code, you may redistribute such embedded portions in such object form
without including the above copyright and permission notices.
License for MurmurHash3
--------------------------------------
https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
MurmurHash3 was written by Austin Appleby, and is placed in the public
domain. The author hereby disclaims copyright to this source

View file

@ -66,8 +66,8 @@ bool RemoveDup::unique(MethodContext* mc)
// Assume that there are lots of duplicates, so don't allocate a new buffer for the MD5 hash data
// until we know we're going to add it to the map.
char md5Buff[MD5_HASH_BUFFER_SIZE];
mc->dumpMethodMD5HashToBuffer(md5Buff, MD5_HASH_BUFFER_SIZE, /* ignoreMethodName */ true, &newInfo, newFlags);
char md5Buff[MM3_HASH_BUFFER_SIZE];
mc->dumpMethodHashToBuffer(md5Buff, MM3_HASH_BUFFER_SIZE, /* ignoreMethodName */ true, &newInfo, newFlags);
if (m_inFile->GetIndex(newInfo.ILCodeSize) == -1)
m_inFile->Add(newInfo.ILCodeSize, new DenseLightWeightMap<char*>());
@ -77,14 +77,14 @@ bool RemoveDup::unique(MethodContext* mc)
for (unsigned i = 0; i < ourRank->GetCount(); i++)
{
char* md5Buff2 = ourRank->Get(i);
if (strncmp(md5Buff, md5Buff2, MD5_HASH_BUFFER_SIZE) == 0)
if (strncmp(md5Buff, md5Buff2, MM3_HASH_BUFFER_SIZE) == 0)
{
return false;
}
}
char* newmd5Buff = new char[MD5_HASH_BUFFER_SIZE];
memcpy(newmd5Buff, md5Buff, MD5_HASH_BUFFER_SIZE);
char* newmd5Buff = new char[MM3_HASH_BUFFER_SIZE];
memcpy(newmd5Buff, md5Buff, MM3_HASH_BUFFER_SIZE);
ourRank->Append(newmd5Buff);
return true;
}

View file

@ -37,7 +37,7 @@ int verbTOC::DoWork(const char* nameOfInput)
MethodContext* mc = mci.Current();
TOCElementNode* nxt = new TOCElementNode(mci.MethodContextNumber(), mci.CurrentPos());
mc->dumpMethodMD5HashToBuffer(nxt->tocElement.Hash, MD5_HASH_BUFFER_SIZE);
mc->dumpMethodHashToBuffer(nxt->tocElement.Hash, MM3_HASH_BUFFER_SIZE);
if (curElem != nullptr)
{

View file

@ -2,87 +2,110 @@
// The .NET Foundation licenses this file to you under the MIT license.
//----------------------------------------------------------
// hash.cpp - Class for hashing a text stream using MD5 hashing
//
// Note that on Windows, acquiring the Crypto hash provider is expensive, so
// only do that once and cache it.
// hash.cpp - Class for hashing a text stream using MurMurHash3 hashing
//----------------------------------------------------------
#include "standardpch.h"
#include "runtimedetails.h"
#include "errorhandling.h"
#include "md5.h"
#include "hash.h"
Hash::Hash()
#ifndef TARGET_UNIX
: m_Initialized(false)
, m_hCryptProv(NULL)
#endif // !TARGET_UNIX
// MurmurHash3 was written by Austin Appleby, and is placed in the public
// domain. The author hereby disclaims copyright to this source code.
//
// Implementation was copied from https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
// with changes around strict-aliasing/unaligned reads
inline uint64_t ROTL64(uint64_t x, int8_t r)
{
return (x << r) | (x >> (64 - r));
}
Hash::~Hash()
inline uint64_t getblock64(const uint8_t* ptr)
{
Destroy(); // Ignoring return code.
uint64_t val = 0;
memcpy(&val, ptr, sizeof(uint64_t));
return val;
}
// static
bool Hash::Initialize()
inline void setblock64(uint8_t* ptr, uint64_t val)
{
#ifdef TARGET_UNIX
memcpy(ptr, &val, sizeof(uint64_t));
}
// No initialization necessary.
return true;
// Finalization mix - force all bits of a hash block to avalanche
inline uint64_t fmix64(uint64_t k)
{
k ^= k >> 33;
k *= 0xff51afd7ed558ccdLLU;
k ^= k >> 33;
k *= 0xc4ceb9fe1a85ec53LLU;
k ^= k >> 33;
return k;
}
#else // !TARGET_UNIX
static void MurmurHash3_128(const void* key, const size_t len, const uint32_t seed, void* out)
{
const uint8_t* data = static_cast<const uint8_t*>(key);
const size_t nblocks = len / MM3_HASH_BYTE_SIZE;
uint64_t h1 = seed;
uint64_t h2 = seed;
const uint64_t c1 = 0x87c37b91114253d5LLU;
const uint64_t c2 = 0x4cf5ad432745937fLLU;
if (m_Initialized)
// body
for (size_t i = 0; i < nblocks; i++)
{
LogError("Hash class has already been initialized");
return false;
uint64_t k1 = getblock64(data + (i * 2 + 0) * sizeof(uint64_t));
uint64_t k2 = getblock64(data + (i * 2 + 1) * sizeof(uint64_t));
k1 *= c1; k1 = ROTL64(k1, 31); k1 *= c2; h1 ^= k1;
h1 = ROTL64(h1, 27); h1 += h2; h1 = h1 * 5 + 0x52dce729;
k2 *= c2; k2 = ROTL64(k2, 33); k2 *= c1; h2 ^= k2;
h2 = ROTL64(h2, 31); h2 += h1; h2 = h2 * 5 + 0x38495ab5;
}
// Get handle to the crypto provider
if (!CryptAcquireContextA(&m_hCryptProv, NULL, NULL, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT))
goto OnError;
// tail
const uint8_t* tail = data + nblocks * MM3_HASH_BYTE_SIZE;
uint64_t k1 = 0;
uint64_t k2 = 0;
m_Initialized = true;
return true;
OnError:
LogError("Failed to create a hash using the Crypto API (Error 0x%X)", GetLastError());
if (m_hCryptProv != NULL)
CryptReleaseContext(m_hCryptProv, 0);
m_Initialized = false;
return false;
#endif // !TARGET_UNIX
}
// static
bool Hash::Destroy()
{
#ifdef TARGET_UNIX
// No destruction necessary.
return true;
#else // !TARGET_UNIX
// Should probably check Crypt() function return codes.
if (m_hCryptProv != NULL)
switch (len & 15)
{
CryptReleaseContext(m_hCryptProv, 0);
m_hCryptProv = NULL;
case 15: k2 ^= static_cast<uint64_t>(tail[14]) << 48; FALLTHROUGH;
case 14: k2 ^= static_cast<uint64_t>(tail[13]) << 40; FALLTHROUGH;
case 13: k2 ^= static_cast<uint64_t>(tail[12]) << 32; FALLTHROUGH;
case 12: k2 ^= static_cast<uint64_t>(tail[11]) << 24; FALLTHROUGH;
case 11: k2 ^= static_cast<uint64_t>(tail[10]) << 16; FALLTHROUGH;
case 10: k2 ^= static_cast<uint64_t>(tail[9]) << 8; FALLTHROUGH;
case 9: k2 ^= static_cast<uint64_t>(tail[8]) << 0;
k2 *= c2; k2 = ROTL64(k2, 33); k2 *= c1; h2 ^= k2;
FALLTHROUGH;
case 8: k1 ^= static_cast<uint64_t>(tail[7]) << 56; FALLTHROUGH;
case 7: k1 ^= static_cast<uint64_t>(tail[6]) << 48; FALLTHROUGH;
case 6: k1 ^= static_cast<uint64_t>(tail[5]) << 40; FALLTHROUGH;
case 5: k1 ^= static_cast<uint64_t>(tail[4]) << 32; FALLTHROUGH;
case 4: k1 ^= static_cast<uint64_t>(tail[3]) << 24; FALLTHROUGH;
case 3: k1 ^= static_cast<uint64_t>(tail[2]) << 16; FALLTHROUGH;
case 2: k1 ^= static_cast<uint64_t>(tail[1]) << 8; FALLTHROUGH;
case 1: k1 ^= static_cast<uint64_t>(tail[0]) << 0;
k1 *= c1; k1 = ROTL64(k1, 31); k1 *= c2; h1 ^= k1;
break;
}
m_Initialized = false;
return true;
// finalization
h1 ^= len;
h2 ^= len;
h1 += h2;
h2 += h1;
h1 = fmix64(h1);
h2 = fmix64(h2);
h1 += h2;
h2 += h1;
#endif // !TARGET_UNIX
setblock64(static_cast<uint8_t*>(out), h1);
setblock64(static_cast<uint8_t*>(out) + sizeof(uint64_t), h2);
}
// Hash::WriteHashValueAsText - Take a binary hash value in the array of bytes pointed to by
@ -94,7 +117,7 @@ bool Hash::WriteHashValueAsText(const BYTE* pHash, size_t cbHash, char* hashText
{
// This could be:
//
// for (DWORD i = 0; i < MD5_HASH_BYTE_SIZE; i++)
// for (DWORD i = 0; i < MM3_HASH_BYTE_SIZE; i++)
// {
// sprintf_s(hash + i * 2, hashLen - i * 2, "%02X", bHash[i]);
// }
@ -121,77 +144,18 @@ bool Hash::WriteHashValueAsText(const BYTE* pHash, size_t cbHash, char* hashText
return true;
}
// Hash::HashBuffer - Compute an MD5 hash of the data pointed to by 'pBuffer', of 'bufLen' bytes,
// Hash::HashBuffer - Compute a MurMurHash3 hash of the data pointed to by 'pBuffer', of 'bufLen' bytes,
// writing the hexadecimal ASCII text representation of the hash to the buffer pointed to by 'hash',
// of 'hashLen' bytes in size, which must be at least MD5_HASH_BUFFER_SIZE bytes.
// of 'hashLen' bytes in size, which must be at least MM3_HASH_BUFFER_SIZE bytes.
//
// Returns the number of bytes written, or -1 on error.
int Hash::HashBuffer(BYTE* pBuffer, size_t bufLen, char* hash, size_t hashLen)
{
#ifdef TARGET_UNIX
uint8_t murMurHash[MM3_HASH_BYTE_SIZE];
MurmurHash3_128(pBuffer, bufLen, 0, murMurHash);
MD5HASHDATA md5_hashdata;
MD5 md5_hasher;
if (hashLen < MD5_HASH_BUFFER_SIZE)
if (!WriteHashValueAsText(murMurHash, MM3_HASH_BYTE_SIZE, hash, hashLen))
return -1;
md5_hasher.Hash(pBuffer, (ULONG)bufLen, &md5_hashdata);
DWORD md5_hashdata_size = sizeof(md5_hashdata.rgb) / sizeof(BYTE);
Assert(md5_hashdata_size == MD5_HASH_BYTE_SIZE);
if (!WriteHashValueAsText(md5_hashdata.rgb, md5_hashdata_size, hash, hashLen))
return -1;
return MD5_HASH_BUFFER_SIZE; // if we had success we wrote MD5_HASH_BUFFER_SIZE bytes to the buffer
#else // !TARGET_UNIX
if (!m_Initialized)
{
LogError("Hash class not initialized");
return -1;
}
HCRYPTHASH hCryptHash;
BYTE bHash[MD5_HASH_BYTE_SIZE];
DWORD cbHash = MD5_HASH_BYTE_SIZE;
if (hashLen < MD5_HASH_BUFFER_SIZE)
return -1;
if (!CryptCreateHash(m_hCryptProv, CALG_MD5, 0, 0, &hCryptHash))
goto OnError;
if (!CryptHashData(hCryptHash, pBuffer, (DWORD)bufLen, 0))
goto OnError;
if (!CryptGetHashParam(hCryptHash, HP_HASHVAL, bHash, &cbHash, 0))
goto OnError;
if (cbHash != MD5_HASH_BYTE_SIZE)
goto OnError;
if (!WriteHashValueAsText(bHash, cbHash, hash, hashLen))
return -1;
// Clean up.
CryptDestroyHash(hCryptHash);
hCryptHash = NULL;
return MD5_HASH_BUFFER_SIZE; // if we had success we wrote MD5_HASH_BUFFER_SIZE bytes to the buffer
OnError:
LogError("Failed to create a hash using the Crypto API (Error 0x%X)", GetLastError());
if (hCryptHash != NULL)
{
CryptDestroyHash(hCryptHash);
hCryptHash = NULL;
}
return -1;
#endif // !TARGET_UNIX
return MM3_HASH_BUFFER_SIZE; // if we had success we wrote MM3_HASH_BUFFER_SIZE bytes to the buffer
}

View file

@ -7,39 +7,16 @@
#ifndef _hash
#define _hash
#define MD5_HASH_BYTE_SIZE 16 // MD5 is 128-bit, so we need 16 bytes to store it
#define MD5_HASH_BUFFER_SIZE 33 // MD5 is 128-bit, so we need 32 chars + 1 char to store null-terminator
#define MM3_HASH_BYTE_SIZE 16 // MurMurHash3 is 128-bit, so we need 16 bytes to store it
#define MM3_HASH_BUFFER_SIZE 33 // MurMurHash3 is 128-bit, so we need 32 chars + 1 char to store null-terminator
class Hash
{
public:
Hash();
~Hash();
bool Initialize();
bool Destroy();
bool IsInitialized()
{
#ifdef TARGET_UNIX
return true; // No initialization necessary.
#else // TARGET_UNIX
return m_Initialized;
#endif // !TARGET_UNIX
}
int HashBuffer(BYTE* pBuffer, size_t bufLen, char* hash, size_t hashLen);
static int HashBuffer(BYTE* pBuffer, size_t bufLen, char* hash, size_t hashLen);
private:
bool WriteHashValueAsText(const BYTE* pHash, size_t cbHash, char* hashTextBuffer, size_t hashTextBufferLen);
#ifndef TARGET_UNIX
bool m_Initialized;
HCRYPTPROV m_hCryptProv;
#endif // !TARGET_UNIX
static bool WriteHashValueAsText(const BYTE* pHash, size_t cbHash, char* hashTextBuffer, size_t hashTextBufferLen);
};
#endif

View file

@ -18,9 +18,6 @@
#define sparseMC // Support filling in details where guesses are okay and will still generate good code. (i.e. helper
// function addresses)
// static variable initialization
Hash MethodContext::m_hash;
MethodContext::MethodContext()
{
methodSize = 0;
@ -7245,8 +7242,8 @@ int MethodContext::dumpMethodIdentityInfoToBuffer(char* buff, int len, bool igno
}
// Hash the IL Code for this method and append it to the ID info
char ilHash[MD5_HASH_BUFFER_SIZE];
dumpMD5HashToBuffer(pInfo->ILCode, pInfo->ILCodeSize, ilHash, MD5_HASH_BUFFER_SIZE);
char ilHash[MM3_HASH_BUFFER_SIZE];
dumpHashToBuffer(pInfo->ILCode, pInfo->ILCodeSize, ilHash, MM3_HASH_BUFFER_SIZE);
t = sprintf_s(buff, len, " ILCode Hash: %s", ilHash);
buff += t;
len -= t;
@ -7305,9 +7302,9 @@ int MethodContext::dumpMethodIdentityInfoToBuffer(char* buff, int len, bool igno
//
if (minOffset < maxOffset)
{
char pgoHash[MD5_HASH_BUFFER_SIZE];
dumpMD5HashToBuffer(schemaData + minOffset, (int)(maxOffset - minOffset), pgoHash,
MD5_HASH_BUFFER_SIZE);
char pgoHash[MM3_HASH_BUFFER_SIZE];
dumpHashToBuffer(schemaData + minOffset, (int)(maxOffset - minOffset), pgoHash,
MM3_HASH_BUFFER_SIZE);
t = sprintf_s(buff, len, " Pgo Counters %u, Count %llu, Hash: %s", schemaCount, totalCount, pgoHash);
buff += t;
@ -7319,7 +7316,7 @@ int MethodContext::dumpMethodIdentityInfoToBuffer(char* buff, int len, bool igno
return (int)(buff - obuff);
}
int MethodContext::dumpMethodMD5HashToBuffer(char* buff, int len, bool ignoreMethodName /* = false */, CORINFO_METHOD_INFO* optInfo /* = nullptr */, unsigned optFlags /* = 0 */)
int MethodContext::dumpMethodHashToBuffer(char* buff, int len, bool ignoreMethodName /* = false */, CORINFO_METHOD_INFO* optInfo /* = nullptr */, unsigned optFlags /* = 0 */)
{
char bufferIdentityInfo[METHOD_IDENTITY_INFO_SIZE];
@ -7328,24 +7325,14 @@ int MethodContext::dumpMethodMD5HashToBuffer(char* buff, int len, bool ignoreMet
if (cbLen < 0)
return cbLen;
cbLen = dumpMD5HashToBuffer((BYTE*)bufferIdentityInfo, cbLen, buff, len);
cbLen = dumpHashToBuffer((BYTE*)bufferIdentityInfo, cbLen, buff, len);
return cbLen;
}
int MethodContext::dumpMD5HashToBuffer(BYTE* pBuffer, int bufLen, char* hash, int hashLen)
int MethodContext::dumpHashToBuffer(BYTE* pBuffer, int bufLen, char* hash, int hashLen)
{
// Lazy initialize the MD5 hasher.
if (!m_hash.IsInitialized())
{
if (!m_hash.Initialize())
{
AssertMsg(false, "Failed to initialize the MD5 hasher");
return -1;
}
}
return m_hash.HashBuffer(pBuffer, bufLen, hash, hashLen);
return Hash::HashBuffer(pBuffer, bufLen, hash, hashLen);
}
bool MethodContext::hasPgoData(bool& hasEdgeProfile, bool& hasClassProfile, bool& hasMethodProfile, bool& hasLikelyClass, bool& hasLikelyMethod, ICorJitInfo::PgoSource& pgoSource)

View file

@ -83,7 +83,7 @@ private:
bool Initialize(int mcIndex, unsigned char* buff, DWORD size);
bool Initialize(int mcIndex, HANDLE hFile);
int dumpMD5HashToBuffer(BYTE* pBuffer, int bufLen, char* buff, int len);
int dumpHashToBuffer(BYTE* pBuffer, int bufLen, char* buff, int len);
public:
static bool Initialize(int mcIndex, unsigned char* buff, DWORD size, /* OUT */ MethodContext** ppmc);
@ -110,7 +110,7 @@ public:
int methodSize;
int dumpMethodIdentityInfoToBuffer(char* buff, int len, bool ignoreMethodName = false, CORINFO_METHOD_INFO* optInfo = nullptr, unsigned optFlags = 0);
int dumpMethodMD5HashToBuffer(char* buff, int len, bool ignoreMethodName = false, CORINFO_METHOD_INFO* optInfo = nullptr, unsigned optFlags = 0);
int dumpMethodHashToBuffer(char* buff, int len, bool ignoreMethodName = false, CORINFO_METHOD_INFO* optInfo = nullptr, unsigned optFlags = 0);
bool hasPgoData(bool& hasEdgeProfile, bool& hasClassProfile, bool& hasMethodProfile, bool& hasLikelyClass, bool& hasLikelyMethod, ICorJitInfo::PgoSource& pgoSource);

View file

@ -308,7 +308,7 @@ MethodContextBuffer MethodContextReader::GetNextMethodContextFromHash()
// one-by-one till we find a matching hash
for (; curTOCIndex < (int)this->tocFile.GetTocCount(); curTOCIndex++)
{
if (_strnicmp(this->Hash, this->tocFile.GetElementPtr(curTOCIndex)->Hash, MD5_HASH_BUFFER_SIZE) == 0)
if (_strnicmp(this->Hash, this->tocFile.GetElementPtr(curTOCIndex)->Hash, MM3_HASH_BUFFER_SIZE) == 0)
{
// We found a match, return this specific method
return this->GetSpecificMethodContext(this->tocFile.GetElementPtr(curTOCIndex++)->Number);
@ -330,7 +330,7 @@ MethodContextBuffer MethodContextReader::GetNextMethodContextFromHash()
if (mcb.allDone() || mcb.Error())
return mcb;
char mcHash[MD5_HASH_BUFFER_SIZE];
char mcHash[MM3_HASH_BUFFER_SIZE];
// Create a temporary copy of mcb.buff plus ending 2-byte canary
// this will get freed up by MethodContext constructor
@ -342,10 +342,10 @@ MethodContextBuffer MethodContextReader::GetNextMethodContextFromHash()
if (!MethodContext::Initialize(-1, buff, mcb.size, &mc))
return MethodContextBuffer(-1);
mc->dumpMethodMD5HashToBuffer(mcHash, MD5_HASH_BUFFER_SIZE);
mc->dumpMethodHashToBuffer(mcHash, MM3_HASH_BUFFER_SIZE);
delete mc;
if (_strnicmp(this->Hash, mcHash, MD5_HASH_BUFFER_SIZE) == 0)
if (_strnicmp(this->Hash, mcHash, MM3_HASH_BUFFER_SIZE) == 0)
{
// We found a match, return this specific method
return mcb;
@ -532,7 +532,7 @@ void MethodContextReader::ReadExcludedMethods(std::string mchFileName)
curr++;
}
if (hash.length() == MD5_HASH_BUFFER_SIZE - 1)
if (hash.length() == MM3_HASH_BUFFER_SIZE - 1)
{
StringList* node = new StringList();
node->hash = hash;
@ -566,8 +566,8 @@ bool MethodContextReader::IsMethodExcluded(MethodContext* mc)
{
if (excludedMethodsList != nullptr)
{
char md5HashBuf[MD5_HASH_BUFFER_SIZE] = {0};
mc->dumpMethodMD5HashToBuffer(md5HashBuf, MD5_HASH_BUFFER_SIZE);
char md5HashBuf[MM3_HASH_BUFFER_SIZE] = {0};
mc->dumpMethodHashToBuffer(md5HashBuf, MM3_HASH_BUFFER_SIZE);
for (StringList* node = excludedMethodsList; node != nullptr; node = node->next)
{
if (strcmp(node->hash.c_str(), md5HashBuf) == 0)

View file

@ -14,7 +14,7 @@ class TOCElement
public:
__int64 Offset;
int Number;
char Hash[MD5_HASH_BUFFER_SIZE];
char Hash[MM3_HASH_BUFFER_SIZE];
TOCElement()
{

View file

@ -456,7 +456,7 @@ bool CommandLine::Parse(int argc, char* argv[], /* OUT */ Options* o)
return false;
}
if (strlen(argv[i]) != (MD5_HASH_BUFFER_SIZE - 1))
if (strlen(argv[i]) != (MM3_HASH_BUFFER_SIZE - 1))
{
LogError("Arg '%s' is invalid, needed a valid method context hash.", argv[i]);
DumpHelp(argv[0]);

View file

@ -45,8 +45,8 @@ void MethodStatsEmitter::Emit(int methodNumber, MethodContext* mc, ULONGLONG fir
if (strchr(statsTypes, '*') != NULL || strchr(statsTypes, 'h') != NULL || strchr(statsTypes, 'H') != NULL)
{
// Obtain the method Hash
char md5Hash[MD5_HASH_BUFFER_SIZE];
if (mc->dumpMethodMD5HashToBuffer(md5Hash, MD5_HASH_BUFFER_SIZE) != MD5_HASH_BUFFER_SIZE)
char md5Hash[MM3_HASH_BUFFER_SIZE];
if (mc->dumpMethodHashToBuffer(md5Hash, MM3_HASH_BUFFER_SIZE) != MM3_HASH_BUFFER_SIZE)
md5Hash[0] = 0;
charCount += sprintf_s(rowData + charCount, ARRAY_SIZE(rowData) - charCount, "%s,", md5Hash);