You can skip all the code and assembly and understand the article. I included it in case you'd want to see the results on your own machine.
Why People Read Assembly
Hash functions can be called millions and sometimes billions of times in a few seconds. Let's take a look at MurmurHash64A, a good public domain hash function that produces a 64bit number. What's wrong with this function? It's not a trick question
u64 MurmurHash64A(const void*data_, s64 len, u64 seed)
{
const u8*data = reinterpret_cast<const u8*>(data_);
const u64 m = 0xc6a4a7935bd1e995ULL;
const int r = 47;
u64 h = seed ^ (len * m);
const auto*end=data + (len & -8);
while(data < end) {
u64 k;
memcpy(&k, data, 8);
data += 8;
k *= m;
k ^= k >> r;
k *= m;
h ^= k;
h *= m;
}
if (len & 7) {
u64 temp = 0;
memcpy(&temp, data, len&7);
h ^= temp;
}
h *= m;
h ^= h >> r;
h *= m;
h ^= h >> r;
return h;
}
There's not a thing wrong with this. Let's take a look at the x86-64 assembly it produces
MurmurHash64A(void const*, long long, unsigned long long):
mov r10, rsi
mov r9, rdi
movabs rsi, -4132994306676758123
mov rdi, r10
mov r11, r10
imul rdi, rsi
and r11, -8
lea r8, [r9+r11]
xor rdi, rdx
cmp r9, r8
jnb .L2
mov rcx, r9
.L3:
mov rdx, QWORD PTR [rcx]
add rcx, 8
imul rdx, rsi
mov rax, rdx
shr rax, 47
xor rax, rdx
imul rax, rsi
xor rax, rdi
imul rax, rsi
mov rdi, rax
cmp rcx, r8
jb .L3
add r9, r11
.L2:
and r10d, 7
jne .L13
.L4:
movabs rax, -4132994306676758123
imul rdi, rax
mov rdx, rdi
shr rdx, 47
xor rdx, rdi
imul rdx, rax
mov rax, rdx
shr rax, 47
xor rax, rdx
ret
.L13:
mov QWORD PTR [rsp-8], 0
test r10d, r10d
je .L6
xor eax, eax
.L5:
mov edx, eax
add eax, 1
movzx ecx, BYTE PTR [r9+rdx]
mov BYTE PTR [rsp-8+rdx], cl
cmp eax, r10d
jb .L5
.L6:
xor rdi, QWORD PTR [rsp-8]
jmp .L4

Duplicating 0xc6a4a7935bd1e995 (the -4132994306676758123) isn't bad, it's just a little strange. It should take a cycle and some instruction cache, which should be less than a nanosecond per iteration. Considering there's a million nanoseconds in a milliseconds, it isn't a problem unless you need to do it several million times a second. Let's see how LLVM does
MurmurHash64A(void const*, long long, unsigned long long):
push r14
push rbx
push rax
mov rax, rsi
mov rsi, rdi
movabs rbx, -4132994306676758123
mov rcx, rax
imul rcx, rbx
xor rcx, rdx
cmp rax, 8
jl .LBB0_1
mov rdx, rax
and rdx, -8
add rdx, rsi
.LBB0_3:
mov rdi, qword ptr [rsi]
imul rdi, rbx
add rsi, 8
mov r14, rdi
shr r14, 47
xor r14, rdi
imul r14, rbx
xor r14, rcx
imul r14, rbx
mov rcx, r14
cmp rsi, rdx
jb .LBB0_3
and rax, 7
je .LBB0_6
.LBB0_5:
mov qword ptr [rsp], 0
mov rdi, rsp
mov rdx, rax
call memcpy@PLT
xor r14, qword ptr [rsp]
.LBB0_6:
imul r14, rbx
mov rcx, r14
shr rcx, 47
xor rcx, r14
imul rcx, rbx
mov rax, rcx
shr rax, 47
xor rax, rcx
add rsp, 8
pop rbx
pop r14
ret

This is bad, but we don't know how bad until we measure. There's no reason not to inline memcpy, and you can see the start of the function having multiple pushes because of it. If you want to see two register holding the constant in gcc you can compile with -O2 and run the next line, but it may not work if your enviroment and compiler is different from mine gdb ./a.out -batch -ex "b *0x5555555551fb" -ex r -ex "i r"
.
Let's try to improve the llvm produced code, let's replace the second memcpy.
u64 MurmurHash64A_v2(const void*data_, s64 len, u64 seed)
{
const u8*data = reinterpret_cast<const u8*>(data_);
const u64 m = 0xc6a4a7935bd1e995ULL;
const int r = 47;
u64 h = seed ^ (len * m);
const auto*end=data + (len & -8);
while(data < end) {
u64 k;
memcpy(&k, data, 8);
data += 8;
k *= m;
k ^= k >> r;
k *= m;
h ^= k;
h *= m;
}
if (len & 7) {
// new code
u64 temp=0;
for(int i=(len & 7) - 1; i >= 0; i--) {
temp = (temp<<8) | data[i];
}
h ^= temp;
}
h *= m;
h ^= h >> r;
h *= m;
h ^= h >> r;
return h;
}
I'll show the result at the end when we measure everything. For gcc this is better. There's two je .L17 back to back for some reason, and it still loads the constant twice but when measuring I get better numbers. I'm more concerned about llvm not inlining memcpy. In this version, llvm uses the loop instead of calling memcpy, but the loop is completely unrolled. I'm not sure if this is a good idea, but it's might be.
I know for my use case, I never use the hash function on any arrays that doesn't have 8+ bytes after it (padding or data I can ignore), so instead of a loop copying 1 byte at a time, I can do an unaligned load. If you try to dereference an unaligned pointer with the undefined sanitize you'll get a runtime error. Fortunately, we can solve that by using an atomic load, specifically the weakest load, which shouldn't be used in most lockless code
__atomic_load_n((u64*)data, __ATOMIC_RELAXED)
gcc output doesn't have the duplicate je and doesn't have the byte loop; clang also looks better. Let's measure all three functions. For this, we'll use nanobench. We'll need the single header file.
https://godbolt.org/z/5v4qoTEdf, using "-O2 -march=x86-64-v4"main.cpp
#include <stdio.h>
#include <cassert>
#define ANKERL_NANOBENCH_IMPLEMENT
#include "nanobench.h"
typedef unsigned long long u64;
typedef long long s64;
typedef unsigned char u8;
u64 MurmurHash64A(const void*data_, s64 len, u64 seed);
u64 MurmurHash64A_v2(const void*data_, s64 len, u64 seed);
u64 MurmurHash64A_v3(const void*data_, s64 len, u64 seed);
int main(int argc, char*argv[])
{
auto hashOrig = MurmurHash64A("MurmurHash64A-test", 8, 0x9714F115FCA80DE7);
auto hashOrig2 = MurmurHash64A_v2("MurmurHash64A-test", 8, 0x9714F115FCA80DE7);
auto hashOrig3 = MurmurHash64A_v3("MurmurHash64A-test", 8, 0x9714F115FCA80DE7);
assert(hashOrig == hashOrig2);
assert(hashOrig == hashOrig3);
std::vector<u8> v;
srand(0x12345678);
for(int i=0; i<1<<20; i++) {
v.push_back(rand() & 15);
}
int i;
for(int repeat = 0; repeat<2; repeat++) {
i=0;
ankerl::nanobench::Bench().run("Original", [&] { ankerl::nanobench::doNotOptimizeAway(MurmurHash64A("a string that isn't big", 18 - v[i & v.size()-1], 0x9714F115FCA80DE7)); });
i=0;
ankerl::nanobench::Bench().run("v2", [&] { ankerl::nanobench::doNotOptimizeAway(MurmurHash64A_v2("a string that isn't big", 18 - v[i & v.size()-1], 0x9714F115FCA80DE7)); });
ankerl::nanobench::Bench().run("v3", [&] { ankerl::nanobench::doNotOptimizeAway(MurmurHash64A_v2("a string that isn't big", 18 - v[i & v.size()-1], 0x9714F115FCA80DE7)); });
}
}
impl.cpp
#include
typedef unsigned long long u64;
typedef long long s64;
typedef unsigned char u8;
u64 MurmurHash64A(const void*data_, s64 len, u64 seed)
{
const u8*data = reinterpret_cast<const u8*>(data_);
const u64 m = 0xc6a4a7935bd1e995ULL;
const int r = 47;
u64 h = seed ^ (len * m);
const auto*end=data + (len & -8);
while(data < end) {
u64 k;
memcpy(&k, data, 8);
data += 8;
k *= m;
k ^= k >> r;
k *= m;
h ^= k;
h *= m;
}
if (len & 7) {
u64 temp = 0;
memcpy(&temp, data, len&7);
h ^= temp;
}
h *= m;
h ^= h >> r;
h *= m;
h ^= h >> r;
return h;
}
u64 MurmurHash64A_v2(const void*data_, s64 len, u64 seed)
{
const u8*data = reinterpret_cast<const u8*>(data_);
const u64 m = 0xc6a4a7935bd1e995ULL;
const int r = 47;
u64 h = seed ^ (len * m);
const auto*end=data + (len & -8);
while(data < end) {
u64 k;
memcpy(&k, data, 8);
data += 8;
k *= m;
k ^= k >> r;
k *= m;
h ^= k;
h *= m;
}
if (len & 7) {
u64 temp=0;
for(int i=(len & 7) - 1; i >= 0; i--) {
temp = (temp<<8) | data[i];
}
h ^= temp;
}
h *= m;
h ^= h >> r;
h *= m;
h ^= h >> r;
return h;
}
u64 MurmurHash64A_v3(const void*data_, s64 len, u64 seed)
{
const u8*data = reinterpret_cast<const u8*>(data_);
const u64 m = 0xc6a4a7935bd1e995ULL;
const int r = 47;
u64 h = seed ^ (len * m);
const auto*end=data + (len & -8);
while(data < end) {
u64 k;
memcpy(&k, data, 8);
data += 8;
k *= m;
k ^= k >> r;
k *= m;
h ^= k;
h *= m;
}
if (len & 7) {
u64 temp = __atomic_load_n((u64*)data, __ATOMIC_RELAXED);
temp &= (1ULL << (len&7)*8)-1;
h ^= temp;
}
h *= m;
h ^= h >> r;
h *= m;
h ^= h >> r;
return h;
}
Here's my result using clang++ -O2 -g main.cpp impl.cpp. On gcc v2 is a tiny bit slower than v3 but both clang and gcc are very fast and about the same speed
| ns/op | op/s | err% | ins/op | cyc/op | IPC | bra/op | miss% | total | benchmark |--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:---------- | 9.24 | 108,282,640.08 | 0.1% | 82.00 | 37.01 | 2.216 | 13.00 | 0.0% | 0.01 | `Original` | 5.27 | 189,682,784.59 | 0.1% | 87.00 | 20.99 | 4.144 | 11.00 | 0.0% | 0.01 | `v2` | 5.31 | 188,296,425.14 | 0.3% | 87.00 | 21.00 | 4.144 | 11.00 | 0.0% | 0.01 | `v3` | 9.29 | 107,613,999.16 | 0.2% | 82.00 | 37.01 | 2.216 | 13.00 | 0.0% | 0.01 | `Original` | 5.28 | 189,504,945.68 | 0.4% | 87.00 | 20.99 | 4.144 | 11.00 | 0.0% | 0.01 | `v2` | 5.27 | 189,666,894.59 | 0.1% | 87.00 | 21.00 | 4.144 | 11.00 | 0.0% | 0.01 | `v3`
Since version 3 is the same speed as version 2 (in llvm and almost in gcc), and version 2 works in more situation, I'd stick to v2. v3 has the best and nearly identical result in both gcc and clang, which is 40% faster than the initial version.
The reason why people look at assembly code is that optimizers sometimes do funny things, and it takes less than 5 lines of C code to correct it.