Code Style & Taste
A blog where I share thoughts on code and practices

You can skip all the code and assembly and understand the article. I included it in case you'd want to see the results on your own machine.

Why People Read Assembly

Hash functions can be called millions and sometimes billions of times in a few seconds. Let's take a look at MurmurHash64A, a good public domain hash function that produces a 64bit number. What's wrong with this function? It's not a trick question

u64 MurmurHash64A(const void*data_, s64 len, u64 seed)
{
	const u8*data = reinterpret_cast<const u8*>(data_);
	const u64 m = 0xc6a4a7935bd1e995ULL;
	const int r = 47;

	u64 h = seed ^ (len * m);

	const auto*end=data + (len & -8);

	while(data < end) {
		u64 k;
		memcpy(&k, data, 8);
		data += 8;

		k *= m;
		k ^= k >> r;
		k *= m;

		h ^= k;
		h *= m;
	}
	
	if (len & 7) {
		u64 temp = 0;
		memcpy(&temp, data, len&7);
		h ^= temp;
	}
	h *= m;

	h ^= h >> r;
	h *= m;
	h ^= h >> r;

	return h;
}

There's not a thing wrong with this. Let's take a look at the x86-64 assembly it produces

MurmurHash64A(void const*, long long, unsigned long long):
	mov     r10, rsi
	mov     r9, rdi
	movabs  rsi, -4132994306676758123
	mov     rdi, r10
	mov     r11, r10
	imul    rdi, rsi
	and     r11, -8
	lea     r8, [r9+r11]
	xor     rdi, rdx
	cmp     r9, r8
	jnb     .L2
	mov     rcx, r9
.L3:
	mov     rdx, QWORD PTR [rcx]
	add     rcx, 8
	imul    rdx, rsi
	mov     rax, rdx
	shr     rax, 47
	xor     rax, rdx
	imul    rax, rsi
	xor     rax, rdi
	imul    rax, rsi
	mov     rdi, rax
	cmp     rcx, r8
	jb      .L3
	add     r9, r11
.L2:
	and     r10d, 7
	jne     .L13
.L4:
	movabs  rax, -4132994306676758123
	imul    rdi, rax
	mov     rdx, rdi
	shr     rdx, 47
	xor     rdx, rdi
	imul    rdx, rax
	mov     rax, rdx
	shr     rax, 47
	xor     rax, rdx
	ret
.L13:
	mov     QWORD PTR [rsp-8], 0
	test    r10d, r10d
	je      .L6
	xor     eax, eax
.L5:
	mov     edx, eax
	add     eax, 1
	movzx   ecx, BYTE PTR [r9+rdx]
	mov     BYTE PTR [rsp-8+rdx], cl
	cmp     eax, r10d
	jb      .L5
.L6:
	xor     rdi, QWORD PTR [rsp-8]
	jmp     .L4

Duplicating 0xc6a4a7935bd1e995 (the -4132994306676758123) isn't bad, it's just a little strange. It should take a cycle and some instruction cache, which should be less than a nanosecond per iteration. Considering there's a million nanoseconds in a milliseconds, it isn't a problem unless you need to do it several million times a second. Let's see how LLVM does

MurmurHash64A(void const*, long long, unsigned long long):
	push    r14
	push    rbx
	push    rax
	mov     rax, rsi
	mov     rsi, rdi
	movabs  rbx, -4132994306676758123
	mov     rcx, rax
	imul    rcx, rbx
	xor     rcx, rdx
	cmp     rax, 8
	jl      .LBB0_1
	mov     rdx, rax
	and     rdx, -8
	add     rdx, rsi
.LBB0_3:
	mov     rdi, qword ptr [rsi]
	imul    rdi, rbx
	add     rsi, 8
	mov     r14, rdi
	shr     r14, 47
	xor     r14, rdi
	imul    r14, rbx
	xor     r14, rcx
	imul    r14, rbx
	mov     rcx, r14
	cmp     rsi, rdx
	jb      .LBB0_3
	and     rax, 7
	je      .LBB0_6
.LBB0_5:
	mov     qword ptr [rsp], 0
	mov     rdi, rsp
	mov     rdx, rax
	call    memcpy@PLT
	xor     r14, qword ptr [rsp]
.LBB0_6:
	imul    r14, rbx
	mov     rcx, r14
	shr     rcx, 47
	xor     rcx, r14
	imul    rcx, rbx
	mov     rax, rcx
	shr     rax, 47
	xor     rax, rcx
	add     rsp, 8
	pop     rbx
	pop     r14
	ret

This is bad, but we don't know how bad until we measure. There's no reason not to inline memcpy, and you can see the start of the function having multiple pushes because of it. If you want to see two register holding the constant in gcc you can compile with -O2 and run the next line, but it may not work if your enviroment and compiler is different from mine gdb ./a.out -batch -ex "b *0x5555555551fb" -ex r -ex "i r".

Let's try to improve the llvm produced code, let's replace the second memcpy.

u64 MurmurHash64A_v2(const void*data_, s64 len, u64 seed)
{
	const u8*data = reinterpret_cast<const u8*>(data_);
	const u64 m = 0xc6a4a7935bd1e995ULL;
	const int r = 47;

	u64 h = seed ^ (len * m);

	const auto*end=data + (len & -8);

	while(data < end) {
		u64 k;
		memcpy(&k, data, 8);
		data += 8;

		k *= m;
		k ^= k >> r;
		k *= m;

		h ^= k;
		h *= m;
	}
	
	if (len & 7) {
		// new code
		u64 temp=0;
		for(int i=(len & 7) - 1; i >= 0; i--) {
			temp = (temp<<8) | data[i];
		}
		h ^= temp;
	}
	h *= m;

	h ^= h >> r;
	h *= m;
	h ^= h >> r;

	return h;
}

I'll show the result at the end when we measure everything. For gcc this is better. There's two je .L17 back to back for some reason, and it still loads the constant twice but when measuring I get better numbers. I'm more concerned about llvm not inlining memcpy. In this version, llvm uses the loop instead of calling memcpy, but the loop is completely unrolled. I'm not sure if this is a good idea, but it's might be.

I know for my use case, I never use the hash function on any arrays that doesn't have 8+ bytes after it (padding or data I can ignore), so instead of a loop copying 1 byte at a time, I can do an unaligned load. If you try to dereference an unaligned pointer with the undefined sanitize you'll get a runtime error. Fortunately, we can solve that by using an atomic load, specifically the weakest load, which shouldn't be used in most lockless code

__atomic_load_n((u64*)data, __ATOMIC_RELAXED)

gcc output doesn't have the duplicate je and doesn't have the byte loop; clang also looks better. Let's measure all three functions. For this, we'll use nanobench. We'll need the single header file.

https://godbolt.org/z/5v4qoTEdf, using "-O2 -march=x86-64-v4"
main.cpp
#include <stdio.h>
#include <cassert>
#define ANKERL_NANOBENCH_IMPLEMENT
#include "nanobench.h"

typedef unsigned long long u64;
typedef long long s64;
typedef unsigned char u8;

u64 MurmurHash64A(const void*data_, s64 len, u64 seed);
u64 MurmurHash64A_v2(const void*data_, s64 len, u64 seed);
u64 MurmurHash64A_v3(const void*data_, s64 len, u64 seed);
int main(int argc, char*argv[])
{
	auto hashOrig = MurmurHash64A("MurmurHash64A-test", 8, 0x9714F115FCA80DE7);
	auto hashOrig2 = MurmurHash64A_v2("MurmurHash64A-test", 8, 0x9714F115FCA80DE7);
	auto hashOrig3 = MurmurHash64A_v3("MurmurHash64A-test", 8, 0x9714F115FCA80DE7);
	assert(hashOrig == hashOrig2);
	assert(hashOrig == hashOrig3);
	std::vector<u8> v;
	srand(0x12345678);
	for(int i=0; i<1<<20; i++) {
		v.push_back(rand() & 15);
	}
	
	int i;
	for(int repeat = 0; repeat<2; repeat++) {
		i=0;
		ankerl::nanobench::Bench().run("Original", [&] { ankerl::nanobench::doNotOptimizeAway(MurmurHash64A("a string that isn't big", 18 - v[i & v.size()-1], 0x9714F115FCA80DE7)); });
		i=0;
		ankerl::nanobench::Bench().run("v2", [&] { ankerl::nanobench::doNotOptimizeAway(MurmurHash64A_v2("a string that isn't big", 18 - v[i & v.size()-1], 0x9714F115FCA80DE7)); });
		ankerl::nanobench::Bench().run("v3", [&] { ankerl::nanobench::doNotOptimizeAway(MurmurHash64A_v2("a string that isn't big", 18 - v[i & v.size()-1], 0x9714F115FCA80DE7)); });
	}
}
impl.cpp
#include 
typedef unsigned long long u64;
typedef long long s64;
typedef unsigned char u8;

u64 MurmurHash64A(const void*data_, s64 len, u64 seed)
{
	const u8*data = reinterpret_cast<const u8*>(data_);
	const u64 m = 0xc6a4a7935bd1e995ULL;
	const int r = 47;

	u64 h = seed ^ (len * m);

	const auto*end=data + (len & -8);

	while(data < end) {
		u64 k;
		memcpy(&k, data, 8);
		data += 8;

		k *= m;
		k ^= k >> r;
		k *= m;

		h ^= k;
		h *= m;
	}
	
	if (len & 7) {
		u64 temp = 0;
		memcpy(&temp, data, len&7);
		h ^= temp;
	}
	h *= m;

	h ^= h >> r;
	h *= m;
	h ^= h >> r;

	return h;
}

u64 MurmurHash64A_v2(const void*data_, s64 len, u64 seed)
{
	const u8*data = reinterpret_cast<const u8*>(data_);
	const u64 m = 0xc6a4a7935bd1e995ULL;
	const int r = 47;

	u64 h = seed ^ (len * m);

	const auto*end=data + (len & -8);

	while(data < end) {
		u64 k;
		memcpy(&k, data, 8);
		data += 8;

		k *= m;
		k ^= k >> r;
		k *= m;

		h ^= k;
		h *= m;
	}
	
	if (len & 7) {
		u64 temp=0;
		for(int i=(len & 7) - 1; i >= 0; i--) {
			temp = (temp<<8) | data[i];
		}
		h ^= temp;
	}
	h *= m;

	h ^= h >> r;
	h *= m;
	h ^= h >> r;

	return h;
}

u64 MurmurHash64A_v3(const void*data_, s64 len, u64 seed)
{
	const u8*data = reinterpret_cast<const u8*>(data_);
	const u64 m = 0xc6a4a7935bd1e995ULL;
	const int r = 47;

	u64 h = seed ^ (len * m);

	const auto*end=data + (len & -8);

	while(data < end) {
		u64 k;
		memcpy(&k, data, 8);
		data += 8;

		k *= m;
		k ^= k >> r;
		k *= m;

		h ^= k;
		h *= m;
	}
	
	if (len & 7) {
		u64 temp = __atomic_load_n((u64*)data, __ATOMIC_RELAXED);
		temp &= (1ULL << (len&7)*8)-1;
		h ^= temp;
	}
	h *= m;

	h ^= h >> r;
	h *= m;
	h ^= h >> r;

	return h;
}

Here's my result using clang++ -O2 -g main.cpp impl.cpp. On gcc v2 is a tiny bit slower than v3 but both clang and gcc are very fast and about the same speed

|               ns/op |                op/s |    err% |          ins/op |          cyc/op |    IPC |         bra/op |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|                9.24 |      108,282,640.08 |    0.1% |           82.00 |           37.01 |  2.216 |          13.00 |    0.0% |      0.01 | `Original`
|                5.27 |      189,682,784.59 |    0.1% |           87.00 |           20.99 |  4.144 |          11.00 |    0.0% |      0.01 | `v2`
|                5.31 |      188,296,425.14 |    0.3% |           87.00 |           21.00 |  4.144 |          11.00 |    0.0% |      0.01 | `v3`
|                9.29 |      107,613,999.16 |    0.2% |           82.00 |           37.01 |  2.216 |          13.00 |    0.0% |      0.01 | `Original`
|                5.28 |      189,504,945.68 |    0.4% |           87.00 |           20.99 |  4.144 |          11.00 |    0.0% |      0.01 | `v2`
|                5.27 |      189,666,894.59 |    0.1% |           87.00 |           21.00 |  4.144 |          11.00 |    0.0% |      0.01 | `v3`

Since version 3 is the same speed as version 2 (in llvm and almost in gcc), and version 2 works in more situation, I'd stick to v2. v3 has the best and nearly identical result in both gcc and clang, which is 40% faster than the initial version.

The reason why people look at assembly code is that optimizers sometimes do funny things, and it takes less than 5 lines of C code to correct it.