From ab5bbcecc5ed4d60848496c61cd3c99d1a5789e4 Mon Sep 17 00:00:00 2001 From: lukechampine Date: Thu, 11 Oct 2018 17:48:17 -0400 Subject: [PATCH] speedup Block by casting to array, not slice --- README.md | 8 ++++---- xor_amd64.go | 6 +++--- xor_test.go | 6 ++++-- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 551589c..b17e9fe 100644 --- a/README.md +++ b/README.md @@ -53,9 +53,9 @@ roughly twice as fast as SSE, which is unsurpising since it can operate on twice as many bits per cycle. Lastly, for very small slices, the cost of the function call starts to outweigh the benefit of AVX/SSE (the Go compiler never inlines handwritten asm). If you need to xor exactly 16 bytes (common in block -ciphers), the specialized `Block` function outperforms the more generic -`Bytes`: +ciphers), the specialized `Block` function is about 6 times faster than the +more generic `Bytes`: ``` -BenchmarkBlock-4 1000000000 2.72 ns/op 5888.02 MB/s -``` \ No newline at end of file +BenchmarkBlock-4 2000000000 1.18 ns/op 13546.30 MB/s +``` diff --git a/xor_amd64.go b/xor_amd64.go index a08ea7b..cb2ed16 100644 --- a/xor_amd64.go +++ b/xor_amd64.go @@ -97,9 +97,9 @@ func Byte(dst, a []byte, b byte) int { func Block(dst, a, b []byte) { // profiling indicates that for 16-byte blocks, the cost of a function // call outweighs the SSE/AVX speedup - dw := *(*[]uintptr)(unsafe.Pointer(&dst)) - aw := *(*[]uintptr)(unsafe.Pointer(&a)) - bw := *(*[]uintptr)(unsafe.Pointer(&b)) + dw := (*[2]uintptr)(unsafe.Pointer(&dst[0])) + aw := (*[2]uintptr)(unsafe.Pointer(&a[0])) + bw := (*[2]uintptr)(unsafe.Pointer(&b[0])) dw[0] = aw[0] ^ bw[0] dw[1] = aw[1] ^ bw[1] } diff --git a/xor_test.go b/xor_test.go index 6cf2c0f..bf4e2b3 100644 --- a/xor_test.go +++ b/xor_test.go @@ -181,10 +181,12 @@ func BenchmarkRefByte(b *testing.B) { } func BenchmarkBlock(b *testing.B) { - buf := make([]byte, 16) + dst := make([]byte, 16) + bufA := make([]byte, 16) + bufB := make([]byte, 16) b.SetBytes(16) for i := 0; i < b.N; i++ { - Block(buf, buf, buf) + Block(dst, bufA, bufB) } }