commit a04e125b73fadcf0e63e37a266a2e944d8c91172 Author: lukechampine Date: Thu Jul 5 17:52:38 2018 -0400 initial commit diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..892209a --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2018 Luke Champine + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..61b2c73 --- /dev/null +++ b/README.md @@ -0,0 +1,61 @@ +fastxor +----- + +[![GoDoc](https://godoc.org/github.com/lukechampine/fastxor?status.svg)](https://godoc.org/github.com/lukechampine/fastxor) +[![Go Report Card](http://goreportcard.com/badge/github.com/lukechampine/fastxor)](https://goreportcard.com/report/github.com/lukechampine/fastxor) + +``` +go get github.com/lukechampine/fastxor +``` + +Is there a gaping hole in your heart that can only be filled by xor'ing byte +streams at 20GB/s? If so, you've come to the right place. + +`fastxor` is exactly what it sounds like: a package that xors bytes as fast +as your CPU is capable of. For best results, use a CPU that supports a SIMD +instruction set like SSE or AVX. On other architectures, performance is much +less impressive, but still faster than a naive byte-wise loop. + +I wrote this package to try my hand at writing Go assembly, so please scrutinize +my code and let me know how I could make it faster or cleaner! + + +# Benchmarks + +``` +AVX: + +BenchmarkBytes/16-4 200000000 8.72 ns/op 1835.82 MB/s +BenchmarkBytes/1024-4 50000000 38.1 ns/op 26850.41 MB/s +BenchmarkBytes/65k-4 500000 2738 ns/op 23930.93 MB/s + +SSE: + +BenchmarkBytes/16-4 200000000 8.63 ns/op 1852.98 MB/s +BenchmarkBytes/1024-4 50000000 39.4 ns/op 25993.00 MB/s +BenchmarkBytes/65k-4 500000 2733 ns/op 23975.08 MB/s + +Word-wise: + +BenchmarkBytes/16-4 100000000 10.5 ns/op 1521.66 MB/s +BenchmarkBytes/1024-4 10000000 125 ns/op 8163.59 MB/s +BenchmarkBytes/65k-4 200000 6895 ns/op 9504.62 MB/s + +Byte-wise: + +BenchmarkBytes/16-4 100000000 17.3 ns/op 925.16 MB/s +BenchmarkBytes/1024-4 2000000 841 ns/op 1216.31 MB/s +BenchmarkBytes/65k-4 30000 54100 ns/op 1211.38 MB/s +``` + +Conclusions: `fastxor` is 2-25 times faster than a naive `for` loop. AVX and +SSE performance is roughly equivalent, which makes me suspect that I may be +doing something wrong. Lastly, for very small slices, the cost of the function +call starts to outweigh the benefit of AVX/SSE (the Go compiler never inlines +handwritten asm). If you need to xor exactly 16 bytes (common in block +ciphers), the specialized `Block` function outperforms the more generic +`Bytes`: + +``` +BenchmarkBlock-4 500000000 3.69 ns/op 4337.88 MB/s +``` \ No newline at end of file diff --git a/xor.go b/xor.go new file mode 100644 index 0000000..791b171 --- /dev/null +++ b/xor.go @@ -0,0 +1,57 @@ +// +build !386,!amd64,!ppc64,!ppc64le,!s390x + +package fastxor + +// Bytes stores (a xor b) in dst, stopping when the end of any slice is +// reached. It returns the number of bytes xor'd. +func Bytes(dst, a, b []byte) int { + n := len(a) + if len(b) < n { + n = len(b) + } + if len(dst) < n { + n = len(dst) + } + for i := 0; i < n; i++ { + dst[i] = a[i] ^ b[i] + } + return n +} + +// Byte xors each byte in a with b and stores the result in dst, stopping when +// the end of either dst or a is reached. It returns the number of bytes +// xor'd. +func Byte(dst, a []byte, b byte) int { + n := len(a) + if len(dst) < n { + n = len(dst) + } + for i := 0; i < n; i++ { + dst[i] = a[i] ^ b + } + return n +} + +// Block stores (a xor b) in dst, where a, b, and dst all have length 16. +func Block(dst, a, b []byte) { + _ = dst[15] + _ = a[15] + _ = b[15] + + dst[0] = a[0] ^ b[0] + dst[1] = a[1] ^ b[1] + dst[2] = a[2] ^ b[2] + dst[3] = a[3] ^ b[3] + dst[4] = a[4] ^ b[4] + dst[5] = a[5] ^ b[5] + dst[6] = a[6] ^ b[6] + dst[7] = a[7] ^ b[7] + dst[8] = a[8] ^ b[8] + dst[9] = a[9] ^ b[9] + dst[10] = a[10] ^ b[10] + dst[11] = a[11] ^ b[11] + dst[12] = a[12] ^ b[12] + dst[13] = a[13] ^ b[13] + dst[14] = a[14] ^ b[14] + dst[15] = a[15] ^ b[15] +} diff --git a/xor_amd64.go b/xor_amd64.go new file mode 100644 index 0000000..a08ea7b --- /dev/null +++ b/xor_amd64.go @@ -0,0 +1,105 @@ +// +build go1.7,amd64,!gccgo,!appengine,!nacl + +package fastxor + +import ( + "unsafe" + + "golang.org/x/sys/cpu" +) + +//go:noescape +func xorBytesSSE(dst, a, b []byte, n int) + +//go:noescape +func xorBytesAVX(dst, a, b []byte, n int) + +func min(a, b, c int) int { + if a < b { + b = a + } + if b < c { + c = b + } + return c +} + +// Bytes stores (a xor b) in dst, stopping when the end of any slice is +// reached. It returns the number of bytes xor'd. +func Bytes(dst, a, b []byte) int { + n := min(len(dst), len(a), len(b)) + if n == 0 { + return 0 + } + switch { + case cpu.X86.HasAVX: + xorBytesAVX(dst, a, b, n) + case cpu.X86.HasSSE2: + xorBytesSSE(dst, a, b, n) + default: + xorBytesGeneric(dst, a, b, n) + } + return n +} + +const wordSize = int(unsafe.Sizeof(uintptr(0))) + +func xorBytesGeneric(dst, a, b []byte, n int) { + // Assert dst has enough space + _ = dst[n-1] + + w := n / wordSize + if w > 0 { + dw := *(*[]uintptr)(unsafe.Pointer(&dst)) + aw := *(*[]uintptr)(unsafe.Pointer(&a)) + bw := *(*[]uintptr)(unsafe.Pointer(&b)) + for i := 0; i < w; i++ { + dw[i] = aw[i] ^ bw[i] + } + } + + for i := (n - n%wordSize); i < n; i++ { + dst[i] = a[i] ^ b[i] + } +} + +// Byte xors each byte in a with b and stores the result in dst, stopping when +// the end of either dst or a is reached. It returns the number of bytes +// xor'd. +func Byte(dst, a []byte, b byte) int { + n := len(a) + if len(dst) < n { + n = len(dst) + } + + var bw uintptr + for i := 0; i < wordSize; i += 1 { + bw |= uintptr(b) << uint(i*8) + } + + w := n / wordSize + if w > 0 { + dw := *(*[]uintptr)(unsafe.Pointer(&dst)) + aw := *(*[]uintptr)(unsafe.Pointer(&a)) + for i := 0; i < w; i++ { + dw[i] = aw[i] ^ bw + } + } + + for i := (n - n%wordSize); i < n; i++ { + dst[i] = a[i] ^ b + } + + return n +} + +// Block stores (a xor b) in dst, where a, b, and dst all have length 16. +func Block(dst, a, b []byte) { + // profiling indicates that for 16-byte blocks, the cost of a function + // call outweighs the SSE/AVX speedup + dw := *(*[]uintptr)(unsafe.Pointer(&dst)) + aw := *(*[]uintptr)(unsafe.Pointer(&a)) + bw := *(*[]uintptr)(unsafe.Pointer(&b)) + dw[0] = aw[0] ^ bw[0] + dw[1] = aw[1] ^ bw[1] +} diff --git a/xor_amd64.s b/xor_amd64.s new file mode 100644 index 0000000..24df6bf --- /dev/null +++ b/xor_amd64.s @@ -0,0 +1,211 @@ +// +build amd64,!gccgo,!appengine,!nacl + +#include "textflag.h" + +#define Dst DI +#define A R8 +#define B R9 +#define N R12 + +// func xorBytesSSE(dst, a, b []byte, n int) +TEXT ·xorBytesSSE(SB),NOSPLIT,$0 + MOVQ dst_data+0(FP), Dst + MOVQ a_data+24(FP), A + MOVQ b_data+48(FP), B + MOVQ n+72(FP), N + +XOR_LOOP_128_SSE: + CMPQ N, $128 + JB XOR_LOOP_64_SSE + + MOVOU (A), X0 + MOVOU 16(A), X1 + MOVOU 32(A), X2 + MOVOU 48(A), X3 + MOVOU 64(A), X4 + MOVOU 80(A), X5 + MOVOU 96(A), X6 + MOVOU 112(A), X7 + + PXOR (B), X0 + PXOR 16(B), X1 + PXOR 32(B), X2 + PXOR 48(B), X3 + PXOR 64(B), X4 + PXOR 80(B), X5 + PXOR 96(B), X6 + PXOR 112(B), X7 + + MOVOU X0, (Dst) + MOVOU X1, 16(Dst) + MOVOU X2, 32(Dst) + MOVOU X3, 48(Dst) + MOVOU X4, 64(Dst) + MOVOU X5, 80(Dst) + MOVOU X6, 96(Dst) + MOVOU X7, 112(Dst) + + ADDQ $128, A + ADDQ $128, B + ADDQ $128, Dst + SUBQ $128, N + JNZ XOR_LOOP_128_SSE + RET + +XOR_LOOP_64_SSE: + CMPQ N, $64 + JB XOR_LOOP_16_SSE + + MOVOU (A), X0 + MOVOU (B), X1 + MOVOU 16(A), X2 + MOVOU 16(B), X3 + MOVOU 32(A), X4 + MOVOU 32(B), X5 + MOVOU 48(A), X6 + MOVOU 48(B), X7 + + PXOR X0, X1 + PXOR X2, X3 + PXOR X4, X5 + PXOR X6, X7 + + MOVOU X1, (Dst) + MOVOU X3, 16(Dst) + MOVOU X5, 32(Dst) + MOVOU X7, 48(Dst) + + ADDQ $64, A + ADDQ $64, B + ADDQ $64, Dst + SUBQ $64, N + JNZ XOR_LOOP_64_SSE + RET + +XOR_LOOP_16_SSE: + CMPQ N, $16 + JB XOR_LOOP_FINAL_SSE + MOVOU (A), X0 + MOVOU (B), X1 + PXOR X0, X1 + MOVOU X1, (Dst) + ADDQ $16, A + ADDQ $16, B + ADDQ $16, Dst + SUBQ $16, N + JNZ XOR_LOOP_16_SSE + RET + +XOR_LOOP_FINAL_SSE: + MOVB (A), AL + MOVB (B), BL + XORB AL, BL + MOVB BL, (Dst) + INCQ A + INCQ B + INCQ Dst + DECQ N + JNZ XOR_LOOP_FINAL_SSE + RET + + +// func xorBytesAVX(dst, a, b []byte, n int) +TEXT ·xorBytesAVX(SB),NOSPLIT,$0 + MOVQ dst_data+0(FP), Dst + MOVQ a_data+24(FP), A + MOVQ b_data+48(FP), B + MOVQ n+72(FP), N + +XOR_LOOP_128_AVX: + CMPQ N, $128 + JB XOR_LOOP_64_AVX + + VMOVDQU (A), X0 + VMOVDQU 16(A), X1 + VMOVDQU 32(A), X2 + VMOVDQU 48(A), X3 + VMOVDQU 64(A), X4 + VMOVDQU 80(A), X5 + VMOVDQU 96(A), X6 + VMOVDQU 112(A), X7 + + VPXOR (B), X0, X0 + VPXOR 16(B), X1, X1 + VPXOR 32(B), X2, X2 + VPXOR 48(B), X3, X3 + VPXOR 64(B), X4, X4 + VPXOR 80(B), X5, X5 + VPXOR 96(B), X6, X6 + VPXOR 112(B), X7, X7 + + VMOVDQU X0, (Dst) + VMOVDQU X1, 16(Dst) + VMOVDQU X2, 32(Dst) + VMOVDQU X3, 48(Dst) + VMOVDQU X4, 64(Dst) + VMOVDQU X5, 80(Dst) + VMOVDQU X6, 96(Dst) + VMOVDQU X7, 112(Dst) + + ADDQ $128, A + ADDQ $128, B + ADDQ $128, Dst + SUBQ $128, N + JNZ XOR_LOOP_128_AVX + RET + +XOR_LOOP_64_AVX: + CMPQ N, $64 + JB XOR_LOOP_16_AVX + + MOVOU (A), X0 + MOVOU 16(A), X1 + MOVOU 32(A), X2 + MOVOU 48(A), X3 + + VPXOR (B), X0, X4 + VPXOR 16(B), X1, X5 + VPXOR 32(B), X2, X6 + VPXOR 48(B), X3, X7 + + VMOVDQU X4, (Dst) + VMOVDQU X5, 16(Dst) + VMOVDQU X6, 32(Dst) + VMOVDQU X7, 48(Dst) + + ADDQ $64, A + ADDQ $64, B + ADDQ $64, Dst + SUBQ $64, N + JNZ XOR_LOOP_64_AVX + RET + +XOR_LOOP_16_AVX: + CMPQ N, $16 + JB XOR_LOOP_FINAL_AVX + MOVOU (A), X0 + VPXOR (B), X0, X1 + VMOVDQU X1, (Dst) + ADDQ $16, A + ADDQ $16, B + ADDQ $16, Dst + SUBQ $16, N + JNZ XOR_LOOP_16_AVX + RET + +XOR_LOOP_FINAL_AVX: + MOVB (A), AL + MOVB (B), BL + XORB AL, BL + MOVB BL, (Dst) + INCQ A + INCQ B + INCQ Dst + DECQ N + JNZ XOR_LOOP_FINAL_AVX + RET + +#undef Dst +#undef A +#undef B +#undef N diff --git a/xor_test.go b/xor_test.go new file mode 100644 index 0000000..6fea00b --- /dev/null +++ b/xor_test.go @@ -0,0 +1,185 @@ +package fastxor + +import ( + "bytes" + "testing" + "testing/quick" +) + +func refBytes(dst, a, b []byte) int { + n := len(a) + if len(b) < n { + n = len(b) + } + if len(dst) < n { + n = len(dst) + } + for i := 0; i < n; i++ { + dst[i] = a[i] ^ b[i] + } + return n +} + +func refByte(dst, a []byte, b byte) int { + n := len(a) + if len(dst) < n { + n = len(dst) + } + for i := 0; i < n; i++ { + dst[i] = a[i] ^ b + } + return n +} + +func refBlock(dst, a, b []byte) { + _ = dst[15] + _ = a[15] + _ = b[15] + + dst[0] = a[0] ^ b[0] + dst[1] = a[1] ^ b[1] + dst[2] = a[2] ^ b[2] + dst[3] = a[3] ^ b[3] + dst[4] = a[4] ^ b[4] + dst[5] = a[5] ^ b[5] + dst[6] = a[6] ^ b[6] + dst[7] = a[7] ^ b[7] + dst[8] = a[8] ^ b[8] + dst[9] = a[9] ^ b[9] + dst[10] = a[10] ^ b[10] + dst[11] = a[11] ^ b[11] + dst[12] = a[12] ^ b[12] + dst[13] = a[13] ^ b[13] + dst[14] = a[14] ^ b[14] + dst[15] = a[15] ^ b[15] +} + +func TestBytes(t *testing.T) { + err := quick.Check(func(a, b []byte) bool { + // double size to increase chances of reaching 64 bytes + a = append(a, a...) + b = append(b, b...) + if len(a) < 8 { + return true + } + // shift alignment randomly + a = a[(a[0] % 8):] + + dst1 := make([]byte, len(a)) + dst2 := make([]byte, len(a)) + Bytes(dst1, a, b) + refBytes(dst2, a, b) + return bytes.Equal(dst1, dst2) + }, &quick.Config{MaxCount: 10000}) + if err != nil { + t.Fatal(err) + } +} + +func TestByte(t *testing.T) { + err := quick.Check(func(a []byte, b byte) bool { + if len(a) < 8 { + return true + } + // shift alignment randomly + a = a[(a[0] % 8):] + + dst1 := make([]byte, len(a)) + dst2 := make([]byte, len(a)) + Byte(dst1, a, b) + refByte(dst2, a, b) + return bytes.Equal(dst1, dst2) + }, &quick.Config{MaxCount: 10000}) + if err != nil { + t.Fatal(err) + } +} + +func TestBlock(t *testing.T) { + err := quick.Check(func(a, b [16]byte) bool { + dst1 := make([]byte, len(a)) + dst2 := make([]byte, len(a)) + Block(dst1, a[:], b[:]) + refBlock(dst2, a[:], b[:]) + return bytes.Equal(dst1, dst2) + }, &quick.Config{MaxCount: 10000}) + if err != nil { + t.Fatal(err) + } +} + +func BenchmarkBytes(b *testing.B) { + benchN := func(n int) func(*testing.B) { + return func(b *testing.B) { + buf := make([]byte, n) + b.SetBytes(int64(len(buf))) + for i := 0; i < b.N; i++ { + Bytes(buf, buf, buf) + } + } + } + b.Run("16", benchN(16)) + b.Run("1024", benchN(1024)) + b.Run("65k", benchN(65536)) +} + +func BenchmarkRefBytes(b *testing.B) { + benchN := func(n int) func(*testing.B) { + return func(b *testing.B) { + buf := make([]byte, n) + b.SetBytes(int64(len(buf))) + for i := 0; i < b.N; i++ { + refBytes(buf, buf, buf) + } + } + } + b.Run("16", benchN(16)) + b.Run("1024", benchN(1024)) + b.Run("65k", benchN(65536)) +} + +func BenchmarkByte(b *testing.B) { + benchN := func(n int) func(*testing.B) { + return func(b *testing.B) { + buf := make([]byte, n) + b.SetBytes(int64(len(buf))) + for i := 0; i < b.N; i++ { + Byte(buf, buf, 'b') + } + } + } + b.Run("16", benchN(16)) + b.Run("1024", benchN(1024)) + b.Run("65k", benchN(65536)) +} + +func BenchmarkRefByte(b *testing.B) { + benchN := func(n int) func(*testing.B) { + return func(b *testing.B) { + buf := make([]byte, n) + b.SetBytes(int64(len(buf))) + for i := 0; i < b.N; i++ { + refByte(buf, buf, 'b') + } + } + } + b.Run("16", benchN(16)) + b.Run("1024", benchN(1024)) + b.Run("65k", benchN(65536)) +} + +func BenchmarkBlock(b *testing.B) { + buf := make([]byte, 16) + b.SetBytes(16) + for i := 0; i < b.N; i++ { + Block(buf, buf, buf) + } +} + +func BenchmarkRefBlock(b *testing.B) { + buf := make([]byte, 16) + b.SetBytes(16) + for i := 0; i < b.N; i++ { + refBlock(buf, buf, buf) + } +} diff --git a/xor_unaligned.go b/xor_unaligned.go new file mode 100644 index 0000000..4b7512b --- /dev/null +++ b/xor_unaligned.go @@ -0,0 +1,62 @@ +// +build 386 amd64,!go1.7 ppc64 ppc64le s390x + +package fastxor + +import ( + "unsafe" +) + +const wordSize = int(unsafe.Sizeof(uintptr(0))) + +// Bytes stores (a xor b) in dst, stopping when the end of any slice is +// reached. It returns the number of bytes xor'd. +func Bytes(dst, a, b []byte) int { + n := len(a) + if len(b) < n { + n = len(b) + } + if n == 0 { + return 0 + } + // Assert dst has enough space + _ = dst[n-1] + + w := n / wordSize + if w > 0 { + dw := *(*[]uintptr)(unsafe.Pointer(&dst)) + aw := *(*[]uintptr)(unsafe.Pointer(&a)) + bw := *(*[]uintptr)(unsafe.Pointer(&b)) + for i := 0; i < w; i++ { + dw[i] = aw[i] ^ bw[i] + } + } + + for i := (n - n%wordSize); i < n; i++ { + dst[i] = a[i] ^ b[i] + } + + return n +} + +// Byte xors each byte in a with b and stores the result in dst, stopping when +// the end of either dst or a is reached. It returns the number of bytes +// xor'd. +func Byte(dst, a []byte, b byte) int { + n := len(a) + if len(dst) < n { + n = len(dst) + } + for i := 0; i < n; i++ { + dst[i] = a[i] ^ b + } + return n +} + +// Block stores (a xor b) in dst, where a, b, and dst all have length 16. +func Block(dst, a, b []byte) { + dw := *(*[]uintptr)(unsafe.Pointer(&dst)) + aw := *(*[]uintptr)(unsafe.Pointer(&a)) + bw := *(*[]uintptr)(unsafe.Pointer(&b)) + dw[0] = aw[0] ^ bw[0] + dw[1] = aw[1] ^ bw[1] +}