initial commit

2018-07-05 17:52:38 -04:00
commit a04e125b73
7 changed files with 702 additions and 0 deletions
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2018 Luke Champine
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
@@ -0,0 +1,61 @@
+fastxor
+-----
+
+[![GoDoc](https://godoc.org/github.com/lukechampine/fastxor?status.svg)](https://godoc.org/github.com/lukechampine/fastxor)
+[![Go Report Card](http://goreportcard.com/badge/github.com/lukechampine/fastxor)](https://goreportcard.com/report/github.com/lukechampine/fastxor)
+
+```
+go get github.com/lukechampine/fastxor
+```
+
+Is there a gaping hole in your heart that can only be filled by xor'ing byte
+streams at 20GB/s? If so, you've come to the right place.
+
+`fastxor` is exactly what it sounds like: a package that xors bytes as fast
+as your CPU is capable of. For best results, use a CPU that supports a SIMD
+instruction set like SSE or AVX. On other architectures,  performance is much
+less impressive, but still faster than a naive byte-wise loop.
+
+I wrote this package to try my hand at writing Go assembly, so please scrutinize
+my code and let me know how I could make it faster or cleaner! 
+
+
+# Benchmarks
+
+```
+AVX:
+
+BenchmarkBytes/16-4   	200000000	         8.72 ns/op	 1835.82 MB/s
+BenchmarkBytes/1024-4 	 50000000	        38.1 ns/op	26850.41 MB/s
+BenchmarkBytes/65k-4  	   500000	      2738 ns/op	23930.93 MB/s
+
+SSE:
+
+BenchmarkBytes/16-4   	200000000	         8.63 ns/op	 1852.98 MB/s
+BenchmarkBytes/1024-4 	 50000000	        39.4 ns/op	25993.00 MB/s
+BenchmarkBytes/65k-4  	   500000	      2733 ns/op	23975.08 MB/s
+
+Word-wise:
+
+BenchmarkBytes/16-4   	100000000	        10.5 ns/op	1521.66 MB/s
+BenchmarkBytes/1024-4 	 10000000	       125 ns/op	8163.59 MB/s
+BenchmarkBytes/65k-4  	   200000	      6895 ns/op	9504.62 MB/s
+
+Byte-wise:
+
+BenchmarkBytes/16-4    	100000000	        17.3 ns/op	 925.16 MB/s
+BenchmarkBytes/1024-4  	  2000000	       841 ns/op	1216.31 MB/s
+BenchmarkBytes/65k-4   	    30000	     54100 ns/op	1211.38 MB/s
+```
+
+Conclusions: `fastxor` is 2-25 times faster than a naive `for` loop. AVX and
+SSE performance is roughly equivalent, which makes me suspect that I may be
+doing something wrong. Lastly, for very small slices, the cost of the function
+call starts to outweigh the benefit of AVX/SSE (the Go compiler never inlines
+handwritten asm). If you need to xor exactly 16 bytes (common in block
+ciphers), the specialized `Block` function outperforms the more generic
+`Bytes`:
+
+```
+BenchmarkBlock-4   	500000000	         3.69 ns/op	4337.88 MB/s
+```
@@ -0,0 +1,57 @@
+// +build !386,!amd64,!ppc64,!ppc64le,!s390x
+
+package fastxor
+
+// Bytes stores (a xor b) in dst, stopping when the end of any slice is
+// reached. It returns the number of bytes xor'd.
+func Bytes(dst, a, b []byte) int {
+	n := len(a)
+	if len(b) < n {
+		n = len(b)
+	}
+	if len(dst) < n {
+		n = len(dst)
+	}
+	for i := 0; i < n; i++ {
+		dst[i] = a[i] ^ b[i]
+	}
+	return n
+}
+
+// Byte xors each byte in a with b and stores the result in dst, stopping when
+// the end of either dst or a is reached. It returns the number of bytes
+// xor'd.
+func Byte(dst, a []byte, b byte) int {
+	n := len(a)
+	if len(dst) < n {
+		n = len(dst)
+	}
+	for i := 0; i < n; i++ {
+		dst[i] = a[i] ^ b
+	}
+	return n
+}
+
+// Block stores (a xor b) in dst, where a, b, and dst all have length 16.
+func Block(dst, a, b []byte) {
+	_ = dst[15]
+	_ = a[15]
+	_ = b[15]
+
+	dst[0] = a[0] ^ b[0]
+	dst[1] = a[1] ^ b[1]
+	dst[2] = a[2] ^ b[2]
+	dst[3] = a[3] ^ b[3]
+	dst[4] = a[4] ^ b[4]
+	dst[5] = a[5] ^ b[5]
+	dst[6] = a[6] ^ b[6]
+	dst[7] = a[7] ^ b[7]
+	dst[8] = a[8] ^ b[8]
+	dst[9] = a[9] ^ b[9]
+	dst[10] = a[10] ^ b[10]
+	dst[11] = a[11] ^ b[11]
+	dst[12] = a[12] ^ b[12]
+	dst[13] = a[13] ^ b[13]
+	dst[14] = a[14] ^ b[14]
+	dst[15] = a[15] ^ b[15]
+}
@@ -0,0 +1,105 @@
+// +build go1.7,amd64,!gccgo,!appengine,!nacl
+
+package fastxor
+
+import (
+	"unsafe"
+
+	"golang.org/x/sys/cpu"
+)
+
+//go:noescape
+func xorBytesSSE(dst, a, b []byte, n int)
+
+//go:noescape
+func xorBytesAVX(dst, a, b []byte, n int)
+
+func min(a, b, c int) int {
+	if a < b {
+		b = a
+	}
+	if b < c {
+		c = b
+	}
+	return c
+}
+
+// Bytes stores (a xor b) in dst, stopping when the end of any slice is
+// reached. It returns the number of bytes xor'd.
+func Bytes(dst, a, b []byte) int {
+	n := min(len(dst), len(a), len(b))
+	if n == 0 {
+		return 0
+	}
+	switch {
+	case cpu.X86.HasAVX:
+		xorBytesAVX(dst, a, b, n)
+	case cpu.X86.HasSSE2:
+		xorBytesSSE(dst, a, b, n)
+	default:
+		xorBytesGeneric(dst, a, b, n)
+	}
+	return n
+}
+
+const wordSize = int(unsafe.Sizeof(uintptr(0)))
+
+func xorBytesGeneric(dst, a, b []byte, n int) {
+	// Assert dst has enough space
+	_ = dst[n-1]
+
+	w := n / wordSize
+	if w > 0 {
+		dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+		aw := *(*[]uintptr)(unsafe.Pointer(&a))
+		bw := *(*[]uintptr)(unsafe.Pointer(&b))
+		for i := 0; i < w; i++ {
+			dw[i] = aw[i] ^ bw[i]
+		}
+	}
+
+	for i := (n - n%wordSize); i < n; i++ {
+		dst[i] = a[i] ^ b[i]
+	}
+}
+
+// Byte xors each byte in a with b and stores the result in dst, stopping when
+// the end of either dst or a is reached. It returns the number of bytes
+// xor'd.
+func Byte(dst, a []byte, b byte) int {
+	n := len(a)
+	if len(dst) < n {
+		n = len(dst)
+	}
+
+	var bw uintptr
+	for i := 0; i < wordSize; i += 1 {
+		bw |= uintptr(b) << uint(i*8)
+	}
+
+	w := n / wordSize
+	if w > 0 {
+		dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+		aw := *(*[]uintptr)(unsafe.Pointer(&a))
+		for i := 0; i < w; i++ {
+			dw[i] = aw[i] ^ bw
+		}
+	}
+
+	for i := (n - n%wordSize); i < n; i++ {
+		dst[i] = a[i] ^ b
+	}
+
+	return n
+}
+
+// Block stores (a xor b) in dst, where a, b, and dst all have length 16.
+func Block(dst, a, b []byte) {
+	// profiling indicates that for 16-byte blocks, the cost of a function
+	// call outweighs the SSE/AVX speedup
+	dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+	aw := *(*[]uintptr)(unsafe.Pointer(&a))
+	bw := *(*[]uintptr)(unsafe.Pointer(&b))
+	dw[0] = aw[0] ^ bw[0]
+	dw[1] = aw[1] ^ bw[1]
+}
@@ -0,0 +1,211 @@
+// +build amd64,!gccgo,!appengine,!nacl
+
+#include "textflag.h"
+
+#define Dst DI
+#define A R8
+#define B R9
+#define N R12
+
+// func xorBytesSSE(dst, a, b []byte, n int)
+TEXT ·xorBytesSSE(SB),NOSPLIT,$0
+	MOVQ dst_data+0(FP), Dst
+	MOVQ a_data+24(FP), A
+	MOVQ b_data+48(FP), B
+	MOVQ n+72(FP), N
+
+XOR_LOOP_128_SSE:
+	CMPQ   N, $128
+	JB     XOR_LOOP_64_SSE
+
+	MOVOU     (A), X0
+	MOVOU   16(A), X1
+	MOVOU   32(A), X2
+	MOVOU   48(A), X3
+	MOVOU   64(A), X4
+	MOVOU   80(A), X5
+	MOVOU   96(A), X6
+	MOVOU  112(A), X7
+
+	PXOR     (B), X0
+	PXOR   16(B), X1
+	PXOR   32(B), X2
+	PXOR   48(B), X3
+	PXOR   64(B), X4
+	PXOR   80(B), X5
+	PXOR   96(B), X6
+	PXOR  112(B), X7
+
+	MOVOU   X0,    (Dst)
+	MOVOU   X1,  16(Dst)
+	MOVOU   X2,  32(Dst)
+	MOVOU   X3,  48(Dst)
+	MOVOU   X4,  64(Dst)
+	MOVOU   X5,  80(Dst)
+	MOVOU   X6,  96(Dst)
+	MOVOU   X7, 112(Dst)
+
+	ADDQ   $128, A
+	ADDQ   $128, B
+	ADDQ   $128, Dst
+	SUBQ   $128, N
+	JNZ    XOR_LOOP_128_SSE
+	RET
+
+XOR_LOOP_64_SSE:
+	CMPQ   N, $64
+	JB     XOR_LOOP_16_SSE
+
+	MOVOU    (A), X0
+	MOVOU    (B), X1
+	MOVOU  16(A), X2
+	MOVOU  16(B), X3
+	MOVOU  32(A), X4
+	MOVOU  32(B), X5
+	MOVOU  48(A), X6
+	MOVOU  48(B), X7
+
+	PXOR   X0, X1
+	PXOR   X2, X3
+	PXOR   X4, X5
+	PXOR   X6, X7
+
+	MOVOU  X1,   (Dst)
+	MOVOU  X3, 16(Dst)
+	MOVOU  X5, 32(Dst)
+	MOVOU  X7, 48(Dst)
+
+	ADDQ   $64, A
+	ADDQ   $64, B
+	ADDQ   $64, Dst
+	SUBQ   $64, N
+	JNZ    XOR_LOOP_64_SSE
+	RET
+
+XOR_LOOP_16_SSE:
+	CMPQ   N, $16
+	JB     XOR_LOOP_FINAL_SSE
+	MOVOU  (A), X0
+	MOVOU  (B), X1
+	PXOR   X0, X1
+	MOVOU  X1, (Dst)
+	ADDQ   $16, A
+	ADDQ   $16, B
+	ADDQ   $16, Dst
+	SUBQ   $16, N
+	JNZ    XOR_LOOP_16_SSE
+	RET
+
+XOR_LOOP_FINAL_SSE:
+	MOVB  (A), AL
+	MOVB  (B), BL
+	XORB  AL, BL
+	MOVB  BL, (Dst)
+	INCQ  A
+	INCQ  B
+	INCQ  Dst
+	DECQ  N
+	JNZ   XOR_LOOP_FINAL_SSE
+	RET
+
+
+// func xorBytesAVX(dst, a, b []byte, n int)
+TEXT ·xorBytesAVX(SB),NOSPLIT,$0
+	MOVQ dst_data+0(FP), Dst
+	MOVQ a_data+24(FP), A
+	MOVQ b_data+48(FP), B
+	MOVQ n+72(FP), N
+
+XOR_LOOP_128_AVX:
+	CMPQ   N, $128
+	JB     XOR_LOOP_64_AVX
+
+	VMOVDQU     (A), X0
+	VMOVDQU   16(A), X1
+	VMOVDQU   32(A), X2
+	VMOVDQU   48(A), X3
+	VMOVDQU   64(A), X4
+	VMOVDQU   80(A), X5
+	VMOVDQU   96(A), X6
+	VMOVDQU  112(A), X7
+
+	VPXOR     (B), X0, X0
+	VPXOR   16(B), X1, X1
+	VPXOR   32(B), X2, X2
+	VPXOR   48(B), X3, X3
+	VPXOR   64(B), X4, X4
+	VPXOR   80(B), X5, X5
+	VPXOR   96(B), X6, X6
+	VPXOR  112(B), X7, X7
+
+	VMOVDQU   X0,    (Dst)
+	VMOVDQU   X1,  16(Dst)
+	VMOVDQU   X2,  32(Dst)
+	VMOVDQU   X3,  48(Dst)
+	VMOVDQU   X4,  64(Dst)
+	VMOVDQU   X5,  80(Dst)
+	VMOVDQU   X6,  96(Dst)
+	VMOVDQU   X7, 112(Dst)
+
+	ADDQ   $128, A
+	ADDQ   $128, B
+	ADDQ   $128, Dst
+	SUBQ   $128, N
+	JNZ    XOR_LOOP_128_AVX
+	RET
+
+XOR_LOOP_64_AVX:
+	CMPQ   N, $64
+	JB     XOR_LOOP_16_AVX
+
+	MOVOU    (A), X0
+	MOVOU  16(A), X1
+	MOVOU  32(A), X2
+	MOVOU  48(A), X3
+
+	VPXOR    (B), X0, X4
+	VPXOR  16(B), X1, X5
+	VPXOR  32(B), X2, X6
+	VPXOR  48(B), X3, X7
+
+	VMOVDQU  X4,   (Dst)
+	VMOVDQU  X5, 16(Dst)
+	VMOVDQU  X6, 32(Dst)
+	VMOVDQU  X7, 48(Dst)
+
+	ADDQ   $64, A
+	ADDQ   $64, B
+	ADDQ   $64, Dst
+	SUBQ   $64, N
+	JNZ    XOR_LOOP_64_AVX
+	RET
+
+XOR_LOOP_16_AVX:
+	CMPQ     N, $16
+	JB       XOR_LOOP_FINAL_AVX
+	MOVOU    (A), X0
+	VPXOR    (B), X0, X1
+	VMOVDQU  X1, (Dst)
+	ADDQ     $16, A
+	ADDQ     $16, B
+	ADDQ     $16, Dst
+	SUBQ     $16, N
+	JNZ      XOR_LOOP_16_AVX
+	RET
+
+XOR_LOOP_FINAL_AVX:
+	MOVB  (A), AL
+	MOVB  (B), BL
+	XORB  AL, BL
+	MOVB  BL, (Dst)
+	INCQ  A
+	INCQ  B
+	INCQ  Dst
+	DECQ  N
+	JNZ   XOR_LOOP_FINAL_AVX
+	RET
+
+#undef Dst
+#undef A
+#undef B
+#undef N
@@ -0,0 +1,185 @@
+package fastxor
+
+import (
+	"bytes"
+	"testing"
+	"testing/quick"
+)
+
+func refBytes(dst, a, b []byte) int {
+	n := len(a)
+	if len(b) < n {
+		n = len(b)
+	}
+	if len(dst) < n {
+		n = len(dst)
+	}
+	for i := 0; i < n; i++ {
+		dst[i] = a[i] ^ b[i]
+	}
+	return n
+}
+
+func refByte(dst, a []byte, b byte) int {
+	n := len(a)
+	if len(dst) < n {
+		n = len(dst)
+	}
+	for i := 0; i < n; i++ {
+		dst[i] = a[i] ^ b
+	}
+	return n
+}
+
+func refBlock(dst, a, b []byte) {
+	_ = dst[15]
+	_ = a[15]
+	_ = b[15]
+
+	dst[0] = a[0] ^ b[0]
+	dst[1] = a[1] ^ b[1]
+	dst[2] = a[2] ^ b[2]
+	dst[3] = a[3] ^ b[3]
+	dst[4] = a[4] ^ b[4]
+	dst[5] = a[5] ^ b[5]
+	dst[6] = a[6] ^ b[6]
+	dst[7] = a[7] ^ b[7]
+	dst[8] = a[8] ^ b[8]
+	dst[9] = a[9] ^ b[9]
+	dst[10] = a[10] ^ b[10]
+	dst[11] = a[11] ^ b[11]
+	dst[12] = a[12] ^ b[12]
+	dst[13] = a[13] ^ b[13]
+	dst[14] = a[14] ^ b[14]
+	dst[15] = a[15] ^ b[15]
+}
+
+func TestBytes(t *testing.T) {
+	err := quick.Check(func(a, b []byte) bool {
+		// double size to increase chances of reaching 64 bytes
+		a = append(a, a...)
+		b = append(b, b...)
+		if len(a) < 8 {
+			return true
+		}
+		// shift alignment randomly
+		a = a[(a[0] % 8):]
+
+		dst1 := make([]byte, len(a))
+		dst2 := make([]byte, len(a))
+		Bytes(dst1, a, b)
+		refBytes(dst2, a, b)
+		return bytes.Equal(dst1, dst2)
+	}, &quick.Config{MaxCount: 10000})
+	if err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestByte(t *testing.T) {
+	err := quick.Check(func(a []byte, b byte) bool {
+		if len(a) < 8 {
+			return true
+		}
+		// shift alignment randomly
+		a = a[(a[0] % 8):]
+
+		dst1 := make([]byte, len(a))
+		dst2 := make([]byte, len(a))
+		Byte(dst1, a, b)
+		refByte(dst2, a, b)
+		return bytes.Equal(dst1, dst2)
+	}, &quick.Config{MaxCount: 10000})
+	if err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestBlock(t *testing.T) {
+	err := quick.Check(func(a, b [16]byte) bool {
+		dst1 := make([]byte, len(a))
+		dst2 := make([]byte, len(a))
+		Block(dst1, a[:], b[:])
+		refBlock(dst2, a[:], b[:])
+		return bytes.Equal(dst1, dst2)
+	}, &quick.Config{MaxCount: 10000})
+	if err != nil {
+		t.Fatal(err)
+	}
+}
+
+func BenchmarkBytes(b *testing.B) {
+	benchN := func(n int) func(*testing.B) {
+		return func(b *testing.B) {
+			buf := make([]byte, n)
+			b.SetBytes(int64(len(buf)))
+			for i := 0; i < b.N; i++ {
+				Bytes(buf, buf, buf)
+			}
+		}
+	}
+	b.Run("16", benchN(16))
+	b.Run("1024", benchN(1024))
+	b.Run("65k", benchN(65536))
+}
+
+func BenchmarkRefBytes(b *testing.B) {
+	benchN := func(n int) func(*testing.B) {
+		return func(b *testing.B) {
+			buf := make([]byte, n)
+			b.SetBytes(int64(len(buf)))
+			for i := 0; i < b.N; i++ {
+				refBytes(buf, buf, buf)
+			}
+		}
+	}
+	b.Run("16", benchN(16))
+	b.Run("1024", benchN(1024))
+	b.Run("65k", benchN(65536))
+}
+
+func BenchmarkByte(b *testing.B) {
+	benchN := func(n int) func(*testing.B) {
+		return func(b *testing.B) {
+			buf := make([]byte, n)
+			b.SetBytes(int64(len(buf)))
+			for i := 0; i < b.N; i++ {
+				Byte(buf, buf, 'b')
+			}
+		}
+	}
+	b.Run("16", benchN(16))
+	b.Run("1024", benchN(1024))
+	b.Run("65k", benchN(65536))
+}
+
+func BenchmarkRefByte(b *testing.B) {
+	benchN := func(n int) func(*testing.B) {
+		return func(b *testing.B) {
+			buf := make([]byte, n)
+			b.SetBytes(int64(len(buf)))
+			for i := 0; i < b.N; i++ {
+				refByte(buf, buf, 'b')
+			}
+		}
+	}
+	b.Run("16", benchN(16))
+	b.Run("1024", benchN(1024))
+	b.Run("65k", benchN(65536))
+}
+
+func BenchmarkBlock(b *testing.B) {
+	buf := make([]byte, 16)
+	b.SetBytes(16)
+	for i := 0; i < b.N; i++ {
+		Block(buf, buf, buf)
+	}
+}
+
+func BenchmarkRefBlock(b *testing.B) {
+	buf := make([]byte, 16)
+	b.SetBytes(16)
+	for i := 0; i < b.N; i++ {
+		refBlock(buf, buf, buf)
+	}
+}
@@ -0,0 +1,62 @@
+// +build 386 amd64,!go1.7 ppc64 ppc64le s390x
+
+package fastxor
+
+import (
+	"unsafe"
+)
+
+const wordSize = int(unsafe.Sizeof(uintptr(0)))
+
+// Bytes stores (a xor b) in dst, stopping when the end of any slice is
+// reached. It returns the number of bytes xor'd.
+func Bytes(dst, a, b []byte) int {
+	n := len(a)
+	if len(b) < n {
+		n = len(b)
+	}
+	if n == 0 {
+		return 0
+	}
+	// Assert dst has enough space
+	_ = dst[n-1]
+
+	w := n / wordSize
+	if w > 0 {
+		dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+		aw := *(*[]uintptr)(unsafe.Pointer(&a))
+		bw := *(*[]uintptr)(unsafe.Pointer(&b))
+		for i := 0; i < w; i++ {
+			dw[i] = aw[i] ^ bw[i]
+		}
+	}
+
+	for i := (n - n%wordSize); i < n; i++ {
+		dst[i] = a[i] ^ b[i]
+	}
+
+	return n
+}
+
+// Byte xors each byte in a with b and stores the result in dst, stopping when
+// the end of either dst or a is reached. It returns the number of bytes
+// xor'd.
+func Byte(dst, a []byte, b byte) int {
+	n := len(a)
+	if len(dst) < n {
+		n = len(dst)
+	}
+	for i := 0; i < n; i++ {
+		dst[i] = a[i] ^ b
+	}
+	return n
+}
+
+// Block stores (a xor b) in dst, where a, b, and dst all have length 16.
+func Block(dst, a, b []byte) {
+	dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+	aw := *(*[]uintptr)(unsafe.Pointer(&a))
+	bw := *(*[]uintptr)(unsafe.Pointer(&b))
+	dw[0] = aw[0] ^ bw[0]
+	dw[1] = aw[1] ^ bw[1]
+}