initial commit

This commit is contained in:
lukechampine
2018-07-05 17:52:38 -04:00
commit a04e125b73
7 changed files with 702 additions and 0 deletions
+21
View File
@@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2018 Luke Champine
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
+61
View File
@@ -0,0 +1,61 @@
fastxor
-----
[![GoDoc](https://godoc.org/github.com/lukechampine/fastxor?status.svg)](https://godoc.org/github.com/lukechampine/fastxor)
[![Go Report Card](http://goreportcard.com/badge/github.com/lukechampine/fastxor)](https://goreportcard.com/report/github.com/lukechampine/fastxor)
```
go get github.com/lukechampine/fastxor
```
Is there a gaping hole in your heart that can only be filled by xor'ing byte
streams at 20GB/s? If so, you've come to the right place.
`fastxor` is exactly what it sounds like: a package that xors bytes as fast
as your CPU is capable of. For best results, use a CPU that supports a SIMD
instruction set like SSE or AVX. On other architectures, performance is much
less impressive, but still faster than a naive byte-wise loop.
I wrote this package to try my hand at writing Go assembly, so please scrutinize
my code and let me know how I could make it faster or cleaner!
# Benchmarks
```
AVX:
BenchmarkBytes/16-4 200000000 8.72 ns/op 1835.82 MB/s
BenchmarkBytes/1024-4 50000000 38.1 ns/op 26850.41 MB/s
BenchmarkBytes/65k-4 500000 2738 ns/op 23930.93 MB/s
SSE:
BenchmarkBytes/16-4 200000000 8.63 ns/op 1852.98 MB/s
BenchmarkBytes/1024-4 50000000 39.4 ns/op 25993.00 MB/s
BenchmarkBytes/65k-4 500000 2733 ns/op 23975.08 MB/s
Word-wise:
BenchmarkBytes/16-4 100000000 10.5 ns/op 1521.66 MB/s
BenchmarkBytes/1024-4 10000000 125 ns/op 8163.59 MB/s
BenchmarkBytes/65k-4 200000 6895 ns/op 9504.62 MB/s
Byte-wise:
BenchmarkBytes/16-4 100000000 17.3 ns/op 925.16 MB/s
BenchmarkBytes/1024-4 2000000 841 ns/op 1216.31 MB/s
BenchmarkBytes/65k-4 30000 54100 ns/op 1211.38 MB/s
```
Conclusions: `fastxor` is 2-25 times faster than a naive `for` loop. AVX and
SSE performance is roughly equivalent, which makes me suspect that I may be
doing something wrong. Lastly, for very small slices, the cost of the function
call starts to outweigh the benefit of AVX/SSE (the Go compiler never inlines
handwritten asm). If you need to xor exactly 16 bytes (common in block
ciphers), the specialized `Block` function outperforms the more generic
`Bytes`:
```
BenchmarkBlock-4 500000000 3.69 ns/op 4337.88 MB/s
```
+57
View File
@@ -0,0 +1,57 @@
// +build !386,!amd64,!ppc64,!ppc64le,!s390x
package fastxor
// Bytes stores (a xor b) in dst, stopping when the end of any slice is
// reached. It returns the number of bytes xor'd.
func Bytes(dst, a, b []byte) int {
n := len(a)
if len(b) < n {
n = len(b)
}
if len(dst) < n {
n = len(dst)
}
for i := 0; i < n; i++ {
dst[i] = a[i] ^ b[i]
}
return n
}
// Byte xors each byte in a with b and stores the result in dst, stopping when
// the end of either dst or a is reached. It returns the number of bytes
// xor'd.
func Byte(dst, a []byte, b byte) int {
n := len(a)
if len(dst) < n {
n = len(dst)
}
for i := 0; i < n; i++ {
dst[i] = a[i] ^ b
}
return n
}
// Block stores (a xor b) in dst, where a, b, and dst all have length 16.
func Block(dst, a, b []byte) {
_ = dst[15]
_ = a[15]
_ = b[15]
dst[0] = a[0] ^ b[0]
dst[1] = a[1] ^ b[1]
dst[2] = a[2] ^ b[2]
dst[3] = a[3] ^ b[3]
dst[4] = a[4] ^ b[4]
dst[5] = a[5] ^ b[5]
dst[6] = a[6] ^ b[6]
dst[7] = a[7] ^ b[7]
dst[8] = a[8] ^ b[8]
dst[9] = a[9] ^ b[9]
dst[10] = a[10] ^ b[10]
dst[11] = a[11] ^ b[11]
dst[12] = a[12] ^ b[12]
dst[13] = a[13] ^ b[13]
dst[14] = a[14] ^ b[14]
dst[15] = a[15] ^ b[15]
}
+105
View File
@@ -0,0 +1,105 @@
// +build go1.7,amd64,!gccgo,!appengine,!nacl
package fastxor
import (
"unsafe"
"golang.org/x/sys/cpu"
)
//go:noescape
func xorBytesSSE(dst, a, b []byte, n int)
//go:noescape
func xorBytesAVX(dst, a, b []byte, n int)
func min(a, b, c int) int {
if a < b {
b = a
}
if b < c {
c = b
}
return c
}
// Bytes stores (a xor b) in dst, stopping when the end of any slice is
// reached. It returns the number of bytes xor'd.
func Bytes(dst, a, b []byte) int {
n := min(len(dst), len(a), len(b))
if n == 0 {
return 0
}
switch {
case cpu.X86.HasAVX:
xorBytesAVX(dst, a, b, n)
case cpu.X86.HasSSE2:
xorBytesSSE(dst, a, b, n)
default:
xorBytesGeneric(dst, a, b, n)
}
return n
}
const wordSize = int(unsafe.Sizeof(uintptr(0)))
func xorBytesGeneric(dst, a, b []byte, n int) {
// Assert dst has enough space
_ = dst[n-1]
w := n / wordSize
if w > 0 {
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
aw := *(*[]uintptr)(unsafe.Pointer(&a))
bw := *(*[]uintptr)(unsafe.Pointer(&b))
for i := 0; i < w; i++ {
dw[i] = aw[i] ^ bw[i]
}
}
for i := (n - n%wordSize); i < n; i++ {
dst[i] = a[i] ^ b[i]
}
}
// Byte xors each byte in a with b and stores the result in dst, stopping when
// the end of either dst or a is reached. It returns the number of bytes
// xor'd.
func Byte(dst, a []byte, b byte) int {
n := len(a)
if len(dst) < n {
n = len(dst)
}
var bw uintptr
for i := 0; i < wordSize; i += 1 {
bw |= uintptr(b) << uint(i*8)
}
w := n / wordSize
if w > 0 {
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
aw := *(*[]uintptr)(unsafe.Pointer(&a))
for i := 0; i < w; i++ {
dw[i] = aw[i] ^ bw
}
}
for i := (n - n%wordSize); i < n; i++ {
dst[i] = a[i] ^ b
}
return n
}
// Block stores (a xor b) in dst, where a, b, and dst all have length 16.
func Block(dst, a, b []byte) {
// profiling indicates that for 16-byte blocks, the cost of a function
// call outweighs the SSE/AVX speedup
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
aw := *(*[]uintptr)(unsafe.Pointer(&a))
bw := *(*[]uintptr)(unsafe.Pointer(&b))
dw[0] = aw[0] ^ bw[0]
dw[1] = aw[1] ^ bw[1]
}
+211
View File
@@ -0,0 +1,211 @@
// +build amd64,!gccgo,!appengine,!nacl
#include "textflag.h"
#define Dst DI
#define A R8
#define B R9
#define N R12
// func xorBytesSSE(dst, a, b []byte, n int)
TEXT ·xorBytesSSE(SB),NOSPLIT,$0
MOVQ dst_data+0(FP), Dst
MOVQ a_data+24(FP), A
MOVQ b_data+48(FP), B
MOVQ n+72(FP), N
XOR_LOOP_128_SSE:
CMPQ N, $128
JB XOR_LOOP_64_SSE
MOVOU (A), X0
MOVOU 16(A), X1
MOVOU 32(A), X2
MOVOU 48(A), X3
MOVOU 64(A), X4
MOVOU 80(A), X5
MOVOU 96(A), X6
MOVOU 112(A), X7
PXOR (B), X0
PXOR 16(B), X1
PXOR 32(B), X2
PXOR 48(B), X3
PXOR 64(B), X4
PXOR 80(B), X5
PXOR 96(B), X6
PXOR 112(B), X7
MOVOU X0, (Dst)
MOVOU X1, 16(Dst)
MOVOU X2, 32(Dst)
MOVOU X3, 48(Dst)
MOVOU X4, 64(Dst)
MOVOU X5, 80(Dst)
MOVOU X6, 96(Dst)
MOVOU X7, 112(Dst)
ADDQ $128, A
ADDQ $128, B
ADDQ $128, Dst
SUBQ $128, N
JNZ XOR_LOOP_128_SSE
RET
XOR_LOOP_64_SSE:
CMPQ N, $64
JB XOR_LOOP_16_SSE
MOVOU (A), X0
MOVOU (B), X1
MOVOU 16(A), X2
MOVOU 16(B), X3
MOVOU 32(A), X4
MOVOU 32(B), X5
MOVOU 48(A), X6
MOVOU 48(B), X7
PXOR X0, X1
PXOR X2, X3
PXOR X4, X5
PXOR X6, X7
MOVOU X1, (Dst)
MOVOU X3, 16(Dst)
MOVOU X5, 32(Dst)
MOVOU X7, 48(Dst)
ADDQ $64, A
ADDQ $64, B
ADDQ $64, Dst
SUBQ $64, N
JNZ XOR_LOOP_64_SSE
RET
XOR_LOOP_16_SSE:
CMPQ N, $16
JB XOR_LOOP_FINAL_SSE
MOVOU (A), X0
MOVOU (B), X1
PXOR X0, X1
MOVOU X1, (Dst)
ADDQ $16, A
ADDQ $16, B
ADDQ $16, Dst
SUBQ $16, N
JNZ XOR_LOOP_16_SSE
RET
XOR_LOOP_FINAL_SSE:
MOVB (A), AL
MOVB (B), BL
XORB AL, BL
MOVB BL, (Dst)
INCQ A
INCQ B
INCQ Dst
DECQ N
JNZ XOR_LOOP_FINAL_SSE
RET
// func xorBytesAVX(dst, a, b []byte, n int)
TEXT ·xorBytesAVX(SB),NOSPLIT,$0
MOVQ dst_data+0(FP), Dst
MOVQ a_data+24(FP), A
MOVQ b_data+48(FP), B
MOVQ n+72(FP), N
XOR_LOOP_128_AVX:
CMPQ N, $128
JB XOR_LOOP_64_AVX
VMOVDQU (A), X0
VMOVDQU 16(A), X1
VMOVDQU 32(A), X2
VMOVDQU 48(A), X3
VMOVDQU 64(A), X4
VMOVDQU 80(A), X5
VMOVDQU 96(A), X6
VMOVDQU 112(A), X7
VPXOR (B), X0, X0
VPXOR 16(B), X1, X1
VPXOR 32(B), X2, X2
VPXOR 48(B), X3, X3
VPXOR 64(B), X4, X4
VPXOR 80(B), X5, X5
VPXOR 96(B), X6, X6
VPXOR 112(B), X7, X7
VMOVDQU X0, (Dst)
VMOVDQU X1, 16(Dst)
VMOVDQU X2, 32(Dst)
VMOVDQU X3, 48(Dst)
VMOVDQU X4, 64(Dst)
VMOVDQU X5, 80(Dst)
VMOVDQU X6, 96(Dst)
VMOVDQU X7, 112(Dst)
ADDQ $128, A
ADDQ $128, B
ADDQ $128, Dst
SUBQ $128, N
JNZ XOR_LOOP_128_AVX
RET
XOR_LOOP_64_AVX:
CMPQ N, $64
JB XOR_LOOP_16_AVX
MOVOU (A), X0
MOVOU 16(A), X1
MOVOU 32(A), X2
MOVOU 48(A), X3
VPXOR (B), X0, X4
VPXOR 16(B), X1, X5
VPXOR 32(B), X2, X6
VPXOR 48(B), X3, X7
VMOVDQU X4, (Dst)
VMOVDQU X5, 16(Dst)
VMOVDQU X6, 32(Dst)
VMOVDQU X7, 48(Dst)
ADDQ $64, A
ADDQ $64, B
ADDQ $64, Dst
SUBQ $64, N
JNZ XOR_LOOP_64_AVX
RET
XOR_LOOP_16_AVX:
CMPQ N, $16
JB XOR_LOOP_FINAL_AVX
MOVOU (A), X0
VPXOR (B), X0, X1
VMOVDQU X1, (Dst)
ADDQ $16, A
ADDQ $16, B
ADDQ $16, Dst
SUBQ $16, N
JNZ XOR_LOOP_16_AVX
RET
XOR_LOOP_FINAL_AVX:
MOVB (A), AL
MOVB (B), BL
XORB AL, BL
MOVB BL, (Dst)
INCQ A
INCQ B
INCQ Dst
DECQ N
JNZ XOR_LOOP_FINAL_AVX
RET
#undef Dst
#undef A
#undef B
#undef N
+185
View File
@@ -0,0 +1,185 @@
package fastxor
import (
"bytes"
"testing"
"testing/quick"
)
func refBytes(dst, a, b []byte) int {
n := len(a)
if len(b) < n {
n = len(b)
}
if len(dst) < n {
n = len(dst)
}
for i := 0; i < n; i++ {
dst[i] = a[i] ^ b[i]
}
return n
}
func refByte(dst, a []byte, b byte) int {
n := len(a)
if len(dst) < n {
n = len(dst)
}
for i := 0; i < n; i++ {
dst[i] = a[i] ^ b
}
return n
}
func refBlock(dst, a, b []byte) {
_ = dst[15]
_ = a[15]
_ = b[15]
dst[0] = a[0] ^ b[0]
dst[1] = a[1] ^ b[1]
dst[2] = a[2] ^ b[2]
dst[3] = a[3] ^ b[3]
dst[4] = a[4] ^ b[4]
dst[5] = a[5] ^ b[5]
dst[6] = a[6] ^ b[6]
dst[7] = a[7] ^ b[7]
dst[8] = a[8] ^ b[8]
dst[9] = a[9] ^ b[9]
dst[10] = a[10] ^ b[10]
dst[11] = a[11] ^ b[11]
dst[12] = a[12] ^ b[12]
dst[13] = a[13] ^ b[13]
dst[14] = a[14] ^ b[14]
dst[15] = a[15] ^ b[15]
}
func TestBytes(t *testing.T) {
err := quick.Check(func(a, b []byte) bool {
// double size to increase chances of reaching 64 bytes
a = append(a, a...)
b = append(b, b...)
if len(a) < 8 {
return true
}
// shift alignment randomly
a = a[(a[0] % 8):]
dst1 := make([]byte, len(a))
dst2 := make([]byte, len(a))
Bytes(dst1, a, b)
refBytes(dst2, a, b)
return bytes.Equal(dst1, dst2)
}, &quick.Config{MaxCount: 10000})
if err != nil {
t.Fatal(err)
}
}
func TestByte(t *testing.T) {
err := quick.Check(func(a []byte, b byte) bool {
if len(a) < 8 {
return true
}
// shift alignment randomly
a = a[(a[0] % 8):]
dst1 := make([]byte, len(a))
dst2 := make([]byte, len(a))
Byte(dst1, a, b)
refByte(dst2, a, b)
return bytes.Equal(dst1, dst2)
}, &quick.Config{MaxCount: 10000})
if err != nil {
t.Fatal(err)
}
}
func TestBlock(t *testing.T) {
err := quick.Check(func(a, b [16]byte) bool {
dst1 := make([]byte, len(a))
dst2 := make([]byte, len(a))
Block(dst1, a[:], b[:])
refBlock(dst2, a[:], b[:])
return bytes.Equal(dst1, dst2)
}, &quick.Config{MaxCount: 10000})
if err != nil {
t.Fatal(err)
}
}
func BenchmarkBytes(b *testing.B) {
benchN := func(n int) func(*testing.B) {
return func(b *testing.B) {
buf := make([]byte, n)
b.SetBytes(int64(len(buf)))
for i := 0; i < b.N; i++ {
Bytes(buf, buf, buf)
}
}
}
b.Run("16", benchN(16))
b.Run("1024", benchN(1024))
b.Run("65k", benchN(65536))
}
func BenchmarkRefBytes(b *testing.B) {
benchN := func(n int) func(*testing.B) {
return func(b *testing.B) {
buf := make([]byte, n)
b.SetBytes(int64(len(buf)))
for i := 0; i < b.N; i++ {
refBytes(buf, buf, buf)
}
}
}
b.Run("16", benchN(16))
b.Run("1024", benchN(1024))
b.Run("65k", benchN(65536))
}
func BenchmarkByte(b *testing.B) {
benchN := func(n int) func(*testing.B) {
return func(b *testing.B) {
buf := make([]byte, n)
b.SetBytes(int64(len(buf)))
for i := 0; i < b.N; i++ {
Byte(buf, buf, 'b')
}
}
}
b.Run("16", benchN(16))
b.Run("1024", benchN(1024))
b.Run("65k", benchN(65536))
}
func BenchmarkRefByte(b *testing.B) {
benchN := func(n int) func(*testing.B) {
return func(b *testing.B) {
buf := make([]byte, n)
b.SetBytes(int64(len(buf)))
for i := 0; i < b.N; i++ {
refByte(buf, buf, 'b')
}
}
}
b.Run("16", benchN(16))
b.Run("1024", benchN(1024))
b.Run("65k", benchN(65536))
}
func BenchmarkBlock(b *testing.B) {
buf := make([]byte, 16)
b.SetBytes(16)
for i := 0; i < b.N; i++ {
Block(buf, buf, buf)
}
}
func BenchmarkRefBlock(b *testing.B) {
buf := make([]byte, 16)
b.SetBytes(16)
for i := 0; i < b.N; i++ {
refBlock(buf, buf, buf)
}
}
+62
View File
@@ -0,0 +1,62 @@
// +build 386 amd64,!go1.7 ppc64 ppc64le s390x
package fastxor
import (
"unsafe"
)
const wordSize = int(unsafe.Sizeof(uintptr(0)))
// Bytes stores (a xor b) in dst, stopping when the end of any slice is
// reached. It returns the number of bytes xor'd.
func Bytes(dst, a, b []byte) int {
n := len(a)
if len(b) < n {
n = len(b)
}
if n == 0 {
return 0
}
// Assert dst has enough space
_ = dst[n-1]
w := n / wordSize
if w > 0 {
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
aw := *(*[]uintptr)(unsafe.Pointer(&a))
bw := *(*[]uintptr)(unsafe.Pointer(&b))
for i := 0; i < w; i++ {
dw[i] = aw[i] ^ bw[i]
}
}
for i := (n - n%wordSize); i < n; i++ {
dst[i] = a[i] ^ b[i]
}
return n
}
// Byte xors each byte in a with b and stores the result in dst, stopping when
// the end of either dst or a is reached. It returns the number of bytes
// xor'd.
func Byte(dst, a []byte, b byte) int {
n := len(a)
if len(dst) < n {
n = len(dst)
}
for i := 0; i < n; i++ {
dst[i] = a[i] ^ b
}
return n
}
// Block stores (a xor b) in dst, where a, b, and dst all have length 16.
func Block(dst, a, b []byte) {
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
aw := *(*[]uintptr)(unsafe.Pointer(&a))
bw := *(*[]uintptr)(unsafe.Pointer(&b))
dw[0] = aw[0] ^ bw[0]
dw[1] = aw[1] ^ bw[1]
}