initial commit
This commit is contained in:
@@ -0,0 +1,21 @@
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2018 Luke Champine
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
@@ -0,0 +1,61 @@
|
||||
fastxor
|
||||
-----
|
||||
|
||||
[](https://godoc.org/github.com/lukechampine/fastxor)
|
||||
[](https://goreportcard.com/report/github.com/lukechampine/fastxor)
|
||||
|
||||
```
|
||||
go get github.com/lukechampine/fastxor
|
||||
```
|
||||
|
||||
Is there a gaping hole in your heart that can only be filled by xor'ing byte
|
||||
streams at 20GB/s? If so, you've come to the right place.
|
||||
|
||||
`fastxor` is exactly what it sounds like: a package that xors bytes as fast
|
||||
as your CPU is capable of. For best results, use a CPU that supports a SIMD
|
||||
instruction set like SSE or AVX. On other architectures, performance is much
|
||||
less impressive, but still faster than a naive byte-wise loop.
|
||||
|
||||
I wrote this package to try my hand at writing Go assembly, so please scrutinize
|
||||
my code and let me know how I could make it faster or cleaner!
|
||||
|
||||
|
||||
# Benchmarks
|
||||
|
||||
```
|
||||
AVX:
|
||||
|
||||
BenchmarkBytes/16-4 200000000 8.72 ns/op 1835.82 MB/s
|
||||
BenchmarkBytes/1024-4 50000000 38.1 ns/op 26850.41 MB/s
|
||||
BenchmarkBytes/65k-4 500000 2738 ns/op 23930.93 MB/s
|
||||
|
||||
SSE:
|
||||
|
||||
BenchmarkBytes/16-4 200000000 8.63 ns/op 1852.98 MB/s
|
||||
BenchmarkBytes/1024-4 50000000 39.4 ns/op 25993.00 MB/s
|
||||
BenchmarkBytes/65k-4 500000 2733 ns/op 23975.08 MB/s
|
||||
|
||||
Word-wise:
|
||||
|
||||
BenchmarkBytes/16-4 100000000 10.5 ns/op 1521.66 MB/s
|
||||
BenchmarkBytes/1024-4 10000000 125 ns/op 8163.59 MB/s
|
||||
BenchmarkBytes/65k-4 200000 6895 ns/op 9504.62 MB/s
|
||||
|
||||
Byte-wise:
|
||||
|
||||
BenchmarkBytes/16-4 100000000 17.3 ns/op 925.16 MB/s
|
||||
BenchmarkBytes/1024-4 2000000 841 ns/op 1216.31 MB/s
|
||||
BenchmarkBytes/65k-4 30000 54100 ns/op 1211.38 MB/s
|
||||
```
|
||||
|
||||
Conclusions: `fastxor` is 2-25 times faster than a naive `for` loop. AVX and
|
||||
SSE performance is roughly equivalent, which makes me suspect that I may be
|
||||
doing something wrong. Lastly, for very small slices, the cost of the function
|
||||
call starts to outweigh the benefit of AVX/SSE (the Go compiler never inlines
|
||||
handwritten asm). If you need to xor exactly 16 bytes (common in block
|
||||
ciphers), the specialized `Block` function outperforms the more generic
|
||||
`Bytes`:
|
||||
|
||||
```
|
||||
BenchmarkBlock-4 500000000 3.69 ns/op 4337.88 MB/s
|
||||
```
|
||||
@@ -0,0 +1,57 @@
|
||||
// +build !386,!amd64,!ppc64,!ppc64le,!s390x
|
||||
|
||||
package fastxor
|
||||
|
||||
// Bytes stores (a xor b) in dst, stopping when the end of any slice is
|
||||
// reached. It returns the number of bytes xor'd.
|
||||
func Bytes(dst, a, b []byte) int {
|
||||
n := len(a)
|
||||
if len(b) < n {
|
||||
n = len(b)
|
||||
}
|
||||
if len(dst) < n {
|
||||
n = len(dst)
|
||||
}
|
||||
for i := 0; i < n; i++ {
|
||||
dst[i] = a[i] ^ b[i]
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// Byte xors each byte in a with b and stores the result in dst, stopping when
|
||||
// the end of either dst or a is reached. It returns the number of bytes
|
||||
// xor'd.
|
||||
func Byte(dst, a []byte, b byte) int {
|
||||
n := len(a)
|
||||
if len(dst) < n {
|
||||
n = len(dst)
|
||||
}
|
||||
for i := 0; i < n; i++ {
|
||||
dst[i] = a[i] ^ b
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// Block stores (a xor b) in dst, where a, b, and dst all have length 16.
|
||||
func Block(dst, a, b []byte) {
|
||||
_ = dst[15]
|
||||
_ = a[15]
|
||||
_ = b[15]
|
||||
|
||||
dst[0] = a[0] ^ b[0]
|
||||
dst[1] = a[1] ^ b[1]
|
||||
dst[2] = a[2] ^ b[2]
|
||||
dst[3] = a[3] ^ b[3]
|
||||
dst[4] = a[4] ^ b[4]
|
||||
dst[5] = a[5] ^ b[5]
|
||||
dst[6] = a[6] ^ b[6]
|
||||
dst[7] = a[7] ^ b[7]
|
||||
dst[8] = a[8] ^ b[8]
|
||||
dst[9] = a[9] ^ b[9]
|
||||
dst[10] = a[10] ^ b[10]
|
||||
dst[11] = a[11] ^ b[11]
|
||||
dst[12] = a[12] ^ b[12]
|
||||
dst[13] = a[13] ^ b[13]
|
||||
dst[14] = a[14] ^ b[14]
|
||||
dst[15] = a[15] ^ b[15]
|
||||
}
|
||||
+105
@@ -0,0 +1,105 @@
|
||||
// +build go1.7,amd64,!gccgo,!appengine,!nacl
|
||||
|
||||
package fastxor
|
||||
|
||||
import (
|
||||
"unsafe"
|
||||
|
||||
"golang.org/x/sys/cpu"
|
||||
)
|
||||
|
||||
//go:noescape
|
||||
func xorBytesSSE(dst, a, b []byte, n int)
|
||||
|
||||
//go:noescape
|
||||
func xorBytesAVX(dst, a, b []byte, n int)
|
||||
|
||||
func min(a, b, c int) int {
|
||||
if a < b {
|
||||
b = a
|
||||
}
|
||||
if b < c {
|
||||
c = b
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
// Bytes stores (a xor b) in dst, stopping when the end of any slice is
|
||||
// reached. It returns the number of bytes xor'd.
|
||||
func Bytes(dst, a, b []byte) int {
|
||||
n := min(len(dst), len(a), len(b))
|
||||
if n == 0 {
|
||||
return 0
|
||||
}
|
||||
switch {
|
||||
case cpu.X86.HasAVX:
|
||||
xorBytesAVX(dst, a, b, n)
|
||||
case cpu.X86.HasSSE2:
|
||||
xorBytesSSE(dst, a, b, n)
|
||||
default:
|
||||
xorBytesGeneric(dst, a, b, n)
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
const wordSize = int(unsafe.Sizeof(uintptr(0)))
|
||||
|
||||
func xorBytesGeneric(dst, a, b []byte, n int) {
|
||||
// Assert dst has enough space
|
||||
_ = dst[n-1]
|
||||
|
||||
w := n / wordSize
|
||||
if w > 0 {
|
||||
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
|
||||
aw := *(*[]uintptr)(unsafe.Pointer(&a))
|
||||
bw := *(*[]uintptr)(unsafe.Pointer(&b))
|
||||
for i := 0; i < w; i++ {
|
||||
dw[i] = aw[i] ^ bw[i]
|
||||
}
|
||||
}
|
||||
|
||||
for i := (n - n%wordSize); i < n; i++ {
|
||||
dst[i] = a[i] ^ b[i]
|
||||
}
|
||||
}
|
||||
|
||||
// Byte xors each byte in a with b and stores the result in dst, stopping when
|
||||
// the end of either dst or a is reached. It returns the number of bytes
|
||||
// xor'd.
|
||||
func Byte(dst, a []byte, b byte) int {
|
||||
n := len(a)
|
||||
if len(dst) < n {
|
||||
n = len(dst)
|
||||
}
|
||||
|
||||
var bw uintptr
|
||||
for i := 0; i < wordSize; i += 1 {
|
||||
bw |= uintptr(b) << uint(i*8)
|
||||
}
|
||||
|
||||
w := n / wordSize
|
||||
if w > 0 {
|
||||
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
|
||||
aw := *(*[]uintptr)(unsafe.Pointer(&a))
|
||||
for i := 0; i < w; i++ {
|
||||
dw[i] = aw[i] ^ bw
|
||||
}
|
||||
}
|
||||
|
||||
for i := (n - n%wordSize); i < n; i++ {
|
||||
dst[i] = a[i] ^ b
|
||||
}
|
||||
|
||||
return n
|
||||
}
|
||||
|
||||
// Block stores (a xor b) in dst, where a, b, and dst all have length 16.
|
||||
func Block(dst, a, b []byte) {
|
||||
// profiling indicates that for 16-byte blocks, the cost of a function
|
||||
// call outweighs the SSE/AVX speedup
|
||||
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
|
||||
aw := *(*[]uintptr)(unsafe.Pointer(&a))
|
||||
bw := *(*[]uintptr)(unsafe.Pointer(&b))
|
||||
dw[0] = aw[0] ^ bw[0]
|
||||
dw[1] = aw[1] ^ bw[1]
|
||||
}
|
||||
+211
@@ -0,0 +1,211 @@
|
||||
// +build amd64,!gccgo,!appengine,!nacl
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#define Dst DI
|
||||
#define A R8
|
||||
#define B R9
|
||||
#define N R12
|
||||
|
||||
// func xorBytesSSE(dst, a, b []byte, n int)
|
||||
TEXT ·xorBytesSSE(SB),NOSPLIT,$0
|
||||
MOVQ dst_data+0(FP), Dst
|
||||
MOVQ a_data+24(FP), A
|
||||
MOVQ b_data+48(FP), B
|
||||
MOVQ n+72(FP), N
|
||||
|
||||
XOR_LOOP_128_SSE:
|
||||
CMPQ N, $128
|
||||
JB XOR_LOOP_64_SSE
|
||||
|
||||
MOVOU (A), X0
|
||||
MOVOU 16(A), X1
|
||||
MOVOU 32(A), X2
|
||||
MOVOU 48(A), X3
|
||||
MOVOU 64(A), X4
|
||||
MOVOU 80(A), X5
|
||||
MOVOU 96(A), X6
|
||||
MOVOU 112(A), X7
|
||||
|
||||
PXOR (B), X0
|
||||
PXOR 16(B), X1
|
||||
PXOR 32(B), X2
|
||||
PXOR 48(B), X3
|
||||
PXOR 64(B), X4
|
||||
PXOR 80(B), X5
|
||||
PXOR 96(B), X6
|
||||
PXOR 112(B), X7
|
||||
|
||||
MOVOU X0, (Dst)
|
||||
MOVOU X1, 16(Dst)
|
||||
MOVOU X2, 32(Dst)
|
||||
MOVOU X3, 48(Dst)
|
||||
MOVOU X4, 64(Dst)
|
||||
MOVOU X5, 80(Dst)
|
||||
MOVOU X6, 96(Dst)
|
||||
MOVOU X7, 112(Dst)
|
||||
|
||||
ADDQ $128, A
|
||||
ADDQ $128, B
|
||||
ADDQ $128, Dst
|
||||
SUBQ $128, N
|
||||
JNZ XOR_LOOP_128_SSE
|
||||
RET
|
||||
|
||||
XOR_LOOP_64_SSE:
|
||||
CMPQ N, $64
|
||||
JB XOR_LOOP_16_SSE
|
||||
|
||||
MOVOU (A), X0
|
||||
MOVOU (B), X1
|
||||
MOVOU 16(A), X2
|
||||
MOVOU 16(B), X3
|
||||
MOVOU 32(A), X4
|
||||
MOVOU 32(B), X5
|
||||
MOVOU 48(A), X6
|
||||
MOVOU 48(B), X7
|
||||
|
||||
PXOR X0, X1
|
||||
PXOR X2, X3
|
||||
PXOR X4, X5
|
||||
PXOR X6, X7
|
||||
|
||||
MOVOU X1, (Dst)
|
||||
MOVOU X3, 16(Dst)
|
||||
MOVOU X5, 32(Dst)
|
||||
MOVOU X7, 48(Dst)
|
||||
|
||||
ADDQ $64, A
|
||||
ADDQ $64, B
|
||||
ADDQ $64, Dst
|
||||
SUBQ $64, N
|
||||
JNZ XOR_LOOP_64_SSE
|
||||
RET
|
||||
|
||||
XOR_LOOP_16_SSE:
|
||||
CMPQ N, $16
|
||||
JB XOR_LOOP_FINAL_SSE
|
||||
MOVOU (A), X0
|
||||
MOVOU (B), X1
|
||||
PXOR X0, X1
|
||||
MOVOU X1, (Dst)
|
||||
ADDQ $16, A
|
||||
ADDQ $16, B
|
||||
ADDQ $16, Dst
|
||||
SUBQ $16, N
|
||||
JNZ XOR_LOOP_16_SSE
|
||||
RET
|
||||
|
||||
XOR_LOOP_FINAL_SSE:
|
||||
MOVB (A), AL
|
||||
MOVB (B), BL
|
||||
XORB AL, BL
|
||||
MOVB BL, (Dst)
|
||||
INCQ A
|
||||
INCQ B
|
||||
INCQ Dst
|
||||
DECQ N
|
||||
JNZ XOR_LOOP_FINAL_SSE
|
||||
RET
|
||||
|
||||
|
||||
// func xorBytesAVX(dst, a, b []byte, n int)
|
||||
TEXT ·xorBytesAVX(SB),NOSPLIT,$0
|
||||
MOVQ dst_data+0(FP), Dst
|
||||
MOVQ a_data+24(FP), A
|
||||
MOVQ b_data+48(FP), B
|
||||
MOVQ n+72(FP), N
|
||||
|
||||
XOR_LOOP_128_AVX:
|
||||
CMPQ N, $128
|
||||
JB XOR_LOOP_64_AVX
|
||||
|
||||
VMOVDQU (A), X0
|
||||
VMOVDQU 16(A), X1
|
||||
VMOVDQU 32(A), X2
|
||||
VMOVDQU 48(A), X3
|
||||
VMOVDQU 64(A), X4
|
||||
VMOVDQU 80(A), X5
|
||||
VMOVDQU 96(A), X6
|
||||
VMOVDQU 112(A), X7
|
||||
|
||||
VPXOR (B), X0, X0
|
||||
VPXOR 16(B), X1, X1
|
||||
VPXOR 32(B), X2, X2
|
||||
VPXOR 48(B), X3, X3
|
||||
VPXOR 64(B), X4, X4
|
||||
VPXOR 80(B), X5, X5
|
||||
VPXOR 96(B), X6, X6
|
||||
VPXOR 112(B), X7, X7
|
||||
|
||||
VMOVDQU X0, (Dst)
|
||||
VMOVDQU X1, 16(Dst)
|
||||
VMOVDQU X2, 32(Dst)
|
||||
VMOVDQU X3, 48(Dst)
|
||||
VMOVDQU X4, 64(Dst)
|
||||
VMOVDQU X5, 80(Dst)
|
||||
VMOVDQU X6, 96(Dst)
|
||||
VMOVDQU X7, 112(Dst)
|
||||
|
||||
ADDQ $128, A
|
||||
ADDQ $128, B
|
||||
ADDQ $128, Dst
|
||||
SUBQ $128, N
|
||||
JNZ XOR_LOOP_128_AVX
|
||||
RET
|
||||
|
||||
XOR_LOOP_64_AVX:
|
||||
CMPQ N, $64
|
||||
JB XOR_LOOP_16_AVX
|
||||
|
||||
MOVOU (A), X0
|
||||
MOVOU 16(A), X1
|
||||
MOVOU 32(A), X2
|
||||
MOVOU 48(A), X3
|
||||
|
||||
VPXOR (B), X0, X4
|
||||
VPXOR 16(B), X1, X5
|
||||
VPXOR 32(B), X2, X6
|
||||
VPXOR 48(B), X3, X7
|
||||
|
||||
VMOVDQU X4, (Dst)
|
||||
VMOVDQU X5, 16(Dst)
|
||||
VMOVDQU X6, 32(Dst)
|
||||
VMOVDQU X7, 48(Dst)
|
||||
|
||||
ADDQ $64, A
|
||||
ADDQ $64, B
|
||||
ADDQ $64, Dst
|
||||
SUBQ $64, N
|
||||
JNZ XOR_LOOP_64_AVX
|
||||
RET
|
||||
|
||||
XOR_LOOP_16_AVX:
|
||||
CMPQ N, $16
|
||||
JB XOR_LOOP_FINAL_AVX
|
||||
MOVOU (A), X0
|
||||
VPXOR (B), X0, X1
|
||||
VMOVDQU X1, (Dst)
|
||||
ADDQ $16, A
|
||||
ADDQ $16, B
|
||||
ADDQ $16, Dst
|
||||
SUBQ $16, N
|
||||
JNZ XOR_LOOP_16_AVX
|
||||
RET
|
||||
|
||||
XOR_LOOP_FINAL_AVX:
|
||||
MOVB (A), AL
|
||||
MOVB (B), BL
|
||||
XORB AL, BL
|
||||
MOVB BL, (Dst)
|
||||
INCQ A
|
||||
INCQ B
|
||||
INCQ Dst
|
||||
DECQ N
|
||||
JNZ XOR_LOOP_FINAL_AVX
|
||||
RET
|
||||
|
||||
#undef Dst
|
||||
#undef A
|
||||
#undef B
|
||||
#undef N
|
||||
+185
@@ -0,0 +1,185 @@
|
||||
package fastxor
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"testing"
|
||||
"testing/quick"
|
||||
)
|
||||
|
||||
func refBytes(dst, a, b []byte) int {
|
||||
n := len(a)
|
||||
if len(b) < n {
|
||||
n = len(b)
|
||||
}
|
||||
if len(dst) < n {
|
||||
n = len(dst)
|
||||
}
|
||||
for i := 0; i < n; i++ {
|
||||
dst[i] = a[i] ^ b[i]
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func refByte(dst, a []byte, b byte) int {
|
||||
n := len(a)
|
||||
if len(dst) < n {
|
||||
n = len(dst)
|
||||
}
|
||||
for i := 0; i < n; i++ {
|
||||
dst[i] = a[i] ^ b
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func refBlock(dst, a, b []byte) {
|
||||
_ = dst[15]
|
||||
_ = a[15]
|
||||
_ = b[15]
|
||||
|
||||
dst[0] = a[0] ^ b[0]
|
||||
dst[1] = a[1] ^ b[1]
|
||||
dst[2] = a[2] ^ b[2]
|
||||
dst[3] = a[3] ^ b[3]
|
||||
dst[4] = a[4] ^ b[4]
|
||||
dst[5] = a[5] ^ b[5]
|
||||
dst[6] = a[6] ^ b[6]
|
||||
dst[7] = a[7] ^ b[7]
|
||||
dst[8] = a[8] ^ b[8]
|
||||
dst[9] = a[9] ^ b[9]
|
||||
dst[10] = a[10] ^ b[10]
|
||||
dst[11] = a[11] ^ b[11]
|
||||
dst[12] = a[12] ^ b[12]
|
||||
dst[13] = a[13] ^ b[13]
|
||||
dst[14] = a[14] ^ b[14]
|
||||
dst[15] = a[15] ^ b[15]
|
||||
}
|
||||
|
||||
func TestBytes(t *testing.T) {
|
||||
err := quick.Check(func(a, b []byte) bool {
|
||||
// double size to increase chances of reaching 64 bytes
|
||||
a = append(a, a...)
|
||||
b = append(b, b...)
|
||||
if len(a) < 8 {
|
||||
return true
|
||||
}
|
||||
// shift alignment randomly
|
||||
a = a[(a[0] % 8):]
|
||||
|
||||
dst1 := make([]byte, len(a))
|
||||
dst2 := make([]byte, len(a))
|
||||
Bytes(dst1, a, b)
|
||||
refBytes(dst2, a, b)
|
||||
return bytes.Equal(dst1, dst2)
|
||||
}, &quick.Config{MaxCount: 10000})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestByte(t *testing.T) {
|
||||
err := quick.Check(func(a []byte, b byte) bool {
|
||||
if len(a) < 8 {
|
||||
return true
|
||||
}
|
||||
// shift alignment randomly
|
||||
a = a[(a[0] % 8):]
|
||||
|
||||
dst1 := make([]byte, len(a))
|
||||
dst2 := make([]byte, len(a))
|
||||
Byte(dst1, a, b)
|
||||
refByte(dst2, a, b)
|
||||
return bytes.Equal(dst1, dst2)
|
||||
}, &quick.Config{MaxCount: 10000})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBlock(t *testing.T) {
|
||||
err := quick.Check(func(a, b [16]byte) bool {
|
||||
dst1 := make([]byte, len(a))
|
||||
dst2 := make([]byte, len(a))
|
||||
Block(dst1, a[:], b[:])
|
||||
refBlock(dst2, a[:], b[:])
|
||||
return bytes.Equal(dst1, dst2)
|
||||
}, &quick.Config{MaxCount: 10000})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkBytes(b *testing.B) {
|
||||
benchN := func(n int) func(*testing.B) {
|
||||
return func(b *testing.B) {
|
||||
buf := make([]byte, n)
|
||||
b.SetBytes(int64(len(buf)))
|
||||
for i := 0; i < b.N; i++ {
|
||||
Bytes(buf, buf, buf)
|
||||
}
|
||||
}
|
||||
}
|
||||
b.Run("16", benchN(16))
|
||||
b.Run("1024", benchN(1024))
|
||||
b.Run("65k", benchN(65536))
|
||||
}
|
||||
|
||||
func BenchmarkRefBytes(b *testing.B) {
|
||||
benchN := func(n int) func(*testing.B) {
|
||||
return func(b *testing.B) {
|
||||
buf := make([]byte, n)
|
||||
b.SetBytes(int64(len(buf)))
|
||||
for i := 0; i < b.N; i++ {
|
||||
refBytes(buf, buf, buf)
|
||||
}
|
||||
}
|
||||
}
|
||||
b.Run("16", benchN(16))
|
||||
b.Run("1024", benchN(1024))
|
||||
b.Run("65k", benchN(65536))
|
||||
}
|
||||
|
||||
func BenchmarkByte(b *testing.B) {
|
||||
benchN := func(n int) func(*testing.B) {
|
||||
return func(b *testing.B) {
|
||||
buf := make([]byte, n)
|
||||
b.SetBytes(int64(len(buf)))
|
||||
for i := 0; i < b.N; i++ {
|
||||
Byte(buf, buf, 'b')
|
||||
}
|
||||
}
|
||||
}
|
||||
b.Run("16", benchN(16))
|
||||
b.Run("1024", benchN(1024))
|
||||
b.Run("65k", benchN(65536))
|
||||
}
|
||||
|
||||
func BenchmarkRefByte(b *testing.B) {
|
||||
benchN := func(n int) func(*testing.B) {
|
||||
return func(b *testing.B) {
|
||||
buf := make([]byte, n)
|
||||
b.SetBytes(int64(len(buf)))
|
||||
for i := 0; i < b.N; i++ {
|
||||
refByte(buf, buf, 'b')
|
||||
}
|
||||
}
|
||||
}
|
||||
b.Run("16", benchN(16))
|
||||
b.Run("1024", benchN(1024))
|
||||
b.Run("65k", benchN(65536))
|
||||
}
|
||||
|
||||
func BenchmarkBlock(b *testing.B) {
|
||||
buf := make([]byte, 16)
|
||||
b.SetBytes(16)
|
||||
for i := 0; i < b.N; i++ {
|
||||
Block(buf, buf, buf)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkRefBlock(b *testing.B) {
|
||||
buf := make([]byte, 16)
|
||||
b.SetBytes(16)
|
||||
for i := 0; i < b.N; i++ {
|
||||
refBlock(buf, buf, buf)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
// +build 386 amd64,!go1.7 ppc64 ppc64le s390x
|
||||
|
||||
package fastxor
|
||||
|
||||
import (
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
const wordSize = int(unsafe.Sizeof(uintptr(0)))
|
||||
|
||||
// Bytes stores (a xor b) in dst, stopping when the end of any slice is
|
||||
// reached. It returns the number of bytes xor'd.
|
||||
func Bytes(dst, a, b []byte) int {
|
||||
n := len(a)
|
||||
if len(b) < n {
|
||||
n = len(b)
|
||||
}
|
||||
if n == 0 {
|
||||
return 0
|
||||
}
|
||||
// Assert dst has enough space
|
||||
_ = dst[n-1]
|
||||
|
||||
w := n / wordSize
|
||||
if w > 0 {
|
||||
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
|
||||
aw := *(*[]uintptr)(unsafe.Pointer(&a))
|
||||
bw := *(*[]uintptr)(unsafe.Pointer(&b))
|
||||
for i := 0; i < w; i++ {
|
||||
dw[i] = aw[i] ^ bw[i]
|
||||
}
|
||||
}
|
||||
|
||||
for i := (n - n%wordSize); i < n; i++ {
|
||||
dst[i] = a[i] ^ b[i]
|
||||
}
|
||||
|
||||
return n
|
||||
}
|
||||
|
||||
// Byte xors each byte in a with b and stores the result in dst, stopping when
|
||||
// the end of either dst or a is reached. It returns the number of bytes
|
||||
// xor'd.
|
||||
func Byte(dst, a []byte, b byte) int {
|
||||
n := len(a)
|
||||
if len(dst) < n {
|
||||
n = len(dst)
|
||||
}
|
||||
for i := 0; i < n; i++ {
|
||||
dst[i] = a[i] ^ b
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// Block stores (a xor b) in dst, where a, b, and dst all have length 16.
|
||||
func Block(dst, a, b []byte) {
|
||||
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
|
||||
aw := *(*[]uintptr)(unsafe.Pointer(&a))
|
||||
bw := *(*[]uintptr)(unsafe.Pointer(&b))
|
||||
dw[0] = aw[0] ^ bw[0]
|
||||
dw[1] = aw[1] ^ bw[1]
|
||||
}
|
||||
Reference in New Issue
Block a user