make it twice as fast
This commit is contained in:
@@ -9,7 +9,7 @@ go get github.com/lukechampine/fastxor
|
||||
```
|
||||
|
||||
Is there a gaping hole in your heart that can only be filled by xor'ing byte
|
||||
streams at 20GB/s? If so, you've come to the right place.
|
||||
streams at 60GB/s? If so, you've come to the right place.
|
||||
|
||||
`fastxor` is exactly what it sounds like: a package that xors bytes as fast
|
||||
as your CPU is capable of. For best results, use a CPU that supports a SIMD
|
||||
@@ -25,37 +25,37 @@ my code and let me know how I could make it faster or cleaner!
|
||||
```
|
||||
AVX:
|
||||
|
||||
BenchmarkBytes/16-4 200000000 8.72 ns/op 1835.82 MB/s
|
||||
BenchmarkBytes/1024-4 50000000 38.1 ns/op 26850.41 MB/s
|
||||
BenchmarkBytes/65k-4 500000 2738 ns/op 23930.93 MB/s
|
||||
BenchmarkBytes/16-4 200000000 6.20 ns/op 2579.65 MB/s
|
||||
BenchmarkBytes/1024-4 100000000 15.5 ns/op 66089.39 MB/s
|
||||
BenchmarkBytes/65k-4 2000000 974 ns/op 67217.99 MB/s
|
||||
|
||||
SSE:
|
||||
|
||||
BenchmarkBytes/16-4 200000000 8.63 ns/op 1852.98 MB/s
|
||||
BenchmarkBytes/1024-4 50000000 39.4 ns/op 25993.00 MB/s
|
||||
BenchmarkBytes/65k-4 500000 2733 ns/op 23975.08 MB/s
|
||||
BenchmarkBytes/16-4 200000000 6.31 ns/op 2536.64 MB/s
|
||||
BenchmarkBytes/1024-4 50000000 27.2 ns/op 37609.69 MB/s
|
||||
BenchmarkBytes/65k-4 1000000 2009 ns/op 32619.21 MB/s
|
||||
|
||||
Word-wise:
|
||||
|
||||
BenchmarkBytes/16-4 100000000 10.5 ns/op 1521.66 MB/s
|
||||
BenchmarkBytes/1024-4 10000000 125 ns/op 8163.59 MB/s
|
||||
BenchmarkBytes/65k-4 200000 6895 ns/op 9504.62 MB/s
|
||||
BenchmarkBytes/16-4 200000000 7.37 ns/op 2170.17 MB/s
|
||||
BenchmarkBytes/1024-4 20000000 89.4 ns/op 11455.33 MB/s
|
||||
BenchmarkBytes/65k-4 300000 4963 ns/op 13203.25 MB/s
|
||||
|
||||
Byte-wise:
|
||||
|
||||
BenchmarkBytes/16-4 100000000 17.3 ns/op 925.16 MB/s
|
||||
BenchmarkBytes/1024-4 2000000 841 ns/op 1216.31 MB/s
|
||||
BenchmarkBytes/65k-4 30000 54100 ns/op 1211.38 MB/s
|
||||
BenchmarkBytes/16-4 100000000 12.7 ns/op 1263.77 MB/s
|
||||
BenchmarkBytes/1024-4 2000000 610 ns/op 1677.18 MB/s
|
||||
BenchmarkBytes/65k-4 50000 38906 ns/op 1684.45 MB/s
|
||||
```
|
||||
|
||||
Conclusions: `fastxor` is 2-25 times faster than a naive `for` loop. AVX and
|
||||
SSE performance is roughly equivalent, which makes me suspect that I may be
|
||||
doing something wrong. Lastly, for very small slices, the cost of the function
|
||||
call starts to outweigh the benefit of AVX/SSE (the Go compiler never inlines
|
||||
handwritten asm). If you need to xor exactly 16 bytes (common in block
|
||||
Conclusions: `fastxor` is 2-40 times faster than a naive `for` loop. AVX is
|
||||
roughly twice as fast as SSE, which is unsurpising since it can operate on
|
||||
twice as many bits per cycle. Lastly, for very small slices, the cost of the
|
||||
function call starts to outweigh the benefit of AVX/SSE (the Go compiler never
|
||||
inlines handwritten asm). If you need to xor exactly 16 bytes (common in block
|
||||
ciphers), the specialized `Block` function outperforms the more generic
|
||||
`Bytes`:
|
||||
|
||||
```
|
||||
BenchmarkBlock-4 500000000 3.69 ns/op 4337.88 MB/s
|
||||
BenchmarkBlock-4 1000000000 2.72 ns/op 5888.02 MB/s
|
||||
```
|
||||
+56
-36
@@ -116,36 +116,62 @@ TEXT ·xorBytesAVX(SB),NOSPLIT,$0
|
||||
MOVQ b_data+48(FP), B
|
||||
MOVQ n+72(FP), N
|
||||
|
||||
XOR_LOOP_256_AVX:
|
||||
CMPQ N, $256
|
||||
JB XOR_LOOP_128_AVX
|
||||
|
||||
VMOVDQU (A), Y0
|
||||
VMOVDQU 32(A), Y1
|
||||
VMOVDQU 64(A), Y2
|
||||
VMOVDQU 96(A), Y3
|
||||
VMOVDQU 128(A), Y4
|
||||
VMOVDQU 160(A), Y5
|
||||
VMOVDQU 192(A), Y6
|
||||
VMOVDQU 224(A), Y7
|
||||
|
||||
VPXOR (B), Y0, Y0
|
||||
VPXOR 32(B), Y1, Y1
|
||||
VPXOR 64(B), Y2, Y2
|
||||
VPXOR 96(B), Y3, Y3
|
||||
VPXOR 128(B), Y4, Y4
|
||||
VPXOR 160(B), Y5, Y5
|
||||
VPXOR 192(B), Y6, Y6
|
||||
VPXOR 224(B), Y7, Y7
|
||||
|
||||
VMOVDQU Y0, (Dst)
|
||||
VMOVDQU Y1, 32(Dst)
|
||||
VMOVDQU Y2, 64(Dst)
|
||||
VMOVDQU Y3, 96(Dst)
|
||||
VMOVDQU Y4, 128(Dst)
|
||||
VMOVDQU Y5, 160(Dst)
|
||||
VMOVDQU Y6, 192(Dst)
|
||||
VMOVDQU Y7, 224(Dst)
|
||||
|
||||
ADDQ $256, A
|
||||
ADDQ $256, B
|
||||
ADDQ $256, Dst
|
||||
SUBQ $256, N
|
||||
JNZ XOR_LOOP_256_AVX
|
||||
RET
|
||||
|
||||
XOR_LOOP_128_AVX:
|
||||
CMPQ N, $128
|
||||
JB XOR_LOOP_64_AVX
|
||||
|
||||
VMOVDQU (A), X0
|
||||
VMOVDQU 16(A), X1
|
||||
VMOVDQU 32(A), X2
|
||||
VMOVDQU 48(A), X3
|
||||
VMOVDQU 64(A), X4
|
||||
VMOVDQU 80(A), X5
|
||||
VMOVDQU 96(A), X6
|
||||
VMOVDQU 112(A), X7
|
||||
VMOVDQU (A), Y0
|
||||
VMOVDQU 32(A), Y1
|
||||
VMOVDQU 64(A), Y2
|
||||
VMOVDQU 96(A), Y3
|
||||
|
||||
VPXOR (B), X0, X0
|
||||
VPXOR 16(B), X1, X1
|
||||
VPXOR 32(B), X2, X2
|
||||
VPXOR 48(B), X3, X3
|
||||
VPXOR 64(B), X4, X4
|
||||
VPXOR 80(B), X5, X5
|
||||
VPXOR 96(B), X6, X6
|
||||
VPXOR 112(B), X7, X7
|
||||
VPXOR (B), Y0, Y0
|
||||
VPXOR 32(B), Y1, Y1
|
||||
VPXOR 64(B), Y2, Y2
|
||||
VPXOR 96(B), Y3, Y3
|
||||
|
||||
VMOVDQU X0, (Dst)
|
||||
VMOVDQU X1, 16(Dst)
|
||||
VMOVDQU X2, 32(Dst)
|
||||
VMOVDQU X3, 48(Dst)
|
||||
VMOVDQU X4, 64(Dst)
|
||||
VMOVDQU X5, 80(Dst)
|
||||
VMOVDQU X6, 96(Dst)
|
||||
VMOVDQU X7, 112(Dst)
|
||||
VMOVDQU Y0, (Dst)
|
||||
VMOVDQU Y1, 32(Dst)
|
||||
VMOVDQU Y2, 64(Dst)
|
||||
VMOVDQU Y3, 96(Dst)
|
||||
|
||||
ADDQ $128, A
|
||||
ADDQ $128, B
|
||||
@@ -158,20 +184,14 @@ XOR_LOOP_64_AVX:
|
||||
CMPQ N, $64
|
||||
JB XOR_LOOP_16_AVX
|
||||
|
||||
MOVOU (A), X0
|
||||
MOVOU 16(A), X1
|
||||
MOVOU 32(A), X2
|
||||
MOVOU 48(A), X3
|
||||
VMOVDQU (A), Y0
|
||||
VMOVDQU 32(A), Y1
|
||||
|
||||
VPXOR (B), X0, X4
|
||||
VPXOR 16(B), X1, X5
|
||||
VPXOR 32(B), X2, X6
|
||||
VPXOR 48(B), X3, X7
|
||||
VPXOR (B), Y0, Y2
|
||||
VPXOR 32(B), Y1, Y3
|
||||
|
||||
VMOVDQU X4, (Dst)
|
||||
VMOVDQU X5, 16(Dst)
|
||||
VMOVDQU X6, 32(Dst)
|
||||
VMOVDQU X7, 48(Dst)
|
||||
VMOVDQU Y2, (Dst)
|
||||
VMOVDQU Y3, 32(Dst)
|
||||
|
||||
ADDQ $64, A
|
||||
ADDQ $64, B
|
||||
|
||||
Reference in New Issue
Block a user