make it twice as fast
This commit is contained in:
@@ -9,7 +9,7 @@ go get github.com/lukechampine/fastxor
|
|||||||
```
|
```
|
||||||
|
|
||||||
Is there a gaping hole in your heart that can only be filled by xor'ing byte
|
Is there a gaping hole in your heart that can only be filled by xor'ing byte
|
||||||
streams at 20GB/s? If so, you've come to the right place.
|
streams at 60GB/s? If so, you've come to the right place.
|
||||||
|
|
||||||
`fastxor` is exactly what it sounds like: a package that xors bytes as fast
|
`fastxor` is exactly what it sounds like: a package that xors bytes as fast
|
||||||
as your CPU is capable of. For best results, use a CPU that supports a SIMD
|
as your CPU is capable of. For best results, use a CPU that supports a SIMD
|
||||||
@@ -25,37 +25,37 @@ my code and let me know how I could make it faster or cleaner!
|
|||||||
```
|
```
|
||||||
AVX:
|
AVX:
|
||||||
|
|
||||||
BenchmarkBytes/16-4 200000000 8.72 ns/op 1835.82 MB/s
|
BenchmarkBytes/16-4 200000000 6.20 ns/op 2579.65 MB/s
|
||||||
BenchmarkBytes/1024-4 50000000 38.1 ns/op 26850.41 MB/s
|
BenchmarkBytes/1024-4 100000000 15.5 ns/op 66089.39 MB/s
|
||||||
BenchmarkBytes/65k-4 500000 2738 ns/op 23930.93 MB/s
|
BenchmarkBytes/65k-4 2000000 974 ns/op 67217.99 MB/s
|
||||||
|
|
||||||
SSE:
|
SSE:
|
||||||
|
|
||||||
BenchmarkBytes/16-4 200000000 8.63 ns/op 1852.98 MB/s
|
BenchmarkBytes/16-4 200000000 6.31 ns/op 2536.64 MB/s
|
||||||
BenchmarkBytes/1024-4 50000000 39.4 ns/op 25993.00 MB/s
|
BenchmarkBytes/1024-4 50000000 27.2 ns/op 37609.69 MB/s
|
||||||
BenchmarkBytes/65k-4 500000 2733 ns/op 23975.08 MB/s
|
BenchmarkBytes/65k-4 1000000 2009 ns/op 32619.21 MB/s
|
||||||
|
|
||||||
Word-wise:
|
Word-wise:
|
||||||
|
|
||||||
BenchmarkBytes/16-4 100000000 10.5 ns/op 1521.66 MB/s
|
BenchmarkBytes/16-4 200000000 7.37 ns/op 2170.17 MB/s
|
||||||
BenchmarkBytes/1024-4 10000000 125 ns/op 8163.59 MB/s
|
BenchmarkBytes/1024-4 20000000 89.4 ns/op 11455.33 MB/s
|
||||||
BenchmarkBytes/65k-4 200000 6895 ns/op 9504.62 MB/s
|
BenchmarkBytes/65k-4 300000 4963 ns/op 13203.25 MB/s
|
||||||
|
|
||||||
Byte-wise:
|
Byte-wise:
|
||||||
|
|
||||||
BenchmarkBytes/16-4 100000000 17.3 ns/op 925.16 MB/s
|
BenchmarkBytes/16-4 100000000 12.7 ns/op 1263.77 MB/s
|
||||||
BenchmarkBytes/1024-4 2000000 841 ns/op 1216.31 MB/s
|
BenchmarkBytes/1024-4 2000000 610 ns/op 1677.18 MB/s
|
||||||
BenchmarkBytes/65k-4 30000 54100 ns/op 1211.38 MB/s
|
BenchmarkBytes/65k-4 50000 38906 ns/op 1684.45 MB/s
|
||||||
```
|
```
|
||||||
|
|
||||||
Conclusions: `fastxor` is 2-25 times faster than a naive `for` loop. AVX and
|
Conclusions: `fastxor` is 2-40 times faster than a naive `for` loop. AVX is
|
||||||
SSE performance is roughly equivalent, which makes me suspect that I may be
|
roughly twice as fast as SSE, which is unsurpising since it can operate on
|
||||||
doing something wrong. Lastly, for very small slices, the cost of the function
|
twice as many bits per cycle. Lastly, for very small slices, the cost of the
|
||||||
call starts to outweigh the benefit of AVX/SSE (the Go compiler never inlines
|
function call starts to outweigh the benefit of AVX/SSE (the Go compiler never
|
||||||
handwritten asm). If you need to xor exactly 16 bytes (common in block
|
inlines handwritten asm). If you need to xor exactly 16 bytes (common in block
|
||||||
ciphers), the specialized `Block` function outperforms the more generic
|
ciphers), the specialized `Block` function outperforms the more generic
|
||||||
`Bytes`:
|
`Bytes`:
|
||||||
|
|
||||||
```
|
```
|
||||||
BenchmarkBlock-4 500000000 3.69 ns/op 4337.88 MB/s
|
BenchmarkBlock-4 1000000000 2.72 ns/op 5888.02 MB/s
|
||||||
```
|
```
|
||||||
+56
-36
@@ -116,36 +116,62 @@ TEXT ·xorBytesAVX(SB),NOSPLIT,$0
|
|||||||
MOVQ b_data+48(FP), B
|
MOVQ b_data+48(FP), B
|
||||||
MOVQ n+72(FP), N
|
MOVQ n+72(FP), N
|
||||||
|
|
||||||
|
XOR_LOOP_256_AVX:
|
||||||
|
CMPQ N, $256
|
||||||
|
JB XOR_LOOP_128_AVX
|
||||||
|
|
||||||
|
VMOVDQU (A), Y0
|
||||||
|
VMOVDQU 32(A), Y1
|
||||||
|
VMOVDQU 64(A), Y2
|
||||||
|
VMOVDQU 96(A), Y3
|
||||||
|
VMOVDQU 128(A), Y4
|
||||||
|
VMOVDQU 160(A), Y5
|
||||||
|
VMOVDQU 192(A), Y6
|
||||||
|
VMOVDQU 224(A), Y7
|
||||||
|
|
||||||
|
VPXOR (B), Y0, Y0
|
||||||
|
VPXOR 32(B), Y1, Y1
|
||||||
|
VPXOR 64(B), Y2, Y2
|
||||||
|
VPXOR 96(B), Y3, Y3
|
||||||
|
VPXOR 128(B), Y4, Y4
|
||||||
|
VPXOR 160(B), Y5, Y5
|
||||||
|
VPXOR 192(B), Y6, Y6
|
||||||
|
VPXOR 224(B), Y7, Y7
|
||||||
|
|
||||||
|
VMOVDQU Y0, (Dst)
|
||||||
|
VMOVDQU Y1, 32(Dst)
|
||||||
|
VMOVDQU Y2, 64(Dst)
|
||||||
|
VMOVDQU Y3, 96(Dst)
|
||||||
|
VMOVDQU Y4, 128(Dst)
|
||||||
|
VMOVDQU Y5, 160(Dst)
|
||||||
|
VMOVDQU Y6, 192(Dst)
|
||||||
|
VMOVDQU Y7, 224(Dst)
|
||||||
|
|
||||||
|
ADDQ $256, A
|
||||||
|
ADDQ $256, B
|
||||||
|
ADDQ $256, Dst
|
||||||
|
SUBQ $256, N
|
||||||
|
JNZ XOR_LOOP_256_AVX
|
||||||
|
RET
|
||||||
|
|
||||||
XOR_LOOP_128_AVX:
|
XOR_LOOP_128_AVX:
|
||||||
CMPQ N, $128
|
CMPQ N, $128
|
||||||
JB XOR_LOOP_64_AVX
|
JB XOR_LOOP_64_AVX
|
||||||
|
|
||||||
VMOVDQU (A), X0
|
VMOVDQU (A), Y0
|
||||||
VMOVDQU 16(A), X1
|
VMOVDQU 32(A), Y1
|
||||||
VMOVDQU 32(A), X2
|
VMOVDQU 64(A), Y2
|
||||||
VMOVDQU 48(A), X3
|
VMOVDQU 96(A), Y3
|
||||||
VMOVDQU 64(A), X4
|
|
||||||
VMOVDQU 80(A), X5
|
|
||||||
VMOVDQU 96(A), X6
|
|
||||||
VMOVDQU 112(A), X7
|
|
||||||
|
|
||||||
VPXOR (B), X0, X0
|
VPXOR (B), Y0, Y0
|
||||||
VPXOR 16(B), X1, X1
|
VPXOR 32(B), Y1, Y1
|
||||||
VPXOR 32(B), X2, X2
|
VPXOR 64(B), Y2, Y2
|
||||||
VPXOR 48(B), X3, X3
|
VPXOR 96(B), Y3, Y3
|
||||||
VPXOR 64(B), X4, X4
|
|
||||||
VPXOR 80(B), X5, X5
|
|
||||||
VPXOR 96(B), X6, X6
|
|
||||||
VPXOR 112(B), X7, X7
|
|
||||||
|
|
||||||
VMOVDQU X0, (Dst)
|
VMOVDQU Y0, (Dst)
|
||||||
VMOVDQU X1, 16(Dst)
|
VMOVDQU Y1, 32(Dst)
|
||||||
VMOVDQU X2, 32(Dst)
|
VMOVDQU Y2, 64(Dst)
|
||||||
VMOVDQU X3, 48(Dst)
|
VMOVDQU Y3, 96(Dst)
|
||||||
VMOVDQU X4, 64(Dst)
|
|
||||||
VMOVDQU X5, 80(Dst)
|
|
||||||
VMOVDQU X6, 96(Dst)
|
|
||||||
VMOVDQU X7, 112(Dst)
|
|
||||||
|
|
||||||
ADDQ $128, A
|
ADDQ $128, A
|
||||||
ADDQ $128, B
|
ADDQ $128, B
|
||||||
@@ -158,20 +184,14 @@ XOR_LOOP_64_AVX:
|
|||||||
CMPQ N, $64
|
CMPQ N, $64
|
||||||
JB XOR_LOOP_16_AVX
|
JB XOR_LOOP_16_AVX
|
||||||
|
|
||||||
MOVOU (A), X0
|
VMOVDQU (A), Y0
|
||||||
MOVOU 16(A), X1
|
VMOVDQU 32(A), Y1
|
||||||
MOVOU 32(A), X2
|
|
||||||
MOVOU 48(A), X3
|
|
||||||
|
|
||||||
VPXOR (B), X0, X4
|
VPXOR (B), Y0, Y2
|
||||||
VPXOR 16(B), X1, X5
|
VPXOR 32(B), Y1, Y3
|
||||||
VPXOR 32(B), X2, X6
|
|
||||||
VPXOR 48(B), X3, X7
|
|
||||||
|
|
||||||
VMOVDQU X4, (Dst)
|
VMOVDQU Y2, (Dst)
|
||||||
VMOVDQU X5, 16(Dst)
|
VMOVDQU Y3, 32(Dst)
|
||||||
VMOVDQU X6, 32(Dst)
|
|
||||||
VMOVDQU X7, 48(Dst)
|
|
||||||
|
|
||||||
ADDQ $64, A
|
ADDQ $64, A
|
||||||
ADDQ $64, B
|
ADDQ $64, B
|
||||||
|
|||||||
Reference in New Issue
Block a user