diff --git a/xor_amd64.s b/xor_amd64.s index f641c4a..c137b67 100644 --- a/xor_amd64.s +++ b/xor_amd64.s @@ -115,32 +115,32 @@ XOR_LOOP_256_AVX: CMPQ N, $256 JB XOR_LOOP_128_AVX - VMOVDQU (A), Y0 - VMOVDQU 32(A), Y1 - VMOVDQU 64(A), Y2 - VMOVDQU 96(A), Y3 - VMOVDQU 128(A), Y4 - VMOVDQU 160(A), Y5 - VMOVDQU 192(A), Y6 - VMOVDQU 224(A), Y7 + VMOVDQU 0*32(A), Y0 + VMOVDQU 1*32(A), Y1 + VMOVDQU 2*32(A), Y2 + VMOVDQU 3*32(A), Y3 + VMOVDQU 4*32(A), Y4 + VMOVDQU 5*32(A), Y5 + VMOVDQU 6*32(A), Y6 + VMOVDQU 7*32(A), Y7 - VPXOR (B), Y0, Y0 - VPXOR 32(B), Y1, Y1 - VPXOR 64(B), Y2, Y2 - VPXOR 96(B), Y3, Y3 - VPXOR 128(B), Y4, Y4 - VPXOR 160(B), Y5, Y5 - VPXOR 192(B), Y6, Y6 - VPXOR 224(B), Y7, Y7 + VPXOR 0*32(B), Y0, Y0 + VPXOR 1*32(B), Y1, Y1 + VPXOR 2*32(B), Y2, Y2 + VPXOR 3*32(B), Y3, Y3 + VPXOR 4*32(B), Y4, Y4 + VPXOR 5*32(B), Y5, Y5 + VPXOR 6*32(B), Y6, Y6 + VPXOR 7*32(B), Y7, Y7 - VMOVDQU Y0, (Dst) - VMOVDQU Y1, 32(Dst) - VMOVDQU Y2, 64(Dst) - VMOVDQU Y3, 96(Dst) - VMOVDQU Y4, 128(Dst) - VMOVDQU Y5, 160(Dst) - VMOVDQU Y6, 192(Dst) - VMOVDQU Y7, 224(Dst) + VMOVDQU Y0, 0*32(Dst) + VMOVDQU Y1, 1*32(Dst) + VMOVDQU Y2, 2*32(Dst) + VMOVDQU Y3, 3*32(Dst) + VMOVDQU Y4, 4*32(Dst) + VMOVDQU Y5, 5*32(Dst) + VMOVDQU Y6, 6*32(Dst) + VMOVDQU Y7, 7*32(Dst) ADDQ $256, A ADDQ $256, B @@ -150,23 +150,23 @@ XOR_LOOP_256_AVX: RET XOR_LOOP_128_AVX: - CMPQ N, $128 - JB XOR_LOOP_64_AVX + CMPQ N, $128 + JB XOR_LOOP_64_AVX - VMOVDQU (A), Y0 - VMOVDQU 32(A), Y1 - VMOVDQU 64(A), Y2 - VMOVDQU 96(A), Y3 + VMOVDQU 0*32(A), Y0 + VMOVDQU 1*32(A), Y1 + VMOVDQU 2*32(A), Y2 + VMOVDQU 3*32(A), Y3 - VPXOR (B), Y0, Y0 - VPXOR 32(B), Y1, Y1 - VPXOR 64(B), Y2, Y2 - VPXOR 96(B), Y3, Y3 + VPXOR 0*32(B), Y0, Y0 + VPXOR 1*32(B), Y1, Y1 + VPXOR 2*32(B), Y2, Y2 + VPXOR 3*32(B), Y3, Y3 - VMOVDQU Y0, (Dst) - VMOVDQU Y1, 32(Dst) - VMOVDQU Y2, 64(Dst) - VMOVDQU Y3, 96(Dst) + VMOVDQU Y0, 0*32(Dst) + VMOVDQU Y1, 1*32(Dst) + VMOVDQU Y2, 2*32(Dst) + VMOVDQU Y3, 3*32(Dst) ADDQ $128, A ADDQ $128, B @@ -179,14 +179,14 @@ XOR_LOOP_64_AVX: CMPQ N, $64 JB XOR_LOOP_16_AVX - VMOVDQU (A), Y0 - VMOVDQU 32(A), Y1 + VMOVDQU 0*32(A), Y0 + VMOVDQU 1*32(A), Y1 - VPXOR (B), Y0, Y2 - VPXOR 32(B), Y1, Y3 + VPXOR 0*32(B), Y0, Y2 + VPXOR 1*32(B), Y1, Y3 - VMOVDQU Y2, (Dst) - VMOVDQU Y3, 32(Dst) + VMOVDQU Y2, 0*32(Dst) + VMOVDQU Y3, 1*32(Dst) ADDQ $64, A ADDQ $64, B