diff options
author | H. Peter Anvin <hpa@linux.intel.com> | 2014-02-24 21:49:58 -0800 |
---|---|---|
committer | H. Peter Anvin <hpa@linux.intel.com> | 2014-03-04 14:45:52 -0800 |
commit | c851f481cd16b1bd0442c8baa9c4ed12ceb86de3 (patch) | |
tree | 8c872f805a516428dfd47f6ac64250b37f3dfb39 | |
parent | 795666e61962e8d781390713cb9cd39043f5c334 (diff) | |
download | rng-tools-c851f481cd16b1bd0442c8baa9c4ed12ceb86de3.tar.gz |
rdrand_asm.S: On x86-64 we have enough registers, avoid repeated loads
On x86-64 there are enough registers that there really is no point in
using a repeated memory operand for the key material. Load it into a
register instead, hopefully it will be slightly faster.
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
-rw-r--r-- | rdrand_asm.S | 26 |
1 files changed, 26 insertions, 0 deletions
diff --git a/rdrand_asm.S b/rdrand_asm.S index 0bd4b04..4b8fdc5 100644 --- a/rdrand_asm.S +++ b/rdrand_asm.S @@ -122,7 +122,20 @@ ENTRY(x86_aes_mangle) pxor (6*16)(PTR1), %xmm6 pxor (7*16)(PTR1), %xmm7 +offset = 0 .rept 10 +#ifdef __x86_64__ + movdqa offset(PTR2), %xmm8 +offset = offset + 16 + .byte 0x66,0x41,0x0f,0x38,0xdc,0xc0 /* aesenc %xmm8, %xmm0 */ + .byte 0x66,0x41,0x0f,0x38,0xdc,0xc8 /* aesenc %xmm8, %xmm1 */ + .byte 0x66,0x41,0x0f,0x38,0xdc,0xd0 /* aesenc %xmm8, %xmm2 */ + .byte 0x66,0x41,0x0f,0x38,0xdc,0xd8 /* aesenc %xmm8, %xmm3 */ + .byte 0x66,0x41,0x0f,0x38,0xdc,0xe0 /* aesenc %xmm8, %xmm4 */ + .byte 0x66,0x41,0x0f,0x38,0xdc,0xe8 /* aesenc %xmm8, %xmm5 */ + .byte 0x66,0x41,0x0f,0x38,0xdc,0xf0 /* aesenc %xmm8, %xmm6 */ + .byte 0x66,0x41,0x0f,0x38,0xdc,0xf8 /* aesenc %xmm8, %xmm7 */ +#else .byte 0x66,0x0f,0x38,0xdc,0x00+NPTR2 /* aesenc (PTR2), %xmm0 */ .byte 0x66,0x0f,0x38,0xdc,0x08+NPTR2 /* aesenc (PTR2), %xmm1 */ .byte 0x66,0x0f,0x38,0xdc,0x10+NPTR2 /* aesenc (PTR2), %xmm2 */ @@ -132,8 +145,20 @@ ENTRY(x86_aes_mangle) .byte 0x66,0x0f,0x38,0xdc,0x30+NPTR2 /* aesenc (PTR2), %xmm6 */ .byte 0x66,0x0f,0x38,0xdc,0x38+NPTR2 /* aesenc (PTR2), %xmm7 */ add $16, PTR2 +#endif .endr +#ifdef __x86_64__ + movdqa offset(PTR2), %xmm8 + .byte 0x66,0x41,0x0f,0x38,0xdd,0xc0 /* aesenclast %xmm8, %xmm0 */ + .byte 0x66,0x41,0x0f,0x38,0xdd,0xc8 /* aesenclast %xmm8, %xmm1 */ + .byte 0x66,0x41,0x0f,0x38,0xdd,0xd0 /* aesenclast %xmm8, %xmm2 */ + .byte 0x66,0x41,0x0f,0x38,0xdd,0xd8 /* aesenclast %xmm8, %xmm3 */ + .byte 0x66,0x41,0x0f,0x38,0xdd,0xe0 /* aesenclast %xmm8, %xmm4 */ + .byte 0x66,0x41,0x0f,0x38,0xdd,0xe8 /* aesenclast %xmm8, %xmm5 */ + .byte 0x66,0x41,0x0f,0x38,0xdd,0xf0 /* aesenclast %xmm8, %xmm6 */ + .byte 0x66,0x41,0x0f,0x38,0xdd,0xf8 /* aesenclast %xmm8, %xmm7 */ +#else .byte 0x66,0x0f,0x38,0xdd,0x00+NPTR2 /* aesenclast (PTR2), %xmm0 */ .byte 0x66,0x0f,0x38,0xdd,0x08+NPTR2 /* aesenclast (PTR2), %xmm1 */ .byte 0x66,0x0f,0x38,0xdd,0x10+NPTR2 /* aesenclast (PTR2), %xmm2 */ @@ -142,6 +167,7 @@ ENTRY(x86_aes_mangle) .byte 0x66,0x0f,0x38,0xdd,0x28+NPTR2 /* aesenclast (PTR2), %xmm5 */ .byte 0x66,0x0f,0x38,0xdd,0x30+NPTR2 /* aesenclast (PTR2), %xmm6 */ .byte 0x66,0x0f,0x38,0xdd,0x38+NPTR2 /* aesenclast (PTR2), %xmm7 */ +#endif movdqa %xmm0, (0*16)(PTR0) movdqa %xmm1, (1*16)(PTR0) |