aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@linux.intel.com>2014-02-24 21:49:58 -0800
committerH. Peter Anvin <hpa@linux.intel.com>2014-03-04 14:45:52 -0800
commitc851f481cd16b1bd0442c8baa9c4ed12ceb86de3 (patch)
tree8c872f805a516428dfd47f6ac64250b37f3dfb39
parent795666e61962e8d781390713cb9cd39043f5c334 (diff)
downloadrng-tools-c851f481cd16b1bd0442c8baa9c4ed12ceb86de3.tar.gz
rdrand_asm.S: On x86-64 we have enough registers, avoid repeated loads
On x86-64 there are enough registers that there really is no point in using a repeated memory operand for the key material. Load it into a register instead, hopefully it will be slightly faster. Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
-rw-r--r--rdrand_asm.S26
1 files changed, 26 insertions, 0 deletions
diff --git a/rdrand_asm.S b/rdrand_asm.S
index 0bd4b04..4b8fdc5 100644
--- a/rdrand_asm.S
+++ b/rdrand_asm.S
@@ -122,7 +122,20 @@ ENTRY(x86_aes_mangle)
pxor (6*16)(PTR1), %xmm6
pxor (7*16)(PTR1), %xmm7
+offset = 0
.rept 10
+#ifdef __x86_64__
+ movdqa offset(PTR2), %xmm8
+offset = offset + 16
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xc0 /* aesenc %xmm8, %xmm0 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xc8 /* aesenc %xmm8, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xd0 /* aesenc %xmm8, %xmm2 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xd8 /* aesenc %xmm8, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xe0 /* aesenc %xmm8, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xe8 /* aesenc %xmm8, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xf0 /* aesenc %xmm8, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xdc,0xf8 /* aesenc %xmm8, %xmm7 */
+#else
.byte 0x66,0x0f,0x38,0xdc,0x00+NPTR2 /* aesenc (PTR2), %xmm0 */
.byte 0x66,0x0f,0x38,0xdc,0x08+NPTR2 /* aesenc (PTR2), %xmm1 */
.byte 0x66,0x0f,0x38,0xdc,0x10+NPTR2 /* aesenc (PTR2), %xmm2 */
@@ -132,8 +145,20 @@ ENTRY(x86_aes_mangle)
.byte 0x66,0x0f,0x38,0xdc,0x30+NPTR2 /* aesenc (PTR2), %xmm6 */
.byte 0x66,0x0f,0x38,0xdc,0x38+NPTR2 /* aesenc (PTR2), %xmm7 */
add $16, PTR2
+#endif
.endr
+#ifdef __x86_64__
+ movdqa offset(PTR2), %xmm8
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xc0 /* aesenclast %xmm8, %xmm0 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xc8 /* aesenclast %xmm8, %xmm1 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xd0 /* aesenclast %xmm8, %xmm2 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xd8 /* aesenclast %xmm8, %xmm3 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xe0 /* aesenclast %xmm8, %xmm4 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xe8 /* aesenclast %xmm8, %xmm5 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xf0 /* aesenclast %xmm8, %xmm6 */
+ .byte 0x66,0x41,0x0f,0x38,0xdd,0xf8 /* aesenclast %xmm8, %xmm7 */
+#else
.byte 0x66,0x0f,0x38,0xdd,0x00+NPTR2 /* aesenclast (PTR2), %xmm0 */
.byte 0x66,0x0f,0x38,0xdd,0x08+NPTR2 /* aesenclast (PTR2), %xmm1 */
.byte 0x66,0x0f,0x38,0xdd,0x10+NPTR2 /* aesenclast (PTR2), %xmm2 */
@@ -142,6 +167,7 @@ ENTRY(x86_aes_mangle)
.byte 0x66,0x0f,0x38,0xdd,0x28+NPTR2 /* aesenclast (PTR2), %xmm5 */
.byte 0x66,0x0f,0x38,0xdd,0x30+NPTR2 /* aesenclast (PTR2), %xmm6 */
.byte 0x66,0x0f,0x38,0xdd,0x38+NPTR2 /* aesenclast (PTR2), %xmm7 */
+#endif
movdqa %xmm0, (0*16)(PTR0)
movdqa %xmm1, (1*16)(PTR0)