rdrand_asm.S: On x86-64 we have enough registers, avoid repeated loads

On x86-64 there are enough registers that there really is no point in using a repeated memory operand for the key material. Load it into a register instead, hopefully it will be slightly faster. Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
author: H. Peter Anvin <hpa@linux.intel.com> 2014-02-24 21:49:58 -0800
committer: H. Peter Anvin <hpa@linux.intel.com> 2014-03-04 14:45:52 -0800
commit: c851f481cd16b1bd0442c8baa9c4ed12ceb86de3 (patch)
tree: 8c872f805a516428dfd47f6ac64250b37f3dfb39
parent: 795666e61962e8d781390713cb9cd39043f5c334 (diff)
download: rng-tools-c851f481cd16b1bd0442c8baa9c4ed12ceb86de3.tar.gz
1 files changed, 26 insertions, 0 deletions
diff --git a/rdrand_asm.S b/rdrand_asm.S
index 0bd4b04..4b8fdc5 100644
--- a/rdrand_asm.S
+++ b/rdrand_asm.S
@@ -122,7 +122,20 @@ ENTRY(x86_aes_mangle)
 	pxor	(6*16)(PTR1), %xmm6
 	pxor	(7*16)(PTR1), %xmm7
 
+offset = 0
 	.rept 10
+#ifdef __x86_64__
+	movdqa	offset(PTR2), %xmm8
+offset = offset + 16
+	.byte	0x66,0x41,0x0f,0x38,0xdc,0xc0	/* aesenc %xmm8, %xmm0 */
+	.byte	0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc %xmm8, %xmm1 */
+	.byte	0x66,0x41,0x0f,0x38,0xdc,0xd0	/* aesenc %xmm8, %xmm2 */
+	.byte	0x66,0x41,0x0f,0x38,0xdc,0xd8	/* aesenc %xmm8, %xmm3 */
+	.byte	0x66,0x41,0x0f,0x38,0xdc,0xe0	/* aesenc %xmm8, %xmm4 */
+	.byte	0x66,0x41,0x0f,0x38,0xdc,0xe8	/* aesenc %xmm8, %xmm5 */
+	.byte	0x66,0x41,0x0f,0x38,0xdc,0xf0	/* aesenc %xmm8, %xmm6 */
+	.byte	0x66,0x41,0x0f,0x38,0xdc,0xf8	/* aesenc %xmm8, %xmm7 */
+#else
 	.byte	0x66,0x0f,0x38,0xdc,0x00+NPTR2	/* aesenc (PTR2), %xmm0 */
 	.byte	0x66,0x0f,0x38,0xdc,0x08+NPTR2	/* aesenc (PTR2), %xmm1 */
 	.byte	0x66,0x0f,0x38,0xdc,0x10+NPTR2	/* aesenc (PTR2), %xmm2 */
@@ -132,8 +145,20 @@ ENTRY(x86_aes_mangle)
 	.byte	0x66,0x0f,0x38,0xdc,0x30+NPTR2	/* aesenc (PTR2), %xmm6 */
 	.byte	0x66,0x0f,0x38,0xdc,0x38+NPTR2	/* aesenc (PTR2), %xmm7 */
 	add	$16, PTR2
+#endif
 	.endr
 
+#ifdef __x86_64__
+	movdqa	offset(PTR2), %xmm8
+	.byte	0x66,0x41,0x0f,0x38,0xdd,0xc0	/* aesenclast %xmm8, %xmm0 */
+	.byte	0x66,0x41,0x0f,0x38,0xdd,0xc8	/* aesenclast %xmm8, %xmm1 */
+	.byte	0x66,0x41,0x0f,0x38,0xdd,0xd0	/* aesenclast %xmm8, %xmm2 */
+	.byte	0x66,0x41,0x0f,0x38,0xdd,0xd8	/* aesenclast %xmm8, %xmm3 */
+	.byte	0x66,0x41,0x0f,0x38,0xdd,0xe0	/* aesenclast %xmm8, %xmm4 */
+	.byte	0x66,0x41,0x0f,0x38,0xdd,0xe8	/* aesenclast %xmm8, %xmm5 */
+	.byte	0x66,0x41,0x0f,0x38,0xdd,0xf0	/* aesenclast %xmm8, %xmm6 */
+	.byte	0x66,0x41,0x0f,0x38,0xdd,0xf8	/* aesenclast %xmm8, %xmm7 */
+#else
 	.byte	0x66,0x0f,0x38,0xdd,0x00+NPTR2	/* aesenclast (PTR2), %xmm0 */
 	.byte	0x66,0x0f,0x38,0xdd,0x08+NPTR2	/* aesenclast (PTR2), %xmm1 */
 	.byte	0x66,0x0f,0x38,0xdd,0x10+NPTR2	/* aesenclast (PTR2), %xmm2 */
@@ -142,6 +167,7 @@ ENTRY(x86_aes_mangle)
 	.byte	0x66,0x0f,0x38,0xdd,0x28+NPTR2	/* aesenclast (PTR2), %xmm5 */
 	.byte	0x66,0x0f,0x38,0xdd,0x30+NPTR2	/* aesenclast (PTR2), %xmm6 */
 	.byte	0x66,0x0f,0x38,0xdd,0x38+NPTR2	/* aesenclast (PTR2), %xmm7 */
+#endif
 
 	movdqa	%xmm0, (0*16)(PTR0)
 	movdqa	%xmm1, (1*16)(PTR0)
author	H. Peter Anvin <hpa@linux.intel.com>	2014-02-24 21:49:58 -0800
committer	H. Peter Anvin <hpa@linux.intel.com>	2014-03-04 14:45:52 -0800
commit	c851f481cd16b1bd0442c8baa9c4ed12ceb86de3 (patch)
tree	8c872f805a516428dfd47f6ac64250b37f3dfb39
parent	795666e61962e8d781390713cb9cd39043f5c334 (diff)
download	rng-tools-c851f481cd16b1bd0442c8baa9c4ed12ceb86de3.tar.gz