From: Denis Vlasenko Looks like open-coded be_to_cpu. GCC produces rather poor code for this. be_to_cpu produces asm()s which are ~4 times shorter. Compile-tested only. I am not sure whether input can be 64bit-unaligned. If it indeed can be, replace: ((u64*)(input))[I] -> get_unaligned( ((u64*)(input))+I ) Signed-off-by: Andrew Morton --- 25-akpm/crypto/sha512.c | 19 ++----------------- 1 files changed, 2 insertions(+), 17 deletions(-) diff -puN crypto/sha512.c~small-sha512-cleanup crypto/sha512.c --- 25/crypto/sha512.c~small-sha512-cleanup 2004-10-01 21:20:42.100900176 -0700 +++ 25-akpm/crypto/sha512.c 2004-10-01 21:20:42.104899568 -0700 @@ -104,27 +104,12 @@ const u64 sha512_K[80] = { static inline void LOAD_OP(int I, u64 *W, const u8 *input) { - u64 t1 = input[(8*I) ] & 0xff; - t1 <<= 8; - t1 |= input[(8*I)+1] & 0xff; - t1 <<= 8; - t1 |= input[(8*I)+2] & 0xff; - t1 <<= 8; - t1 |= input[(8*I)+3] & 0xff; - t1 <<= 8; - t1 |= input[(8*I)+4] & 0xff; - t1 <<= 8; - t1 |= input[(8*I)+5] & 0xff; - t1 <<= 8; - t1 |= input[(8*I)+6] & 0xff; - t1 <<= 8; - t1 |= input[(8*I)+7] & 0xff; - W[I] = t1; + W[I] = __be64_to_cpu( ((u64*)(input))[I] ); } static inline void BLEND_OP(int I, u64 *W) { - W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16]; + W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16]; } static void _