1 // sha.cpp - modified by Wei Dai from Steve Reid's public domain sha1.c
3 // Steve Reid implemented SHA-1. Wei Dai implemented SHA-2.
4 // Both are in the public domain.
6 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sha.cpp" to generate MASM code
10 #ifndef CRYPTOPP_IMPORTS
11 #ifndef CRYPTOPP_GENERATE_X64_MASM
17 NAMESPACE_BEGIN(CryptoPP)
19 // start of Steve Reid's code
21 #define blk0(i) (W[i] = data[i])
22 #define blk1(i) (W[i&15] = rotlFixed(W[(i+13)&15]^W[(i+8)&15]^W[(i+2)&15]^W[i&15],1))
24 void SHA1::InitState(HashWordType *state)
26 state[0] = 0x67452301L;
27 state[1] = 0xEFCDAB89L;
28 state[2] = 0x98BADCFEL;
29 state[3] = 0x10325476L;
30 state[4] = 0xC3D2E1F0L;
33 #define f1(x,y,z) (z^(x&(y^z)))
34 #define f2(x,y,z) (x^y^z)
35 #define f3(x,y,z) ((x&y)|(z&(x|y)))
36 #define f4(x,y,z) (x^y^z)
38 /* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
39 #define R0(v,w,x,y,z,i) z+=f1(w,x,y)+blk0(i)+0x5A827999+rotlFixed(v,5);w=rotlFixed(w,30);
40 #define R1(v,w,x,y,z,i) z+=f1(w,x,y)+blk1(i)+0x5A827999+rotlFixed(v,5);w=rotlFixed(w,30);
41 #define R2(v,w,x,y,z,i) z+=f2(w,x,y)+blk1(i)+0x6ED9EBA1+rotlFixed(v,5);w=rotlFixed(w,30);
42 #define R3(v,w,x,y,z,i) z+=f3(w,x,y)+blk1(i)+0x8F1BBCDC+rotlFixed(v,5);w=rotlFixed(w,30);
43 #define R4(v,w,x,y,z,i) z+=f4(w,x,y)+blk1(i)+0xCA62C1D6+rotlFixed(v,5);w=rotlFixed(w,30);
45 void SHA1::Transform(word32 *state, const word32 *data)
48 /* Copy context->state[] to working vars */
54 /* 4 rounds of 20 operations each. Loop unrolled. */
55 R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3);
56 R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7);
57 R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11);
58 R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15);
59 R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19);
60 R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23);
61 R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27);
62 R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31);
63 R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35);
64 R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39);
65 R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43);
66 R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47);
67 R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51);
68 R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55);
69 R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59);
70 R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63);
71 R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67);
72 R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71);
73 R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75);
74 R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79);
75 /* Add the working vars back into context.state[] */
83 // end of Steve Reid's code
85 // *************************************************************
87 void SHA224::InitState(HashWordType *state)
89 static const word32 s[8] = {0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4};
90 memcpy(state, s, sizeof(s));
93 void SHA256::InitState(HashWordType *state)
95 static const word32 s[8] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
96 memcpy(state, s, sizeof(s));
99 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
100 CRYPTOPP_ALIGN_DATA(16) extern const word32 SHA256_K[64] CRYPTOPP_SECTION_ALIGN16 = {
102 extern const word32 SHA256_K[64] = {
104 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
105 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
106 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
107 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
108 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
109 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
110 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
111 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
112 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
113 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
114 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
115 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
116 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
117 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
118 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
119 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
122 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
124 #if defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_GENERATE_X64_MASM)
126 #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
128 static void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(word32 *state, const word32 *data, size_t len
129 #if defined(_MSC_VER) && (_MSC_VER == 1200)
130 , ... // VC60 workaround: prevent VC 6 from inlining this function
134 #if defined(_MSC_VER) && (_MSC_VER == 1200)
135 AS2(mov ecx, [state])
139 #define LOCALS_SIZE 8*4 + 16*4 + 4*WORD_SZ
140 #define H(i) [BASE+ASM_MOD(1024+7-(i),8)*4]
148 #define Wt(i) BASE+8*4+ASM_MOD(1024+15-(i),16)*4
149 #define Wt_2(i) Wt((i)-2)
150 #define Wt_15(i) Wt((i)-15)
151 #define Wt_7(i) Wt((i)-7)
152 #define K_END [BASE+8*4+16*4+0*WORD_SZ]
153 #define STATE_SAVE [BASE+8*4+16*4+1*WORD_SZ]
154 #define DATA_SAVE [BASE+8*4+16*4+2*WORD_SZ]
155 #define DATA_END [BASE+8*4+16*4+3*WORD_SZ]
156 #define Kt(i) WORD_REG(si)+(i)*4
157 #if CRYPTOPP_BOOL_X86
159 #elif defined(__GNUC__)
165 #define RA0(i, edx, edi) \
166 AS2( add edx, [Kt(i)] )\
167 AS2( add edx, [Wt(i)] )\
168 AS2( add edx, H(i) )\
170 #define RA1(i, edx, edi)
172 #define RB0(i, edx, edi)
174 #define RB1(i, edx, edi) \
175 AS2( mov AS_REG_7d, [Wt_2(i)] )\
176 AS2( mov edi, [Wt_15(i)])\
177 AS2( mov ebx, AS_REG_7d )\
178 AS2( shr AS_REG_7d, 10 )\
180 AS2( xor AS_REG_7d, ebx )\
182 AS2( xor ebx, AS_REG_7d )/* s1(W_t-2) */\
183 AS2( add ebx, [Wt_7(i)])\
184 AS2( mov AS_REG_7d, edi )\
185 AS2( shr AS_REG_7d, 3 )\
187 AS2( add ebx, [Wt(i)])/* s1(W_t-2) + W_t-7 + W_t-16 */\
188 AS2( xor AS_REG_7d, edi )\
189 AS2( add edx, [Kt(i)])\
191 AS2( add edx, H(i) )\
192 AS2( xor AS_REG_7d, edi )/* s0(W_t-15) */\
193 AS2( add AS_REG_7d, ebx )/* W_t = s1(W_t-2) + W_t-7 + s0(W_t-15) W_t-16*/\
194 AS2( mov [Wt(i)], AS_REG_7d)\
195 AS2( add edx, AS_REG_7d )\
197 #define ROUND(i, r, eax, ecx, edi, edx)\
199 /* unused: eax, ecx, temp: ebx, AS_REG_7d, out: edx = T1 */\
200 AS2( mov edx, F(i) )\
201 AS2( xor edx, G(i) )\
203 AS2( xor edx, G(i) )/* Ch(E,F,G) = (G^(E&(F^G))) */\
204 AS2( mov AS_REG_7d, edi )\
206 AS2( ror AS_REG_7d, 25 )\
207 RA##r(i, edx, edi )/* H + Wt + Kt + Ch(E,F,G) */\
208 AS2( xor AS_REG_7d, edi )\
210 AS2( xor AS_REG_7d, edi )/* S1(E) */\
211 AS2( add edx, AS_REG_7d )/* T1 = S1(E) + Ch(E,F,G) + H + Wt + Kt */\
212 RB##r(i, edx, edi )/* H + Wt + Kt + Ch(E,F,G) */\
213 /* in: ecx = A, eax = B^C, edx = T1 */\
214 /* unused: edx, temp: ebx, AS_REG_7d, out: eax = A, ecx = B^C, edx = E */\
216 AS2( xor ecx, B(i) )/* A^B */\
218 AS2( xor eax, B(i) )/* Maj(A,B,C) = B^((A^B)&(B^C) */\
219 AS2( mov AS_REG_7d, ebx )\
221 AS2( add eax, edx )/* T1 + Maj(A,B,C) */\
222 AS2( add edx, D(i) )\
223 AS2( mov D(i), edx )\
224 AS2( ror AS_REG_7d, 22 )\
225 AS2( xor AS_REG_7d, ebx )\
227 AS2( xor AS_REG_7d, ebx )\
228 AS2( add eax, AS_REG_7d )/* T1 + S0(A) + Maj(A,B,C) */\
229 AS2( mov H(i), eax )\
231 #define SWAP_COPY(i) \
232 AS2( mov WORD_REG(bx), [WORD_REG(dx)+i*WORD_SZ])\
233 AS1( bswap WORD_REG(bx))\
234 AS2( mov [Wt(i*(1+CRYPTOPP_BOOL_X64)+CRYPTOPP_BOOL_X64)], WORD_REG(bx))
236 #if defined(__GNUC__)
237 #if CRYPTOPP_BOOL_X64
238 FixedSizeAlignedSecBlock<byte, LOCALS_SIZE> workspace;
242 #if CRYPTOPP_BOOL_X64
245 ".intel_syntax noprefix;"
246 #elif defined(CRYPTOPP_GENERATE_X64_MASM)
248 X86_SHA256_HashBlocks PROC FRAME
253 alloc_stack(LOCALS_SIZE+8)
256 lea rsi, [?SHA256_K@CryptoPP@@3QBIB + 48*4]
259 #if CRYPTOPP_BOOL_X86
262 AS2( lea WORD_REG(si), [SHA256_K+48*4])
264 #if !defined(_MSC_VER) || (_MSC_VER < 1400)
271 AS2( sub WORD_REG(sp), LOCALS_SIZE)
274 AS2( mov STATE_SAVE, WORD_REG(cx))
275 AS2( mov DATA_SAVE, WORD_REG(dx))
276 AS2( add WORD_REG(di), WORD_REG(dx))
277 AS2( mov DATA_END, WORD_REG(di))
278 AS2( mov K_END, WORD_REG(si))
280 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
281 #if CRYPTOPP_BOOL_X86
285 AS2( movdqa xmm0, XMMWORD_PTR [WORD_REG(cx)+0*16])
286 AS2( movdqa xmm1, XMMWORD_PTR [WORD_REG(cx)+1*16])
289 #if CRYPTOPP_BOOL_X86
290 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
302 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
304 AS2( movdqa E(0), xmm1)
305 AS2( movdqa A(0), xmm0)
307 #if CRYPTOPP_BOOL_X86
310 AS2( sub WORD_REG(si), 48*4)
311 SWAP_COPY(0) SWAP_COPY(1) SWAP_COPY(2) SWAP_COPY(3)
312 SWAP_COPY(4) SWAP_COPY(5) SWAP_COPY(6) SWAP_COPY(7)
313 #if CRYPTOPP_BOOL_X86
314 SWAP_COPY(8) SWAP_COPY(9) SWAP_COPY(10) SWAP_COPY(11)
315 SWAP_COPY(12) SWAP_COPY(13) SWAP_COPY(14) SWAP_COPY(15)
317 AS2( mov edi, E(0)) // E
318 AS2( mov eax, B(0)) // B
319 AS2( xor eax, C(0)) // B^C
320 AS2( mov ecx, A(0)) // A
322 ROUND(0, 0, eax, ecx, edi, edx)
323 ROUND(1, 0, ecx, eax, edx, edi)
324 ROUND(2, 0, eax, ecx, edi, edx)
325 ROUND(3, 0, ecx, eax, edx, edi)
326 ROUND(4, 0, eax, ecx, edi, edx)
327 ROUND(5, 0, ecx, eax, edx, edi)
328 ROUND(6, 0, eax, ecx, edi, edx)
329 ROUND(7, 0, ecx, eax, edx, edi)
330 ROUND(8, 0, eax, ecx, edi, edx)
331 ROUND(9, 0, ecx, eax, edx, edi)
332 ROUND(10, 0, eax, ecx, edi, edx)
333 ROUND(11, 0, ecx, eax, edx, edi)
334 ROUND(12, 0, eax, ecx, edi, edx)
335 ROUND(13, 0, ecx, eax, edx, edi)
336 ROUND(14, 0, eax, ecx, edi, edx)
337 ROUND(15, 0, ecx, eax, edx, edi)
340 AS2(add WORD_REG(si), 4*16)
341 ROUND(0, 1, eax, ecx, edi, edx)
342 ROUND(1, 1, ecx, eax, edx, edi)
343 ROUND(2, 1, eax, ecx, edi, edx)
344 ROUND(3, 1, ecx, eax, edx, edi)
345 ROUND(4, 1, eax, ecx, edi, edx)
346 ROUND(5, 1, ecx, eax, edx, edi)
347 ROUND(6, 1, eax, ecx, edi, edx)
348 ROUND(7, 1, ecx, eax, edx, edi)
349 ROUND(8, 1, eax, ecx, edi, edx)
350 ROUND(9, 1, ecx, eax, edx, edi)
351 ROUND(10, 1, eax, ecx, edi, edx)
352 ROUND(11, 1, ecx, eax, edx, edi)
353 ROUND(12, 1, eax, ecx, edi, edx)
354 ROUND(13, 1, ecx, eax, edx, edi)
355 ROUND(14, 1, eax, ecx, edi, edx)
356 ROUND(15, 1, ecx, eax, edx, edi)
357 AS2( cmp WORD_REG(si), K_END)
360 AS2( mov WORD_REG(dx), DATA_SAVE)
361 AS2( add WORD_REG(dx), 64)
362 AS2( mov AS_REG_7, STATE_SAVE)
363 AS2( mov DATA_SAVE, WORD_REG(dx))
365 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
366 #if CRYPTOPP_BOOL_X86
367 AS2( test DWORD PTR DATA_END, 1)
370 AS2( movdqa xmm1, XMMWORD_PTR [AS_REG_7+1*16])
371 AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_7+0*16])
372 AS2( paddd xmm1, E(0))
373 AS2( paddd xmm0, A(0))
374 AS2( movdqa [AS_REG_7+1*16], xmm1)
375 AS2( movdqa [AS_REG_7+0*16], xmm0)
376 AS2( cmp WORD_REG(dx), DATA_END)
380 #if CRYPTOPP_BOOL_X86
381 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
385 AS2( add [AS_REG_7+0*4], ecx) // A
386 AS2( add [AS_REG_7+4*4], edi) // E
390 AS2( add [AS_REG_7+1*4], eax)
391 AS2( add [AS_REG_7+2*4], ebx)
392 AS2( add [AS_REG_7+3*4], ecx)
396 AS2( add [AS_REG_7+5*4], eax)
397 AS2( add [AS_REG_7+6*4], ebx)
398 AS2( add [AS_REG_7+7*4], ecx)
399 AS2( mov ecx, AS_REG_7d)
400 AS2( cmp WORD_REG(dx), DATA_END)
402 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
409 #if !defined(_MSC_VER) || (_MSC_VER < 1400)
413 #ifdef CRYPTOPP_GENERATE_X64_MASM
414 add rsp, LOCALS_SIZE+8
420 X86_SHA256_HashBlocks ENDP
424 ".att_syntax prefix;"
426 : "c" (state), "d" (data), "S" (SHA256_K+48), "D" (len)
427 #if CRYPTOPP_BOOL_X64
430 : "memory", "cc", "%eax"
431 #if CRYPTOPP_BOOL_X64
438 #endif // #if defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_GENERATE_X64_MASM)
440 #ifndef CRYPTOPP_GENERATE_X64_MASM
442 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
444 void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(word32 *state, const word32 *data, size_t len);
448 #if defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X64_MASM_AVAILABLE)
450 size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length)
452 X86_SHA256_HashBlocks(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
453 return length % BLOCKSIZE;
456 size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length)
458 X86_SHA256_HashBlocks(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
459 return length % BLOCKSIZE;
464 #define blk2(i) (W[i&15]+=s1(W[(i-2)&15])+W[(i-7)&15]+s0(W[(i-15)&15]))
466 #define Ch(x,y,z) (z^(x&(y^z)))
467 #define Maj(x,y,z) (y^((x^y)&(y^z)))
469 #define a(i) T[(0-i)&7]
470 #define b(i) T[(1-i)&7]
471 #define c(i) T[(2-i)&7]
472 #define d(i) T[(3-i)&7]
473 #define e(i) T[(4-i)&7]
474 #define f(i) T[(5-i)&7]
475 #define g(i) T[(6-i)&7]
476 #define h(i) T[(7-i)&7]
478 #define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA256_K[i+j]+(j?blk2(i):blk0(i));\
479 d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
482 #define S0(x) (rotrFixed(x,2)^rotrFixed(x,13)^rotrFixed(x,22))
483 #define S1(x) (rotrFixed(x,6)^rotrFixed(x,11)^rotrFixed(x,25))
484 #define s0(x) (rotrFixed(x,7)^rotrFixed(x,18)^(x>>3))
485 #define s1(x) (rotrFixed(x,17)^rotrFixed(x,19)^(x>>10))
487 void SHA256::Transform(word32 *state, const word32 *data)
490 #if defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X64_MASM_AVAILABLE)
491 // this byte reverse is a waste of time, but this function is only called by MDC
492 ByteReverse(W, data, BLOCKSIZE);
493 X86_SHA256_HashBlocks(state, W, BLOCKSIZE - !HasSSE2());
496 /* Copy context->state[] to working vars */
497 memcpy(T, state, sizeof(T));
498 /* 64 operations, partially loop unrolled */
499 for (unsigned int j=0; j<64; j+=16)
501 R( 0); R( 1); R( 2); R( 3);
502 R( 4); R( 5); R( 6); R( 7);
503 R( 8); R( 9); R(10); R(11);
504 R(12); R(13); R(14); R(15);
506 /* Add the working vars back into context.state[] */
519 // smaller but slower
520 void SHA256::Transform(word32 *state, const word32 *data)
524 unsigned int i = 0, j = 0;
527 memcpy(t, state, 8*4);
528 word32 e = t[4], a = t[0];
537 w += Ch(e, t[5], t[6]);
541 a = w + Maj(a, t[1], t[2]);
552 word32 w = s1(W[i+16-2]) + s0(W[i+16-15]) + W[i] + W[i+16-7];
557 w += Ch(e, t[5], t[6]);
561 a = w + Maj(a, t[1], t[2]);
564 w = s1(W[(i+1)+16-2]) + s0(W[(i+1)+16-15]) + W[(i+1)] + W[(i+1)+16-7];
565 W[(i+1)+16] = W[(i+1)] = w;
569 w += Ch(e, (t-1)[5], (t-1)[6]);
571 (t-1)[3] = (t-1)[3+8] = e;
573 a = w + Maj(a, (t-1)[1], (t-1)[2]);
574 (t-1)[-1] = (t-1)[7] = a;
599 // *************************************************************
601 void SHA384::InitState(HashWordType *state)
603 static const word64 s[8] = {
604 W64LIT(0xcbbb9d5dc1059ed8), W64LIT(0x629a292a367cd507),
605 W64LIT(0x9159015a3070dd17), W64LIT(0x152fecd8f70e5939),
606 W64LIT(0x67332667ffc00b31), W64LIT(0x8eb44a8768581511),
607 W64LIT(0xdb0c2e0d64f98fa7), W64LIT(0x47b5481dbefa4fa4)};
608 memcpy(state, s, sizeof(s));
611 void SHA512::InitState(HashWordType *state)
613 static const word64 s[8] = {
614 W64LIT(0x6a09e667f3bcc908), W64LIT(0xbb67ae8584caa73b),
615 W64LIT(0x3c6ef372fe94f82b), W64LIT(0xa54ff53a5f1d36f1),
616 W64LIT(0x510e527fade682d1), W64LIT(0x9b05688c2b3e6c1f),
617 W64LIT(0x1f83d9abfb41bd6b), W64LIT(0x5be0cd19137e2179)};
618 memcpy(state, s, sizeof(s));
621 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
622 CRYPTOPP_ALIGN_DATA(16) static const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN16 = {
624 static const word64 SHA512_K[80] = {
626 W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
627 W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
628 W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
629 W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118),
630 W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe),
631 W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2),
632 W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1),
633 W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694),
634 W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3),
635 W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65),
636 W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483),
637 W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5),
638 W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210),
639 W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4),
640 W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725),
641 W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70),
642 W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926),
643 W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df),
644 W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8),
645 W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b),
646 W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001),
647 W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30),
648 W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910),
649 W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8),
650 W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53),
651 W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8),
652 W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb),
653 W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3),
654 W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60),
655 W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec),
656 W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9),
657 W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b),
658 W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207),
659 W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178),
660 W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6),
661 W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b),
662 W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493),
663 W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c),
664 W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a),
665 W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
668 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
669 // put assembly version in separate function, otherwise MSVC 2005 SP1 doesn't generate correct code for the non-assembly version
670 CRYPTOPP_NAKED static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 *data)
675 ".intel_syntax noprefix;"
682 AS2( lea ebx, SHA512_K)
686 AS2( and esp, 0xfffffff0)
687 AS2( sub esp, 27*16) // 17*16 for expanded data, 20*8 for state
690 AS2( lea edi, [esp+4+8*8]) // start at middle of state buffer. will decrement pointer each round to avoid copying
691 AS2( lea esi, [esp+4+20*8+8]) // 16-byte alignment, then add 8
693 AS2( movdqa xmm0, [ecx+0*16])
694 AS2( movdq2q mm4, xmm0)
695 AS2( movdqa [edi+0*16], xmm0)
696 AS2( movdqa xmm0, [ecx+1*16])
697 AS2( movdqa [edi+1*16], xmm0)
698 AS2( movdqa xmm0, [ecx+2*16])
699 AS2( movdq2q mm5, xmm0)
700 AS2( movdqa [edi+2*16], xmm0)
701 AS2( movdqa xmm0, [ecx+3*16])
702 AS2( movdqa [edi+3*16], xmm0)
705 #define SSE2_S0_S1(r, a, b, c) \
709 AS2( psllq mm6, 64-c)\
713 AS2( psllq mm6, c-b)\
717 AS2( psllq mm6, b-a)\
720 #define SSE2_s0(r, a, b, c) \
721 AS2( movdqa xmm6, r)\
723 AS2( movdqa xmm7, r)\
724 AS2( psllq xmm6, 64-c)\
725 AS2( pxor xmm7, xmm6)\
730 AS2( psllq xmm6, c-a)\
733 #define SSE2_s1(r, a, b, c) \
734 AS2( movdqa xmm6, r)\
736 AS2( movdqa xmm7, r)\
737 AS2( psllq xmm6, 64-c)\
738 AS2( pxor xmm7, xmm6)\
741 AS2( psllq xmm6, c-b)\
742 AS2( pxor xmm7, xmm6)\
747 // k + w is in mm0, a is in mm4, e is in mm5
748 AS2( paddq mm0, [edi+7*8]) // h
749 AS2( movq mm2, [edi+5*8]) // f
750 AS2( movq mm3, [edi+6*8]) // g
753 SSE2_S0_S1(mm5,14,18,41)
755 AS2( paddq mm0, mm2) // h += Ch(e,f,g)
756 AS2( paddq mm5, mm0) // h += S1(e)
757 AS2( movq mm2, [edi+1*8]) // b
760 AS2( pand mm2, [edi+2*8]) // c
763 AS2( paddq mm1, mm5) // temp = h + Maj(a,b,c)
764 AS2( paddq mm5, [edi+3*8]) // e = d + h
765 AS2( movq [edi+3*8], mm5)
766 AS2( movq [edi+11*8], mm5)
767 SSE2_S0_S1(mm4,28,34,39) // S0(a)
768 AS2( paddq mm4, mm1) // a = temp + S0(a)
769 AS2( movq [edi-8], mm4)
770 AS2( movq [edi+7*8], mm4)
775 AS2( movq mm0, [edx+eax*8])
776 AS2( movq [esi+eax*8], mm0)
777 AS2( movq [esi+eax*8+16*8], mm0)
778 AS2( paddq mm0, [ebx+eax*8])
779 ASC( call, SHA512_Round)
788 // rest of the rounds
789 AS2( movdqu xmm0, [esi+(16-2)*8])
791 // data expansion, W[i-2] already in xmm0
792 AS2( movdqu xmm3, [esi])
793 AS2( paddq xmm3, [esi+(16-7)*8])
794 AS2( movdqa xmm2, [esi+(16-15)*8])
795 SSE2_s1(xmm0, 6, 19, 61)
796 AS2( paddq xmm0, xmm3)
797 SSE2_s0(xmm2, 1, 7, 8)
798 AS2( paddq xmm0, xmm2)
799 AS2( movdq2q mm0, xmm0)
800 AS2( movhlps xmm1, xmm0)
801 AS2( paddq mm0, [ebx+eax*8])
802 AS2( movlps [esi], xmm0)
803 AS2( movlps [esi+8], xmm1)
804 AS2( movlps [esi+8*16], xmm0)
805 AS2( movlps [esi+8*17], xmm1)
807 ASC( call, SHA512_Round)
809 AS2( movdq2q mm0, xmm1)
810 AS2( paddq mm0, [ebx+eax*8+8])
811 ASC( call, SHA512_Round)
812 // update indices and loop
818 // do housekeeping every 8 rounds
821 AS2( lea esi, [esp+4+20*8+8+esi*8])
826 #define SSE2_CombineState(i) \
827 AS2( movdqa xmm0, [edi+i*16])\
828 AS2( paddq xmm0, [ecx+i*16])\
829 AS2( movdqa [ecx+i*16], xmm0)
839 #if defined(__GNUC__)
841 ".att_syntax prefix;"
843 : "a" (SHA512_K), "c" (state), "d" (data)
844 : "%esi", "%edi", "memory", "cc"
853 #endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
855 void SHA512::Transform(word64 *state, const word64 *data)
857 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
860 SHA512_SSE2_Transform(state, data);
865 #define S0(x) (rotrFixed(x,28)^rotrFixed(x,34)^rotrFixed(x,39))
866 #define S1(x) (rotrFixed(x,14)^rotrFixed(x,18)^rotrFixed(x,41))
867 #define s0(x) (rotrFixed(x,1)^rotrFixed(x,8)^(x>>7))
868 #define s1(x) (rotrFixed(x,19)^rotrFixed(x,61)^(x>>6))
870 #define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA512_K[i+j]+(j?blk2(i):blk0(i));\
871 d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
875 /* Copy context->state[] to working vars */
876 memcpy(T, state, sizeof(T));
877 /* 80 operations, partially loop unrolled */
878 for (unsigned int j=0; j<80; j+=16)
880 R( 0); R( 1); R( 2); R( 3);
881 R( 4); R( 5); R( 6); R( 7);
882 R( 8); R( 9); R(10); R(11);
883 R(12); R(13); R(14); R(15);
885 /* Add the working vars back into context.state[] */
898 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
899 #endif // #ifndef CRYPTOPP_IMPORTS