2 * Copyright 2012 pooler@litecoinpool.org
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License as published by the Free
6 * Software Foundation; either version 2 of the License, or (at your option)
7 * any later version. See COPYING for more details.
10 #if defined(__linux__) && defined(__ELF__)
11 .section .note.GNU-stack,"",%progbits
19 .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
20 .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
21 .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
22 .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
23 .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
24 .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
25 .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
26 .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
31 .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
32 .long 0x71374491, 0x71374491, 0x71374491, 0x71374491
33 .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
34 .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
35 .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
36 .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
37 .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
38 .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
39 .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
40 .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
41 .long 0x243185be, 0x243185be, 0x243185be, 0x243185be
42 .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
43 .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
44 .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
45 .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
46 .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
47 .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
48 .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
49 .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
50 .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
51 .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
52 .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
53 .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
54 .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
55 .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
56 .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
57 .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
58 .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
59 .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
60 .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
61 .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
62 .long 0x14292967, 0x14292967, 0x14292967, 0x14292967
63 .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
64 .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
65 .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
66 .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
67 .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
68 .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
69 .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
70 .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
71 .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
72 .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
73 .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
74 .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
75 .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
76 .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
77 .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
78 .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
79 .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
80 .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
81 .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
82 .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
83 .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
84 .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
85 .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
86 .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
87 .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
88 .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
89 .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
90 .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
91 .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
92 .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
93 .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
94 .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
98 .globl sha256_init_4way
99 .globl _sha256_init_4way
103 movdqa sha256_4h+0, %xmm0
104 movdqa sha256_4h+16, %xmm1
105 movdqa sha256_4h+32, %xmm2
106 movdqa sha256_4h+48, %xmm3
107 movdqu %xmm0, 0(%edx)
108 movdqu %xmm1, 16(%edx)
109 movdqu %xmm2, 32(%edx)
110 movdqu %xmm3, 48(%edx)
111 movdqa sha256_4h+64, %xmm0
112 movdqa sha256_4h+80, %xmm1
113 movdqa sha256_4h+96, %xmm2
114 movdqa sha256_4h+112, %xmm3
115 movdqu %xmm0, 64(%edx)
116 movdqu %xmm1, 80(%edx)
117 movdqu %xmm2, 96(%edx)
118 movdqu %xmm3, 112(%edx)
122 .macro sha256_sse2_extend_round i
123 movdqa (\i-15)*16(%eax), %xmm0
135 paddd (\i-16)*16(%eax), %xmm0
136 paddd (\i-7)*16(%eax), %xmm0
150 movdqa %xmm3, \i*16(%eax)
153 .macro sha256_sse2_extend_doubleround i
154 movdqa (\i-15)*16(%eax), %xmm0
155 movdqa (\i-14)*16(%eax), %xmm4
179 paddd (\i-16)*16(%eax), %xmm0
180 paddd (\i-15)*16(%eax), %xmm4
193 paddd (\i-7)*16(%eax), %xmm0
194 paddd (\i-6)*16(%eax), %xmm4
211 movdqa %xmm3, \i*16(%eax)
212 movdqa %xmm7, (\i+1)*16(%eax)
215 .macro sha256_sse2_main_round i
216 movdqa 16*(\i)(%eax), %xmm6
219 movdqa 16(%esp), %xmm2
221 paddd 32(%esp), %xmm6
223 movdqa %xmm2, 32(%esp)
224 movdqa 0(%esp), %xmm2
225 movdqa %xmm2, 16(%esp)
229 movdqa %xmm0, 0(%esp)
235 paddd 16*(\i)+sha256_4k, %xmm6
279 .macro sha256_sse2_main_quadround i
280 sha256_sse2_main_round \i+0
281 sha256_sse2_main_round \i+1
282 sha256_sse2_main_round \i+2
283 sha256_sse2_main_round \i+3
287 .macro p2bswap_esi_esp i
288 movdqu \i*16(%esi), %xmm0
289 movdqu (\i+1)*16(%esi), %xmm2
290 pshuflw $0xb1, %xmm0, %xmm0
291 pshuflw $0xb1, %xmm2, %xmm2
292 pshufhw $0xb1, %xmm0, %xmm0
293 pshufhw $0xb1, %xmm2, %xmm2
302 movdqa %xmm0, (\i+3)*16(%esp)
303 movdqa %xmm2, (\i+4)*16(%esp)
308 .globl sha256_transform_4way
309 .globl _sha256_transform_4way
310 sha256_transform_4way:
311 _sha256_transform_4way:
322 jnz sha256_transform_4way_swap
324 movdqu 0*16(%esi), %xmm0
325 movdqu 1*16(%esi), %xmm1
326 movdqu 2*16(%esi), %xmm2
327 movdqu 3*16(%esi), %xmm3
328 movdqu 4*16(%esi), %xmm4
329 movdqu 5*16(%esi), %xmm5
330 movdqu 6*16(%esi), %xmm6
331 movdqu 7*16(%esi), %xmm7
332 movdqa %xmm0, 3*16(%esp)
333 movdqa %xmm1, 4*16(%esp)
334 movdqa %xmm2, 5*16(%esp)
335 movdqa %xmm3, 6*16(%esp)
336 movdqa %xmm4, 7*16(%esp)
337 movdqa %xmm5, 8*16(%esp)
338 movdqa %xmm6, 9*16(%esp)
339 movdqa %xmm7, 10*16(%esp)
340 movdqu 8*16(%esi), %xmm0
341 movdqu 9*16(%esi), %xmm1
342 movdqu 10*16(%esi), %xmm2
343 movdqu 11*16(%esi), %xmm3
344 movdqu 12*16(%esi), %xmm4
345 movdqu 13*16(%esi), %xmm5
346 movdqu 14*16(%esi), %xmm6
347 movdqu 15*16(%esi), %xmm7
348 movdqa %xmm0, 11*16(%esp)
349 movdqa %xmm1, 12*16(%esp)
350 movdqa %xmm2, 13*16(%esp)
351 movdqa %xmm3, 14*16(%esp)
352 movdqa %xmm4, 15*16(%esp)
353 movdqa %xmm5, 16*16(%esp)
354 movdqa %xmm6, 17*16(%esp)
355 movdqa %xmm7, 18*16(%esp)
356 jmp sha256_transform_4way_extend
359 sha256_transform_4way_swap:
369 sha256_transform_4way_extend:
370 leal 19*16(%esp), %ecx
371 leal 48*16(%ecx), %eax
372 movdqa -2*16(%ecx), %xmm3
373 movdqa -1*16(%ecx), %xmm7
374 sha256_transform_4way_extend_loop:
375 movdqa -15*16(%ecx), %xmm0
376 movdqa -14*16(%ecx), %xmm4
400 paddd -16*16(%ecx), %xmm0
401 paddd -15*16(%ecx), %xmm4
414 paddd -7*16(%ecx), %xmm0
415 paddd -6*16(%ecx), %xmm4
433 movdqa %xmm7, 16(%ecx)
436 jne sha256_transform_4way_extend_loop
438 movdqu 0(%edi), %xmm7
439 movdqu 16(%edi), %xmm5
440 movdqu 32(%edi), %xmm4
441 movdqu 48(%edi), %xmm3
442 movdqu 64(%edi), %xmm0
443 movdqu 80(%edi), %xmm1
444 movdqu 96(%edi), %xmm2
445 movdqu 112(%edi), %xmm6
446 movdqa %xmm1, 0(%esp)
447 movdqa %xmm2, 16(%esp)
448 movdqa %xmm6, 32(%esp)
451 sha256_transform_4way_main_loop:
452 movdqa 3*16(%esp, %eax), %xmm6
453 paddd sha256_4k(%eax), %xmm6
454 paddd 32(%esp), %xmm6
457 movdqa 16(%esp), %xmm2
460 movdqa %xmm2, 32(%esp)
461 movdqa 0(%esp), %xmm2
462 movdqa %xmm2, 16(%esp)
466 movdqa %xmm0, 0(%esp)
517 jne sha256_transform_4way_main_loop
519 movdqu 0(%edi), %xmm1
520 movdqu 16(%edi), %xmm2
523 movdqu 32(%edi), %xmm1
524 movdqu 48(%edi), %xmm2
528 movdqu %xmm7, 0(%edi)
529 movdqu %xmm5, 16(%edi)
530 movdqu %xmm4, 32(%edi)
531 movdqu %xmm3, 48(%edi)
533 movdqu 64(%edi), %xmm1
534 movdqu 80(%edi), %xmm2
535 movdqu 96(%edi), %xmm6
536 movdqu 112(%edi), %xmm7
539 paddd 16(%esp), %xmm6
540 paddd 32(%esp), %xmm7
542 movdqu %xmm0, 64(%edi)
543 movdqu %xmm2, 80(%edi)
544 movdqu %xmm6, 96(%edi)
545 movdqu %xmm7, 112(%edi)
552 .macro sha256_sse2_main_round_red i, r7
553 movdqa 16*(\i)(%eax), %xmm6
554 paddd 16*(\i)+sha256_4k, %xmm6
555 paddd 32(%esp), %xmm6
557 movdqa 16(%esp), %xmm2
560 movdqa %xmm2, 32(%esp)
561 movdqa 0(%esp), %xmm2
562 movdqa %xmm2, 16(%esp)
565 movdqa %xmm0, 0(%esp)
585 .globl sha256_use_4way
586 .globl _sha256_use_4way
591 /* Check for SSE2 availability */
594 andl $0x04000000, %edx
595 jnz sha256_use_4way_sse2
600 sha256_use_4way_sse2:
607 .globl sha256_use_ssse3
608 .globl _sha256_use_ssse3
615 andl $0x00000200, %ecx
616 jnz sha256_use_ssse3_done
621 sha256_use_ssse3_done: