2 * Copyright 2012-2015 pooler@litecoinpool.org
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License as published by the Free
6 * Software Foundation; either version 2 of the License, or (at your option)
7 * any later version. See COPYING for more details.
10 #if defined(__linux__) && defined(__ELF__)
11 .section .note.GNU-stack,"",%progbits
14 #if defined(__x86_64__)
18 .long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a
19 .long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
24 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
25 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
26 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
27 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
28 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
29 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
30 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
31 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
32 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
33 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
34 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
35 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
36 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
37 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
38 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
39 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
42 .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
45 .macro sha256_mixed_quadround ra, rb, rc, rd, re, rf, rg, rh, x0, x1, x2, x3
100 pslld $(32-18), %xmm7
123 pshufd $0xfa, \x3, %xmm6
153 pshufd $0x8f, %xmm8, %xmm8
165 pshufd $0x50, %xmm4, %xmm6
197 pshufd $0xf8, \x0, \x0
210 .macro sha256_main_round i, ra, rb, rc, rd, re, rf, rg, rh
228 addl \i*4(%rsp), %ecx
244 sha256_transform_sse2:
250 #if defined(_WIN64) || defined(__CYGWIN__)
254 movdqa %xmm6, 1*16(%rsp)
255 movdqa %xmm7, 2*16(%rsp)
256 movdqa %xmm8, 3*16(%rsp)
257 movdqa %xmm9, 4*16(%rsp)
267 movl 2*4(%rdi), %r10d
268 movl 3*4(%rdi), %r11d
269 movl 4*4(%rdi), %r12d
270 movl 5*4(%rdi), %r13d
271 movl 6*4(%rdi), %r14d
272 movl 7*4(%rdi), %r15d
275 jnz sha256_transform_sse2_swap
277 movdqu 0*16(%rsi), %xmm0
278 movdqu 1*16(%rsi), %xmm1
279 movdqu 2*16(%rsi), %xmm2
280 movdqu 3*16(%rsi), %xmm3
281 jmp sha256_transform_sse2_core
283 sha256_transform_sse2_swap:
284 movdqu 0*16(%rsi), %xmm0
285 movdqu 1*16(%rsi), %xmm1
286 movdqu 2*16(%rsi), %xmm2
287 movdqu 3*16(%rsi), %xmm3
288 pshuflw $0xb1, %xmm0, %xmm0
289 pshuflw $0xb1, %xmm1, %xmm1
290 pshuflw $0xb1, %xmm2, %xmm2
291 pshuflw $0xb1, %xmm3, %xmm3
292 pshufhw $0xb1, %xmm0, %xmm0
293 pshufhw $0xb1, %xmm1, %xmm1
294 pshufhw $0xb1, %xmm2, %xmm2
295 pshufhw $0xb1, %xmm3, %xmm3
313 sha256_transform_sse2_core:
314 leaq sha256_k(%rip), %rdx
317 sha256_transform_sse2_loop:
318 movdqa 0*16(%rdx), %xmm9
321 sha256_mixed_quadround %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %xmm0, %xmm1, %xmm2, %xmm3
322 movdqa 1*16(%rdx), %xmm9
325 sha256_mixed_quadround %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %xmm1, %xmm2, %xmm3, %xmm0
326 movdqa 2*16(%rdx), %xmm9
329 sha256_mixed_quadround %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %xmm2, %xmm3, %xmm0, %xmm1
330 movdqa 3*16(%rdx), %xmm9
334 sha256_mixed_quadround %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %xmm3, %xmm0, %xmm1, %xmm2
337 jne sha256_transform_sse2_loop
339 paddd 0*16(%rdx), %xmm0
341 sha256_main_round 0, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d
342 sha256_main_round 1, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d
343 sha256_main_round 2, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d
344 sha256_main_round 3, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d
345 paddd 1*16(%rdx), %xmm1
347 sha256_main_round 0, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d
348 sha256_main_round 1, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d
349 sha256_main_round 2, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d
350 sha256_main_round 3, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d
351 paddd 2*16(%rdx), %xmm2
353 sha256_main_round 0, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d
354 sha256_main_round 1, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d
355 sha256_main_round 2, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d
356 sha256_main_round 3, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d
357 paddd 3*16(%rdx), %xmm3
359 sha256_main_round 0, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d
360 sha256_main_round 1, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d
361 sha256_main_round 2, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d
362 sha256_main_round 3, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d
366 addl %r10d, 2*4(%rdi)
367 addl %r11d, 3*4(%rdi)
368 addl %r12d, 4*4(%rdi)
369 addl %r13d, 5*4(%rdi)
370 addl %r14d, 6*4(%rdi)
371 addl %r15d, 7*4(%rdi)
373 #if defined(_WIN64) || defined(__CYGWIN__)
374 movdqa 1*16(%rsp), %xmm6
375 movdqa 2*16(%rsp), %xmm7
376 movdqa 3*16(%rsp), %xmm8
377 movdqa 4*16(%rsp), %xmm9
394 sha256_transform_phe:
395 #if defined(_WIN64) || defined(__CYGWIN__)
407 jnz sha256_transform_phe_noswap
434 movdqu 2*16(%rsi), %xmm0
435 movdqu 3*16(%rsi), %xmm2
436 pshuflw $0xb1, %xmm0, %xmm0
437 pshuflw $0xb1, %xmm2, %xmm2
438 pshufhw $0xb1, %xmm0, %xmm0
439 pshufhw $0xb1, %xmm2, %xmm2
448 movdqa %xmm0, 2*16(%rsp)
449 movdqa %xmm2, 3*16(%rsp)
451 jmp sha256_transform_phe_core
453 sha256_transform_phe_noswap:
454 movdqu 0*16(%rsi), %xmm0
455 movdqu 1*16(%rsi), %xmm1
456 movdqu 2*16(%rsi), %xmm2
457 movdqu 3*16(%rsi), %xmm3
458 movdqa %xmm0, 0*16(%rsp)
459 movdqa %xmm1, 1*16(%rsp)
460 movdqa %xmm2, 2*16(%rsp)
461 movdqa %xmm3, 3*16(%rsp)
463 sha256_transform_phe_core:
468 .byte 0xf3, 0x0f, 0xa6, 0xd0
471 #if defined(_WIN64) || defined(__CYGWIN__)
480 sha256_transform_addr:
481 .quad sha256_transform_sse2
485 .globl sha256_transform
486 .globl _sha256_transform
489 jmp *sha256_transform_addr(%rip)
498 #if defined(_WIN64) || defined(__CYGWIN__)
509 movdqa 0*16(%rdx), %xmm0
510 movdqa 1*16(%rdx), %xmm1
511 movdqa %xmm0, 0*16(%rdi)
512 movdqa %xmm1, 1*16(%rdi)
531 .byte 0xf3, 0x0f, 0xa6, 0xd0
533 movdqa bswap_xmm_mask(%rip), %xmm1
534 movdqa 0*16(%rdi), %xmm0
535 movdqa 1*16(%rdi), %xmm2
538 movdqa %xmm0, 0*16(%rsp)
539 movdqa %xmm2, 1*16(%rsp)
541 movdqa sha256_h+0*16(%rip), %xmm0
542 movdqa sha256_h+1*16(%rip), %xmm1
543 movdqa %xmm0, 0*16(%rdi)
544 movdqa %xmm1, 1*16(%rdi)
550 .byte 0xf3, 0x0f, 0xa6, 0xd0
553 #if defined(_WIN64) || defined(__CYGWIN__)
563 .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
564 .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
565 .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
566 .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
567 .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
568 .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
569 .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
570 .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
575 .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
576 .long 0x71374491, 0x71374491, 0x71374491, 0x71374491
577 .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
578 .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
579 .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
580 .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
581 .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
582 .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
583 .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
584 .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
585 .long 0x243185be, 0x243185be, 0x243185be, 0x243185be
586 .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
587 .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
588 .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
589 .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
590 .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
591 .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
592 .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
593 .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
594 .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
595 .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
596 .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
597 .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
598 .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
599 .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
600 .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
601 .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
602 .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
603 .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
604 .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
605 .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
606 .long 0x14292967, 0x14292967, 0x14292967, 0x14292967
607 .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
608 .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
609 .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
610 .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
611 .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
612 .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
613 .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
614 .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
615 .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
616 .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
617 .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
618 .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
619 .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
620 .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
621 .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
622 .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
623 .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
624 .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
625 .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
626 .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
627 .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
628 .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
629 .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
630 .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
631 .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
632 .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
633 .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
634 .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
635 .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
636 .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
637 .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
638 .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
643 .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000
645 .long 0x11002000, 0x11002000, 0x11002000, 0x11002000
647 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
649 .long 0x00400022, 0x00400022, 0x00400022, 0x00400022
654 .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
655 .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
656 .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
657 .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
658 .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
659 .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
660 .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
661 .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
666 .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
667 .long 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491
668 .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
669 .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
670 .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
671 .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
672 .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
673 .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
674 .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
675 .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
676 .long 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be
677 .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
678 .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
679 .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
680 .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
681 .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
682 .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
683 .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
684 .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
685 .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
686 .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
687 .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
688 .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
689 .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
690 .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
691 .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
692 .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
693 .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
694 .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
695 .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
696 .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
697 .long 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967
698 .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
699 .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
700 .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
701 .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
702 .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
703 .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
704 .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
705 .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
706 .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
707 .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
708 .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
709 .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
710 .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
711 .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
712 .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
713 .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
714 .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
715 .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
716 .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
717 .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
718 .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
719 .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
720 .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
721 .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
722 .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
723 .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
724 .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
725 .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
726 .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
727 .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
728 .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
729 .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
734 .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000
736 .long 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000
738 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
740 .long 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022
744 .globl sha256_init_4way
745 .globl _sha256_init_4way
748 #if defined(_WIN64) || defined(__CYGWIN__)
752 movdqa sha256_4h+0(%rip), %xmm0
753 movdqa sha256_4h+16(%rip), %xmm1
754 movdqa sha256_4h+32(%rip), %xmm2
755 movdqa sha256_4h+48(%rip), %xmm3
756 movdqu %xmm0, 0(%rdi)
757 movdqu %xmm1, 16(%rdi)
758 movdqu %xmm2, 32(%rdi)
759 movdqu %xmm3, 48(%rdi)
760 movdqa sha256_4h+64(%rip), %xmm0
761 movdqa sha256_4h+80(%rip), %xmm1
762 movdqa sha256_4h+96(%rip), %xmm2
763 movdqa sha256_4h+112(%rip), %xmm3
764 movdqu %xmm0, 64(%rdi)
765 movdqu %xmm1, 80(%rdi)
766 movdqu %xmm2, 96(%rdi)
767 movdqu %xmm3, 112(%rdi)
768 #if defined(_WIN64) || defined(__CYGWIN__)
775 .globl sha256_init_8way
776 .globl _sha256_init_8way
779 #if defined(_WIN64) || defined(__CYGWIN__)
783 vpbroadcastd sha256_4h+0(%rip), %ymm0
784 vpbroadcastd sha256_4h+16(%rip), %ymm1
785 vpbroadcastd sha256_4h+32(%rip), %ymm2
786 vpbroadcastd sha256_4h+48(%rip), %ymm3
787 vmovdqu %ymm0, 0*32(%rdi)
788 vmovdqu %ymm1, 1*32(%rdi)
789 vmovdqu %ymm2, 2*32(%rdi)
790 vmovdqu %ymm3, 3*32(%rdi)
791 vpbroadcastd sha256_4h+64(%rip), %ymm0
792 vpbroadcastd sha256_4h+80(%rip), %ymm1
793 vpbroadcastd sha256_4h+96(%rip), %ymm2
794 vpbroadcastd sha256_4h+112(%rip), %ymm3
795 vmovdqu %ymm0, 4*32(%rdi)
796 vmovdqu %ymm1, 5*32(%rdi)
797 vmovdqu %ymm2, 6*32(%rdi)
798 vmovdqu %ymm3, 7*32(%rdi)
799 #if defined(_WIN64) || defined(__CYGWIN__)
804 .macro sha256_sse2_extend_round i
805 movdqa (\i-15)*16(%rax), %xmm0
817 paddd (\i-16)*16(%rax), %xmm0
818 paddd (\i-7)*16(%rax), %xmm0
832 movdqa %xmm3, \i*16(%rax)
835 .macro sha256_sse2_extend_doubleround i
836 movdqa (\i-15)*16(%rax), %xmm0
837 movdqa (\i-14)*16(%rax), %xmm4
861 paddd (\i-16)*16(%rax), %xmm0
862 paddd (\i-15)*16(%rax), %xmm4
875 paddd (\i-7)*16(%rax), %xmm0
876 paddd (\i-6)*16(%rax), %xmm4
893 movdqa %xmm3, \i*16(%rax)
894 movdqa %xmm7, (\i+1)*16(%rax)
897 .macro sha256_sse2_main_round i
898 movdqa 16*(\i)(%rax), %xmm6
901 movdqa 16(%rsp), %xmm2
903 paddd 32(%rsp), %xmm6
905 movdqa %xmm2, 32(%rsp)
906 movdqa 0(%rsp), %xmm2
907 movdqa %xmm2, 16(%rsp)
911 movdqa %xmm0, 0(%rsp)
917 paddd 16*(\i)(%rcx), %xmm6
961 .macro sha256_sse2_main_quadround i
962 sha256_sse2_main_round \i+0
963 sha256_sse2_main_round \i+1
964 sha256_sse2_main_round \i+2
965 sha256_sse2_main_round \i+3
969 .macro sha256_avx_extend_round i
970 vmovdqa (\i-15)*16(%rax), %xmm0
971 vpslld $14, %xmm0, %xmm2
972 vpsrld $3, %xmm0, %xmm0
973 vpsrld $4, %xmm0, %xmm1
974 vpxor %xmm1, %xmm0, %xmm0
975 vpxor %xmm2, %xmm0, %xmm0
976 vpsrld $11, %xmm1, %xmm1
977 vpslld $11, %xmm2, %xmm2
978 vpxor %xmm1, %xmm0, %xmm0
979 vpxor %xmm2, %xmm0, %xmm0
980 vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
981 vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
983 vpslld $13, %xmm3, %xmm2
984 vpsrld $10, %xmm3, %xmm3
985 vpsrld $7, %xmm3, %xmm1
986 vpxor %xmm1, %xmm3, %xmm3
987 vpxor %xmm2, %xmm3, %xmm3
988 vpsrld $2, %xmm1, %xmm1
989 vpslld $2, %xmm2, %xmm2
990 vpxor %xmm1, %xmm3, %xmm3
991 vpxor %xmm2, %xmm3, %xmm3
992 vpaddd %xmm0, %xmm3, %xmm3
993 vmovdqa %xmm3, \i*16(%rax)
996 .macro sha256_avx_extend_doubleround i
997 vmovdqa (\i-15)*16(%rax), %xmm0
998 vmovdqa (\i-14)*16(%rax), %xmm4
999 vpslld $14, %xmm0, %xmm2
1000 vpslld $14, %xmm4, %xmm6
1001 vpsrld $3, %xmm0, %xmm8
1002 vpsrld $3, %xmm4, %xmm4
1003 vpsrld $7, %xmm0, %xmm1
1004 vpsrld $4, %xmm4, %xmm5
1005 vpxor %xmm1, %xmm8, %xmm8
1006 vpxor %xmm5, %xmm4, %xmm4
1007 vpsrld $11, %xmm1, %xmm1
1008 vpsrld $11, %xmm5, %xmm5
1009 vpxor %xmm2, %xmm8, %xmm8
1010 vpxor %xmm6, %xmm4, %xmm4
1011 vpslld $11, %xmm2, %xmm2
1012 vpslld $11, %xmm6, %xmm6
1013 vpxor %xmm1, %xmm8, %xmm8
1014 vpxor %xmm5, %xmm4, %xmm4
1015 vpxor %xmm2, %xmm8, %xmm8
1016 vpxor %xmm6, %xmm4, %xmm4
1018 vpaddd %xmm0, %xmm4, %xmm4
1019 vpaddd (\i-16)*16(%rax), %xmm8, %xmm0
1021 vpslld $13, %xmm3, %xmm2
1022 vpslld $13, %xmm7, %xmm6
1023 vpsrld $10, %xmm3, %xmm3
1024 vpsrld $10, %xmm7, %xmm7
1026 vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
1027 vpaddd (\i-6)*16(%rax), %xmm4, %xmm4
1029 vpsrld $7, %xmm3, %xmm1
1030 vpsrld $7, %xmm7, %xmm5
1031 vpxor %xmm1, %xmm3, %xmm3
1032 vpxor %xmm5, %xmm7, %xmm7
1033 vpsrld $2, %xmm1, %xmm1
1034 vpsrld $2, %xmm5, %xmm5
1035 vpxor %xmm2, %xmm3, %xmm3
1036 vpxor %xmm6, %xmm7, %xmm7
1037 vpslld $2, %xmm2, %xmm2
1038 vpslld $2, %xmm6, %xmm6
1039 vpxor %xmm1, %xmm3, %xmm3
1040 vpxor %xmm5, %xmm7, %xmm7
1041 vpxor %xmm2, %xmm3, %xmm3
1042 vpxor %xmm6, %xmm7, %xmm7
1044 vpaddd %xmm0, %xmm3, %xmm3
1045 vpaddd %xmm4, %xmm7, %xmm7
1046 vmovdqa %xmm3, \i*16(%rax)
1047 vmovdqa %xmm7, (\i+1)*16(%rax)
1050 .macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
1051 vpaddd 16*(\i)(%rax), \r0, %xmm6
1052 vpaddd 16*(\i)(%rcx), %xmm6, %xmm6
1054 vpandn \r1, \r3, %xmm1
1055 vpand \r3, \r2, %xmm2
1056 vpxor %xmm2, %xmm1, %xmm1
1057 vpaddd %xmm1, %xmm6, %xmm6
1059 vpslld $7, \r3, %xmm1
1061 vpsrld $5, \r0, %xmm2
1062 vpxor %xmm1, \r0, \r0
1063 vpxor %xmm2, \r0, \r0
1064 vpslld $14, %xmm1, %xmm1
1065 vpsrld $14, %xmm2, %xmm2
1066 vpxor %xmm1, \r0, \r0
1067 vpxor %xmm2, \r0, \r0
1068 vpslld $5, %xmm1, %xmm1
1069 vpxor %xmm1, \r0, \r0
1070 vpaddd \r0, %xmm6, %xmm6
1071 vpaddd %xmm6, \r4, \r0
1073 vpand \r6, \r5, %xmm2
1075 vpand \r7, \r6, %xmm1
1076 vpxor \r4, %xmm1, %xmm1
1077 vpxor %xmm2, %xmm1, %xmm1
1078 vpaddd %xmm1, %xmm6, %xmm6
1080 vpslld $10, \r7, %xmm2
1082 vpsrld $11, \r4, %xmm1
1083 vpxor %xmm2, \r4, \r4
1084 vpxor %xmm1, \r4, \r4
1085 vpslld $9, %xmm2, %xmm2
1086 vpsrld $9, %xmm1, %xmm1
1087 vpxor %xmm2, \r4, \r4
1088 vpxor %xmm1, \r4, \r4
1089 vpslld $11, %xmm2, %xmm2
1090 vpxor %xmm2, \r4, \r4
1091 vpaddd %xmm6, \r4, \r4
1094 .macro sha256_avx_main_quadround i
1095 sha256_avx_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
1096 sha256_avx_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
1097 sha256_avx_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
1098 sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
1102 .macro sha256_avx2_extend_round i
1103 vmovdqa (\i-15)*32(%rax), %ymm0
1104 vpslld $14, %ymm0, %ymm2
1105 vpsrld $3, %ymm0, %ymm0
1106 vpsrld $4, %ymm0, %ymm1
1107 vpxor %ymm1, %ymm0, %ymm0
1108 vpxor %ymm2, %ymm0, %ymm0
1109 vpsrld $11, %ymm1, %ymm1
1110 vpslld $11, %ymm2, %ymm2
1111 vpxor %ymm1, %ymm0, %ymm0
1112 vpxor %ymm2, %ymm0, %ymm0
1113 vpaddd (\i-16)*32(%rax), %ymm0, %ymm0
1114 vpaddd (\i-7)*32(%rax), %ymm0, %ymm0
1116 vpslld $13, %ymm3, %ymm2
1117 vpsrld $10, %ymm3, %ymm3
1118 vpsrld $7, %ymm3, %ymm1
1119 vpxor %ymm1, %ymm3, %ymm3
1120 vpxor %ymm2, %ymm3, %ymm3
1121 vpsrld $2, %ymm1, %ymm1
1122 vpslld $2, %ymm2, %ymm2
1123 vpxor %ymm1, %ymm3, %ymm3
1124 vpxor %ymm2, %ymm3, %ymm3
1125 vpaddd %ymm0, %ymm3, %ymm3
1126 vmovdqa %ymm3, \i*32(%rax)
1129 .macro sha256_avx2_extend_doubleround i
1130 vmovdqa (\i-15)*32(%rax), %ymm0
1131 vmovdqa (\i-14)*32(%rax), %ymm4
1132 vpslld $14, %ymm0, %ymm2
1133 vpslld $14, %ymm4, %ymm6
1134 vpsrld $3, %ymm0, %ymm8
1135 vpsrld $3, %ymm4, %ymm4
1136 vpsrld $7, %ymm0, %ymm1
1137 vpsrld $4, %ymm4, %ymm5
1138 vpxor %ymm1, %ymm8, %ymm8
1139 vpxor %ymm5, %ymm4, %ymm4
1140 vpsrld $11, %ymm1, %ymm1
1141 vpsrld $11, %ymm5, %ymm5
1142 vpxor %ymm2, %ymm8, %ymm8
1143 vpxor %ymm6, %ymm4, %ymm4
1144 vpslld $11, %ymm2, %ymm2
1145 vpslld $11, %ymm6, %ymm6
1146 vpxor %ymm1, %ymm8, %ymm8
1147 vpxor %ymm5, %ymm4, %ymm4
1148 vpxor %ymm2, %ymm8, %ymm8
1149 vpxor %ymm6, %ymm4, %ymm4
1151 vpaddd %ymm0, %ymm4, %ymm4
1152 vpaddd (\i-16)*32(%rax), %ymm8, %ymm0
1154 vpslld $13, %ymm3, %ymm2
1155 vpslld $13, %ymm7, %ymm6
1156 vpsrld $10, %ymm3, %ymm3
1157 vpsrld $10, %ymm7, %ymm7
1159 vpaddd (\i-7)*32(%rax), %ymm0, %ymm0
1160 vpaddd (\i-6)*32(%rax), %ymm4, %ymm4
1162 vpsrld $7, %ymm3, %ymm1
1163 vpsrld $7, %ymm7, %ymm5
1164 vpxor %ymm1, %ymm3, %ymm3
1165 vpxor %ymm5, %ymm7, %ymm7
1166 vpsrld $2, %ymm1, %ymm1
1167 vpsrld $2, %ymm5, %ymm5
1168 vpxor %ymm2, %ymm3, %ymm3
1169 vpxor %ymm6, %ymm7, %ymm7
1170 vpslld $2, %ymm2, %ymm2
1171 vpslld $2, %ymm6, %ymm6
1172 vpxor %ymm1, %ymm3, %ymm3
1173 vpxor %ymm5, %ymm7, %ymm7
1174 vpxor %ymm2, %ymm3, %ymm3
1175 vpxor %ymm6, %ymm7, %ymm7
1177 vpaddd %ymm0, %ymm3, %ymm3
1178 vpaddd %ymm4, %ymm7, %ymm7
1179 vmovdqa %ymm3, \i*32(%rax)
1180 vmovdqa %ymm7, (\i+1)*32(%rax)
1183 .macro sha256_avx2_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
1184 vpaddd 32*(\i)(%rax), \r0, %ymm6
1185 vpaddd 32*(\i)(%rcx), %ymm6, %ymm6
1187 vpandn \r1, \r3, %ymm1
1188 vpand \r3, \r2, %ymm2
1189 vpxor %ymm2, %ymm1, %ymm1
1190 vpaddd %ymm1, %ymm6, %ymm6
1192 vpslld $7, \r3, %ymm1
1194 vpsrld $5, \r0, %ymm2
1195 vpxor %ymm1, \r0, \r0
1196 vpxor %ymm2, \r0, \r0
1197 vpslld $14, %ymm1, %ymm1
1198 vpsrld $14, %ymm2, %ymm2
1199 vpxor %ymm1, \r0, \r0
1200 vpxor %ymm2, \r0, \r0
1201 vpslld $5, %ymm1, %ymm1
1202 vpxor %ymm1, \r0, \r0
1203 vpaddd \r0, %ymm6, %ymm6
1204 vpaddd %ymm6, \r4, \r0
1206 vpand \r6, \r5, %ymm2
1208 vpand \r7, \r6, %ymm1
1209 vpxor \r4, %ymm1, %ymm1
1210 vpxor %ymm2, %ymm1, %ymm1
1211 vpaddd %ymm1, %ymm6, %ymm6
1213 vpslld $10, \r7, %ymm2
1215 vpsrld $11, \r4, %ymm1
1216 vpxor %ymm2, \r4, \r4
1217 vpxor %ymm1, \r4, \r4
1218 vpslld $9, %ymm2, %ymm2
1219 vpsrld $9, %ymm1, %ymm1
1220 vpxor %ymm2, \r4, \r4
1221 vpxor %ymm1, \r4, \r4
1222 vpslld $11, %ymm2, %ymm2
1223 vpxor %ymm2, \r4, \r4
1224 vpaddd %ymm6, \r4, \r4
1227 .macro sha256_avx2_main_quadround i
1228 sha256_avx2_main_round \i+0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7
1229 sha256_avx2_main_round \i+1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3
1230 sha256_avx2_main_round \i+2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4
1231 sha256_avx2_main_round \i+3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5
1234 .macro sha256_xop_extend_round i
1235 vmovdqa (\i-15)*16(%rax), %xmm0
1236 vprotd $25, %xmm0, %xmm1
1237 vprotd $14, %xmm0, %xmm2
1238 vpsrld $3, %xmm0, %xmm0
1239 vpxor %xmm1, %xmm2, %xmm2
1240 vpxor %xmm2, %xmm0, %xmm0
1242 vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
1243 vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
1245 vprotd $15, %xmm3, %xmm1
1246 vprotd $13, %xmm3, %xmm2
1247 vpsrld $10, %xmm3, %xmm3
1248 vpxor %xmm1, %xmm2, %xmm2
1249 vpxor %xmm2, %xmm3, %xmm3
1250 vpaddd %xmm0, %xmm3, %xmm3
1251 vmovdqa %xmm3, \i*16(%rax)
1254 .macro sha256_xop_extend_doubleround i
1255 vmovdqa (\i-15)*16(%rax), %xmm0
1256 vmovdqa (\i-14)*16(%rax), %xmm4
1257 vprotd $25, %xmm0, %xmm1
1258 vprotd $25, %xmm4, %xmm5
1259 vprotd $14, %xmm0, %xmm2
1260 vprotd $14, %xmm4, %xmm6
1261 vpxor %xmm1, %xmm2, %xmm2
1262 vpxor %xmm5, %xmm6, %xmm6
1263 vpsrld $3, %xmm0, %xmm0
1264 vpsrld $3, %xmm4, %xmm4
1265 vpxor %xmm2, %xmm0, %xmm0
1266 vpxor %xmm6, %xmm4, %xmm4
1268 vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
1269 vpaddd (\i-15)*16(%rax), %xmm4, %xmm4
1271 vprotd $15, %xmm3, %xmm1
1272 vprotd $15, %xmm7, %xmm5
1273 vprotd $13, %xmm3, %xmm2
1274 vprotd $13, %xmm7, %xmm6
1275 vpxor %xmm1, %xmm2, %xmm2
1276 vpxor %xmm5, %xmm6, %xmm6
1278 vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
1279 vpaddd (\i-6)*16(%rax), %xmm4, %xmm4
1281 vpsrld $10, %xmm3, %xmm3
1282 vpsrld $10, %xmm7, %xmm7
1283 vpxor %xmm2, %xmm3, %xmm3
1284 vpxor %xmm6, %xmm7, %xmm7
1286 vpaddd %xmm0, %xmm3, %xmm3
1287 vpaddd %xmm4, %xmm7, %xmm7
1288 vmovdqa %xmm3, \i*16(%rax)
1289 vmovdqa %xmm7, (\i+1)*16(%rax)
1292 .macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
1293 vpaddd 16*(\i)(%rax), \r0, %xmm6
1294 vpaddd 16*(\i)(%rcx), %xmm6, %xmm6
1296 vpandn \r1, \r3, %xmm1
1297 vpand \r3, \r2, %xmm2
1298 vpxor %xmm2, %xmm1, %xmm1
1299 vpaddd %xmm1, %xmm6, %xmm6
1301 vprotd $26, \r3, %xmm1
1302 vprotd $21, \r3, %xmm2
1303 vpxor %xmm1, %xmm2, %xmm2
1305 vpxor %xmm2, \r0, \r0
1306 vpaddd \r0, %xmm6, %xmm6
1307 vpaddd %xmm6, \r4, \r0
1309 vpand \r6, \r5, %xmm2
1311 vpand \r7, \r6, %xmm1
1312 vpxor \r4, %xmm1, %xmm1
1313 vpxor %xmm2, %xmm1, %xmm1
1314 vpaddd %xmm1, %xmm6, %xmm6
1316 vprotd $30, \r7, %xmm1
1317 vprotd $19, \r7, %xmm2
1318 vpxor %xmm1, %xmm2, %xmm2
1319 vprotd $10, \r7, \r4
1320 vpxor %xmm2, \r4, \r4
1321 vpaddd %xmm6, \r4, \r4
1324 .macro sha256_xop_main_quadround i
1325 sha256_xop_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
1326 sha256_xop_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
1327 sha256_xop_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
1328 sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
1333 sha256_transform_4way_core_sse2:
1334 leaq 256(%rsp), %rcx
1335 leaq 48*16(%rcx), %rax
1336 movdqa -2*16(%rcx), %xmm3
1337 movdqa -1*16(%rcx), %xmm7
1338 sha256_transform_4way_sse2_extend_loop:
1339 movdqa -15*16(%rcx), %xmm0
1340 movdqa -14*16(%rcx), %xmm4
1364 paddd -16*16(%rcx), %xmm0
1365 paddd -15*16(%rcx), %xmm4
1378 paddd -7*16(%rcx), %xmm0
1379 paddd -6*16(%rcx), %xmm4
1396 movdqa %xmm3, (%rcx)
1397 movdqa %xmm7, 16(%rcx)
1400 jne sha256_transform_4way_sse2_extend_loop
1402 movdqu 0(%rdi), %xmm7
1403 movdqu 16(%rdi), %xmm5
1404 movdqu 32(%rdi), %xmm4
1405 movdqu 48(%rdi), %xmm3
1406 movdqu 64(%rdi), %xmm0
1407 movdqu 80(%rdi), %xmm8
1408 movdqu 96(%rdi), %xmm9
1409 movdqu 112(%rdi), %xmm10
1411 leaq sha256_4k(%rip), %rcx
1413 sha256_transform_4way_sse2_main_loop:
1414 movdqa (%rsp, %rax), %xmm6
1415 paddd (%rcx, %rax), %xmm6
1422 movdqa %xmm2, %xmm10
1479 jne sha256_transform_4way_sse2_main_loop
1480 jmp sha256_transform_4way_finish
1484 sha256_transform_4way_core_avx:
1485 leaq 256(%rsp), %rax
1486 movdqa -2*16(%rax), %xmm3
1487 movdqa -1*16(%rax), %xmm7
1488 sha256_avx_extend_doubleround 0
1489 sha256_avx_extend_doubleround 2
1490 sha256_avx_extend_doubleround 4
1491 sha256_avx_extend_doubleround 6
1492 sha256_avx_extend_doubleround 8
1493 sha256_avx_extend_doubleround 10
1494 sha256_avx_extend_doubleround 12
1495 sha256_avx_extend_doubleround 14
1496 sha256_avx_extend_doubleround 16
1497 sha256_avx_extend_doubleround 18
1498 sha256_avx_extend_doubleround 20
1499 sha256_avx_extend_doubleround 22
1500 sha256_avx_extend_doubleround 24
1501 sha256_avx_extend_doubleround 26
1502 sha256_avx_extend_doubleround 28
1503 sha256_avx_extend_doubleround 30
1504 sha256_avx_extend_doubleround 32
1505 sha256_avx_extend_doubleround 34
1506 sha256_avx_extend_doubleround 36
1507 sha256_avx_extend_doubleround 38
1508 sha256_avx_extend_doubleround 40
1509 sha256_avx_extend_doubleround 42
1510 sha256_avx_extend_doubleround 44
1511 sha256_avx_extend_doubleround 46
1512 movdqu 0(%rdi), %xmm7
1513 movdqu 16(%rdi), %xmm5
1514 movdqu 32(%rdi), %xmm4
1515 movdqu 48(%rdi), %xmm3
1516 movdqu 64(%rdi), %xmm0
1517 movdqu 80(%rdi), %xmm8
1518 movdqu 96(%rdi), %xmm9
1519 movdqu 112(%rdi), %xmm10
1521 leaq sha256_4k(%rip), %rcx
1522 sha256_avx_main_quadround 0
1523 sha256_avx_main_quadround 4
1524 sha256_avx_main_quadround 8
1525 sha256_avx_main_quadround 12
1526 sha256_avx_main_quadround 16
1527 sha256_avx_main_quadround 20
1528 sha256_avx_main_quadround 24
1529 sha256_avx_main_quadround 28
1530 sha256_avx_main_quadround 32
1531 sha256_avx_main_quadround 36
1532 sha256_avx_main_quadround 40
1533 sha256_avx_main_quadround 44
1534 sha256_avx_main_quadround 48
1535 sha256_avx_main_quadround 52
1536 sha256_avx_main_quadround 56
1537 sha256_avx_main_quadround 60
1538 jmp sha256_transform_4way_finish
1542 sha256_transform_4way_core_xop:
1543 leaq 256(%rsp), %rax
1544 movdqa -2*16(%rax), %xmm3
1545 movdqa -1*16(%rax), %xmm7
1546 sha256_xop_extend_doubleround 0
1547 sha256_xop_extend_doubleround 2
1548 sha256_xop_extend_doubleround 4
1549 sha256_xop_extend_doubleround 6
1550 sha256_xop_extend_doubleround 8
1551 sha256_xop_extend_doubleround 10
1552 sha256_xop_extend_doubleround 12
1553 sha256_xop_extend_doubleround 14
1554 sha256_xop_extend_doubleround 16
1555 sha256_xop_extend_doubleround 18
1556 sha256_xop_extend_doubleround 20
1557 sha256_xop_extend_doubleround 22
1558 sha256_xop_extend_doubleround 24
1559 sha256_xop_extend_doubleround 26
1560 sha256_xop_extend_doubleround 28
1561 sha256_xop_extend_doubleround 30
1562 sha256_xop_extend_doubleround 32
1563 sha256_xop_extend_doubleround 34
1564 sha256_xop_extend_doubleround 36
1565 sha256_xop_extend_doubleround 38
1566 sha256_xop_extend_doubleround 40
1567 sha256_xop_extend_doubleround 42
1568 sha256_xop_extend_doubleround 44
1569 sha256_xop_extend_doubleround 46
1570 movdqu 0(%rdi), %xmm7
1571 movdqu 16(%rdi), %xmm5
1572 movdqu 32(%rdi), %xmm4
1573 movdqu 48(%rdi), %xmm3
1574 movdqu 64(%rdi), %xmm0
1575 movdqu 80(%rdi), %xmm8
1576 movdqu 96(%rdi), %xmm9
1577 movdqu 112(%rdi), %xmm10
1579 leaq sha256_4k(%rip), %rcx
1580 sha256_xop_main_quadround 0
1581 sha256_xop_main_quadround 4
1582 sha256_xop_main_quadround 8
1583 sha256_xop_main_quadround 12
1584 sha256_xop_main_quadround 16
1585 sha256_xop_main_quadround 20
1586 sha256_xop_main_quadround 24
1587 sha256_xop_main_quadround 28
1588 sha256_xop_main_quadround 32
1589 sha256_xop_main_quadround 36
1590 sha256_xop_main_quadround 40
1591 sha256_xop_main_quadround 44
1592 sha256_xop_main_quadround 48
1593 sha256_xop_main_quadround 52
1594 sha256_xop_main_quadround 56
1595 sha256_xop_main_quadround 60
1596 jmp sha256_transform_4way_finish
1600 sha256_transform_4way_core_addr:
1603 .macro p2bswap_rsi_rsp i
1604 movdqu \i*16(%rsi), %xmm0
1605 movdqu (\i+1)*16(%rsi), %xmm2
1606 pshuflw $0xb1, %xmm0, %xmm0
1607 pshuflw $0xb1, %xmm2, %xmm2
1608 pshufhw $0xb1, %xmm0, %xmm0
1609 pshufhw $0xb1, %xmm2, %xmm2
1618 movdqa %xmm0, \i*16(%rsp)
1619 movdqa %xmm2, (\i+1)*16(%rsp)
1624 .globl sha256_transform_4way
1625 .globl _sha256_transform_4way
1626 sha256_transform_4way:
1627 _sha256_transform_4way:
1628 #if defined(_WIN64) || defined(__CYGWIN__)
1631 movdqa %xmm6, 0(%rsp)
1632 movdqa %xmm7, 16(%rsp)
1633 movdqa %xmm8, 32(%rsp)
1634 movdqa %xmm9, 48(%rsp)
1635 movdqa %xmm10, 64(%rsp)
1636 movdqa %xmm11, 80(%rsp)
1647 jnz sha256_transform_4way_swap
1649 movdqu 0*16(%rsi), %xmm0
1650 movdqu 1*16(%rsi), %xmm1
1651 movdqu 2*16(%rsi), %xmm2
1652 movdqu 3*16(%rsi), %xmm3
1653 movdqu 4*16(%rsi), %xmm4
1654 movdqu 5*16(%rsi), %xmm5
1655 movdqu 6*16(%rsi), %xmm6
1656 movdqu 7*16(%rsi), %xmm7
1657 movdqa %xmm0, 0*16(%rsp)
1658 movdqa %xmm1, 1*16(%rsp)
1659 movdqa %xmm2, 2*16(%rsp)
1660 movdqa %xmm3, 3*16(%rsp)
1661 movdqa %xmm4, 4*16(%rsp)
1662 movdqa %xmm5, 5*16(%rsp)
1663 movdqa %xmm6, 6*16(%rsp)
1664 movdqa %xmm7, 7*16(%rsp)
1665 movdqu 8*16(%rsi), %xmm0
1666 movdqu 9*16(%rsi), %xmm1
1667 movdqu 10*16(%rsi), %xmm2
1668 movdqu 11*16(%rsi), %xmm3
1669 movdqu 12*16(%rsi), %xmm4
1670 movdqu 13*16(%rsi), %xmm5
1671 movdqu 14*16(%rsi), %xmm6
1672 movdqu 15*16(%rsi), %xmm7
1673 movdqa %xmm0, 8*16(%rsp)
1674 movdqa %xmm1, 9*16(%rsp)
1675 movdqa %xmm2, 10*16(%rsp)
1676 movdqa %xmm3, 11*16(%rsp)
1677 movdqa %xmm4, 12*16(%rsp)
1678 movdqa %xmm5, 13*16(%rsp)
1679 movdqa %xmm6, 14*16(%rsp)
1680 movdqa %xmm7, 15*16(%rsp)
1681 jmp *sha256_transform_4way_core_addr(%rip)
1684 sha256_transform_4way_swap:
1693 jmp *sha256_transform_4way_core_addr(%rip)
1696 sha256_transform_4way_finish:
1697 movdqu 0(%rdi), %xmm2
1698 movdqu 16(%rdi), %xmm6
1699 movdqu 32(%rdi), %xmm11
1700 movdqu 48(%rdi), %xmm1
1705 movdqu 64(%rdi), %xmm2
1706 movdqu 80(%rdi), %xmm6
1707 movdqu 96(%rdi), %xmm11
1708 movdqu 112(%rdi), %xmm1
1714 movdqu %xmm7, 0(%rdi)
1715 movdqu %xmm5, 16(%rdi)
1716 movdqu %xmm4, 32(%rdi)
1717 movdqu %xmm3, 48(%rdi)
1718 movdqu %xmm0, 64(%rdi)
1719 movdqu %xmm8, 80(%rdi)
1720 movdqu %xmm9, 96(%rdi)
1721 movdqu %xmm10, 112(%rdi)
1724 #if defined(_WIN64) || defined(__CYGWIN__)
1726 movdqa 0(%rsp), %xmm6
1727 movdqa 16(%rsp), %xmm7
1728 movdqa 32(%rsp), %xmm8
1729 movdqa 48(%rsp), %xmm9
1730 movdqa 64(%rsp), %xmm10
1731 movdqa 80(%rsp), %xmm11
1739 sha256_transform_8way_core_avx2:
1740 leaq 8*64(%rsp), %rax
1741 vmovdqa -2*32(%rax), %ymm3
1742 vmovdqa -1*32(%rax), %ymm7
1743 sha256_avx2_extend_doubleround 0
1744 sha256_avx2_extend_doubleround 2
1745 sha256_avx2_extend_doubleround 4
1746 sha256_avx2_extend_doubleround 6
1747 sha256_avx2_extend_doubleround 8
1748 sha256_avx2_extend_doubleround 10
1749 sha256_avx2_extend_doubleround 12
1750 sha256_avx2_extend_doubleround 14
1751 sha256_avx2_extend_doubleround 16
1752 sha256_avx2_extend_doubleround 18
1753 sha256_avx2_extend_doubleround 20
1754 sha256_avx2_extend_doubleround 22
1755 sha256_avx2_extend_doubleround 24
1756 sha256_avx2_extend_doubleround 26
1757 sha256_avx2_extend_doubleround 28
1758 sha256_avx2_extend_doubleround 30
1759 sha256_avx2_extend_doubleround 32
1760 sha256_avx2_extend_doubleround 34
1761 sha256_avx2_extend_doubleround 36
1762 sha256_avx2_extend_doubleround 38
1763 sha256_avx2_extend_doubleround 40
1764 sha256_avx2_extend_doubleround 42
1765 sha256_avx2_extend_doubleround 44
1766 sha256_avx2_extend_doubleround 46
1767 vmovdqu 0*32(%rdi), %ymm7
1768 vmovdqu 1*32(%rdi), %ymm5
1769 vmovdqu 2*32(%rdi), %ymm4
1770 vmovdqu 3*32(%rdi), %ymm3
1771 vmovdqu 4*32(%rdi), %ymm0
1772 vmovdqu 5*32(%rdi), %ymm8
1773 vmovdqu 6*32(%rdi), %ymm9
1774 vmovdqu 7*32(%rdi), %ymm10
1776 leaq sha256_8k(%rip), %rcx
1777 sha256_avx2_main_quadround 0
1778 sha256_avx2_main_quadround 4
1779 sha256_avx2_main_quadround 8
1780 sha256_avx2_main_quadround 12
1781 sha256_avx2_main_quadround 16
1782 sha256_avx2_main_quadround 20
1783 sha256_avx2_main_quadround 24
1784 sha256_avx2_main_quadround 28
1785 sha256_avx2_main_quadround 32
1786 sha256_avx2_main_quadround 36
1787 sha256_avx2_main_quadround 40
1788 sha256_avx2_main_quadround 44
1789 sha256_avx2_main_quadround 48
1790 sha256_avx2_main_quadround 52
1791 sha256_avx2_main_quadround 56
1792 sha256_avx2_main_quadround 60
1793 jmp sha256_transform_8way_finish
1795 .macro p2bswap_avx2_rsi_rsp i
1796 vmovdqu \i*32(%rsi), %ymm0
1797 vmovdqu (\i+1)*32(%rsi), %ymm2
1798 vpshuflw $0xb1, %ymm0, %ymm0
1799 vpshuflw $0xb1, %ymm2, %ymm2
1800 vpshufhw $0xb1, %ymm0, %ymm0
1801 vpshufhw $0xb1, %ymm2, %ymm2
1802 vpsrlw $8, %ymm0, %ymm1
1803 vpsrlw $8, %ymm2, %ymm3
1804 vpsllw $8, %ymm0, %ymm0
1805 vpsllw $8, %ymm2, %ymm2
1806 vpxor %ymm1, %ymm0, %ymm0
1807 vpxor %ymm3, %ymm2, %ymm2
1808 vmovdqa %ymm0, \i*32(%rsp)
1809 vmovdqa %ymm2, (\i+1)*32(%rsp)
1814 .globl sha256_transform_8way
1815 .globl _sha256_transform_8way
1816 sha256_transform_8way:
1817 _sha256_transform_8way:
1818 #if defined(_WIN64) || defined(__CYGWIN__)
1821 vmovdqa %xmm6, 0(%rsp)
1822 vmovdqa %xmm7, 16(%rsp)
1823 vmovdqa %xmm8, 32(%rsp)
1824 vmovdqa %xmm9, 48(%rsp)
1825 vmovdqa %xmm10, 64(%rsp)
1826 vmovdqa %xmm11, 80(%rsp)
1837 jnz sha256_transform_8way_swap
1839 vmovdqu 0*32(%rsi), %ymm0
1840 vmovdqu 1*32(%rsi), %ymm1
1841 vmovdqu 2*32(%rsi), %ymm2
1842 vmovdqu 3*32(%rsi), %ymm3
1843 vmovdqu 4*32(%rsi), %ymm4
1844 vmovdqu 5*32(%rsi), %ymm5
1845 vmovdqu 6*32(%rsi), %ymm6
1846 vmovdqu 7*32(%rsi), %ymm7
1847 vmovdqa %ymm0, 0*32(%rsp)
1848 vmovdqa %ymm1, 1*32(%rsp)
1849 vmovdqa %ymm2, 2*32(%rsp)
1850 vmovdqa %ymm3, 3*32(%rsp)
1851 vmovdqa %ymm4, 4*32(%rsp)
1852 vmovdqa %ymm5, 5*32(%rsp)
1853 vmovdqa %ymm6, 6*32(%rsp)
1854 vmovdqa %ymm7, 7*32(%rsp)
1855 vmovdqu 8*32(%rsi), %ymm0
1856 vmovdqu 9*32(%rsi), %ymm1
1857 vmovdqu 10*32(%rsi), %ymm2
1858 vmovdqu 11*32(%rsi), %ymm3
1859 vmovdqu 12*32(%rsi), %ymm4
1860 vmovdqu 13*32(%rsi), %ymm5
1861 vmovdqu 14*32(%rsi), %ymm6
1862 vmovdqu 15*32(%rsi), %ymm7
1863 vmovdqa %ymm0, 8*32(%rsp)
1864 vmovdqa %ymm1, 9*32(%rsp)
1865 vmovdqa %ymm2, 10*32(%rsp)
1866 vmovdqa %ymm3, 11*32(%rsp)
1867 vmovdqa %ymm4, 12*32(%rsp)
1868 vmovdqa %ymm5, 13*32(%rsp)
1869 vmovdqa %ymm6, 14*32(%rsp)
1870 vmovdqa %ymm7, 15*32(%rsp)
1871 jmp sha256_transform_8way_core_avx2
1874 sha256_transform_8way_swap:
1875 p2bswap_avx2_rsi_rsp 0
1876 p2bswap_avx2_rsi_rsp 2
1877 p2bswap_avx2_rsi_rsp 4
1878 p2bswap_avx2_rsi_rsp 6
1879 p2bswap_avx2_rsi_rsp 8
1880 p2bswap_avx2_rsi_rsp 10
1881 p2bswap_avx2_rsi_rsp 12
1882 p2bswap_avx2_rsi_rsp 14
1883 jmp sha256_transform_8way_core_avx2
1886 sha256_transform_8way_finish:
1887 vmovdqu 0*32(%rdi), %ymm2
1888 vmovdqu 1*32(%rdi), %ymm6
1889 vmovdqu 2*32(%rdi), %ymm11
1890 vmovdqu 3*32(%rdi), %ymm1
1891 vpaddd %ymm2, %ymm7, %ymm7
1892 vpaddd %ymm6, %ymm5, %ymm5
1893 vpaddd %ymm11, %ymm4, %ymm4
1894 vpaddd %ymm1, %ymm3, %ymm3
1895 vmovdqu 4*32(%rdi), %ymm2
1896 vmovdqu 5*32(%rdi), %ymm6
1897 vmovdqu 6*32(%rdi), %ymm11
1898 vmovdqu 7*32(%rdi), %ymm1
1899 vpaddd %ymm2, %ymm0, %ymm0
1900 vpaddd %ymm6, %ymm8, %ymm8
1901 vpaddd %ymm11, %ymm9, %ymm9
1902 vpaddd %ymm1, %ymm10, %ymm10
1904 vmovdqu %ymm7, 0*32(%rdi)
1905 vmovdqu %ymm5, 1*32(%rdi)
1906 vmovdqu %ymm4, 2*32(%rdi)
1907 vmovdqu %ymm3, 3*32(%rdi)
1908 vmovdqu %ymm0, 4*32(%rdi)
1909 vmovdqu %ymm8, 5*32(%rdi)
1910 vmovdqu %ymm9, 6*32(%rdi)
1911 vmovdqu %ymm10, 7*32(%rdi)
1914 #if defined(_WIN64) || defined(__CYGWIN__)
1916 vmovdqa 0(%rsp), %xmm6
1917 vmovdqa 16(%rsp), %xmm7
1918 vmovdqa 32(%rsp), %xmm8
1919 vmovdqa 48(%rsp), %xmm9
1920 vmovdqa 64(%rsp), %xmm10
1921 vmovdqa 80(%rsp), %xmm11
1930 sha256d_ms_4way_addr:
1935 .globl sha256d_ms_4way
1936 .globl _sha256d_ms_4way
1939 jmp *sha256d_ms_4way_addr(%rip)
1943 sha256d_ms_4way_sse2:
1944 #if defined(_WIN64) || defined(__CYGWIN__)
1947 movdqa %xmm6, 0(%rsp)
1948 movdqa %xmm7, 16(%rsp)
1957 leaq 256(%rsi), %rax
1959 sha256d_ms_4way_sse2_extend_loop1:
1960 movdqa 3*16(%rsi), %xmm0
1961 movdqa 2*16(%rax), %xmm3
1962 movdqa 3*16(%rax), %xmm7
1963 movdqa %xmm3, 5*16(%rsp)
1964 movdqa %xmm7, 6*16(%rsp)
1978 movdqa %xmm3, 2*16(%rax)
1979 movdqa %xmm7, 3*16(%rax)
1981 movdqa 4*16(%rax), %xmm0
1982 movdqa %xmm0, 7*16(%rsp)
2006 movdqa %xmm3, 4*16(%rax)
2007 movdqa %xmm7, 5*16(%rax)
2009 movdqa 6*16(%rax), %xmm0
2010 movdqa 7*16(%rax), %xmm4
2011 movdqa %xmm0, 9*16(%rsp)
2012 movdqa %xmm4, 10*16(%rsp)
2037 movdqa %xmm3, 6*16(%rax)
2038 movdqa %xmm7, 7*16(%rax)
2040 movdqa 8*16(%rax), %xmm0
2041 movdqa 2*16(%rax), %xmm4
2042 movdqa %xmm0, 11*16(%rsp)
2067 movdqa %xmm3, 8*16(%rax)
2068 movdqa %xmm7, 9*16(%rax)
2092 paddd 3*16(%rax), %xmm3
2093 paddd 4*16(%rax), %xmm7
2094 movdqa %xmm3, 10*16(%rax)
2095 movdqa %xmm7, 11*16(%rax)
2119 paddd 5*16(%rax), %xmm3
2120 paddd 6*16(%rax), %xmm7
2121 movdqa %xmm3, 12*16(%rax)
2122 movdqa %xmm7, 13*16(%rax)
2124 movdqa 14*16(%rax), %xmm0
2125 movdqa 15*16(%rax), %xmm4
2126 movdqa %xmm0, 17*16(%rsp)
2127 movdqa %xmm4, 18*16(%rsp)
2134 paddd 7*16(%rax), %xmm0
2135 paddd 8*16(%rax), %xmm4
2154 movdqa %xmm3, 14*16(%rax)
2155 movdqa %xmm7, 15*16(%rax)
2157 sha256d_ms_4way_sse2_extend_loop2:
2158 sha256_sse2_extend_doubleround 16
2159 sha256_sse2_extend_doubleround 18
2160 sha256_sse2_extend_doubleround 20
2161 sha256_sse2_extend_doubleround 22
2162 sha256_sse2_extend_doubleround 24
2163 sha256_sse2_extend_doubleround 26
2164 sha256_sse2_extend_doubleround 28
2165 sha256_sse2_extend_doubleround 30
2166 sha256_sse2_extend_doubleround 32
2167 sha256_sse2_extend_doubleround 34
2168 sha256_sse2_extend_doubleround 36
2169 sha256_sse2_extend_doubleround 38
2170 sha256_sse2_extend_doubleround 40
2171 sha256_sse2_extend_doubleround 42
2172 jz sha256d_ms_4way_sse2_extend_coda2
2173 sha256_sse2_extend_doubleround 44
2174 sha256_sse2_extend_doubleround 46
2176 movdqa 0(%rcx), %xmm3
2177 movdqa 16(%rcx), %xmm0
2178 movdqa 32(%rcx), %xmm1
2179 movdqa 48(%rcx), %xmm2
2180 movdqa 64(%rcx), %xmm6
2181 movdqa 80(%rcx), %xmm7
2182 movdqa 96(%rcx), %xmm5
2183 movdqa 112(%rcx), %xmm4
2184 movdqa %xmm1, 0(%rsp)
2185 movdqa %xmm2, 16(%rsp)
2186 movdqa %xmm6, 32(%rsp)
2189 leaq sha256_4k(%rip), %rcx
2190 jmp sha256d_ms_4way_sse2_main_loop1
2192 sha256d_ms_4way_sse2_main_loop2:
2193 sha256_sse2_main_round 0
2194 sha256_sse2_main_round 1
2195 sha256_sse2_main_round 2
2196 sha256d_ms_4way_sse2_main_loop1:
2197 sha256_sse2_main_round 3
2198 sha256_sse2_main_quadround 4
2199 sha256_sse2_main_quadround 8
2200 sha256_sse2_main_quadround 12
2201 sha256_sse2_main_quadround 16
2202 sha256_sse2_main_quadround 20
2203 sha256_sse2_main_quadround 24
2204 sha256_sse2_main_quadround 28
2205 sha256_sse2_main_quadround 32
2206 sha256_sse2_main_quadround 36
2207 sha256_sse2_main_quadround 40
2208 sha256_sse2_main_quadround 44
2209 sha256_sse2_main_quadround 48
2210 sha256_sse2_main_quadround 52
2211 sha256_sse2_main_round 56
2212 jz sha256d_ms_4way_sse2_finish
2213 sha256_sse2_main_round 57
2214 sha256_sse2_main_round 58
2215 sha256_sse2_main_round 59
2216 sha256_sse2_main_quadround 60
2218 movdqa 5*16(%rsp), %xmm1
2219 movdqa 6*16(%rsp), %xmm2
2220 movdqa 7*16(%rsp), %xmm6
2221 movdqa %xmm1, 18*16(%rsi)
2222 movdqa %xmm2, 19*16(%rsi)
2223 movdqa %xmm6, 20*16(%rsi)
2224 movdqa 9*16(%rsp), %xmm1
2225 movdqa 10*16(%rsp), %xmm2
2226 movdqa 11*16(%rsp), %xmm6
2227 movdqa %xmm1, 22*16(%rsi)
2228 movdqa %xmm2, 23*16(%rsi)
2229 movdqa %xmm6, 24*16(%rsi)
2230 movdqa 17*16(%rsp), %xmm1
2231 movdqa 18*16(%rsp), %xmm2
2232 movdqa %xmm1, 30*16(%rsi)
2233 movdqa %xmm2, 31*16(%rsi)
2235 movdqa 0(%rsp), %xmm1
2236 movdqa 16(%rsp), %xmm2
2237 movdqa 32(%rsp), %xmm6
2238 paddd 0(%rdx), %xmm7
2239 paddd 16(%rdx), %xmm5
2240 paddd 32(%rdx), %xmm4
2241 paddd 48(%rdx), %xmm3
2242 paddd 64(%rdx), %xmm0
2243 paddd 80(%rdx), %xmm1
2244 paddd 96(%rdx), %xmm2
2245 paddd 112(%rdx), %xmm6
2247 movdqa %xmm7, 48+0(%rsp)
2248 movdqa %xmm5, 48+16(%rsp)
2249 movdqa %xmm4, 48+32(%rsp)
2250 movdqa %xmm3, 48+48(%rsp)
2251 movdqa %xmm0, 48+64(%rsp)
2252 movdqa %xmm1, 48+80(%rsp)
2253 movdqa %xmm2, 48+96(%rsp)
2254 movdqa %xmm6, 48+112(%rsp)
2257 movq $0x8000000000000100, %rax
2259 pshufd $0x55, %xmm1, %xmm2
2260 pshufd $0x00, %xmm1, %xmm1
2261 movdqa %xmm2, 48+128(%rsp)
2262 movdqa %xmm0, 48+144(%rsp)
2263 movdqa %xmm0, 48+160(%rsp)
2264 movdqa %xmm0, 48+176(%rsp)
2265 movdqa %xmm0, 48+192(%rsp)
2266 movdqa %xmm0, 48+208(%rsp)
2267 movdqa %xmm0, 48+224(%rsp)
2268 movdqa %xmm1, 48+240(%rsp)
2270 leaq 19*16(%rsp), %rax
2273 movdqa -15*16(%rax), %xmm0
2274 movdqa -14*16(%rax), %xmm4
2297 paddd -16*16(%rax), %xmm0
2298 paddd -15*16(%rax), %xmm4
2299 paddd sha256d_4preext2_17(%rip), %xmm4
2302 movdqa %xmm3, 0*16(%rax)
2303 movdqa %xmm7, 1*16(%rax)
2305 sha256_sse2_extend_doubleround 2
2306 sha256_sse2_extend_doubleround 4
2308 movdqa -9*16(%rax), %xmm0
2309 movdqa sha256d_4preext2_23(%rip), %xmm4
2321 paddd -10*16(%rax), %xmm0
2322 paddd -9*16(%rax), %xmm4
2329 paddd -1*16(%rax), %xmm0
2334 paddd 0*16(%rax), %xmm4
2349 movdqa %xmm3, 6*16(%rax)
2350 movdqa %xmm7, 7*16(%rax)
2352 movdqa sha256d_4preext2_24(%rip), %xmm0
2359 paddd 1*16(%rax), %xmm0
2377 paddd 2*16(%rax), %xmm7
2378 movdqa %xmm3, 8*16(%rax)
2379 movdqa %xmm7, 9*16(%rax)
2403 paddd 3*16(%rax), %xmm3
2404 paddd 4*16(%rax), %xmm7
2405 movdqa %xmm3, 10*16(%rax)
2406 movdqa %xmm7, 11*16(%rax)
2430 paddd 5*16(%rax), %xmm3
2431 paddd 6*16(%rax), %xmm7
2432 movdqa %xmm3, 12*16(%rax)
2433 movdqa %xmm7, 13*16(%rax)
2435 movdqa sha256d_4preext2_30(%rip), %xmm0
2436 movdqa 0*16(%rax), %xmm4
2448 paddd -1*16(%rax), %xmm4
2455 paddd 7*16(%rax), %xmm0
2460 paddd 8*16(%rax), %xmm4
2475 movdqa %xmm3, 14*16(%rax)
2476 movdqa %xmm7, 15*16(%rax)
2478 jmp sha256d_ms_4way_sse2_extend_loop2
2480 sha256d_ms_4way_sse2_extend_coda2:
2481 sha256_sse2_extend_round 44
2483 movdqa sha256_4h+0(%rip), %xmm7
2484 movdqa sha256_4h+16(%rip), %xmm5
2485 movdqa sha256_4h+32(%rip), %xmm4
2486 movdqa sha256_4h+48(%rip), %xmm3
2487 movdqa sha256_4h+64(%rip), %xmm0
2488 movdqa sha256_4h+80(%rip), %xmm1
2489 movdqa sha256_4h+96(%rip), %xmm2
2490 movdqa sha256_4h+112(%rip), %xmm6
2491 movdqa %xmm1, 0(%rsp)
2492 movdqa %xmm2, 16(%rsp)
2493 movdqa %xmm6, 32(%rsp)
2496 leaq sha256_4k(%rip), %rcx
2497 jmp sha256d_ms_4way_sse2_main_loop2
2499 .macro sha256_sse2_main_round_red i, r7
2500 movdqa 16*\i(%rax), %xmm6
2501 paddd 16*\i(%rcx), %xmm6
2502 paddd 32(%rsp), %xmm6
2504 movdqa 16(%rsp), %xmm2
2507 movdqa %xmm2, 32(%rsp)
2508 movdqa 0(%rsp), %xmm2
2509 movdqa %xmm2, 16(%rsp)
2512 movdqa %xmm0, 0(%rsp)
2530 sha256d_ms_4way_sse2_finish:
2531 sha256_sse2_main_round_red 57, %xmm3
2532 sha256_sse2_main_round_red 58, %xmm4
2533 sha256_sse2_main_round_red 59, %xmm5
2534 sha256_sse2_main_round_red 60, %xmm7
2536 paddd sha256_4h+112(%rip), %xmm0
2537 movdqa %xmm0, 112(%rdi)
2540 #if defined(_WIN64) || defined(__CYGWIN__)
2542 movdqa 0(%rsp), %xmm6
2543 movdqa 16(%rsp), %xmm7
2551 sha256d_ms_4way_avx:
2552 #if defined(_WIN64) || defined(__CYGWIN__)
2555 movdqa %xmm6, 0(%rsp)
2556 movdqa %xmm7, 16(%rsp)
2557 movdqa %xmm8, 32(%rsp)
2558 movdqa %xmm9, 48(%rsp)
2559 movdqa %xmm10, 64(%rsp)
2568 leaq 256(%rsi), %rax
2570 sha256d_ms_4way_avx_extend_loop1:
2571 vmovdqa 3*16(%rsi), %xmm0
2572 vmovdqa 2*16(%rax), %xmm3
2573 vmovdqa 3*16(%rax), %xmm7
2574 vmovdqa %xmm3, 2*16(%rsp)
2575 vmovdqa %xmm7, 3*16(%rsp)
2576 vpaddd %xmm0, %xmm7, %xmm7
2577 vpslld $14, %xmm0, %xmm2
2578 vpsrld $3, %xmm0, %xmm0
2579 vpsrld $4, %xmm0, %xmm1
2580 vpxor %xmm1, %xmm0, %xmm0
2581 vpxor %xmm2, %xmm0, %xmm0
2582 vpsrld $11, %xmm1, %xmm1
2583 vpslld $11, %xmm2, %xmm2
2584 vpxor %xmm1, %xmm0, %xmm0
2585 vpxor %xmm2, %xmm0, %xmm0
2586 vpaddd %xmm0, %xmm3, %xmm3
2587 vmovdqa %xmm3, 2*16(%rax)
2588 vmovdqa %xmm7, 3*16(%rax)
2590 vmovdqa 4*16(%rax), %xmm0
2591 vmovdqa %xmm0, 4*16(%rsp)
2592 vpslld $13, %xmm3, %xmm2
2593 vpslld $13, %xmm7, %xmm6
2594 vpsrld $10, %xmm3, %xmm3
2595 vpsrld $10, %xmm7, %xmm7
2596 vpsrld $7, %xmm3, %xmm1
2597 vpsrld $7, %xmm7, %xmm5
2598 vpxor %xmm1, %xmm3, %xmm3
2599 vpxor %xmm5, %xmm7, %xmm7
2600 vpsrld $2, %xmm1, %xmm1
2601 vpsrld $2, %xmm5, %xmm5
2602 vpxor %xmm2, %xmm3, %xmm3
2603 vpxor %xmm6, %xmm7, %xmm7
2604 vpslld $2, %xmm2, %xmm2
2605 vpslld $2, %xmm6, %xmm6
2606 vpxor %xmm1, %xmm3, %xmm3
2607 vpxor %xmm5, %xmm7, %xmm7
2608 vpxor %xmm2, %xmm3, %xmm3
2609 vpxor %xmm6, %xmm7, %xmm7
2610 vpaddd %xmm0, %xmm3, %xmm3
2611 vmovdqa %xmm3, 4*16(%rax)
2612 vmovdqa %xmm7, 5*16(%rax)
2614 vmovdqa 6*16(%rax), %xmm0
2615 vmovdqa 7*16(%rax), %xmm4
2616 vmovdqa %xmm0, 6*16(%rsp)
2617 vmovdqa %xmm4, 7*16(%rsp)
2618 vpslld $13, %xmm3, %xmm2
2619 vpslld $13, %xmm7, %xmm6
2620 vpsrld $10, %xmm3, %xmm3
2621 vpsrld $10, %xmm7, %xmm7
2622 vpsrld $7, %xmm3, %xmm1
2623 vpsrld $7, %xmm7, %xmm5
2624 vpxor %xmm1, %xmm3, %xmm3
2625 vpxor %xmm5, %xmm7, %xmm7
2626 vpsrld $2, %xmm1, %xmm1
2627 vpsrld $2, %xmm5, %xmm5
2628 vpxor %xmm2, %xmm3, %xmm3
2629 vpxor %xmm6, %xmm7, %xmm7
2630 vpslld $2, %xmm2, %xmm2
2631 vpslld $2, %xmm6, %xmm6
2632 vpxor %xmm1, %xmm3, %xmm3
2633 vpxor %xmm5, %xmm7, %xmm7
2634 vpxor %xmm2, %xmm3, %xmm3
2635 vpxor %xmm6, %xmm7, %xmm7
2636 vpaddd %xmm0, %xmm3, %xmm3
2637 vpaddd %xmm4, %xmm7, %xmm7
2638 vmovdqa %xmm3, 6*16(%rax)
2639 vmovdqa %xmm7, 7*16(%rax)
2641 vmovdqa 8*16(%rax), %xmm0
2642 vmovdqa 2*16(%rax), %xmm4
2643 vmovdqa %xmm0, 8*16(%rsp)
2644 vpslld $13, %xmm3, %xmm2
2645 vpslld $13, %xmm7, %xmm6
2646 vpsrld $10, %xmm3, %xmm3
2647 vpsrld $10, %xmm7, %xmm7
2648 vpsrld $7, %xmm3, %xmm1
2649 vpsrld $7, %xmm7, %xmm5
2650 vpxor %xmm1, %xmm3, %xmm3
2651 vpxor %xmm5, %xmm7, %xmm7
2652 vpsrld $2, %xmm1, %xmm1
2653 vpsrld $2, %xmm5, %xmm5
2654 vpxor %xmm2, %xmm3, %xmm3
2655 vpxor %xmm6, %xmm7, %xmm7
2656 vpslld $2, %xmm2, %xmm2
2657 vpslld $2, %xmm6, %xmm6
2658 vpxor %xmm1, %xmm3, %xmm3
2659 vpxor %xmm5, %xmm7, %xmm7
2660 vpxor %xmm2, %xmm3, %xmm3
2661 vpxor %xmm6, %xmm7, %xmm7
2662 vpaddd %xmm0, %xmm3, %xmm3
2663 vpaddd %xmm4, %xmm7, %xmm7
2664 vmovdqa %xmm3, 8*16(%rax)
2665 vmovdqa %xmm7, 9*16(%rax)
2667 vpslld $13, %xmm3, %xmm2
2668 vpslld $13, %xmm7, %xmm6
2669 vpsrld $10, %xmm3, %xmm3
2670 vpsrld $10, %xmm7, %xmm7
2671 vpsrld $7, %xmm3, %xmm1
2672 vpsrld $7, %xmm7, %xmm5
2673 vpxor %xmm1, %xmm3, %xmm3
2674 vpxor %xmm5, %xmm7, %xmm7
2675 vpsrld $2, %xmm1, %xmm1
2676 vpsrld $2, %xmm5, %xmm5
2677 vpxor %xmm2, %xmm3, %xmm3
2678 vpxor %xmm6, %xmm7, %xmm7
2679 vpslld $2, %xmm2, %xmm2
2680 vpslld $2, %xmm6, %xmm6
2681 vpxor %xmm1, %xmm3, %xmm3
2682 vpxor %xmm5, %xmm7, %xmm7
2683 vpxor %xmm2, %xmm3, %xmm3
2684 vpxor %xmm6, %xmm7, %xmm7
2685 vpaddd 3*16(%rax), %xmm3, %xmm3
2686 vpaddd 4*16(%rax), %xmm7, %xmm7
2687 vmovdqa %xmm3, 10*16(%rax)
2688 vmovdqa %xmm7, 11*16(%rax)
2690 vpslld $13, %xmm3, %xmm2
2691 vpslld $13, %xmm7, %xmm6
2692 vpsrld $10, %xmm3, %xmm3
2693 vpsrld $10, %xmm7, %xmm7
2694 vpsrld $7, %xmm3, %xmm1
2695 vpsrld $7, %xmm7, %xmm5
2696 vpxor %xmm1, %xmm3, %xmm3
2697 vpxor %xmm5, %xmm7, %xmm7
2698 vpsrld $2, %xmm1, %xmm1
2699 vpsrld $2, %xmm5, %xmm5
2700 vpxor %xmm2, %xmm3, %xmm3
2701 vpxor %xmm6, %xmm7, %xmm7
2702 vpslld $2, %xmm2, %xmm2
2703 vpslld $2, %xmm6, %xmm6
2704 vpxor %xmm1, %xmm3, %xmm3
2705 vpxor %xmm5, %xmm7, %xmm7
2706 vpxor %xmm2, %xmm3, %xmm3
2707 vpxor %xmm6, %xmm7, %xmm7
2708 vpaddd 5*16(%rax), %xmm3, %xmm3
2709 vpaddd 6*16(%rax), %xmm7, %xmm7
2710 vmovdqa %xmm3, 12*16(%rax)
2711 vmovdqa %xmm7, 13*16(%rax)
2713 vmovdqa 14*16(%rax), %xmm0
2714 vmovdqa 15*16(%rax), %xmm4
2715 vmovdqa %xmm0, 14*16(%rsp)
2716 vmovdqa %xmm4, 15*16(%rsp)
2717 vpslld $13, %xmm3, %xmm2
2718 vpslld $13, %xmm7, %xmm6
2719 vpsrld $10, %xmm3, %xmm3
2720 vpsrld $10, %xmm7, %xmm7
2721 vpaddd 7*16(%rax), %xmm0, %xmm0
2722 vpaddd 8*16(%rax), %xmm4, %xmm4
2723 vpsrld $7, %xmm3, %xmm1
2724 vpsrld $7, %xmm7, %xmm5
2725 vpxor %xmm1, %xmm3, %xmm3
2726 vpxor %xmm5, %xmm7, %xmm7
2727 vpsrld $2, %xmm1, %xmm1
2728 vpsrld $2, %xmm5, %xmm5
2729 vpxor %xmm2, %xmm3, %xmm3
2730 vpxor %xmm6, %xmm7, %xmm7
2731 vpslld $2, %xmm2, %xmm2
2732 vpslld $2, %xmm6, %xmm6
2733 vpxor %xmm1, %xmm3, %xmm3
2734 vpxor %xmm5, %xmm7, %xmm7
2735 vpxor %xmm2, %xmm3, %xmm3
2736 vpxor %xmm6, %xmm7, %xmm7
2737 vpaddd %xmm0, %xmm3, %xmm3
2738 vpaddd %xmm4, %xmm7, %xmm7
2739 vmovdqa %xmm3, 14*16(%rax)
2740 vmovdqa %xmm7, 15*16(%rax)
2742 sha256d_ms_4way_avx_extend_loop2:
2743 sha256_avx_extend_doubleround 16
2744 sha256_avx_extend_doubleround 18
2745 sha256_avx_extend_doubleround 20
2746 sha256_avx_extend_doubleround 22
2747 sha256_avx_extend_doubleround 24
2748 sha256_avx_extend_doubleround 26
2749 sha256_avx_extend_doubleround 28
2750 sha256_avx_extend_doubleround 30
2751 sha256_avx_extend_doubleround 32
2752 sha256_avx_extend_doubleround 34
2753 sha256_avx_extend_doubleround 36
2754 sha256_avx_extend_doubleround 38
2755 sha256_avx_extend_doubleround 40
2756 sha256_avx_extend_doubleround 42
2757 jz sha256d_ms_4way_avx_extend_coda2
2758 sha256_avx_extend_doubleround 44
2759 sha256_avx_extend_doubleround 46
2761 movdqa 0(%rcx), %xmm7
2762 movdqa 16(%rcx), %xmm8
2763 movdqa 32(%rcx), %xmm9
2764 movdqa 48(%rcx), %xmm10
2765 movdqa 64(%rcx), %xmm0
2766 movdqa 80(%rcx), %xmm5
2767 movdqa 96(%rcx), %xmm4
2768 movdqa 112(%rcx), %xmm3
2771 leaq sha256_4k(%rip), %rcx
2772 jmp sha256d_ms_4way_avx_main_loop1
2774 sha256d_ms_4way_avx_main_loop2:
2775 sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
2776 sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
2777 sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
2778 sha256d_ms_4way_avx_main_loop1:
2779 sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
2780 sha256_avx_main_quadround 4
2781 sha256_avx_main_quadround 8
2782 sha256_avx_main_quadround 12
2783 sha256_avx_main_quadround 16
2784 sha256_avx_main_quadround 20
2785 sha256_avx_main_quadround 24
2786 sha256_avx_main_quadround 28
2787 sha256_avx_main_quadround 32
2788 sha256_avx_main_quadround 36
2789 sha256_avx_main_quadround 40
2790 sha256_avx_main_quadround 44
2791 sha256_avx_main_quadround 48
2792 sha256_avx_main_quadround 52
2793 sha256_avx_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
2794 jz sha256d_ms_4way_avx_finish
2795 sha256_avx_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
2796 sha256_avx_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
2797 sha256_avx_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
2798 sha256_avx_main_quadround 60
2800 movdqa 2*16(%rsp), %xmm1
2801 movdqa 3*16(%rsp), %xmm2
2802 movdqa 4*16(%rsp), %xmm6
2803 movdqa %xmm1, 18*16(%rsi)
2804 movdqa %xmm2, 19*16(%rsi)
2805 movdqa %xmm6, 20*16(%rsi)
2806 movdqa 6*16(%rsp), %xmm1
2807 movdqa 7*16(%rsp), %xmm2
2808 movdqa 8*16(%rsp), %xmm6
2809 movdqa %xmm1, 22*16(%rsi)
2810 movdqa %xmm2, 23*16(%rsi)
2811 movdqa %xmm6, 24*16(%rsi)
2812 movdqa 14*16(%rsp), %xmm1
2813 movdqa 15*16(%rsp), %xmm2
2814 movdqa %xmm1, 30*16(%rsi)
2815 movdqa %xmm2, 31*16(%rsi)
2817 paddd 0(%rdx), %xmm7
2818 paddd 16(%rdx), %xmm5
2819 paddd 32(%rdx), %xmm4
2820 paddd 48(%rdx), %xmm3
2821 paddd 64(%rdx), %xmm0
2822 paddd 80(%rdx), %xmm8
2823 paddd 96(%rdx), %xmm9
2824 paddd 112(%rdx), %xmm10
2826 movdqa %xmm7, 0(%rsp)
2827 movdqa %xmm5, 16(%rsp)
2828 movdqa %xmm4, 32(%rsp)
2829 movdqa %xmm3, 48(%rsp)
2830 movdqa %xmm0, 64(%rsp)
2831 movdqa %xmm8, 80(%rsp)
2832 movdqa %xmm9, 96(%rsp)
2833 movdqa %xmm10, 112(%rsp)
2836 movq $0x8000000000000100, %rax
2838 pshufd $0x55, %xmm1, %xmm2
2839 pshufd $0x00, %xmm1, %xmm1
2840 movdqa %xmm2, 128(%rsp)
2841 movdqa %xmm0, 144(%rsp)
2842 movdqa %xmm0, 160(%rsp)
2843 movdqa %xmm0, 176(%rsp)
2844 movdqa %xmm0, 192(%rsp)
2845 movdqa %xmm0, 208(%rsp)
2846 movdqa %xmm0, 224(%rsp)
2847 movdqa %xmm1, 240(%rsp)
2849 leaq 256(%rsp), %rax
2852 vmovdqa -15*16(%rax), %xmm0
2853 vmovdqa -14*16(%rax), %xmm4
2854 vpslld $14, %xmm0, %xmm2
2855 vpslld $14, %xmm4, %xmm6
2856 vpsrld $3, %xmm0, %xmm8
2857 vpsrld $3, %xmm4, %xmm4
2858 vpsrld $7, %xmm0, %xmm1
2859 vpsrld $4, %xmm4, %xmm5
2860 vpxor %xmm1, %xmm8, %xmm8
2861 vpxor %xmm5, %xmm4, %xmm4
2862 vpsrld $11, %xmm1, %xmm1
2863 vpsrld $11, %xmm5, %xmm5
2864 vpxor %xmm2, %xmm8, %xmm8
2865 vpxor %xmm6, %xmm4, %xmm4
2866 vpslld $11, %xmm2, %xmm2
2867 vpslld $11, %xmm6, %xmm6
2868 vpxor %xmm1, %xmm8, %xmm8
2869 vpxor %xmm5, %xmm4, %xmm4
2870 vpxor %xmm2, %xmm8, %xmm8
2871 vpxor %xmm6, %xmm4, %xmm4
2872 vpaddd %xmm0, %xmm4, %xmm4
2873 vpaddd -16*16(%rax), %xmm8, %xmm3
2874 vpaddd sha256d_4preext2_17(%rip), %xmm4, %xmm7
2875 vmovdqa %xmm3, 0*16(%rax)
2876 vmovdqa %xmm7, 1*16(%rax)
2878 sha256_avx_extend_doubleround 2
2879 sha256_avx_extend_doubleround 4
2881 vmovdqa -9*16(%rax), %xmm0
2882 vpslld $14, %xmm0, %xmm2
2883 vpsrld $3, %xmm0, %xmm8
2884 vpsrld $7, %xmm0, %xmm1
2885 vpxor %xmm1, %xmm8, %xmm8
2886 vpxor %xmm2, %xmm8, %xmm8
2887 vpsrld $11, %xmm1, %xmm1
2888 vpslld $11, %xmm2, %xmm2
2889 vpxor %xmm1, %xmm8, %xmm8
2890 vpxor %xmm2, %xmm8, %xmm8
2891 vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4
2892 vpaddd -10*16(%rax), %xmm8, %xmm0
2893 vpslld $13, %xmm3, %xmm2
2894 vpslld $13, %xmm7, %xmm6
2895 vpsrld $10, %xmm3, %xmm3
2896 vpsrld $10, %xmm7, %xmm7
2897 vpaddd -1*16(%rax), %xmm0, %xmm0
2898 vpaddd 0*16(%rax), %xmm4, %xmm4
2899 vpsrld $7, %xmm3, %xmm1
2900 vpsrld $7, %xmm7, %xmm5
2901 vpxor %xmm1, %xmm3, %xmm3
2902 vpxor %xmm5, %xmm7, %xmm7
2903 vpsrld $2, %xmm1, %xmm1
2904 vpsrld $2, %xmm5, %xmm5
2905 vpxor %xmm2, %xmm3, %xmm3
2906 vpxor %xmm6, %xmm7, %xmm7
2907 vpslld $2, %xmm2, %xmm2
2908 vpslld $2, %xmm6, %xmm6
2909 vpxor %xmm1, %xmm3, %xmm3
2910 vpxor %xmm5, %xmm7, %xmm7
2911 vpxor %xmm2, %xmm3, %xmm3
2912 vpxor %xmm6, %xmm7, %xmm7
2913 vpaddd %xmm0, %xmm3, %xmm3
2914 vpaddd %xmm4, %xmm7, %xmm7
2915 vmovdqa %xmm3, 6*16(%rax)
2916 vmovdqa %xmm7, 7*16(%rax)
2918 vpslld $13, %xmm3, %xmm2
2919 vpslld $13, %xmm7, %xmm6
2920 vpsrld $10, %xmm3, %xmm3
2921 vpsrld $10, %xmm7, %xmm7
2922 vpsrld $7, %xmm3, %xmm1
2923 vpsrld $7, %xmm7, %xmm5
2924 vpxor %xmm1, %xmm3, %xmm3
2925 vpxor %xmm5, %xmm7, %xmm7
2926 vpsrld $2, %xmm1, %xmm1
2927 vpsrld $2, %xmm5, %xmm5
2928 vpxor %xmm2, %xmm3, %xmm3
2929 vpxor %xmm6, %xmm7, %xmm7
2930 vpslld $2, %xmm2, %xmm2
2931 vpslld $2, %xmm6, %xmm6
2932 vpxor %xmm1, %xmm3, %xmm3
2933 vpxor %xmm5, %xmm7, %xmm7
2934 vpxor %xmm2, %xmm3, %xmm3
2935 vpxor %xmm6, %xmm7, %xmm7
2936 vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3
2937 vpaddd 1*16(%rax), %xmm3, %xmm3
2938 vpaddd 2*16(%rax), %xmm7, %xmm7
2939 vmovdqa %xmm3, 8*16(%rax)
2940 vmovdqa %xmm7, 9*16(%rax)
2942 vpslld $13, %xmm3, %xmm2
2943 vpslld $13, %xmm7, %xmm6
2944 vpsrld $10, %xmm3, %xmm3
2945 vpsrld $10, %xmm7, %xmm7
2946 vpsrld $7, %xmm3, %xmm1
2947 vpsrld $7, %xmm7, %xmm5
2948 vpxor %xmm1, %xmm3, %xmm3
2949 vpxor %xmm5, %xmm7, %xmm7
2950 vpsrld $2, %xmm1, %xmm1
2951 vpsrld $2, %xmm5, %xmm5
2952 vpxor %xmm2, %xmm3, %xmm3
2953 vpxor %xmm6, %xmm7, %xmm7
2954 vpslld $2, %xmm2, %xmm2
2955 vpslld $2, %xmm6, %xmm6
2956 vpxor %xmm1, %xmm3, %xmm3
2957 vpxor %xmm5, %xmm7, %xmm7
2958 vpxor %xmm2, %xmm3, %xmm3
2959 vpxor %xmm6, %xmm7, %xmm7
2960 vpaddd 3*16(%rax), %xmm3, %xmm3
2961 vpaddd 4*16(%rax), %xmm7, %xmm7
2962 vmovdqa %xmm3, 10*16(%rax)
2963 vmovdqa %xmm7, 11*16(%rax)
2965 vpslld $13, %xmm3, %xmm2
2966 vpslld $13, %xmm7, %xmm6
2967 vpsrld $10, %xmm3, %xmm3
2968 vpsrld $10, %xmm7, %xmm7
2969 vpsrld $7, %xmm3, %xmm1
2970 vpsrld $7, %xmm7, %xmm5
2971 vpxor %xmm1, %xmm3, %xmm3
2972 vpxor %xmm5, %xmm7, %xmm7
2973 vpsrld $2, %xmm1, %xmm1
2974 vpsrld $2, %xmm5, %xmm5
2975 vpxor %xmm2, %xmm3, %xmm3
2976 vpxor %xmm6, %xmm7, %xmm7
2977 vpslld $2, %xmm2, %xmm2
2978 vpslld $2, %xmm6, %xmm6
2979 vpxor %xmm1, %xmm3, %xmm3
2980 vpxor %xmm5, %xmm7, %xmm7
2981 vpxor %xmm2, %xmm3, %xmm3
2982 vpxor %xmm6, %xmm7, %xmm7
2983 vpaddd 5*16(%rax), %xmm3, %xmm3
2984 vpaddd 6*16(%rax), %xmm7, %xmm7
2985 vmovdqa %xmm3, 12*16(%rax)
2986 vmovdqa %xmm7, 13*16(%rax)
2988 vmovdqa sha256d_4preext2_30(%rip), %xmm0
2989 vmovdqa 0*16(%rax), %xmm4
2990 vpslld $14, %xmm4, %xmm6
2991 vpsrld $3, %xmm4, %xmm4
2992 vpsrld $4, %xmm4, %xmm5
2993 vpxor %xmm5, %xmm4, %xmm4
2994 vpxor %xmm6, %xmm4, %xmm4
2995 vpsrld $11, %xmm5, %xmm5
2996 vpslld $11, %xmm6, %xmm6
2997 vpxor %xmm5, %xmm4, %xmm4
2998 vpxor %xmm6, %xmm4, %xmm4
2999 vpaddd -1*16(%rax), %xmm4, %xmm4
3000 vpslld $13, %xmm3, %xmm2
3001 vpslld $13, %xmm7, %xmm6
3002 vpsrld $10, %xmm3, %xmm3
3003 vpsrld $10, %xmm7, %xmm7
3004 vpaddd 7*16(%rax), %xmm0, %xmm0
3005 vpaddd 8*16(%rax), %xmm4, %xmm4
3006 vpsrld $7, %xmm3, %xmm1
3007 vpsrld $7, %xmm7, %xmm5
3008 vpxor %xmm1, %xmm3, %xmm3
3009 vpxor %xmm5, %xmm7, %xmm7
3010 vpsrld $2, %xmm1, %xmm1
3011 vpsrld $2, %xmm5, %xmm5
3012 vpxor %xmm2, %xmm3, %xmm3
3013 vpxor %xmm6, %xmm7, %xmm7
3014 vpslld $2, %xmm2, %xmm2
3015 vpslld $2, %xmm6, %xmm6
3016 vpxor %xmm1, %xmm3, %xmm3
3017 vpxor %xmm5, %xmm7, %xmm7
3018 vpxor %xmm2, %xmm3, %xmm3
3019 vpxor %xmm6, %xmm7, %xmm7
3020 vpaddd %xmm0, %xmm3, %xmm3
3021 vpaddd %xmm4, %xmm7, %xmm7
3022 vmovdqa %xmm3, 14*16(%rax)
3023 vmovdqa %xmm7, 15*16(%rax)
3025 jmp sha256d_ms_4way_avx_extend_loop2
3027 sha256d_ms_4way_avx_extend_coda2:
3028 sha256_avx_extend_round 44
3030 movdqa sha256_4h+0(%rip), %xmm7
3031 movdqa sha256_4h+16(%rip), %xmm5
3032 movdqa sha256_4h+32(%rip), %xmm4
3033 movdqa sha256_4h+48(%rip), %xmm3
3034 movdqa sha256_4h+64(%rip), %xmm0
3035 movdqa sha256_4h+80(%rip), %xmm8
3036 movdqa sha256_4h+96(%rip), %xmm9
3037 movdqa sha256_4h+112(%rip), %xmm10
3040 leaq sha256_4k(%rip), %rcx
3041 jmp sha256d_ms_4way_avx_main_loop2
3043 .macro sha256_avx_main_round_red i, r0, r1, r2, r3, r4
3044 vpaddd 16*\i(%rax), \r0, %xmm6
3045 vpaddd 16*\i(%rcx), %xmm6, %xmm6
3046 vpandn \r1, \r3, %xmm1
3047 vpand \r3, \r2, %xmm2
3048 vpxor %xmm2, %xmm1, %xmm1
3049 vpaddd %xmm1, %xmm6, %xmm6
3050 vpslld $7, \r3, %xmm1
3052 vpsrld $5, \r0, %xmm2
3053 vpxor %xmm1, \r0, \r0
3054 vpxor %xmm2, \r0, \r0
3055 vpslld $14, %xmm1, %xmm1
3056 vpsrld $14, %xmm2, %xmm2
3057 vpxor %xmm1, \r0, \r0
3058 vpxor %xmm2, \r0, \r0
3059 vpslld $5, %xmm1, %xmm1
3060 vpxor %xmm1, \r0, \r0
3061 vpaddd \r0, %xmm6, %xmm6
3062 vpaddd %xmm6, \r4, \r0
3065 sha256d_ms_4way_avx_finish:
3066 sha256_avx_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4
3067 sha256_avx_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5
3068 sha256_avx_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7
3069 sha256_avx_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3
3071 paddd sha256_4h+112(%rip), %xmm10
3072 movdqa %xmm10, 112(%rdi)
3075 #if defined(_WIN64) || defined(__CYGWIN__)
3077 movdqa 0(%rsp), %xmm6
3078 movdqa 16(%rsp), %xmm7
3079 movdqa 32(%rsp), %xmm8
3080 movdqa 48(%rsp), %xmm9
3081 movdqa 64(%rsp), %xmm10
3088 sha256d_ms_4way_xop:
3089 #if defined(_WIN64) || defined(__CYGWIN__)
3092 movdqa %xmm6, 0(%rsp)
3093 movdqa %xmm7, 16(%rsp)
3094 movdqa %xmm8, 32(%rsp)
3095 movdqa %xmm9, 48(%rsp)
3096 movdqa %xmm10, 64(%rsp)
3105 leaq 256(%rsi), %rax
3107 sha256d_ms_4way_xop_extend_loop1:
3108 vmovdqa 3*16(%rsi), %xmm0
3109 vmovdqa 2*16(%rax), %xmm3
3110 vmovdqa 3*16(%rax), %xmm7
3111 vmovdqa %xmm3, 2*16(%rsp)
3112 vmovdqa %xmm7, 3*16(%rsp)
3113 vpaddd %xmm0, %xmm7, %xmm7
3114 vprotd $25, %xmm0, %xmm1
3115 vprotd $14, %xmm0, %xmm2
3116 vpsrld $3, %xmm0, %xmm0
3117 vpxor %xmm1, %xmm2, %xmm2
3118 vpxor %xmm2, %xmm0, %xmm0
3119 vpaddd %xmm0, %xmm3, %xmm3
3120 vmovdqa %xmm3, 2*16(%rax)
3121 vmovdqa %xmm7, 3*16(%rax)
3123 vmovdqa 4*16(%rax), %xmm0
3124 vmovdqa %xmm0, 4*16(%rsp)
3125 vprotd $15, %xmm3, %xmm1
3126 vprotd $15, %xmm7, %xmm5
3127 vprotd $13, %xmm3, %xmm2
3128 vprotd $13, %xmm7, %xmm6
3129 vpxor %xmm1, %xmm2, %xmm2
3130 vpxor %xmm5, %xmm6, %xmm6
3131 vpsrld $10, %xmm3, %xmm3
3132 vpsrld $10, %xmm7, %xmm7
3133 vpxor %xmm2, %xmm3, %xmm3
3134 vpxor %xmm6, %xmm7, %xmm7
3135 vpaddd %xmm0, %xmm3, %xmm3
3136 vmovdqa %xmm3, 4*16(%rax)
3137 vmovdqa %xmm7, 5*16(%rax)
3139 vmovdqa 6*16(%rax), %xmm0
3140 vmovdqa 7*16(%rax), %xmm4
3141 vmovdqa %xmm0, 6*16(%rsp)
3142 vmovdqa %xmm4, 7*16(%rsp)
3143 vprotd $15, %xmm3, %xmm1
3144 vprotd $15, %xmm7, %xmm5
3145 vprotd $13, %xmm3, %xmm2
3146 vprotd $13, %xmm7, %xmm6
3147 vpxor %xmm1, %xmm2, %xmm2
3148 vpxor %xmm5, %xmm6, %xmm6
3149 vpsrld $10, %xmm3, %xmm3
3150 vpsrld $10, %xmm7, %xmm7
3151 vpxor %xmm2, %xmm3, %xmm3
3152 vpxor %xmm6, %xmm7, %xmm7
3153 vpaddd %xmm0, %xmm3, %xmm3
3154 vpaddd %xmm4, %xmm7, %xmm7
3155 vmovdqa %xmm3, 6*16(%rax)
3156 vmovdqa %xmm7, 7*16(%rax)
3158 vmovdqa 8*16(%rax), %xmm0
3159 vmovdqa 2*16(%rax), %xmm4
3160 vmovdqa %xmm0, 8*16(%rsp)
3161 vprotd $15, %xmm3, %xmm1
3162 vprotd $15, %xmm7, %xmm5
3163 vprotd $13, %xmm3, %xmm2
3164 vprotd $13, %xmm7, %xmm6
3165 vpxor %xmm1, %xmm2, %xmm2
3166 vpxor %xmm5, %xmm6, %xmm6
3167 vpsrld $10, %xmm3, %xmm3
3168 vpsrld $10, %xmm7, %xmm7
3169 vpxor %xmm2, %xmm3, %xmm3
3170 vpxor %xmm6, %xmm7, %xmm7
3171 vpaddd %xmm0, %xmm3, %xmm3
3172 vpaddd %xmm4, %xmm7, %xmm7
3173 vmovdqa %xmm3, 8*16(%rax)
3174 vmovdqa %xmm7, 9*16(%rax)
3176 vprotd $15, %xmm3, %xmm1
3177 vprotd $15, %xmm7, %xmm5
3178 vprotd $13, %xmm3, %xmm2
3179 vprotd $13, %xmm7, %xmm6
3180 vpxor %xmm1, %xmm2, %xmm2
3181 vpxor %xmm5, %xmm6, %xmm6
3182 vpsrld $10, %xmm3, %xmm3
3183 vpsrld $10, %xmm7, %xmm7
3184 vpxor %xmm2, %xmm3, %xmm3
3185 vpxor %xmm6, %xmm7, %xmm7
3186 vpaddd 3*16(%rax), %xmm3, %xmm3
3187 vpaddd 4*16(%rax), %xmm7, %xmm7
3188 vmovdqa %xmm3, 10*16(%rax)
3189 vmovdqa %xmm7, 11*16(%rax)
3191 vprotd $15, %xmm3, %xmm1
3192 vprotd $15, %xmm7, %xmm5
3193 vprotd $13, %xmm3, %xmm2
3194 vprotd $13, %xmm7, %xmm6
3195 vpxor %xmm1, %xmm2, %xmm2
3196 vpxor %xmm5, %xmm6, %xmm6
3197 vpsrld $10, %xmm3, %xmm3
3198 vpsrld $10, %xmm7, %xmm7
3199 vpxor %xmm2, %xmm3, %xmm3
3200 vpxor %xmm6, %xmm7, %xmm7
3201 vpaddd 5*16(%rax), %xmm3, %xmm3
3202 vpaddd 6*16(%rax), %xmm7, %xmm7
3203 vmovdqa %xmm3, 12*16(%rax)
3204 vmovdqa %xmm7, 13*16(%rax)
3206 vmovdqa 14*16(%rax), %xmm0
3207 vmovdqa 15*16(%rax), %xmm4
3208 vmovdqa %xmm0, 14*16(%rsp)
3209 vmovdqa %xmm4, 15*16(%rsp)
3210 vprotd $15, %xmm3, %xmm1
3211 vprotd $15, %xmm7, %xmm5
3212 vprotd $13, %xmm3, %xmm2
3213 vprotd $13, %xmm7, %xmm6
3214 vpxor %xmm1, %xmm2, %xmm2
3215 vpxor %xmm5, %xmm6, %xmm6
3216 vpaddd 7*16(%rax), %xmm0, %xmm0
3217 vpaddd 8*16(%rax), %xmm4, %xmm4
3218 vpsrld $10, %xmm3, %xmm3
3219 vpsrld $10, %xmm7, %xmm7
3220 vpxor %xmm2, %xmm3, %xmm3
3221 vpxor %xmm6, %xmm7, %xmm7
3222 vpaddd %xmm0, %xmm3, %xmm3
3223 vpaddd %xmm4, %xmm7, %xmm7
3224 vmovdqa %xmm3, 14*16(%rax)
3225 vmovdqa %xmm7, 15*16(%rax)
3227 sha256d_ms_4way_xop_extend_loop2:
3228 sha256_xop_extend_doubleround 16
3229 sha256_xop_extend_doubleround 18
3230 sha256_xop_extend_doubleround 20
3231 sha256_xop_extend_doubleround 22
3232 sha256_xop_extend_doubleround 24
3233 sha256_xop_extend_doubleround 26
3234 sha256_xop_extend_doubleround 28
3235 sha256_xop_extend_doubleround 30
3236 sha256_xop_extend_doubleround 32
3237 sha256_xop_extend_doubleround 34
3238 sha256_xop_extend_doubleround 36
3239 sha256_xop_extend_doubleround 38
3240 sha256_xop_extend_doubleround 40
3241 sha256_xop_extend_doubleround 42
3242 jz sha256d_ms_4way_xop_extend_coda2
3243 sha256_xop_extend_doubleround 44
3244 sha256_xop_extend_doubleround 46
3246 movdqa 0(%rcx), %xmm7
3247 movdqa 16(%rcx), %xmm8
3248 movdqa 32(%rcx), %xmm9
3249 movdqa 48(%rcx), %xmm10
3250 movdqa 64(%rcx), %xmm0
3251 movdqa 80(%rcx), %xmm5
3252 movdqa 96(%rcx), %xmm4
3253 movdqa 112(%rcx), %xmm3
3256 leaq sha256_4k(%rip), %rcx
3257 jmp sha256d_ms_4way_xop_main_loop1
3259 sha256d_ms_4way_xop_main_loop2:
3260 sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
3261 sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
3262 sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
3263 sha256d_ms_4way_xop_main_loop1:
3264 sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
3265 sha256_xop_main_quadround 4
3266 sha256_xop_main_quadround 8
3267 sha256_xop_main_quadround 12
3268 sha256_xop_main_quadround 16
3269 sha256_xop_main_quadround 20
3270 sha256_xop_main_quadround 24
3271 sha256_xop_main_quadround 28
3272 sha256_xop_main_quadround 32
3273 sha256_xop_main_quadround 36
3274 sha256_xop_main_quadround 40
3275 sha256_xop_main_quadround 44
3276 sha256_xop_main_quadround 48
3277 sha256_xop_main_quadround 52
3278 sha256_xop_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
3279 jz sha256d_ms_4way_xop_finish
3280 sha256_xop_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
3281 sha256_xop_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
3282 sha256_xop_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
3283 sha256_xop_main_quadround 60
3285 movdqa 2*16(%rsp), %xmm1
3286 movdqa 3*16(%rsp), %xmm2
3287 movdqa 4*16(%rsp), %xmm6
3288 movdqa %xmm1, 18*16(%rsi)
3289 movdqa %xmm2, 19*16(%rsi)
3290 movdqa %xmm6, 20*16(%rsi)
3291 movdqa 6*16(%rsp), %xmm1
3292 movdqa 7*16(%rsp), %xmm2
3293 movdqa 8*16(%rsp), %xmm6
3294 movdqa %xmm1, 22*16(%rsi)
3295 movdqa %xmm2, 23*16(%rsi)
3296 movdqa %xmm6, 24*16(%rsi)
3297 movdqa 14*16(%rsp), %xmm1
3298 movdqa 15*16(%rsp), %xmm2
3299 movdqa %xmm1, 30*16(%rsi)
3300 movdqa %xmm2, 31*16(%rsi)
3302 paddd 0(%rdx), %xmm7
3303 paddd 16(%rdx), %xmm5
3304 paddd 32(%rdx), %xmm4
3305 paddd 48(%rdx), %xmm3
3306 paddd 64(%rdx), %xmm0
3307 paddd 80(%rdx), %xmm8
3308 paddd 96(%rdx), %xmm9
3309 paddd 112(%rdx), %xmm10
3311 movdqa %xmm7, 0(%rsp)
3312 movdqa %xmm5, 16(%rsp)
3313 movdqa %xmm4, 32(%rsp)
3314 movdqa %xmm3, 48(%rsp)
3315 movdqa %xmm0, 64(%rsp)
3316 movdqa %xmm8, 80(%rsp)
3317 movdqa %xmm9, 96(%rsp)
3318 movdqa %xmm10, 112(%rsp)
3321 movq $0x8000000000000100, %rax
3323 pshufd $0x55, %xmm1, %xmm2
3324 pshufd $0x00, %xmm1, %xmm1
3325 movdqa %xmm2, 128(%rsp)
3326 movdqa %xmm0, 144(%rsp)
3327 movdqa %xmm0, 160(%rsp)
3328 movdqa %xmm0, 176(%rsp)
3329 movdqa %xmm0, 192(%rsp)
3330 movdqa %xmm0, 208(%rsp)
3331 movdqa %xmm0, 224(%rsp)
3332 movdqa %xmm1, 240(%rsp)
3334 leaq 256(%rsp), %rax
3337 vmovdqa -15*16(%rax), %xmm0
3338 vmovdqa -14*16(%rax), %xmm4
3339 vprotd $25, %xmm0, %xmm1
3340 vprotd $25, %xmm4, %xmm5
3341 vprotd $14, %xmm0, %xmm2
3342 vprotd $14, %xmm4, %xmm6
3343 vpxor %xmm1, %xmm2, %xmm2
3344 vpxor %xmm5, %xmm6, %xmm6
3345 vpsrld $3, %xmm0, %xmm8
3346 vpsrld $3, %xmm4, %xmm4
3347 vpxor %xmm2, %xmm8, %xmm8
3348 vpxor %xmm6, %xmm4, %xmm4
3349 vpaddd %xmm0, %xmm4, %xmm4
3350 vpaddd -16*16(%rax), %xmm8, %xmm3
3351 vpaddd sha256d_4preext2_17(%rip), %xmm4, %xmm7
3352 vmovdqa %xmm3, 0*16(%rax)
3353 vmovdqa %xmm7, 1*16(%rax)
3355 sha256_xop_extend_doubleround 2
3356 sha256_xop_extend_doubleround 4
3358 vmovdqa -9*16(%rax), %xmm0
3359 vprotd $25, %xmm0, %xmm1
3360 vprotd $14, %xmm0, %xmm2
3361 vpsrld $3, %xmm0, %xmm8
3362 vpxor %xmm1, %xmm2, %xmm2
3363 vpxor %xmm2, %xmm8, %xmm8
3364 vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4
3365 vpaddd -10*16(%rax), %xmm8, %xmm0
3366 vprotd $15, %xmm3, %xmm1
3367 vprotd $15, %xmm7, %xmm5
3368 vprotd $13, %xmm3, %xmm2
3369 vprotd $13, %xmm7, %xmm6
3370 vpxor %xmm1, %xmm2, %xmm2
3371 vpxor %xmm5, %xmm6, %xmm6
3372 vpaddd -1*16(%rax), %xmm0, %xmm0
3373 vpaddd 0*16(%rax), %xmm4, %xmm4
3374 vpsrld $10, %xmm3, %xmm3
3375 vpsrld $10, %xmm7, %xmm7
3376 vpxor %xmm2, %xmm3, %xmm3
3377 vpxor %xmm6, %xmm7, %xmm7
3378 vpaddd %xmm0, %xmm3, %xmm3
3379 vpaddd %xmm4, %xmm7, %xmm7
3380 vmovdqa %xmm3, 6*16(%rax)
3381 vmovdqa %xmm7, 7*16(%rax)
3383 vprotd $15, %xmm3, %xmm1
3384 vprotd $15, %xmm7, %xmm5
3385 vprotd $13, %xmm3, %xmm2
3386 vprotd $13, %xmm7, %xmm6
3387 vpxor %xmm1, %xmm2, %xmm2
3388 vpxor %xmm5, %xmm6, %xmm6
3389 vpsrld $10, %xmm3, %xmm3
3390 vpsrld $10, %xmm7, %xmm7
3391 vpxor %xmm2, %xmm3, %xmm3
3392 vpxor %xmm6, %xmm7, %xmm7
3393 vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3
3394 vpaddd 1*16(%rax), %xmm3, %xmm3
3395 vpaddd 2*16(%rax), %xmm7, %xmm7
3396 vmovdqa %xmm3, 8*16(%rax)
3397 vmovdqa %xmm7, 9*16(%rax)
3399 vprotd $15, %xmm3, %xmm1
3400 vprotd $15, %xmm7, %xmm5
3401 vprotd $13, %xmm3, %xmm2
3402 vprotd $13, %xmm7, %xmm6
3403 vpxor %xmm1, %xmm2, %xmm2
3404 vpxor %xmm5, %xmm6, %xmm6
3405 vpsrld $10, %xmm3, %xmm3
3406 vpsrld $10, %xmm7, %xmm7
3407 vpxor %xmm2, %xmm3, %xmm3
3408 vpxor %xmm6, %xmm7, %xmm7
3409 vpaddd 3*16(%rax), %xmm3, %xmm3
3410 vpaddd 4*16(%rax), %xmm7, %xmm7
3411 vmovdqa %xmm3, 10*16(%rax)
3412 vmovdqa %xmm7, 11*16(%rax)
3414 vprotd $15, %xmm3, %xmm1
3415 vprotd $15, %xmm7, %xmm5
3416 vprotd $13, %xmm3, %xmm2
3417 vprotd $13, %xmm7, %xmm6
3418 vpxor %xmm1, %xmm2, %xmm2
3419 vpxor %xmm5, %xmm6, %xmm6
3420 vpsrld $10, %xmm3, %xmm3
3421 vpsrld $10, %xmm7, %xmm7
3422 vpxor %xmm2, %xmm3, %xmm3
3423 vpxor %xmm6, %xmm7, %xmm7
3424 vpaddd 5*16(%rax), %xmm3, %xmm3
3425 vpaddd 6*16(%rax), %xmm7, %xmm7
3426 vmovdqa %xmm3, 12*16(%rax)
3427 vmovdqa %xmm7, 13*16(%rax)
3429 vmovdqa sha256d_4preext2_30(%rip), %xmm0
3430 vmovdqa 0*16(%rax), %xmm4
3431 vprotd $25, %xmm4, %xmm5
3432 vprotd $14, %xmm4, %xmm6
3433 vpxor %xmm5, %xmm6, %xmm6
3434 vpsrld $3, %xmm4, %xmm4
3435 vpxor %xmm6, %xmm4, %xmm4
3436 vpaddd -1*16(%rax), %xmm4, %xmm4
3437 vprotd $15, %xmm3, %xmm1
3438 vprotd $15, %xmm7, %xmm5
3439 vprotd $13, %xmm3, %xmm2
3440 vprotd $13, %xmm7, %xmm6
3441 vpxor %xmm1, %xmm2, %xmm2
3442 vpxor %xmm5, %xmm6, %xmm6
3443 vpaddd 7*16(%rax), %xmm0, %xmm0
3444 vpaddd 8*16(%rax), %xmm4, %xmm4
3445 vpsrld $10, %xmm3, %xmm3
3446 vpsrld $10, %xmm7, %xmm7
3447 vpxor %xmm2, %xmm3, %xmm3
3448 vpxor %xmm6, %xmm7, %xmm7
3449 vpaddd %xmm0, %xmm3, %xmm3
3450 vpaddd %xmm4, %xmm7, %xmm7
3451 vmovdqa %xmm3, 14*16(%rax)
3452 vmovdqa %xmm7, 15*16(%rax)
3454 jmp sha256d_ms_4way_xop_extend_loop2
3456 sha256d_ms_4way_xop_extend_coda2:
3457 sha256_xop_extend_round 44
3459 movdqa sha256_4h+0(%rip), %xmm7
3460 movdqa sha256_4h+16(%rip), %xmm5
3461 movdqa sha256_4h+32(%rip), %xmm4
3462 movdqa sha256_4h+48(%rip), %xmm3
3463 movdqa sha256_4h+64(%rip), %xmm0
3464 movdqa sha256_4h+80(%rip), %xmm8
3465 movdqa sha256_4h+96(%rip), %xmm9
3466 movdqa sha256_4h+112(%rip), %xmm10
3469 leaq sha256_4k(%rip), %rcx
3470 jmp sha256d_ms_4way_xop_main_loop2
3472 .macro sha256_xop_main_round_red i, r0, r1, r2, r3, r4
3473 vpaddd 16*\i(%rax), \r0, %xmm6
3474 vpaddd 16*\i(%rcx), %xmm6, %xmm6
3475 vpandn \r1, \r3, %xmm1
3476 vpand \r3, \r2, %xmm2
3477 vpxor %xmm2, %xmm1, %xmm1
3478 vpaddd %xmm1, %xmm6, %xmm6
3479 vprotd $26, \r3, %xmm1
3480 vprotd $21, \r3, %xmm2
3481 vpxor %xmm1, %xmm2, %xmm2
3483 vpxor %xmm2, \r0, \r0
3484 vpaddd \r0, %xmm6, %xmm6
3485 vpaddd %xmm6, \r4, \r0
3488 sha256d_ms_4way_xop_finish:
3489 sha256_xop_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4
3490 sha256_xop_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5
3491 sha256_xop_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7
3492 sha256_xop_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3
3494 paddd sha256_4h+112(%rip), %xmm10
3495 movdqa %xmm10, 112(%rdi)
3498 #if defined(_WIN64) || defined(__CYGWIN__)
3500 movdqa 0(%rsp), %xmm6
3501 movdqa 16(%rsp), %xmm7
3502 movdqa 32(%rsp), %xmm8
3503 movdqa 48(%rsp), %xmm9
3504 movdqa 64(%rsp), %xmm10
3512 .globl sha256_use_4way
3513 .globl _sha256_use_4way
3520 /* Check for VIA PadLock Hash Engine */
3521 movl $0xc0000000, %eax
3523 cmpl $0xc0000001, %eax
3524 jb sha256_use_4way_no_phe
3525 movl $0xc0000001, %eax
3527 andl $0x00000c00, %edx
3528 cmpl $0x00000c00, %edx
3529 jne sha256_use_4way_no_phe
3530 leaq sha256_transform_phe(%rip), %rdx
3531 movq %rdx, sha256_transform_addr(%rip)
3533 jmp sha256_use_4way_exit
3534 sha256_use_4way_no_phe:
3535 /* Check for AVX and OSXSAVE support */
3538 andl $0x18000000, %ecx
3539 cmpl $0x18000000, %ecx
3540 jne sha256_use_4way_base
3541 /* Check for XMM and YMM state support */
3544 andl $0x00000006, %eax
3545 cmpl $0x00000006, %eax
3546 jne sha256_use_4way_base
3547 /* Check for XOP support */
3548 movl $0x80000001, %eax
3550 andl $0x00000800, %ecx
3551 jz sha256_use_4way_avx
3553 sha256_use_4way_xop:
3554 leaq sha256d_ms_4way_xop(%rip), %rcx
3555 leaq sha256_transform_4way_core_xop(%rip), %rdx
3556 jmp sha256_use_4way_done
3558 sha256_use_4way_avx:
3559 leaq sha256d_ms_4way_avx(%rip), %rcx
3560 leaq sha256_transform_4way_core_avx(%rip), %rdx
3561 jmp sha256_use_4way_done
3563 sha256_use_4way_base:
3564 leaq sha256d_ms_4way_sse2(%rip), %rcx
3565 leaq sha256_transform_4way_core_sse2(%rip), %rdx
3567 sha256_use_4way_done:
3568 movq %rcx, sha256d_ms_4way_addr(%rip)
3569 movq %rdx, sha256_transform_4way_core_addr(%rip)
3571 sha256_use_4way_exit:
3579 .globl sha256d_ms_8way
3580 .globl _sha256d_ms_8way
3583 sha256d_ms_8way_avx2:
3584 #if defined(_WIN64) || defined(__CYGWIN__)
3587 vmovdqa %xmm6, 0(%rsp)
3588 vmovdqa %xmm7, 16(%rsp)
3589 vmovdqa %xmm8, 32(%rsp)
3590 vmovdqa %xmm9, 48(%rsp)
3591 vmovdqa %xmm10, 64(%rsp)
3603 leaq 16*32(%rsi), %rax
3605 sha256d_ms_8way_avx2_extend_loop1:
3606 vmovdqa 3*32(%rsi), %ymm0
3607 vmovdqa 2*32(%rax), %ymm3
3608 vmovdqa 3*32(%rax), %ymm7
3609 vmovdqa %ymm3, 2*32(%rsp)
3610 vmovdqa %ymm7, 3*32(%rsp)
3611 vpaddd %ymm0, %ymm7, %ymm7
3612 vpslld $14, %ymm0, %ymm2
3613 vpsrld $3, %ymm0, %ymm0
3614 vpsrld $4, %ymm0, %ymm1
3615 vpxor %ymm1, %ymm0, %ymm0
3616 vpxor %ymm2, %ymm0, %ymm0
3617 vpsrld $11, %ymm1, %ymm1
3618 vpslld $11, %ymm2, %ymm2
3619 vpxor %ymm1, %ymm0, %ymm0
3620 vpxor %ymm2, %ymm0, %ymm0
3621 vpaddd %ymm0, %ymm3, %ymm3
3622 vmovdqa %ymm3, 2*32(%rax)
3623 vmovdqa %ymm7, 3*32(%rax)
3625 vmovdqa 4*32(%rax), %ymm0
3626 vmovdqa %ymm0, 4*32(%rsp)
3627 vpslld $13, %ymm3, %ymm2
3628 vpslld $13, %ymm7, %ymm6
3629 vpsrld $10, %ymm3, %ymm3
3630 vpsrld $10, %ymm7, %ymm7
3631 vpsrld $7, %ymm3, %ymm1
3632 vpsrld $7, %ymm7, %ymm5
3633 vpxor %ymm1, %ymm3, %ymm3
3634 vpxor %ymm5, %ymm7, %ymm7
3635 vpsrld $2, %ymm1, %ymm1
3636 vpsrld $2, %ymm5, %ymm5
3637 vpxor %ymm2, %ymm3, %ymm3
3638 vpxor %ymm6, %ymm7, %ymm7
3639 vpslld $2, %ymm2, %ymm2
3640 vpslld $2, %ymm6, %ymm6
3641 vpxor %ymm1, %ymm3, %ymm3
3642 vpxor %ymm5, %ymm7, %ymm7
3643 vpxor %ymm2, %ymm3, %ymm3
3644 vpxor %ymm6, %ymm7, %ymm7
3645 vpaddd %ymm0, %ymm3, %ymm3
3646 vmovdqa %ymm3, 4*32(%rax)
3647 vmovdqa %ymm7, 5*32(%rax)
3649 vmovdqa 6*32(%rax), %ymm0
3650 vmovdqa 7*32(%rax), %ymm4
3651 vmovdqa %ymm0, 6*32(%rsp)
3652 vmovdqa %ymm4, 7*32(%rsp)
3653 vpslld $13, %ymm3, %ymm2
3654 vpslld $13, %ymm7, %ymm6
3655 vpsrld $10, %ymm3, %ymm3
3656 vpsrld $10, %ymm7, %ymm7
3657 vpsrld $7, %ymm3, %ymm1
3658 vpsrld $7, %ymm7, %ymm5
3659 vpxor %ymm1, %ymm3, %ymm3
3660 vpxor %ymm5, %ymm7, %ymm7
3661 vpsrld $2, %ymm1, %ymm1
3662 vpsrld $2, %ymm5, %ymm5
3663 vpxor %ymm2, %ymm3, %ymm3
3664 vpxor %ymm6, %ymm7, %ymm7
3665 vpslld $2, %ymm2, %ymm2
3666 vpslld $2, %ymm6, %ymm6
3667 vpxor %ymm1, %ymm3, %ymm3
3668 vpxor %ymm5, %ymm7, %ymm7
3669 vpxor %ymm2, %ymm3, %ymm3
3670 vpxor %ymm6, %ymm7, %ymm7
3671 vpaddd %ymm0, %ymm3, %ymm3
3672 vpaddd %ymm4, %ymm7, %ymm7
3673 vmovdqa %ymm3, 6*32(%rax)
3674 vmovdqa %ymm7, 7*32(%rax)
3676 vmovdqa 8*32(%rax), %ymm0
3677 vmovdqa 2*32(%rax), %ymm4
3678 vmovdqa %ymm0, 8*32(%rsp)
3679 vpslld $13, %ymm3, %ymm2
3680 vpslld $13, %ymm7, %ymm6
3681 vpsrld $10, %ymm3, %ymm3
3682 vpsrld $10, %ymm7, %ymm7
3683 vpsrld $7, %ymm3, %ymm1
3684 vpsrld $7, %ymm7, %ymm5
3685 vpxor %ymm1, %ymm3, %ymm3
3686 vpxor %ymm5, %ymm7, %ymm7
3687 vpsrld $2, %ymm1, %ymm1
3688 vpsrld $2, %ymm5, %ymm5
3689 vpxor %ymm2, %ymm3, %ymm3
3690 vpxor %ymm6, %ymm7, %ymm7
3691 vpslld $2, %ymm2, %ymm2
3692 vpslld $2, %ymm6, %ymm6
3693 vpxor %ymm1, %ymm3, %ymm3
3694 vpxor %ymm5, %ymm7, %ymm7
3695 vpxor %ymm2, %ymm3, %ymm3
3696 vpxor %ymm6, %ymm7, %ymm7
3697 vpaddd %ymm0, %ymm3, %ymm3
3698 vpaddd %ymm4, %ymm7, %ymm7
3699 vmovdqa %ymm3, 8*32(%rax)
3700 vmovdqa %ymm7, 9*32(%rax)
3702 vpslld $13, %ymm3, %ymm2
3703 vpslld $13, %ymm7, %ymm6
3704 vpsrld $10, %ymm3, %ymm3
3705 vpsrld $10, %ymm7, %ymm7
3706 vpsrld $7, %ymm3, %ymm1
3707 vpsrld $7, %ymm7, %ymm5
3708 vpxor %ymm1, %ymm3, %ymm3
3709 vpxor %ymm5, %ymm7, %ymm7
3710 vpsrld $2, %ymm1, %ymm1
3711 vpsrld $2, %ymm5, %ymm5
3712 vpxor %ymm2, %ymm3, %ymm3
3713 vpxor %ymm6, %ymm7, %ymm7
3714 vpslld $2, %ymm2, %ymm2
3715 vpslld $2, %ymm6, %ymm6
3716 vpxor %ymm1, %ymm3, %ymm3
3717 vpxor %ymm5, %ymm7, %ymm7
3718 vpxor %ymm2, %ymm3, %ymm3
3719 vpxor %ymm6, %ymm7, %ymm7
3720 vpaddd 3*32(%rax), %ymm3, %ymm3
3721 vpaddd 4*32(%rax), %ymm7, %ymm7
3722 vmovdqa %ymm3, 10*32(%rax)
3723 vmovdqa %ymm7, 11*32(%rax)
3725 vpslld $13, %ymm3, %ymm2
3726 vpslld $13, %ymm7, %ymm6
3727 vpsrld $10, %ymm3, %ymm3
3728 vpsrld $10, %ymm7, %ymm7
3729 vpsrld $7, %ymm3, %ymm1
3730 vpsrld $7, %ymm7, %ymm5
3731 vpxor %ymm1, %ymm3, %ymm3
3732 vpxor %ymm5, %ymm7, %ymm7
3733 vpsrld $2, %ymm1, %ymm1
3734 vpsrld $2, %ymm5, %ymm5
3735 vpxor %ymm2, %ymm3, %ymm3
3736 vpxor %ymm6, %ymm7, %ymm7
3737 vpslld $2, %ymm2, %ymm2
3738 vpslld $2, %ymm6, %ymm6
3739 vpxor %ymm1, %ymm3, %ymm3
3740 vpxor %ymm5, %ymm7, %ymm7
3741 vpxor %ymm2, %ymm3, %ymm3
3742 vpxor %ymm6, %ymm7, %ymm7
3743 vpaddd 5*32(%rax), %ymm3, %ymm3
3744 vpaddd 6*32(%rax), %ymm7, %ymm7
3745 vmovdqa %ymm3, 12*32(%rax)
3746 vmovdqa %ymm7, 13*32(%rax)
3748 vmovdqa 14*32(%rax), %ymm0
3749 vmovdqa 15*32(%rax), %ymm4
3750 vmovdqa %ymm0, 14*32(%rsp)
3751 vmovdqa %ymm4, 15*32(%rsp)
3752 vpslld $13, %ymm3, %ymm2
3753 vpslld $13, %ymm7, %ymm6
3754 vpsrld $10, %ymm3, %ymm3
3755 vpsrld $10, %ymm7, %ymm7
3756 vpaddd 7*32(%rax), %ymm0, %ymm0
3757 vpaddd 8*32(%rax), %ymm4, %ymm4
3758 vpsrld $7, %ymm3, %ymm1
3759 vpsrld $7, %ymm7, %ymm5
3760 vpxor %ymm1, %ymm3, %ymm3
3761 vpxor %ymm5, %ymm7, %ymm7
3762 vpsrld $2, %ymm1, %ymm1
3763 vpsrld $2, %ymm5, %ymm5
3764 vpxor %ymm2, %ymm3, %ymm3
3765 vpxor %ymm6, %ymm7, %ymm7
3766 vpslld $2, %ymm2, %ymm2
3767 vpslld $2, %ymm6, %ymm6
3768 vpxor %ymm1, %ymm3, %ymm3
3769 vpxor %ymm5, %ymm7, %ymm7
3770 vpxor %ymm2, %ymm3, %ymm3
3771 vpxor %ymm6, %ymm7, %ymm7
3772 vpaddd %ymm0, %ymm3, %ymm3
3773 vpaddd %ymm4, %ymm7, %ymm7
3774 vmovdqa %ymm3, 14*32(%rax)
3775 vmovdqa %ymm7, 15*32(%rax)
3777 sha256d_ms_8way_avx2_extend_loop2:
3778 sha256_avx2_extend_doubleround 16
3779 sha256_avx2_extend_doubleround 18
3780 sha256_avx2_extend_doubleround 20
3781 sha256_avx2_extend_doubleround 22
3782 sha256_avx2_extend_doubleround 24
3783 sha256_avx2_extend_doubleround 26
3784 sha256_avx2_extend_doubleround 28
3785 sha256_avx2_extend_doubleround 30
3786 sha256_avx2_extend_doubleround 32
3787 sha256_avx2_extend_doubleround 34
3788 sha256_avx2_extend_doubleround 36
3789 sha256_avx2_extend_doubleround 38
3790 sha256_avx2_extend_doubleround 40
3791 sha256_avx2_extend_doubleround 42
3792 jz sha256d_ms_8way_avx2_extend_coda2
3793 sha256_avx2_extend_doubleround 44
3794 sha256_avx2_extend_doubleround 46
3796 vmovdqa 0(%rcx), %ymm7
3797 vmovdqa 32(%rcx), %ymm8
3798 vmovdqa 64(%rcx), %ymm9
3799 vmovdqa 96(%rcx), %ymm10
3800 vmovdqa 128(%rcx), %ymm0
3801 vmovdqa 160(%rcx), %ymm5
3802 vmovdqa 192(%rcx), %ymm4
3803 vmovdqa 224(%rcx), %ymm3
3806 leaq sha256_8k(%rip), %rcx
3807 jmp sha256d_ms_8way_avx2_main_loop1
3809 sha256d_ms_8way_avx2_main_loop2:
3810 sha256_avx2_main_round 0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7
3811 sha256_avx2_main_round 1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3
3812 sha256_avx2_main_round 2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4
3813 sha256d_ms_8way_avx2_main_loop1:
3814 sha256_avx2_main_round 3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5
3815 sha256_avx2_main_quadround 4
3816 sha256_avx2_main_quadround 8
3817 sha256_avx2_main_quadround 12
3818 sha256_avx2_main_quadround 16
3819 sha256_avx2_main_quadround 20
3820 sha256_avx2_main_quadround 24
3821 sha256_avx2_main_quadround 28
3822 sha256_avx2_main_quadround 32
3823 sha256_avx2_main_quadround 36
3824 sha256_avx2_main_quadround 40
3825 sha256_avx2_main_quadround 44
3826 sha256_avx2_main_quadround 48
3827 sha256_avx2_main_quadround 52
3828 sha256_avx2_main_round 56, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7
3829 jz sha256d_ms_8way_avx2_finish
3830 sha256_avx2_main_round 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3
3831 sha256_avx2_main_round 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4
3832 sha256_avx2_main_round 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5
3833 sha256_avx2_main_quadround 60
3835 vmovdqa 2*32(%rsp), %ymm1
3836 vmovdqa 3*32(%rsp), %ymm2
3837 vmovdqa 4*32(%rsp), %ymm6
3838 vmovdqa %ymm1, 18*32(%rsi)
3839 vmovdqa %ymm2, 19*32(%rsi)
3840 vmovdqa %ymm6, 20*32(%rsi)
3841 vmovdqa 6*32(%rsp), %ymm1
3842 vmovdqa 7*32(%rsp), %ymm2
3843 vmovdqa 8*32(%rsp), %ymm6
3844 vmovdqa %ymm1, 22*32(%rsi)
3845 vmovdqa %ymm2, 23*32(%rsi)
3846 vmovdqa %ymm6, 24*32(%rsi)
3847 vmovdqa 14*32(%rsp), %ymm1
3848 vmovdqa 15*32(%rsp), %ymm2
3849 vmovdqa %ymm1, 30*32(%rsi)
3850 vmovdqa %ymm2, 31*32(%rsi)
3852 vpaddd 0(%rdx), %ymm7, %ymm7
3853 vpaddd 32(%rdx), %ymm5, %ymm5
3854 vpaddd 64(%rdx), %ymm4, %ymm4
3855 vpaddd 96(%rdx), %ymm3, %ymm3
3856 vpaddd 128(%rdx), %ymm0, %ymm0
3857 vpaddd 160(%rdx), %ymm8, %ymm8
3858 vpaddd 192(%rdx), %ymm9, %ymm9
3859 vpaddd 224(%rdx), %ymm10, %ymm10
3861 vmovdqa %ymm7, 0(%rsp)
3862 vmovdqa %ymm5, 32(%rsp)
3863 vmovdqa %ymm4, 64(%rsp)
3864 vmovdqa %ymm3, 96(%rsp)
3865 vmovdqa %ymm0, 128(%rsp)
3866 vmovdqa %ymm8, 160(%rsp)
3867 vmovdqa %ymm9, 192(%rsp)
3868 vmovdqa %ymm10, 224(%rsp)
3870 vpxor %ymm0, %ymm0, %ymm0
3871 movq $0x8000000000000100, %rax
3873 vinserti128 $1, %xmm1, %ymm1, %ymm1
3874 vpshufd $0x55, %ymm1, %ymm2
3875 vpshufd $0x00, %ymm1, %ymm1
3876 vmovdqa %ymm2, 8*32(%rsp)
3877 vmovdqa %ymm0, 9*32(%rsp)
3878 vmovdqa %ymm0, 10*32(%rsp)
3879 vmovdqa %ymm0, 11*32(%rsp)
3880 vmovdqa %ymm0, 12*32(%rsp)
3881 vmovdqa %ymm0, 13*32(%rsp)
3882 vmovdqa %ymm0, 14*32(%rsp)
3883 vmovdqa %ymm1, 15*32(%rsp)
3885 leaq 16*32(%rsp), %rax
3888 vmovdqa -15*32(%rax), %ymm0
3889 vmovdqa -14*32(%rax), %ymm4
3890 vpslld $14, %ymm0, %ymm2
3891 vpslld $14, %ymm4, %ymm6
3892 vpsrld $3, %ymm0, %ymm8
3893 vpsrld $3, %ymm4, %ymm4
3894 vpsrld $7, %ymm0, %ymm1
3895 vpsrld $4, %ymm4, %ymm5
3896 vpxor %ymm1, %ymm8, %ymm8
3897 vpxor %ymm5, %ymm4, %ymm4
3898 vpsrld $11, %ymm1, %ymm1
3899 vpsrld $11, %ymm5, %ymm5
3900 vpxor %ymm2, %ymm8, %ymm8
3901 vpxor %ymm6, %ymm4, %ymm4
3902 vpslld $11, %ymm2, %ymm2
3903 vpslld $11, %ymm6, %ymm6
3904 vpxor %ymm1, %ymm8, %ymm8
3905 vpxor %ymm5, %ymm4, %ymm4
3906 vpxor %ymm2, %ymm8, %ymm8
3907 vpxor %ymm6, %ymm4, %ymm4
3908 vpaddd %ymm0, %ymm4, %ymm4
3909 vpaddd -16*32(%rax), %ymm8, %ymm3
3910 vpaddd sha256d_8preext2_17(%rip), %ymm4, %ymm7
3911 vmovdqa %ymm3, 0*32(%rax)
3912 vmovdqa %ymm7, 1*32(%rax)
3914 sha256_avx2_extend_doubleround 2
3915 sha256_avx2_extend_doubleround 4
3917 vmovdqa -9*32(%rax), %ymm0
3918 vpslld $14, %ymm0, %ymm2
3919 vpsrld $3, %ymm0, %ymm8
3920 vpsrld $7, %ymm0, %ymm1
3921 vpxor %ymm1, %ymm8, %ymm8
3922 vpxor %ymm2, %ymm8, %ymm8
3923 vpsrld $11, %ymm1, %ymm1
3924 vpslld $11, %ymm2, %ymm2
3925 vpxor %ymm1, %ymm8, %ymm8
3926 vpxor %ymm2, %ymm8, %ymm8
3927 vpaddd sha256d_8preext2_23(%rip), %ymm0, %ymm4
3928 vpaddd -10*32(%rax), %ymm8, %ymm0
3929 vpslld $13, %ymm3, %ymm2
3930 vpslld $13, %ymm7, %ymm6
3931 vpsrld $10, %ymm3, %ymm3
3932 vpsrld $10, %ymm7, %ymm7
3933 vpaddd -1*32(%rax), %ymm0, %ymm0
3934 vpaddd 0*32(%rax), %ymm4, %ymm4
3935 vpsrld $7, %ymm3, %ymm1
3936 vpsrld $7, %ymm7, %ymm5
3937 vpxor %ymm1, %ymm3, %ymm3
3938 vpxor %ymm5, %ymm7, %ymm7
3939 vpsrld $2, %ymm1, %ymm1
3940 vpsrld $2, %ymm5, %ymm5
3941 vpxor %ymm2, %ymm3, %ymm3
3942 vpxor %ymm6, %ymm7, %ymm7
3943 vpslld $2, %ymm2, %ymm2
3944 vpslld $2, %ymm6, %ymm6
3945 vpxor %ymm1, %ymm3, %ymm3
3946 vpxor %ymm5, %ymm7, %ymm7
3947 vpxor %ymm2, %ymm3, %ymm3
3948 vpxor %ymm6, %ymm7, %ymm7
3949 vpaddd %ymm0, %ymm3, %ymm3
3950 vpaddd %ymm4, %ymm7, %ymm7
3951 vmovdqa %ymm3, 6*32(%rax)
3952 vmovdqa %ymm7, 7*32(%rax)
3954 vpslld $13, %ymm3, %ymm2
3955 vpslld $13, %ymm7, %ymm6
3956 vpsrld $10, %ymm3, %ymm3
3957 vpsrld $10, %ymm7, %ymm7
3958 vpsrld $7, %ymm3, %ymm1
3959 vpsrld $7, %ymm7, %ymm5
3960 vpxor %ymm1, %ymm3, %ymm3
3961 vpxor %ymm5, %ymm7, %ymm7
3962 vpsrld $2, %ymm1, %ymm1
3963 vpsrld $2, %ymm5, %ymm5
3964 vpxor %ymm2, %ymm3, %ymm3
3965 vpxor %ymm6, %ymm7, %ymm7
3966 vpslld $2, %ymm2, %ymm2
3967 vpslld $2, %ymm6, %ymm6
3968 vpxor %ymm1, %ymm3, %ymm3
3969 vpxor %ymm5, %ymm7, %ymm7
3970 vpxor %ymm2, %ymm3, %ymm3
3971 vpxor %ymm6, %ymm7, %ymm7
3972 vpaddd sha256d_8preext2_24(%rip), %ymm3, %ymm3
3973 vpaddd 1*32(%rax), %ymm3, %ymm3
3974 vpaddd 2*32(%rax), %ymm7, %ymm7
3975 vmovdqa %ymm3, 8*32(%rax)
3976 vmovdqa %ymm7, 9*32(%rax)
3978 vpslld $13, %ymm3, %ymm2
3979 vpslld $13, %ymm7, %ymm6
3980 vpsrld $10, %ymm3, %ymm3
3981 vpsrld $10, %ymm7, %ymm7
3982 vpsrld $7, %ymm3, %ymm1
3983 vpsrld $7, %ymm7, %ymm5
3984 vpxor %ymm1, %ymm3, %ymm3
3985 vpxor %ymm5, %ymm7, %ymm7
3986 vpsrld $2, %ymm1, %ymm1
3987 vpsrld $2, %ymm5, %ymm5
3988 vpxor %ymm2, %ymm3, %ymm3
3989 vpxor %ymm6, %ymm7, %ymm7
3990 vpslld $2, %ymm2, %ymm2
3991 vpslld $2, %ymm6, %ymm6
3992 vpxor %ymm1, %ymm3, %ymm3
3993 vpxor %ymm5, %ymm7, %ymm7
3994 vpxor %ymm2, %ymm3, %ymm3
3995 vpxor %ymm6, %ymm7, %ymm7
3996 vpaddd 3*32(%rax), %ymm3, %ymm3
3997 vpaddd 4*32(%rax), %ymm7, %ymm7
3998 vmovdqa %ymm3, 10*32(%rax)
3999 vmovdqa %ymm7, 11*32(%rax)
4001 vpslld $13, %ymm3, %ymm2
4002 vpslld $13, %ymm7, %ymm6
4003 vpsrld $10, %ymm3, %ymm3
4004 vpsrld $10, %ymm7, %ymm7
4005 vpsrld $7, %ymm3, %ymm1
4006 vpsrld $7, %ymm7, %ymm5
4007 vpxor %ymm1, %ymm3, %ymm3
4008 vpxor %ymm5, %ymm7, %ymm7
4009 vpsrld $2, %ymm1, %ymm1
4010 vpsrld $2, %ymm5, %ymm5
4011 vpxor %ymm2, %ymm3, %ymm3
4012 vpxor %ymm6, %ymm7, %ymm7
4013 vpslld $2, %ymm2, %ymm2
4014 vpslld $2, %ymm6, %ymm6
4015 vpxor %ymm1, %ymm3, %ymm3
4016 vpxor %ymm5, %ymm7, %ymm7
4017 vpxor %ymm2, %ymm3, %ymm3
4018 vpxor %ymm6, %ymm7, %ymm7
4019 vpaddd 5*32(%rax), %ymm3, %ymm3
4020 vpaddd 6*32(%rax), %ymm7, %ymm7
4021 vmovdqa %ymm3, 12*32(%rax)
4022 vmovdqa %ymm7, 13*32(%rax)
4024 vmovdqa sha256d_8preext2_30(%rip), %ymm0
4025 vmovdqa 0*32(%rax), %ymm4
4026 vpslld $14, %ymm4, %ymm6
4027 vpsrld $3, %ymm4, %ymm4
4028 vpsrld $4, %ymm4, %ymm5
4029 vpxor %ymm5, %ymm4, %ymm4
4030 vpxor %ymm6, %ymm4, %ymm4
4031 vpsrld $11, %ymm5, %ymm5
4032 vpslld $11, %ymm6, %ymm6
4033 vpxor %ymm5, %ymm4, %ymm4
4034 vpxor %ymm6, %ymm4, %ymm4
4035 vpaddd -1*32(%rax), %ymm4, %ymm4
4036 vpslld $13, %ymm3, %ymm2
4037 vpslld $13, %ymm7, %ymm6
4038 vpsrld $10, %ymm3, %ymm3
4039 vpsrld $10, %ymm7, %ymm7
4040 vpaddd 7*32(%rax), %ymm0, %ymm0
4041 vpaddd 8*32(%rax), %ymm4, %ymm4
4042 vpsrld $7, %ymm3, %ymm1
4043 vpsrld $7, %ymm7, %ymm5
4044 vpxor %ymm1, %ymm3, %ymm3
4045 vpxor %ymm5, %ymm7, %ymm7
4046 vpsrld $2, %ymm1, %ymm1
4047 vpsrld $2, %ymm5, %ymm5
4048 vpxor %ymm2, %ymm3, %ymm3
4049 vpxor %ymm6, %ymm7, %ymm7
4050 vpslld $2, %ymm2, %ymm2
4051 vpslld $2, %ymm6, %ymm6
4052 vpxor %ymm1, %ymm3, %ymm3
4053 vpxor %ymm5, %ymm7, %ymm7
4054 vpxor %ymm2, %ymm3, %ymm3
4055 vpxor %ymm6, %ymm7, %ymm7
4056 vpaddd %ymm0, %ymm3, %ymm3
4057 vpaddd %ymm4, %ymm7, %ymm7
4058 vmovdqa %ymm3, 14*32(%rax)
4059 vmovdqa %ymm7, 15*32(%rax)
4061 jmp sha256d_ms_8way_avx2_extend_loop2
4063 sha256d_ms_8way_avx2_extend_coda2:
4064 sha256_avx2_extend_round 44
4066 vmovdqa sha256_8h+0(%rip), %ymm7
4067 vmovdqa sha256_8h+32(%rip), %ymm5
4068 vmovdqa sha256_8h+64(%rip), %ymm4
4069 vmovdqa sha256_8h+96(%rip), %ymm3
4070 vmovdqa sha256_8h+128(%rip), %ymm0
4071 vmovdqa sha256_8h+160(%rip), %ymm8
4072 vmovdqa sha256_8h+192(%rip), %ymm9
4073 vmovdqa sha256_8h+224(%rip), %ymm10
4076 leaq sha256_8k(%rip), %rcx
4077 jmp sha256d_ms_8way_avx2_main_loop2
4079 .macro sha256_avx2_main_round_red i, r0, r1, r2, r3, r4
4080 vpaddd 32*\i(%rax), \r0, %ymm6
4081 vpaddd 32*\i(%rcx), %ymm6, %ymm6
4082 vpandn \r1, \r3, %ymm1
4083 vpand \r3, \r2, %ymm2
4084 vpxor %ymm2, %ymm1, %ymm1
4085 vpaddd %ymm1, %ymm6, %ymm6
4086 vpslld $7, \r3, %ymm1
4088 vpsrld $5, \r0, %ymm2
4089 vpxor %ymm1, \r0, \r0
4090 vpxor %ymm2, \r0, \r0
4091 vpslld $14, %ymm1, %ymm1
4092 vpsrld $14, %ymm2, %ymm2
4093 vpxor %ymm1, \r0, \r0
4094 vpxor %ymm2, \r0, \r0
4095 vpslld $5, %ymm1, %ymm1
4096 vpxor %ymm1, \r0, \r0
4097 vpaddd \r0, %ymm6, %ymm6
4098 vpaddd %ymm6, \r4, \r0
4101 sha256d_ms_8way_avx2_finish:
4102 sha256_avx2_main_round_red 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4
4103 sha256_avx2_main_round_red 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5
4104 sha256_avx2_main_round_red 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7
4105 sha256_avx2_main_round_red 60, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3
4107 vpaddd sha256_8h+224(%rip), %ymm10, %ymm10
4108 vmovdqa %ymm10, 224(%rdi)
4112 #if defined(_WIN64) || defined(__CYGWIN__)
4114 vmovdqa 0(%rsp), %xmm6
4115 vmovdqa 16(%rsp), %xmm7
4116 vmovdqa 32(%rsp), %xmm8
4117 vmovdqa 48(%rsp), %xmm9
4118 vmovdqa 64(%rsp), %xmm10
4127 .globl sha256_use_8way
4128 .globl _sha256_use_8way
4133 /* Check for AVX and OSXSAVE support */
4136 andl $0x18000000, %ecx
4137 cmpl $0x18000000, %ecx
4138 jne sha256_use_8way_no
4139 /* Check for AVX2 support */
4143 andl $0x00000020, %ebx
4144 cmpl $0x00000020, %ebx
4145 jne sha256_use_8way_no
4146 /* Check for XMM and YMM state support */
4149 andl $0x00000006, %eax
4150 cmpl $0x00000006, %eax
4151 jne sha256_use_8way_no
4153 sha256_use_8way_yes:
4155 jmp sha256_use_8way_done
4160 sha256_use_8way_done: