2 * Copyright 2012-2015 pooler@litecoinpool.org
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License as published by the Free
6 * Software Foundation; either version 2 of the License, or (at your option)
7 * any later version. See COPYING for more details.
10 #if defined(__linux__) && defined(__ELF__)
11 .section .note.GNU-stack,"",%progbits
14 #if defined(__x86_64__)
18 .long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a
19 .long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
24 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
25 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
26 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
27 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
28 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
29 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
30 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
31 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
32 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
33 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
34 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
35 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
36 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
37 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
38 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
39 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
42 .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
45 .macro sha256_mixed_quadround ra, rb, rc, rd, re, rf, rg, rh, x0, x1, x2, x3
100 pslld $(32-18), %xmm7
123 pshufd $0xfa, \x3, %xmm6
153 pshufd $0x8f, %xmm8, %xmm8
165 pshufd $0x50, %xmm4, %xmm6
197 pshufd $0xf8, \x0, \x0
210 .macro sha256_main_round i, ra, rb, rc, rd, re, rf, rg, rh
228 addl \i*4(%rsp), %ecx
244 sha256_transform_sse2:
250 #if defined(_WIN64) || defined(__CYGWIN__)
254 movdqa %xmm6, 1*16(%rsp)
255 movdqa %xmm7, 2*16(%rsp)
256 movdqa %xmm8, 3*16(%rsp)
257 movdqa %xmm9, 4*16(%rsp)
267 movl 2*4(%rdi), %r10d
268 movl 3*4(%rdi), %r11d
269 movl 4*4(%rdi), %r12d
270 movl 5*4(%rdi), %r13d
271 movl 6*4(%rdi), %r14d
272 movl 7*4(%rdi), %r15d
275 jnz sha256_transform_sse2_swap
277 movdqu 0*16(%rsi), %xmm0
278 movdqu 1*16(%rsi), %xmm1
279 movdqu 2*16(%rsi), %xmm2
280 movdqu 3*16(%rsi), %xmm3
281 jmp sha256_transform_sse2_core
283 sha256_transform_sse2_swap:
284 movdqu 0*16(%rsi), %xmm0
285 movdqu 1*16(%rsi), %xmm1
286 movdqu 2*16(%rsi), %xmm2
287 movdqu 3*16(%rsi), %xmm3
288 pshuflw $0xb1, %xmm0, %xmm0
289 pshuflw $0xb1, %xmm1, %xmm1
290 pshuflw $0xb1, %xmm2, %xmm2
291 pshuflw $0xb1, %xmm3, %xmm3
292 pshufhw $0xb1, %xmm0, %xmm0
293 pshufhw $0xb1, %xmm1, %xmm1
294 pshufhw $0xb1, %xmm2, %xmm2
295 pshufhw $0xb1, %xmm3, %xmm3
313 sha256_transform_sse2_core:
314 leaq sha256_k(%rip), %rdx
317 sha256_transform_sse2_loop:
318 movdqa 0*16(%rdx), %xmm9
321 sha256_mixed_quadround %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %xmm0, %xmm1, %xmm2, %xmm3
322 movdqa 1*16(%rdx), %xmm9
325 sha256_mixed_quadround %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %xmm1, %xmm2, %xmm3, %xmm0
326 movdqa 2*16(%rdx), %xmm9
329 sha256_mixed_quadround %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %xmm2, %xmm3, %xmm0, %xmm1
330 movdqa 3*16(%rdx), %xmm9
334 sha256_mixed_quadround %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %xmm3, %xmm0, %xmm1, %xmm2
337 jne sha256_transform_sse2_loop
339 paddd 0*16(%rdx), %xmm0
341 sha256_main_round 0, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d
342 sha256_main_round 1, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d
343 sha256_main_round 2, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d
344 sha256_main_round 3, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d
345 paddd 1*16(%rdx), %xmm1
347 sha256_main_round 0, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d
348 sha256_main_round 1, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d
349 sha256_main_round 2, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d
350 sha256_main_round 3, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d
351 paddd 2*16(%rdx), %xmm2
353 sha256_main_round 0, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d
354 sha256_main_round 1, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d
355 sha256_main_round 2, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d
356 sha256_main_round 3, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d
357 paddd 3*16(%rdx), %xmm3
359 sha256_main_round 0, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d
360 sha256_main_round 1, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d
361 sha256_main_round 2, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d
362 sha256_main_round 3, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d
366 addl %r10d, 2*4(%rdi)
367 addl %r11d, 3*4(%rdi)
368 addl %r12d, 4*4(%rdi)
369 addl %r13d, 5*4(%rdi)
370 addl %r14d, 6*4(%rdi)
371 addl %r15d, 7*4(%rdi)
373 #if defined(_WIN64) || defined(__CYGWIN__)
374 movdqa 1*16(%rsp), %xmm6
375 movdqa 2*16(%rsp), %xmm7
376 movdqa 3*16(%rsp), %xmm8
377 movdqa 4*16(%rsp), %xmm9
394 sha256_transform_phe:
395 #if defined(_WIN64) || defined(__CYGWIN__)
407 jnz sha256_transform_phe_noswap
434 movdqu 2*16(%rsi), %xmm0
435 movdqu 3*16(%rsi), %xmm2
436 pshuflw $0xb1, %xmm0, %xmm0
437 pshuflw $0xb1, %xmm2, %xmm2
438 pshufhw $0xb1, %xmm0, %xmm0
439 pshufhw $0xb1, %xmm2, %xmm2
448 movdqa %xmm0, 2*16(%rsp)
449 movdqa %xmm2, 3*16(%rsp)
451 jmp sha256_transform_phe_core
453 sha256_transform_phe_noswap:
454 movdqu 0*16(%rsi), %xmm0
455 movdqu 1*16(%rsi), %xmm1
456 movdqu 2*16(%rsi), %xmm2
457 movdqu 3*16(%rsi), %xmm3
458 movdqa %xmm0, 0*16(%rsp)
459 movdqa %xmm1, 1*16(%rsp)
460 movdqa %xmm2, 2*16(%rsp)
461 movdqa %xmm3, 3*16(%rsp)
463 sha256_transform_phe_core:
468 .byte 0xf3, 0x0f, 0xa6, 0xd0
471 #if defined(_WIN64) || defined(__CYGWIN__)
480 sha256_transform_addr:
481 .quad sha256_transform_sse2
485 .globl sha256_transform
486 .globl _sha256_transform
489 jmp *sha256_transform_addr(%rip)
495 .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
496 .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
497 .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
498 .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
499 .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
500 .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
501 .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
502 .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
507 .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
508 .long 0x71374491, 0x71374491, 0x71374491, 0x71374491
509 .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
510 .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
511 .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
512 .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
513 .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
514 .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
515 .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
516 .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
517 .long 0x243185be, 0x243185be, 0x243185be, 0x243185be
518 .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
519 .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
520 .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
521 .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
522 .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
523 .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
524 .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
525 .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
526 .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
527 .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
528 .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
529 .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
530 .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
531 .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
532 .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
533 .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
534 .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
535 .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
536 .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
537 .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
538 .long 0x14292967, 0x14292967, 0x14292967, 0x14292967
539 .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
540 .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
541 .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
542 .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
543 .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
544 .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
545 .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
546 .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
547 .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
548 .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
549 .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
550 .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
551 .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
552 .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
553 .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
554 .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
555 .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
556 .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
557 .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
558 .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
559 .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
560 .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
561 .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
562 .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
563 .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
564 .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
565 .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
566 .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
567 .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
568 .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
569 .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
570 .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
575 .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
576 .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
577 .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
578 .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
579 .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
580 .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
581 .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
582 .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
587 .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
588 .long 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491
589 .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
590 .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
591 .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
592 .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
593 .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
594 .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
595 .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
596 .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
597 .long 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be
598 .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
599 .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
600 .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
601 .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
602 .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
603 .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
604 .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
605 .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
606 .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
607 .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
608 .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
609 .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
610 .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
611 .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
612 .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
613 .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
614 .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
615 .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
616 .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
617 .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
618 .long 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967
619 .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
620 .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
621 .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
622 .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
623 .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
624 .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
625 .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
626 .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
627 .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
628 .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
629 .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
630 .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
631 .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
632 .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
633 .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
634 .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
635 .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
636 .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
637 .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
638 .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
639 .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
640 .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
641 .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
642 .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
643 .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
644 .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
645 .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
646 .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
647 .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
648 .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
649 .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
650 .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
654 .globl sha256_init_4way
655 .globl _sha256_init_4way
658 #if defined(_WIN64) || defined(__CYGWIN__)
662 movdqa sha256_4h+0(%rip), %xmm0
663 movdqa sha256_4h+16(%rip), %xmm1
664 movdqa sha256_4h+32(%rip), %xmm2
665 movdqa sha256_4h+48(%rip), %xmm3
666 movdqu %xmm0, 0(%rdi)
667 movdqu %xmm1, 16(%rdi)
668 movdqu %xmm2, 32(%rdi)
669 movdqu %xmm3, 48(%rdi)
670 movdqa sha256_4h+64(%rip), %xmm0
671 movdqa sha256_4h+80(%rip), %xmm1
672 movdqa sha256_4h+96(%rip), %xmm2
673 movdqa sha256_4h+112(%rip), %xmm3
674 movdqu %xmm0, 64(%rdi)
675 movdqu %xmm1, 80(%rdi)
676 movdqu %xmm2, 96(%rdi)
677 movdqu %xmm3, 112(%rdi)
678 #if defined(_WIN64) || defined(__CYGWIN__)
685 .globl sha256_init_8way
686 .globl _sha256_init_8way
689 #if defined(_WIN64) || defined(__CYGWIN__)
693 vpbroadcastd sha256_4h+0(%rip), %ymm0
694 vpbroadcastd sha256_4h+16(%rip), %ymm1
695 vpbroadcastd sha256_4h+32(%rip), %ymm2
696 vpbroadcastd sha256_4h+48(%rip), %ymm3
697 vmovdqu %ymm0, 0*32(%rdi)
698 vmovdqu %ymm1, 1*32(%rdi)
699 vmovdqu %ymm2, 2*32(%rdi)
700 vmovdqu %ymm3, 3*32(%rdi)
701 vpbroadcastd sha256_4h+64(%rip), %ymm0
702 vpbroadcastd sha256_4h+80(%rip), %ymm1
703 vpbroadcastd sha256_4h+96(%rip), %ymm2
704 vpbroadcastd sha256_4h+112(%rip), %ymm3
705 vmovdqu %ymm0, 4*32(%rdi)
706 vmovdqu %ymm1, 5*32(%rdi)
707 vmovdqu %ymm2, 6*32(%rdi)
708 vmovdqu %ymm3, 7*32(%rdi)
709 #if defined(_WIN64) || defined(__CYGWIN__)
714 .macro sha256_sse2_extend_round i
715 movdqa (\i-15)*16(%rax), %xmm0
727 paddd (\i-16)*16(%rax), %xmm0
728 paddd (\i-7)*16(%rax), %xmm0
742 movdqa %xmm3, \i*16(%rax)
745 .macro sha256_sse2_extend_doubleround i
746 movdqa (\i-15)*16(%rax), %xmm0
747 movdqa (\i-14)*16(%rax), %xmm4
771 paddd (\i-16)*16(%rax), %xmm0
772 paddd (\i-15)*16(%rax), %xmm4
785 paddd (\i-7)*16(%rax), %xmm0
786 paddd (\i-6)*16(%rax), %xmm4
803 movdqa %xmm3, \i*16(%rax)
804 movdqa %xmm7, (\i+1)*16(%rax)
807 .macro sha256_sse2_main_round i
808 movdqa 16*(\i)(%rax), %xmm6
811 movdqa 16(%rsp), %xmm2
813 paddd 32(%rsp), %xmm6
815 movdqa %xmm2, 32(%rsp)
816 movdqa 0(%rsp), %xmm2
817 movdqa %xmm2, 16(%rsp)
821 movdqa %xmm0, 0(%rsp)
827 paddd 16*(\i)(%rcx), %xmm6
871 .macro sha256_sse2_main_quadround i
872 sha256_sse2_main_round \i+0
873 sha256_sse2_main_round \i+1
874 sha256_sse2_main_round \i+2
875 sha256_sse2_main_round \i+3
879 .macro sha256_avx_extend_round i
880 vmovdqa (\i-15)*16(%rax), %xmm0
881 vpslld $14, %xmm0, %xmm2
882 vpsrld $3, %xmm0, %xmm0
883 vpsrld $4, %xmm0, %xmm1
884 vpxor %xmm1, %xmm0, %xmm0
885 vpxor %xmm2, %xmm0, %xmm0
886 vpsrld $11, %xmm1, %xmm1
887 vpslld $11, %xmm2, %xmm2
888 vpxor %xmm1, %xmm0, %xmm0
889 vpxor %xmm2, %xmm0, %xmm0
890 vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
891 vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
893 vpslld $13, %xmm3, %xmm2
894 vpsrld $10, %xmm3, %xmm3
895 vpsrld $7, %xmm3, %xmm1
896 vpxor %xmm1, %xmm3, %xmm3
897 vpxor %xmm2, %xmm3, %xmm3
898 vpsrld $2, %xmm1, %xmm1
899 vpslld $2, %xmm2, %xmm2
900 vpxor %xmm1, %xmm3, %xmm3
901 vpxor %xmm2, %xmm3, %xmm3
902 vpaddd %xmm0, %xmm3, %xmm3
903 vmovdqa %xmm3, \i*16(%rax)
906 .macro sha256_avx_extend_doubleround i
907 vmovdqa (\i-15)*16(%rax), %xmm0
908 vmovdqa (\i-14)*16(%rax), %xmm4
909 vpslld $14, %xmm0, %xmm2
910 vpslld $14, %xmm4, %xmm6
911 vpsrld $3, %xmm0, %xmm8
912 vpsrld $3, %xmm4, %xmm4
913 vpsrld $7, %xmm0, %xmm1
914 vpsrld $4, %xmm4, %xmm5
915 vpxor %xmm1, %xmm8, %xmm8
916 vpxor %xmm5, %xmm4, %xmm4
917 vpsrld $11, %xmm1, %xmm1
918 vpsrld $11, %xmm5, %xmm5
919 vpxor %xmm2, %xmm8, %xmm8
920 vpxor %xmm6, %xmm4, %xmm4
921 vpslld $11, %xmm2, %xmm2
922 vpslld $11, %xmm6, %xmm6
923 vpxor %xmm1, %xmm8, %xmm8
924 vpxor %xmm5, %xmm4, %xmm4
925 vpxor %xmm2, %xmm8, %xmm8
926 vpxor %xmm6, %xmm4, %xmm4
928 vpaddd %xmm0, %xmm4, %xmm4
929 vpaddd (\i-16)*16(%rax), %xmm8, %xmm0
931 vpslld $13, %xmm3, %xmm2
932 vpslld $13, %xmm7, %xmm6
933 vpsrld $10, %xmm3, %xmm3
934 vpsrld $10, %xmm7, %xmm7
936 vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
937 vpaddd (\i-6)*16(%rax), %xmm4, %xmm4
939 vpsrld $7, %xmm3, %xmm1
940 vpsrld $7, %xmm7, %xmm5
941 vpxor %xmm1, %xmm3, %xmm3
942 vpxor %xmm5, %xmm7, %xmm7
943 vpsrld $2, %xmm1, %xmm1
944 vpsrld $2, %xmm5, %xmm5
945 vpxor %xmm2, %xmm3, %xmm3
946 vpxor %xmm6, %xmm7, %xmm7
947 vpslld $2, %xmm2, %xmm2
948 vpslld $2, %xmm6, %xmm6
949 vpxor %xmm1, %xmm3, %xmm3
950 vpxor %xmm5, %xmm7, %xmm7
951 vpxor %xmm2, %xmm3, %xmm3
952 vpxor %xmm6, %xmm7, %xmm7
954 vpaddd %xmm0, %xmm3, %xmm3
955 vpaddd %xmm4, %xmm7, %xmm7
956 vmovdqa %xmm3, \i*16(%rax)
957 vmovdqa %xmm7, (\i+1)*16(%rax)
960 .macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
961 vpaddd 16*(\i)(%rax), \r0, %xmm6
962 vpaddd 16*(\i)(%rcx), %xmm6, %xmm6
964 vpandn \r1, \r3, %xmm1
965 vpand \r3, \r2, %xmm2
966 vpxor %xmm2, %xmm1, %xmm1
967 vpaddd %xmm1, %xmm6, %xmm6
969 vpslld $7, \r3, %xmm1
971 vpsrld $5, \r0, %xmm2
972 vpxor %xmm1, \r0, \r0
973 vpxor %xmm2, \r0, \r0
974 vpslld $14, %xmm1, %xmm1
975 vpsrld $14, %xmm2, %xmm2
976 vpxor %xmm1, \r0, \r0
977 vpxor %xmm2, \r0, \r0
978 vpslld $5, %xmm1, %xmm1
979 vpxor %xmm1, \r0, \r0
980 vpaddd \r0, %xmm6, %xmm6
981 vpaddd %xmm6, \r4, \r0
983 vpand \r6, \r5, %xmm2
985 vpand \r7, \r6, %xmm1
986 vpxor \r4, %xmm1, %xmm1
987 vpxor %xmm2, %xmm1, %xmm1
988 vpaddd %xmm1, %xmm6, %xmm6
990 vpslld $10, \r7, %xmm2
992 vpsrld $11, \r4, %xmm1
993 vpxor %xmm2, \r4, \r4
994 vpxor %xmm1, \r4, \r4
995 vpslld $9, %xmm2, %xmm2
996 vpsrld $9, %xmm1, %xmm1
997 vpxor %xmm2, \r4, \r4
998 vpxor %xmm1, \r4, \r4
999 vpslld $11, %xmm2, %xmm2
1000 vpxor %xmm2, \r4, \r4
1001 vpaddd %xmm6, \r4, \r4
1004 .macro sha256_avx_main_quadround i
1005 sha256_avx_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
1006 sha256_avx_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
1007 sha256_avx_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
1008 sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
1012 .macro sha256_avx2_extend_round i
1013 vmovdqa (\i-15)*32(%rax), %ymm0
1014 vpslld $14, %ymm0, %ymm2
1015 vpsrld $3, %ymm0, %ymm0
1016 vpsrld $4, %ymm0, %ymm1
1017 vpxor %ymm1, %ymm0, %ymm0
1018 vpxor %ymm2, %ymm0, %ymm0
1019 vpsrld $11, %ymm1, %ymm1
1020 vpslld $11, %ymm2, %ymm2
1021 vpxor %ymm1, %ymm0, %ymm0
1022 vpxor %ymm2, %ymm0, %ymm0
1023 vpaddd (\i-16)*32(%rax), %ymm0, %ymm0
1024 vpaddd (\i-7)*32(%rax), %ymm0, %ymm0
1026 vpslld $13, %ymm3, %ymm2
1027 vpsrld $10, %ymm3, %ymm3
1028 vpsrld $7, %ymm3, %ymm1
1029 vpxor %ymm1, %ymm3, %ymm3
1030 vpxor %ymm2, %ymm3, %ymm3
1031 vpsrld $2, %ymm1, %ymm1
1032 vpslld $2, %ymm2, %ymm2
1033 vpxor %ymm1, %ymm3, %ymm3
1034 vpxor %ymm2, %ymm3, %ymm3
1035 vpaddd %ymm0, %ymm3, %ymm3
1036 vmovdqa %ymm3, \i*32(%rax)
1039 .macro sha256_avx2_extend_doubleround i
1040 vmovdqa (\i-15)*32(%rax), %ymm0
1041 vmovdqa (\i-14)*32(%rax), %ymm4
1042 vpslld $14, %ymm0, %ymm2
1043 vpslld $14, %ymm4, %ymm6
1044 vpsrld $3, %ymm0, %ymm8
1045 vpsrld $3, %ymm4, %ymm4
1046 vpsrld $7, %ymm0, %ymm1
1047 vpsrld $4, %ymm4, %ymm5
1048 vpxor %ymm1, %ymm8, %ymm8
1049 vpxor %ymm5, %ymm4, %ymm4
1050 vpsrld $11, %ymm1, %ymm1
1051 vpsrld $11, %ymm5, %ymm5
1052 vpxor %ymm2, %ymm8, %ymm8
1053 vpxor %ymm6, %ymm4, %ymm4
1054 vpslld $11, %ymm2, %ymm2
1055 vpslld $11, %ymm6, %ymm6
1056 vpxor %ymm1, %ymm8, %ymm8
1057 vpxor %ymm5, %ymm4, %ymm4
1058 vpxor %ymm2, %ymm8, %ymm8
1059 vpxor %ymm6, %ymm4, %ymm4
1061 vpaddd %ymm0, %ymm4, %ymm4
1062 vpaddd (\i-16)*32(%rax), %ymm8, %ymm0
1064 vpslld $13, %ymm3, %ymm2
1065 vpslld $13, %ymm7, %ymm6
1066 vpsrld $10, %ymm3, %ymm3
1067 vpsrld $10, %ymm7, %ymm7
1069 vpaddd (\i-7)*32(%rax), %ymm0, %ymm0
1070 vpaddd (\i-6)*32(%rax), %ymm4, %ymm4
1072 vpsrld $7, %ymm3, %ymm1
1073 vpsrld $7, %ymm7, %ymm5
1074 vpxor %ymm1, %ymm3, %ymm3
1075 vpxor %ymm5, %ymm7, %ymm7
1076 vpsrld $2, %ymm1, %ymm1
1077 vpsrld $2, %ymm5, %ymm5
1078 vpxor %ymm2, %ymm3, %ymm3
1079 vpxor %ymm6, %ymm7, %ymm7
1080 vpslld $2, %ymm2, %ymm2
1081 vpslld $2, %ymm6, %ymm6
1082 vpxor %ymm1, %ymm3, %ymm3
1083 vpxor %ymm5, %ymm7, %ymm7
1084 vpxor %ymm2, %ymm3, %ymm3
1085 vpxor %ymm6, %ymm7, %ymm7
1087 vpaddd %ymm0, %ymm3, %ymm3
1088 vpaddd %ymm4, %ymm7, %ymm7
1089 vmovdqa %ymm3, \i*32(%rax)
1090 vmovdqa %ymm7, (\i+1)*32(%rax)
1093 .macro sha256_avx2_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
1094 vpaddd 32*(\i)(%rax), \r0, %ymm6
1095 vpaddd 32*(\i)(%rcx), %ymm6, %ymm6
1097 vpandn \r1, \r3, %ymm1
1098 vpand \r3, \r2, %ymm2
1099 vpxor %ymm2, %ymm1, %ymm1
1100 vpaddd %ymm1, %ymm6, %ymm6
1102 vpslld $7, \r3, %ymm1
1104 vpsrld $5, \r0, %ymm2
1105 vpxor %ymm1, \r0, \r0
1106 vpxor %ymm2, \r0, \r0
1107 vpslld $14, %ymm1, %ymm1
1108 vpsrld $14, %ymm2, %ymm2
1109 vpxor %ymm1, \r0, \r0
1110 vpxor %ymm2, \r0, \r0
1111 vpslld $5, %ymm1, %ymm1
1112 vpxor %ymm1, \r0, \r0
1113 vpaddd \r0, %ymm6, %ymm6
1114 vpaddd %ymm6, \r4, \r0
1116 vpand \r6, \r5, %ymm2
1118 vpand \r7, \r6, %ymm1
1119 vpxor \r4, %ymm1, %ymm1
1120 vpxor %ymm2, %ymm1, %ymm1
1121 vpaddd %ymm1, %ymm6, %ymm6
1123 vpslld $10, \r7, %ymm2
1125 vpsrld $11, \r4, %ymm1
1126 vpxor %ymm2, \r4, \r4
1127 vpxor %ymm1, \r4, \r4
1128 vpslld $9, %ymm2, %ymm2
1129 vpsrld $9, %ymm1, %ymm1
1130 vpxor %ymm2, \r4, \r4
1131 vpxor %ymm1, \r4, \r4
1132 vpslld $11, %ymm2, %ymm2
1133 vpxor %ymm2, \r4, \r4
1134 vpaddd %ymm6, \r4, \r4
1137 .macro sha256_avx2_main_quadround i
1138 sha256_avx2_main_round \i+0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7
1139 sha256_avx2_main_round \i+1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3
1140 sha256_avx2_main_round \i+2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4
1141 sha256_avx2_main_round \i+3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5
1144 .macro sha256_xop_extend_round i
1145 vmovdqa (\i-15)*16(%rax), %xmm0
1146 vprotd $25, %xmm0, %xmm1
1147 vprotd $14, %xmm0, %xmm2
1148 vpsrld $3, %xmm0, %xmm0
1149 vpxor %xmm1, %xmm2, %xmm2
1150 vpxor %xmm2, %xmm0, %xmm0
1152 vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
1153 vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
1155 vprotd $15, %xmm3, %xmm1
1156 vprotd $13, %xmm3, %xmm2
1157 vpsrld $10, %xmm3, %xmm3
1158 vpxor %xmm1, %xmm2, %xmm2
1159 vpxor %xmm2, %xmm3, %xmm3
1160 vpaddd %xmm0, %xmm3, %xmm3
1161 vmovdqa %xmm3, \i*16(%rax)
1164 .macro sha256_xop_extend_doubleround i
1165 vmovdqa (\i-15)*16(%rax), %xmm0
1166 vmovdqa (\i-14)*16(%rax), %xmm4
1167 vprotd $25, %xmm0, %xmm1
1168 vprotd $25, %xmm4, %xmm5
1169 vprotd $14, %xmm0, %xmm2
1170 vprotd $14, %xmm4, %xmm6
1171 vpxor %xmm1, %xmm2, %xmm2
1172 vpxor %xmm5, %xmm6, %xmm6
1173 vpsrld $3, %xmm0, %xmm0
1174 vpsrld $3, %xmm4, %xmm4
1175 vpxor %xmm2, %xmm0, %xmm0
1176 vpxor %xmm6, %xmm4, %xmm4
1178 vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
1179 vpaddd (\i-15)*16(%rax), %xmm4, %xmm4
1181 vprotd $15, %xmm3, %xmm1
1182 vprotd $15, %xmm7, %xmm5
1183 vprotd $13, %xmm3, %xmm2
1184 vprotd $13, %xmm7, %xmm6
1185 vpxor %xmm1, %xmm2, %xmm2
1186 vpxor %xmm5, %xmm6, %xmm6
1188 vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
1189 vpaddd (\i-6)*16(%rax), %xmm4, %xmm4
1191 vpsrld $10, %xmm3, %xmm3
1192 vpsrld $10, %xmm7, %xmm7
1193 vpxor %xmm2, %xmm3, %xmm3
1194 vpxor %xmm6, %xmm7, %xmm7
1196 vpaddd %xmm0, %xmm3, %xmm3
1197 vpaddd %xmm4, %xmm7, %xmm7
1198 vmovdqa %xmm3, \i*16(%rax)
1199 vmovdqa %xmm7, (\i+1)*16(%rax)
1202 .macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
1203 vpaddd 16*(\i)(%rax), \r0, %xmm6
1204 vpaddd 16*(\i)(%rcx), %xmm6, %xmm6
1206 vpandn \r1, \r3, %xmm1
1207 vpand \r3, \r2, %xmm2
1208 vpxor %xmm2, %xmm1, %xmm1
1209 vpaddd %xmm1, %xmm6, %xmm6
1211 vprotd $26, \r3, %xmm1
1212 vprotd $21, \r3, %xmm2
1213 vpxor %xmm1, %xmm2, %xmm2
1215 vpxor %xmm2, \r0, \r0
1216 vpaddd \r0, %xmm6, %xmm6
1217 vpaddd %xmm6, \r4, \r0
1219 vpand \r6, \r5, %xmm2
1221 vpand \r7, \r6, %xmm1
1222 vpxor \r4, %xmm1, %xmm1
1223 vpxor %xmm2, %xmm1, %xmm1
1224 vpaddd %xmm1, %xmm6, %xmm6
1226 vprotd $30, \r7, %xmm1
1227 vprotd $19, \r7, %xmm2
1228 vpxor %xmm1, %xmm2, %xmm2
1229 vprotd $10, \r7, \r4
1230 vpxor %xmm2, \r4, \r4
1231 vpaddd %xmm6, \r4, \r4
1234 .macro sha256_xop_main_quadround i
1235 sha256_xop_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
1236 sha256_xop_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
1237 sha256_xop_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
1238 sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
1243 sha256_transform_4way_core_sse2:
1244 leaq 256(%rsp), %rcx
1245 leaq 48*16(%rcx), %rax
1246 movdqa -2*16(%rcx), %xmm3
1247 movdqa -1*16(%rcx), %xmm7
1248 sha256_transform_4way_sse2_extend_loop:
1249 movdqa -15*16(%rcx), %xmm0
1250 movdqa -14*16(%rcx), %xmm4
1274 paddd -16*16(%rcx), %xmm0
1275 paddd -15*16(%rcx), %xmm4
1288 paddd -7*16(%rcx), %xmm0
1289 paddd -6*16(%rcx), %xmm4
1306 movdqa %xmm3, (%rcx)
1307 movdqa %xmm7, 16(%rcx)
1310 jne sha256_transform_4way_sse2_extend_loop
1312 movdqu 0(%rdi), %xmm7
1313 movdqu 16(%rdi), %xmm5
1314 movdqu 32(%rdi), %xmm4
1315 movdqu 48(%rdi), %xmm3
1316 movdqu 64(%rdi), %xmm0
1317 movdqu 80(%rdi), %xmm8
1318 movdqu 96(%rdi), %xmm9
1319 movdqu 112(%rdi), %xmm10
1321 leaq sha256_4k(%rip), %rcx
1323 sha256_transform_4way_sse2_main_loop:
1324 movdqa (%rsp, %rax), %xmm6
1325 paddd (%rcx, %rax), %xmm6
1332 movdqa %xmm2, %xmm10
1389 jne sha256_transform_4way_sse2_main_loop
1390 jmp sha256_transform_4way_finish
1394 sha256_transform_4way_core_avx:
1395 leaq 256(%rsp), %rax
1396 movdqa -2*16(%rax), %xmm3
1397 movdqa -1*16(%rax), %xmm7
1398 sha256_avx_extend_doubleround 0
1399 sha256_avx_extend_doubleround 2
1400 sha256_avx_extend_doubleround 4
1401 sha256_avx_extend_doubleround 6
1402 sha256_avx_extend_doubleround 8
1403 sha256_avx_extend_doubleround 10
1404 sha256_avx_extend_doubleround 12
1405 sha256_avx_extend_doubleround 14
1406 sha256_avx_extend_doubleround 16
1407 sha256_avx_extend_doubleround 18
1408 sha256_avx_extend_doubleround 20
1409 sha256_avx_extend_doubleround 22
1410 sha256_avx_extend_doubleround 24
1411 sha256_avx_extend_doubleround 26
1412 sha256_avx_extend_doubleround 28
1413 sha256_avx_extend_doubleround 30
1414 sha256_avx_extend_doubleround 32
1415 sha256_avx_extend_doubleround 34
1416 sha256_avx_extend_doubleround 36
1417 sha256_avx_extend_doubleround 38
1418 sha256_avx_extend_doubleround 40
1419 sha256_avx_extend_doubleround 42
1420 sha256_avx_extend_doubleround 44
1421 sha256_avx_extend_doubleround 46
1422 movdqu 0(%rdi), %xmm7
1423 movdqu 16(%rdi), %xmm5
1424 movdqu 32(%rdi), %xmm4
1425 movdqu 48(%rdi), %xmm3
1426 movdqu 64(%rdi), %xmm0
1427 movdqu 80(%rdi), %xmm8
1428 movdqu 96(%rdi), %xmm9
1429 movdqu 112(%rdi), %xmm10
1431 leaq sha256_4k(%rip), %rcx
1432 sha256_avx_main_quadround 0
1433 sha256_avx_main_quadround 4
1434 sha256_avx_main_quadround 8
1435 sha256_avx_main_quadround 12
1436 sha256_avx_main_quadround 16
1437 sha256_avx_main_quadround 20
1438 sha256_avx_main_quadround 24
1439 sha256_avx_main_quadround 28
1440 sha256_avx_main_quadround 32
1441 sha256_avx_main_quadround 36
1442 sha256_avx_main_quadround 40
1443 sha256_avx_main_quadround 44
1444 sha256_avx_main_quadround 48
1445 sha256_avx_main_quadround 52
1446 sha256_avx_main_quadround 56
1447 sha256_avx_main_quadround 60
1448 jmp sha256_transform_4way_finish
1452 sha256_transform_4way_core_xop:
1453 leaq 256(%rsp), %rax
1454 movdqa -2*16(%rax), %xmm3
1455 movdqa -1*16(%rax), %xmm7
1456 sha256_xop_extend_doubleround 0
1457 sha256_xop_extend_doubleround 2
1458 sha256_xop_extend_doubleround 4
1459 sha256_xop_extend_doubleround 6
1460 sha256_xop_extend_doubleround 8
1461 sha256_xop_extend_doubleround 10
1462 sha256_xop_extend_doubleround 12
1463 sha256_xop_extend_doubleround 14
1464 sha256_xop_extend_doubleround 16
1465 sha256_xop_extend_doubleround 18
1466 sha256_xop_extend_doubleround 20
1467 sha256_xop_extend_doubleround 22
1468 sha256_xop_extend_doubleround 24
1469 sha256_xop_extend_doubleround 26
1470 sha256_xop_extend_doubleround 28
1471 sha256_xop_extend_doubleround 30
1472 sha256_xop_extend_doubleround 32
1473 sha256_xop_extend_doubleround 34
1474 sha256_xop_extend_doubleround 36
1475 sha256_xop_extend_doubleround 38
1476 sha256_xop_extend_doubleround 40
1477 sha256_xop_extend_doubleround 42
1478 sha256_xop_extend_doubleround 44
1479 sha256_xop_extend_doubleround 46
1480 movdqu 0(%rdi), %xmm7
1481 movdqu 16(%rdi), %xmm5
1482 movdqu 32(%rdi), %xmm4
1483 movdqu 48(%rdi), %xmm3
1484 movdqu 64(%rdi), %xmm0
1485 movdqu 80(%rdi), %xmm8
1486 movdqu 96(%rdi), %xmm9
1487 movdqu 112(%rdi), %xmm10
1489 leaq sha256_4k(%rip), %rcx
1490 sha256_xop_main_quadround 0
1491 sha256_xop_main_quadround 4
1492 sha256_xop_main_quadround 8
1493 sha256_xop_main_quadround 12
1494 sha256_xop_main_quadround 16
1495 sha256_xop_main_quadround 20
1496 sha256_xop_main_quadround 24
1497 sha256_xop_main_quadround 28
1498 sha256_xop_main_quadround 32
1499 sha256_xop_main_quadround 36
1500 sha256_xop_main_quadround 40
1501 sha256_xop_main_quadround 44
1502 sha256_xop_main_quadround 48
1503 sha256_xop_main_quadround 52
1504 sha256_xop_main_quadround 56
1505 sha256_xop_main_quadround 60
1506 jmp sha256_transform_4way_finish
1510 sha256_transform_4way_core_addr:
1513 .macro p2bswap_rsi_rsp i
1514 movdqu \i*16(%rsi), %xmm0
1515 movdqu (\i+1)*16(%rsi), %xmm2
1516 pshuflw $0xb1, %xmm0, %xmm0
1517 pshuflw $0xb1, %xmm2, %xmm2
1518 pshufhw $0xb1, %xmm0, %xmm0
1519 pshufhw $0xb1, %xmm2, %xmm2
1528 movdqa %xmm0, \i*16(%rsp)
1529 movdqa %xmm2, (\i+1)*16(%rsp)
1534 .globl sha256_transform_4way
1535 .globl _sha256_transform_4way
1536 sha256_transform_4way:
1537 _sha256_transform_4way:
1538 #if defined(_WIN64) || defined(__CYGWIN__)
1541 movdqa %xmm6, 0(%rsp)
1542 movdqa %xmm7, 16(%rsp)
1543 movdqa %xmm8, 32(%rsp)
1544 movdqa %xmm9, 48(%rsp)
1545 movdqa %xmm10, 64(%rsp)
1546 movdqa %xmm11, 80(%rsp)
1557 jnz sha256_transform_4way_swap
1559 movdqu 0*16(%rsi), %xmm0
1560 movdqu 1*16(%rsi), %xmm1
1561 movdqu 2*16(%rsi), %xmm2
1562 movdqu 3*16(%rsi), %xmm3
1563 movdqu 4*16(%rsi), %xmm4
1564 movdqu 5*16(%rsi), %xmm5
1565 movdqu 6*16(%rsi), %xmm6
1566 movdqu 7*16(%rsi), %xmm7
1567 movdqa %xmm0, 0*16(%rsp)
1568 movdqa %xmm1, 1*16(%rsp)
1569 movdqa %xmm2, 2*16(%rsp)
1570 movdqa %xmm3, 3*16(%rsp)
1571 movdqa %xmm4, 4*16(%rsp)
1572 movdqa %xmm5, 5*16(%rsp)
1573 movdqa %xmm6, 6*16(%rsp)
1574 movdqa %xmm7, 7*16(%rsp)
1575 movdqu 8*16(%rsi), %xmm0
1576 movdqu 9*16(%rsi), %xmm1
1577 movdqu 10*16(%rsi), %xmm2
1578 movdqu 11*16(%rsi), %xmm3
1579 movdqu 12*16(%rsi), %xmm4
1580 movdqu 13*16(%rsi), %xmm5
1581 movdqu 14*16(%rsi), %xmm6
1582 movdqu 15*16(%rsi), %xmm7
1583 movdqa %xmm0, 8*16(%rsp)
1584 movdqa %xmm1, 9*16(%rsp)
1585 movdqa %xmm2, 10*16(%rsp)
1586 movdqa %xmm3, 11*16(%rsp)
1587 movdqa %xmm4, 12*16(%rsp)
1588 movdqa %xmm5, 13*16(%rsp)
1589 movdqa %xmm6, 14*16(%rsp)
1590 movdqa %xmm7, 15*16(%rsp)
1591 jmp *sha256_transform_4way_core_addr(%rip)
1594 sha256_transform_4way_swap:
1603 jmp *sha256_transform_4way_core_addr(%rip)
1606 sha256_transform_4way_finish:
1607 movdqu 0(%rdi), %xmm2
1608 movdqu 16(%rdi), %xmm6
1609 movdqu 32(%rdi), %xmm11
1610 movdqu 48(%rdi), %xmm1
1615 movdqu 64(%rdi), %xmm2
1616 movdqu 80(%rdi), %xmm6
1617 movdqu 96(%rdi), %xmm11
1618 movdqu 112(%rdi), %xmm1
1624 movdqu %xmm7, 0(%rdi)
1625 movdqu %xmm5, 16(%rdi)
1626 movdqu %xmm4, 32(%rdi)
1627 movdqu %xmm3, 48(%rdi)
1628 movdqu %xmm0, 64(%rdi)
1629 movdqu %xmm8, 80(%rdi)
1630 movdqu %xmm9, 96(%rdi)
1631 movdqu %xmm10, 112(%rdi)
1634 #if defined(_WIN64) || defined(__CYGWIN__)
1636 movdqa 0(%rsp), %xmm6
1637 movdqa 16(%rsp), %xmm7
1638 movdqa 32(%rsp), %xmm8
1639 movdqa 48(%rsp), %xmm9
1640 movdqa 64(%rsp), %xmm10
1641 movdqa 80(%rsp), %xmm11
1649 sha256_transform_8way_core_avx2:
1650 leaq 8*64(%rsp), %rax
1651 vmovdqa -2*32(%rax), %ymm3
1652 vmovdqa -1*32(%rax), %ymm7
1653 sha256_avx2_extend_doubleround 0
1654 sha256_avx2_extend_doubleround 2
1655 sha256_avx2_extend_doubleround 4
1656 sha256_avx2_extend_doubleround 6
1657 sha256_avx2_extend_doubleround 8
1658 sha256_avx2_extend_doubleround 10
1659 sha256_avx2_extend_doubleround 12
1660 sha256_avx2_extend_doubleround 14
1661 sha256_avx2_extend_doubleround 16
1662 sha256_avx2_extend_doubleround 18
1663 sha256_avx2_extend_doubleround 20
1664 sha256_avx2_extend_doubleround 22
1665 sha256_avx2_extend_doubleround 24
1666 sha256_avx2_extend_doubleround 26
1667 sha256_avx2_extend_doubleround 28
1668 sha256_avx2_extend_doubleround 30
1669 sha256_avx2_extend_doubleround 32
1670 sha256_avx2_extend_doubleround 34
1671 sha256_avx2_extend_doubleround 36
1672 sha256_avx2_extend_doubleround 38
1673 sha256_avx2_extend_doubleround 40
1674 sha256_avx2_extend_doubleround 42
1675 sha256_avx2_extend_doubleround 44
1676 sha256_avx2_extend_doubleround 46
1677 vmovdqu 0*32(%rdi), %ymm7
1678 vmovdqu 1*32(%rdi), %ymm5
1679 vmovdqu 2*32(%rdi), %ymm4
1680 vmovdqu 3*32(%rdi), %ymm3
1681 vmovdqu 4*32(%rdi), %ymm0
1682 vmovdqu 5*32(%rdi), %ymm8
1683 vmovdqu 6*32(%rdi), %ymm9
1684 vmovdqu 7*32(%rdi), %ymm10
1686 leaq sha256_8k(%rip), %rcx
1687 sha256_avx2_main_quadround 0
1688 sha256_avx2_main_quadround 4
1689 sha256_avx2_main_quadround 8
1690 sha256_avx2_main_quadround 12
1691 sha256_avx2_main_quadround 16
1692 sha256_avx2_main_quadround 20
1693 sha256_avx2_main_quadround 24
1694 sha256_avx2_main_quadround 28
1695 sha256_avx2_main_quadround 32
1696 sha256_avx2_main_quadround 36
1697 sha256_avx2_main_quadround 40
1698 sha256_avx2_main_quadround 44
1699 sha256_avx2_main_quadround 48
1700 sha256_avx2_main_quadround 52
1701 sha256_avx2_main_quadround 56
1702 sha256_avx2_main_quadround 60
1703 jmp sha256_transform_8way_finish
1705 .macro p2bswap_avx2_rsi_rsp i
1706 vmovdqu \i*32(%rsi), %ymm0
1707 vmovdqu (\i+1)*32(%rsi), %ymm2
1708 vpshuflw $0xb1, %ymm0, %ymm0
1709 vpshuflw $0xb1, %ymm2, %ymm2
1710 vpshufhw $0xb1, %ymm0, %ymm0
1711 vpshufhw $0xb1, %ymm2, %ymm2
1712 vpsrlw $8, %ymm0, %ymm1
1713 vpsrlw $8, %ymm2, %ymm3
1714 vpsllw $8, %ymm0, %ymm0
1715 vpsllw $8, %ymm2, %ymm2
1716 vpxor %ymm1, %ymm0, %ymm0
1717 vpxor %ymm3, %ymm2, %ymm2
1718 vmovdqa %ymm0, \i*32(%rsp)
1719 vmovdqa %ymm2, (\i+1)*32(%rsp)
1724 .globl sha256_transform_8way
1725 .globl _sha256_transform_8way
1726 sha256_transform_8way:
1727 _sha256_transform_8way:
1728 #if defined(_WIN64) || defined(__CYGWIN__)
1731 vmovdqa %xmm6, 0(%rsp)
1732 vmovdqa %xmm7, 16(%rsp)
1733 vmovdqa %xmm8, 32(%rsp)
1734 vmovdqa %xmm9, 48(%rsp)
1735 vmovdqa %xmm10, 64(%rsp)
1736 vmovdqa %xmm11, 80(%rsp)
1747 jnz sha256_transform_8way_swap
1749 vmovdqu 0*32(%rsi), %ymm0
1750 vmovdqu 1*32(%rsi), %ymm1
1751 vmovdqu 2*32(%rsi), %ymm2
1752 vmovdqu 3*32(%rsi), %ymm3
1753 vmovdqu 4*32(%rsi), %ymm4
1754 vmovdqu 5*32(%rsi), %ymm5
1755 vmovdqu 6*32(%rsi), %ymm6
1756 vmovdqu 7*32(%rsi), %ymm7
1757 vmovdqa %ymm0, 0*32(%rsp)
1758 vmovdqa %ymm1, 1*32(%rsp)
1759 vmovdqa %ymm2, 2*32(%rsp)
1760 vmovdqa %ymm3, 3*32(%rsp)
1761 vmovdqa %ymm4, 4*32(%rsp)
1762 vmovdqa %ymm5, 5*32(%rsp)
1763 vmovdqa %ymm6, 6*32(%rsp)
1764 vmovdqa %ymm7, 7*32(%rsp)
1765 vmovdqu 8*32(%rsi), %ymm0
1766 vmovdqu 9*32(%rsi), %ymm1
1767 vmovdqu 10*32(%rsi), %ymm2
1768 vmovdqu 11*32(%rsi), %ymm3
1769 vmovdqu 12*32(%rsi), %ymm4
1770 vmovdqu 13*32(%rsi), %ymm5
1771 vmovdqu 14*32(%rsi), %ymm6
1772 vmovdqu 15*32(%rsi), %ymm7
1773 vmovdqa %ymm0, 8*32(%rsp)
1774 vmovdqa %ymm1, 9*32(%rsp)
1775 vmovdqa %ymm2, 10*32(%rsp)
1776 vmovdqa %ymm3, 11*32(%rsp)
1777 vmovdqa %ymm4, 12*32(%rsp)
1778 vmovdqa %ymm5, 13*32(%rsp)
1779 vmovdqa %ymm6, 14*32(%rsp)
1780 vmovdqa %ymm7, 15*32(%rsp)
1781 jmp sha256_transform_8way_core_avx2
1784 sha256_transform_8way_swap:
1785 p2bswap_avx2_rsi_rsp 0
1786 p2bswap_avx2_rsi_rsp 2
1787 p2bswap_avx2_rsi_rsp 4
1788 p2bswap_avx2_rsi_rsp 6
1789 p2bswap_avx2_rsi_rsp 8
1790 p2bswap_avx2_rsi_rsp 10
1791 p2bswap_avx2_rsi_rsp 12
1792 p2bswap_avx2_rsi_rsp 14
1793 jmp sha256_transform_8way_core_avx2
1796 sha256_transform_8way_finish:
1797 vmovdqu 0*32(%rdi), %ymm2
1798 vmovdqu 1*32(%rdi), %ymm6
1799 vmovdqu 2*32(%rdi), %ymm11
1800 vmovdqu 3*32(%rdi), %ymm1
1801 vpaddd %ymm2, %ymm7, %ymm7
1802 vpaddd %ymm6, %ymm5, %ymm5
1803 vpaddd %ymm11, %ymm4, %ymm4
1804 vpaddd %ymm1, %ymm3, %ymm3
1805 vmovdqu 4*32(%rdi), %ymm2
1806 vmovdqu 5*32(%rdi), %ymm6
1807 vmovdqu 6*32(%rdi), %ymm11
1808 vmovdqu 7*32(%rdi), %ymm1
1809 vpaddd %ymm2, %ymm0, %ymm0
1810 vpaddd %ymm6, %ymm8, %ymm8
1811 vpaddd %ymm11, %ymm9, %ymm9
1812 vpaddd %ymm1, %ymm10, %ymm10
1814 vmovdqu %ymm7, 0*32(%rdi)
1815 vmovdqu %ymm5, 1*32(%rdi)
1816 vmovdqu %ymm4, 2*32(%rdi)
1817 vmovdqu %ymm3, 3*32(%rdi)
1818 vmovdqu %ymm0, 4*32(%rdi)
1819 vmovdqu %ymm8, 5*32(%rdi)
1820 vmovdqu %ymm9, 6*32(%rdi)
1821 vmovdqu %ymm10, 7*32(%rdi)
1824 #if defined(_WIN64) || defined(__CYGWIN__)
1826 vmovdqa 0(%rsp), %xmm6
1827 vmovdqa 16(%rsp), %xmm7
1828 vmovdqa 32(%rsp), %xmm8
1829 vmovdqa 48(%rsp), %xmm9
1830 vmovdqa 64(%rsp), %xmm10
1831 vmovdqa 80(%rsp), %xmm11
1838 .macro sha256_sse2_main_round_red i, r7
1839 movdqa 16*\i(%rax), %xmm6
1840 paddd 16*\i(%rcx), %xmm6
1841 paddd 32(%rsp), %xmm6
1843 movdqa 16(%rsp), %xmm2
1846 movdqa %xmm2, 32(%rsp)
1847 movdqa 0(%rsp), %xmm2
1848 movdqa %xmm2, 16(%rsp)
1851 movdqa %xmm0, 0(%rsp)
1869 .macro sha256_avx_main_round_red i, r0, r1, r2, r3, r4
1870 vpaddd 16*\i(%rax), \r0, %xmm6
1871 vpaddd 16*\i(%rcx), %xmm6, %xmm6
1872 vpandn \r1, \r3, %xmm1
1873 vpand \r3, \r2, %xmm2
1874 vpxor %xmm2, %xmm1, %xmm1
1875 vpaddd %xmm1, %xmm6, %xmm6
1876 vpslld $7, \r3, %xmm1
1878 vpsrld $5, \r0, %xmm2
1879 vpxor %xmm1, \r0, \r0
1880 vpxor %xmm2, \r0, \r0
1881 vpslld $14, %xmm1, %xmm1
1882 vpsrld $14, %xmm2, %xmm2
1883 vpxor %xmm1, \r0, \r0
1884 vpxor %xmm2, \r0, \r0
1885 vpslld $5, %xmm1, %xmm1
1886 vpxor %xmm1, \r0, \r0
1887 vpaddd \r0, %xmm6, %xmm6
1888 vpaddd %xmm6, \r4, \r0
1891 .macro sha256_xop_main_round_red i, r0, r1, r2, r3, r4
1892 vpaddd 16*\i(%rax), \r0, %xmm6
1893 vpaddd 16*\i(%rcx), %xmm6, %xmm6
1894 vpandn \r1, \r3, %xmm1
1895 vpand \r3, \r2, %xmm2
1896 vpxor %xmm2, %xmm1, %xmm1
1897 vpaddd %xmm1, %xmm6, %xmm6
1898 vprotd $26, \r3, %xmm1
1899 vprotd $21, \r3, %xmm2
1900 vpxor %xmm1, %xmm2, %xmm2
1902 vpxor %xmm2, \r0, \r0
1903 vpaddd \r0, %xmm6, %xmm6
1904 vpaddd %xmm6, \r4, \r0
1909 .globl sha256_use_4way
1910 .globl _sha256_use_4way
1917 /* Check for VIA PadLock Hash Engine */
1918 movl $0xc0000000, %eax
1920 cmpl $0xc0000001, %eax
1921 jb sha256_use_4way_no_phe
1922 movl $0xc0000001, %eax
1924 andl $0x00000c00, %edx
1925 cmpl $0x00000c00, %edx
1926 jne sha256_use_4way_no_phe
1927 leaq sha256_transform_phe(%rip), %rdx
1928 movq %rdx, sha256_transform_addr(%rip)
1930 jmp sha256_use_4way_exit
1931 sha256_use_4way_no_phe:
1932 /* Check for AVX and OSXSAVE support */
1935 andl $0x18000000, %ecx
1936 cmpl $0x18000000, %ecx
1937 jne sha256_use_4way_base
1938 /* Check for XMM and YMM state support */
1941 andl $0x00000006, %eax
1942 cmpl $0x00000006, %eax
1943 jne sha256_use_4way_base
1944 /* Check for XOP support */
1945 movl $0x80000001, %eax
1947 andl $0x00000800, %ecx
1948 jz sha256_use_4way_avx
1950 sha256_use_4way_xop:
1951 leaq sha256_transform_4way_core_xop(%rip), %rdx
1952 jmp sha256_use_4way_done
1954 sha256_use_4way_avx:
1955 leaq sha256_transform_4way_core_avx(%rip), %rdx
1956 jmp sha256_use_4way_done
1958 sha256_use_4way_base:
1959 leaq sha256_transform_4way_core_sse2(%rip), %rdx
1961 sha256_use_4way_done:
1962 movq %rdx, sha256_transform_4way_core_addr(%rip)
1964 sha256_use_4way_exit:
1972 .globl sha256_use_ssse3
1973 .globl _sha256_use_ssse3
1980 andl $0x00000200, %ecx
1981 jz sha256_use_ssse3_done
1988 sha256_use_ssse3_done:
1995 .macro sha256_avx2_main_round_red i, r0, r1, r2, r3, r4
1996 vpaddd 32*\i(%rax), \r0, %ymm6
1997 vpaddd 32*\i(%rcx), %ymm6, %ymm6
1998 vpandn \r1, \r3, %ymm1
1999 vpand \r3, \r2, %ymm2
2000 vpxor %ymm2, %ymm1, %ymm1
2001 vpaddd %ymm1, %ymm6, %ymm6
2002 vpslld $7, \r3, %ymm1
2004 vpsrld $5, \r0, %ymm2
2005 vpxor %ymm1, \r0, \r0
2006 vpxor %ymm2, \r0, \r0
2007 vpslld $14, %ymm1, %ymm1
2008 vpsrld $14, %ymm2, %ymm2
2009 vpxor %ymm1, \r0, \r0
2010 vpxor %ymm2, \r0, \r0
2011 vpslld $5, %ymm1, %ymm1
2012 vpxor %ymm1, \r0, \r0
2013 vpaddd \r0, %ymm6, %ymm6
2014 vpaddd %ymm6, \r4, \r0
2019 .globl sha256_use_8way
2020 .globl _sha256_use_8way
2025 /* Check for AVX and OSXSAVE support */
2028 andl $0x18000000, %ecx
2029 cmpl $0x18000000, %ecx
2030 jne sha256_use_8way_no
2031 /* Check for AVX2 support */
2035 andl $0x00000020, %ebx
2036 cmpl $0x00000020, %ebx
2037 jne sha256_use_8way_no
2038 /* Check for XMM and YMM state support */
2041 andl $0x00000006, %eax
2042 cmpl $0x00000006, %eax
2043 jne sha256_use_8way_no
2045 sha256_use_8way_yes:
2047 jmp sha256_use_8way_done
2052 sha256_use_8way_done: