--- /dev/null
+/*
+ * Copyright 2012 pooler@litecoinpool.org
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version. See COPYING for more details.
+ */
+
+#if defined(__arm__) && defined(__APCS_32__)
+
+.macro sha256_k
+ .align 2
+ .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+.endm
+
+.macro sha256_extend_doubleround_core i, rw, ra, rb, ry, rz
+ mov r12, \ry, ror #17
+ add r11, r11, \ra
+ eor r12, r12, \ry, ror #19
+ mov \ra, lr, ror #7
+ eor r12, r12, \ry, lsr #10
+ eor \ra, \ra, lr, ror #18
+ add r12, r12, r11
+ ldr r11, [\rw, #(\i+2)*4]
+ eor \ra, \ra, lr, lsr #3
+ add \ra, \ra, r12
+
+ mov r12, \rz, ror #17
+ str \ra, [\rw, #(\i+16)*4]
+ add lr, lr, \rb
+ eor r12, r12, \rz, ror #19
+ mov \rb, r11, ror #7
+ eor r12, r12, \rz, lsr #10
+ eor \rb, \rb, r11, ror #18
+ add lr, lr, r12
+ eor \rb, \rb, r11, lsr #3
+ add \rb, \rb, lr
+.endm
+
+.macro sha256_extend_doubleround_head i, rw, ra, rb, ry, rz
+ ldr lr, [\rw, #(\i+1)*4]
+ sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz
+ ldr lr, [\rw, #(\i+3)*4]
+.endm
+
+.macro sha256_extend_doubleround_body i, rw, ra, rb, ry, rz
+ str \rz, [\rw, #(\i+15)*4]
+ sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz
+ ldr lr, [\rw, #(\i+3)*4]
+.endm
+
+.macro sha256_extend_doubleround_foot i, rw, ra, rb, ry, rz
+ str \rz, [\rw, #(\i+15)*4]
+ sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz
+ str \rb, [\rw, #(\i+17)*4]
+.endm
+
+.macro sha256_main_round i, ka, rw, ra, rb, rc, rd, re, rf, rg, rh
+ ldr r12, [\rw, #(\i)*4]
+ and r3, \rf, \re
+ bic lr, \rg, \re
+ orr lr, lr, r3
+ ldr r3, \ka + (\i)*4
+ add \rh, \rh, lr
+ eor lr, \re, \re, ror #5
+ add \rh, \rh, r12
+ eor lr, lr, \re, ror #19
+ add \rh, \rh, r3
+ eor r3, \ra, \rb
+ add \rh, \rh, lr, ror #6
+
+ and r3, r3, \rc
+ eor r12, \ra, \ra, ror #11
+ and lr, \ra, \rb
+ eor r12, r12, \ra, ror #20
+ eor lr, lr, r3
+ add r3, \rh, lr
+ add \rh, \rh, \rd
+ add \rd, r3, r12, ror #2
+.endm
+
+.macro sha256_main_quadround i, ka, rw
+ sha256_main_round \i+0, \ka, \rw, r4, r5, r6, r7, r8, r9, r10, r11
+ sha256_main_round \i+1, \ka, \rw, r7, r4, r5, r6, r11, r8, r9, r10
+ sha256_main_round \i+2, \ka, \rw, r6, r7, r4, r5, r10, r11, r8, r9
+ sha256_main_round \i+3, \ka, \rw, r5, r6, r7, r4, r9, r10, r11, r8
+.endm
+
+
+ .text
+ .code 32
+ .align 2
+ .globl sha256_transform
+ .globl _sha256_transform
+#ifdef __ELF__
+ .type sha256_transform, %function
+#endif
+sha256_transform:
+_sha256_transform:
+ stmfd sp!, {r4-r11, lr}
+ cmp r2, #0
+ sub sp, sp, #64*4
+ bne sha256_transform_swap
+
+ ldmia r1!, {r4-r11}
+ stmia sp, {r4-r11}
+ add r3, sp, #8*4
+ ldmia r1, {r4-r11}
+ stmia r3, {r4-r11}
+ b sha256_transform_extend
+
+.macro bswap rd, rn
+ eor r12, \rn, \rn, ror #16
+ bic r12, r12, #0x00ff0000
+ mov \rd, \rn, ror #8
+ eor \rd, \rd, r12, lsr #8
+.endm
+
+sha256_transform_swap:
+ ldmia r1!, {r4-r11}
+ bswap r4, r4
+ bswap r5, r5
+ bswap r6, r6
+ bswap r7, r7
+ bswap r8, r8
+ bswap r9, r9
+ bswap r10, r10
+ bswap r11, r11
+ stmia sp, {r4-r11}
+ add r3, sp, #8*4
+ ldmia r1, {r4-r11}
+ bswap r4, r4
+ bswap r5, r5
+ bswap r6, r6
+ bswap r7, r7
+ bswap r8, r8
+ bswap r9, r9
+ bswap r10, r10
+ bswap r11, r11
+ stmia r3, {r4-r11}
+
+sha256_transform_extend:
+ add r12, sp, #9*4
+ ldr r11, [sp, #0*4]
+ ldmia r12, {r4-r10}
+ sha256_extend_doubleround_head 0, sp, r4, r5, r9, r10
+ sha256_extend_doubleround_body 2, sp, r6, r7, r4, r5
+ sha256_extend_doubleround_body 4, sp, r8, r9, r6, r7
+ sha256_extend_doubleround_body 6, sp, r10, r4, r8, r9
+ sha256_extend_doubleround_body 8, sp, r5, r6, r10, r4
+ sha256_extend_doubleround_body 10, sp, r7, r8, r5, r6
+ sha256_extend_doubleround_body 12, sp, r9, r10, r7, r8
+ sha256_extend_doubleround_body 14, sp, r4, r5, r9, r10
+ sha256_extend_doubleround_body 16, sp, r6, r7, r4, r5
+ sha256_extend_doubleround_body 18, sp, r8, r9, r6, r7
+ sha256_extend_doubleround_body 20, sp, r10, r4, r8, r9
+ sha256_extend_doubleround_body 22, sp, r5, r6, r10, r4
+ sha256_extend_doubleround_body 24, sp, r7, r8, r5, r6
+ sha256_extend_doubleround_body 26, sp, r9, r10, r7, r8
+ sha256_extend_doubleround_body 28, sp, r4, r5, r9, r10
+ sha256_extend_doubleround_body 30, sp, r6, r7, r4, r5
+ sha256_extend_doubleround_body 32, sp, r8, r9, r6, r7
+ sha256_extend_doubleround_body 34, sp, r10, r4, r8, r9
+ sha256_extend_doubleround_body 36, sp, r5, r6, r10, r4
+ sha256_extend_doubleround_body 38, sp, r7, r8, r5, r6
+ sha256_extend_doubleround_body 40, sp, r9, r10, r7, r8
+ sha256_extend_doubleround_body 42, sp, r4, r5, r9, r10
+ sha256_extend_doubleround_body 44, sp, r6, r7, r4, r5
+ sha256_extend_doubleround_foot 46, sp, r8, r9, r6, r7
+
+ ldmia r0, {r4-r11}
+ sha256_main_quadround 0, sha256_transform_k, sp
+ sha256_main_quadround 4, sha256_transform_k, sp
+ sha256_main_quadround 8, sha256_transform_k, sp
+ sha256_main_quadround 12, sha256_transform_k, sp
+ sha256_main_quadround 16, sha256_transform_k, sp
+ sha256_main_quadround 20, sha256_transform_k, sp
+ sha256_main_quadround 24, sha256_transform_k, sp
+ sha256_main_quadround 28, sha256_transform_k, sp
+ b sha256_transform_k_over
+sha256_transform_k:
+ sha256_k
+sha256_transform_k_over:
+ sha256_main_quadround 32, sha256_transform_k, sp
+ sha256_main_quadround 36, sha256_transform_k, sp
+ sha256_main_quadround 40, sha256_transform_k, sp
+ sha256_main_quadround 44, sha256_transform_k, sp
+ sha256_main_quadround 48, sha256_transform_k, sp
+ sha256_main_quadround 52, sha256_transform_k, sp
+ sha256_main_quadround 56, sha256_transform_k, sp
+ sha256_main_quadround 60, sha256_transform_k, sp
+
+ ldmia r0, {r1, r2, r3, r12}
+ add r4, r4, r1
+ add r5, r5, r2
+ add r6, r6, r3
+ add r7, r7, r12
+ stmia r0!, {r4-r7}
+ ldmia r0, {r1, r2, r3, r12}
+ add r8, r8, r1
+ add r9, r9, r2
+ add r10, r10, r3
+ add r11, r11, r12
+ stmia r0, {r8-r11}
+
+ add sp, sp, #64*4
+#ifdef __thumb__
+ ldmfd sp!, {r4-r11, lr}
+ bx lr
+#else
+ ldmfd sp!, {r4-r11, pc}
+#endif
+
+
+ .text
+ .code 32
+ .align 2
+ .globl sha256d_ms
+ .globl _sha256d_ms
+#ifdef __ELF__
+ .type sha256d_ms, %function
+#endif
+sha256d_ms:
+_sha256d_ms:
+ stmfd sp!, {r4-r11, lr}
+ sub sp, sp, #64*4
+
+ cmp r0, r0
+
+ ldr lr, [r1, #3*4]
+ ldr r6, [r1, #18*4]
+ ldr r7, [r1, #19*4]
+
+ mov r12, lr, ror #7
+ str r6, [sp, #18*4]
+ eor r12, r12, lr, ror #18
+ str r7, [sp, #19*4]
+ eor r12, r12, lr, lsr #3
+ ldr r8, [r1, #20*4]
+ add r6, r6, r12
+ ldr r10, [r1, #22*4]
+ add r7, r7, lr
+ str r6, [r1, #18*4]
+
+ mov r12, r6, ror #17
+ str r7, [r1, #19*4]
+ eor r12, r12, r6, ror #19
+ str r8, [sp, #20*4]
+ eor r12, r12, r6, lsr #10
+ ldr r4, [r1, #23*4]
+ add r8, r8, r12
+ ldr r5, [r1, #24*4]
+
+ mov r9, r7, ror #17
+ str r8, [r1, #20*4]
+ eor r9, r9, r7, ror #19
+ str r10, [sp, #21*4]
+ eor r9, r9, r7, lsr #10
+ str r4, [sp, #22*4]
+
+ mov r12, r8, ror #17
+ str r9, [r1, #21*4]
+ eor r12, r12, r8, ror #19
+ str r5, [sp, #23*4]
+ eor r12, r12, r8, lsr #10
+ mov lr, r9, ror #17
+ add r10, r10, r12
+ ldr r11, [r1, #30*4]
+
+ eor lr, lr, r9, ror #19
+ str r10, [r1, #22*4]
+ eor lr, lr, r9, lsr #10
+ str r11, [sp, #24*4]
+ add r4, r4, lr
+
+ mov r12, r10, ror #17
+ str r4, [r1, #23*4]
+ eor r12, r12, r10, ror #19
+ mov lr, r4, ror #17
+ eor r12, r12, r10, lsr #10
+ eor lr, lr, r4, ror #19
+ add r5, r5, r12
+ eor lr, lr, r4, lsr #10
+ str r5, [r1, #24*4]
+ add r6, r6, lr
+
+ mov r12, r5, ror #17
+ str r6, [r1, #25*4]
+ eor r12, r12, r5, ror #19
+ mov lr, r6, ror #17
+ eor r12, r12, r5, lsr #10
+ eor lr, lr, r6, ror #19
+ add r7, r7, r12
+ eor lr, lr, r6, lsr #10
+ str r7, [r1, #26*4]
+ add r8, r8, lr
+
+ mov r12, r7, ror #17
+ str r8, [r1, #27*4]
+ eor r12, r12, r7, ror #19
+ mov lr, r8, ror #17
+ eor r12, r12, r7, lsr #10
+ eor lr, lr, r8, ror #19
+ add r9, r9, r12
+ eor lr, lr, r8, lsr #10
+ str r9, [r1, #28*4]
+ add r10, r10, lr
+
+ ldr lr, [r1, #31*4]
+ mov r12, r9, ror #17
+ str r10, [r1, #29*4]
+ eor r12, r12, r9, ror #19
+ str lr, [sp, #25*4]
+ eor r12, r12, r9, lsr #10
+ add r11, r11, r12
+ add r5, r5, lr
+ mov r12, r10, ror #17
+ add r4, r4, r11
+
+ ldr r11, [r1, #16*4]
+ eor r12, r12, r10, ror #19
+ str r4, [r1, #30*4]
+ eor r12, r12, r10, lsr #10
+ add r5, r5, r12
+ ldr lr, [r1, #17*4]
+
+sha256d_ms_extend_loop2:
+ sha256_extend_doubleround_body 16, r1, r6, r7, r4, r5
+ sha256_extend_doubleround_body 18, r1, r8, r9, r6, r7
+ sha256_extend_doubleround_body 20, r1, r10, r4, r8, r9
+ sha256_extend_doubleround_body 22, r1, r5, r6, r10, r4
+ sha256_extend_doubleround_body 24, r1, r7, r8, r5, r6
+ sha256_extend_doubleround_body 26, r1, r9, r10, r7, r8
+ sha256_extend_doubleround_body 28, r1, r4, r5, r9, r10
+ sha256_extend_doubleround_body 30, r1, r6, r7, r4, r5
+ sha256_extend_doubleround_body 32, r1, r8, r9, r6, r7
+ sha256_extend_doubleround_body 34, r1, r10, r4, r8, r9
+ sha256_extend_doubleround_body 36, r1, r5, r6, r10, r4
+ sha256_extend_doubleround_body 38, r1, r7, r8, r5, r6
+ sha256_extend_doubleround_body 40, r1, r9, r10, r7, r8
+ sha256_extend_doubleround_body 42, r1, r4, r5, r9, r10
+ bne sha256d_ms_extend_coda2
+ sha256_extend_doubleround_body 44, r1, r6, r7, r4, r5
+ sha256_extend_doubleround_foot 46, r1, r8, r9, r6, r7
+
+ ldr r4, [r3, #0*4]
+ ldr r9, [r3, #1*4]
+ ldr r10, [r3, #2*4]
+ ldr r11, [r3, #3*4]
+ ldr r8, [r3, #4*4]
+ ldr r5, [r3, #5*4]
+ ldr r6, [r3, #6*4]
+ ldr r7, [r3, #7*4]
+ b sha256d_ms_main_loop1
+
+sha256d_ms_main_loop2:
+ sha256_main_round 0, sha256d_ms_k, r1, r4, r5, r6, r7, r8, r9, r10, r11
+ sha256_main_round 1, sha256d_ms_k, r1, r7, r4, r5, r6, r11, r8, r9, r10
+ sha256_main_round 2, sha256d_ms_k, r1, r6, r7, r4, r5, r10, r11, r8, r9
+sha256d_ms_main_loop1:
+ sha256_main_round 3, sha256d_ms_k, r1, r5, r6, r7, r4, r9, r10, r11, r8
+ sha256_main_quadround 4, sha256d_ms_k, r1
+ sha256_main_quadround 8, sha256d_ms_k, r1
+ sha256_main_quadround 12, sha256d_ms_k, r1
+ sha256_main_quadround 16, sha256d_ms_k, r1
+ sha256_main_quadround 20, sha256d_ms_k, r1
+ sha256_main_quadround 24, sha256d_ms_k, r1
+ sha256_main_quadround 28, sha256d_ms_k, r1
+ b sha256d_ms_k_over
+sha256d_ms_k:
+ sha256_k
+sha256d_ms_k_over:
+ sha256_main_quadround 32, sha256d_ms_k, r1
+ sha256_main_quadround 36, sha256d_ms_k, r1
+ sha256_main_quadround 40, sha256d_ms_k, r1
+ sha256_main_quadround 44, sha256d_ms_k, r1
+ sha256_main_quadround 48, sha256d_ms_k, r1
+ sha256_main_quadround 52, sha256d_ms_k, r1
+ sha256_main_round 56, sha256d_ms_k, r1, r4, r5, r6, r7, r8, r9, r10, r11
+ bne sha256d_ms_finish
+ sha256_main_round 57, sha256d_ms_k, r1, r7, r4, r5, r6, r11, r8, r9, r10
+ sha256_main_round 58, sha256d_ms_k, r1, r6, r7, r4, r5, r10, r11, r8, r9
+ sha256_main_round 59, sha256d_ms_k, r1, r5, r6, r7, r4, r9, r10, r11, r8
+ sha256_main_quadround 60, sha256d_ms_k, r1
+
+ ldmia r2!, {r3, r12, lr}
+ add r4, r4, r3
+ add r5, r5, r12
+ add r6, r6, lr
+ stmia sp, {r4-r6}
+ ldmia r2, {r3, r4, r5, r6, r12}
+ add lr, sp, #3*4
+ add r7, r7, r3
+ add r8, r8, r4
+ add r9, r9, r5
+ add r10, r10, r6
+ add r11, r11, r12
+ add r12, sp, #18*4
+ stmia lr!, {r7-r11}
+
+ ldmia r12, {r4-r11}
+ str r4, [r1, #18*4]
+ str r5, [r1, #19*4]
+ str r6, [r1, #20*4]
+ str r7, [r1, #22*4]
+ str r8, [r1, #23*4]
+ str r9, [r1, #24*4]
+ str r10, [r1, #30*4]
+ str r11, [r1, #31*4]
+
+ mov r3, #0x80000000
+ mov r4, #0
+ mov r5, #0
+ mov r6, #0
+ mov r7, #0
+ mov r8, #0
+ mov r9, #0
+ mov r10, #0x00000100
+ stmia lr, {r3-r10}
+
+ ldr lr, [sp, #1*4]
+ movs r1, sp
+ ldr r4, [sp, #0*4]
+
+ ldr r11, [sp, #2*4]
+ mov r12, lr, ror #7
+ eor r12, r12, lr, ror #18
+ add r5, lr, #0x00a00000
+ eor r12, r12, lr, lsr #3
+ mov lr, r11, ror #7
+ add r4, r4, r12
+ eor lr, lr, r11, ror #18
+ str r4, [sp, #16*4]
+ eor lr, lr, r11, lsr #3
+ mov r12, r4, ror #17
+ add r5, r5, lr
+ ldr lr, [sp, #3*4]
+
+ str r5, [sp, #17*4]
+ eor r12, r12, r4, ror #19
+ mov r6, lr, ror #7
+ eor r12, r12, r4, lsr #10
+ eor r6, r6, lr, ror #18
+ add r11, r11, r12
+ eor r6, r6, lr, lsr #3
+ mov r12, r5, ror #17
+ add r6, r6, r11
+ ldr r11, [sp, #4*4]
+
+ str r6, [sp, #18*4]
+ eor r12, r12, r5, ror #19
+ mov r7, r11, ror #7
+ eor r12, r12, r5, lsr #10
+ eor r7, r7, r11, ror #18
+ add lr, lr, r12
+ eor r7, r7, r11, lsr #3
+ mov r12, r6, ror #17
+ add r7, r7, lr
+ ldr lr, [sp, #5*4]
+
+ str r7, [sp, #19*4]
+ eor r12, r12, r6, ror #19
+ mov r8, lr, ror #7
+ eor r12, r12, r6, lsr #10
+ eor r8, r8, lr, ror #18
+ add r11, r11, r12
+ eor r8, r8, lr, lsr #3
+ mov r12, r7, ror #17
+ add r8, r8, r11
+ ldr r11, [sp, #6*4]
+
+ str r8, [sp, #20*4]
+ eor r12, r12, r7, ror #19
+ mov r9, r11, ror #7
+ eor r12, r12, r7, lsr #10
+ eor r9, r9, r11, ror #18
+ add lr, lr, r12
+ eor r9, r9, r11, lsr #3
+ mov r12, r8, ror #17
+ add r9, r9, lr
+ ldr lr, [sp, #7*4]
+
+ str r9, [sp, #21*4]
+ eor r12, r12, r8, ror #19
+ mov r10, lr, ror #7
+ eor r12, r12, r8, lsr #10
+ eor r10, r10, lr, ror #18
+ add r11, r11, r12
+ eor r10, r10, lr, lsr #3
+ mov r12, r9, ror #17
+ add r11, r11, #0x00000100
+ add lr, lr, r4
+ add r10, r10, r11
+
+ eor r12, r12, r9, ror #19
+ str r10, [sp, #22*4]
+ add lr, lr, #0x11000000
+ eor r12, r12, r9, lsr #10
+ add lr, lr, r12
+ mov r12, r10, ror #17
+ add r4, lr, #0x00002000
+ eor r12, r12, r10, ror #19
+ str r4, [sp, #23*4]
+ add r5, r5, #0x80000000
+ eor r12, r12, r10, lsr #10
+ add r5, r5, r12
+
+ mov r12, r4, ror #17
+ str r5, [sp, #24*4]
+ eor r12, r12, r4, ror #19
+ mov r11, r5, ror #17
+ eor r12, r12, r4, lsr #10
+ eor r11, r11, r5, ror #19
+ add r6, r6, r12
+ eor r11, r11, r5, lsr #10
+ str r6, [sp, #25*4]
+ add r7, r7, r11
+
+ mov r12, r6, ror #17
+ str r7, [sp, #26*4]
+ eor r12, r12, r6, ror #19
+ mov r11, r7, ror #17
+ eor r12, r12, r6, lsr #10
+ eor r11, r11, r7, ror #19
+ add r8, r8, r12
+ eor r11, r11, r7, lsr #10
+ str r8, [sp, #27*4]
+ add r9, r9, r11
+
+ mov lr, r8, ror #17
+ mov r12, r9, ror #17
+ str r9, [sp, #28*4]
+ add r4, r4, #0x00400000
+ eor lr, lr, r8, ror #19
+ eor r12, r12, r9, ror #19
+ eor lr, lr, r8, lsr #10
+ eor r12, r12, r9, lsr #10
+ add r4, r4, #0x00000022
+ add r10, r10, lr
+ add r4, r4, r12
+ ldr r11, [sp, #16*4]
+
+ add r5, r5, #0x00000100
+ str r4, [sp, #30*4]
+ mov lr, r11, ror #7
+ str r10, [sp, #29*4]
+ mov r12, r10, ror #17
+ eor lr, lr, r11, ror #18
+ eor r12, r12, r10, ror #19
+ eor lr, lr, r11, lsr #3
+ eor r12, r12, r10, lsr #10
+ add r5, r5, lr
+ ldr lr, [r1, #17*4]
+ add r5, r5, r12
+
+ b sha256d_ms_extend_loop2
+
+sha256d_ms_extend_coda2:
+ str r5, [r1, #(44+15)*4]
+ mov r12, r4, ror #17
+ add r11, r11, r6
+ mov r6, lr, ror #7
+ eor r12, r12, r4, ror #19
+ eor r6, r6, lr, ror #18
+ eor r12, r12, r4, lsr #10
+ eor r6, r6, lr, lsr #3
+ add r12, r12, r11
+ add r6, r6, r12
+ str r6, [r1, #(44+16)*4]
+
+ adr r2, sha256d_ms_h
+ ldmia r2, {r4-r11}
+ b sha256d_ms_main_loop2
+
+sha256d_ms_h:
+ .long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a
+ .long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+
+.macro sha256_main_round_red i, ka, rw, rd, re, rf, rg, rh
+ ldr r12, [\rw, #(\i)*4]
+ and r3, \rf, \re
+ bic lr, \rg, \re
+ add \rh, \rh, \rd
+ orr lr, lr, r3
+ ldr r3, \ka + (\i)*4
+ add \rh, \rh, lr
+ eor lr, \re, \re, ror #5
+ add \rh, \rh, r12
+ eor lr, lr, \re, ror #19
+ add \rh, \rh, r3
+ add \rh, \rh, lr, ror #6
+.endm
+
+sha256d_ms_finish:
+ sha256_main_round_red 57, sha256d_ms_k, r1, r6, r11, r8, r9, r10
+ sha256_main_round_red 58, sha256d_ms_k, r1, r5, r10, r11, r8, r9
+ sha256_main_round_red 59, sha256d_ms_k, r1, r4, r9, r10, r11, r8
+ ldr r5, [r2, #7*4]
+ sha256_main_round_red 60, sha256d_ms_k, r1, r7, r8, r9, r10, r11
+
+ add r11, r11, r5
+ str r11, [r0, #7*4]
+
+ add sp, sp, #64*4
+#ifdef __thumb__
+ ldmfd sp!, {r4-r11, lr}
+ bx lr
+#else
+ ldmfd sp!, {r4-r11, pc}
+#endif
+
+
+#ifdef __ARM_NEON__
+
+ .text
+ .code 32
+ .align 2
+ .globl sha256_init_4way
+ .globl _sha256_init_4way
+#ifdef __ELF__
+ .type sha256_init_4way, %function
+#endif
+sha256_init_4way:
+_sha256_init_4way:
+ adr r12, sha256_4h
+ vldmia r12, {q8-q15}
+ vstmia r0, {q8-q15}
+ bx lr
+ .align 4
+sha256_4h:
+ .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
+ .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
+ .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
+ .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
+ .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
+ .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
+ .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
+ .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
+
+.macro sha256_4k
+ .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
+ .long 0x71374491, 0x71374491, 0x71374491, 0x71374491
+ .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
+ .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
+ .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
+ .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
+ .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
+ .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
+ .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
+ .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
+ .long 0x243185be, 0x243185be, 0x243185be, 0x243185be
+ .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
+ .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
+ .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
+ .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
+ .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
+ .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
+ .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
+ .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
+ .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
+ .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
+ .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
+ .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
+ .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
+ .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
+ .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
+ .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
+ .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
+ .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
+ .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
+ .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
+ .long 0x14292967, 0x14292967, 0x14292967, 0x14292967
+ .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
+ .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
+ .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
+ .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
+ .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
+ .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
+ .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
+ .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
+ .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
+ .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
+ .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
+ .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
+ .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
+ .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
+ .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
+ .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
+ .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
+ .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
+ .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
+ .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
+ .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
+ .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
+ .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
+ .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
+ .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
+ .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
+ .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
+ .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
+ .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
+ .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
+ .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
+ .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
+.endm
+
+.macro sha256_4way_extend_doubleround_core i, rr, rw, ra, rb, ry, rz
+ vadd.u32 q5, q5, \ra
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, \ry, #19
+ vshl.u32 q1, \ry, #32-19
+ veor.u32 q4, q4, q0
+ vshr.u32 \ra, q6, #7
+ vshl.u32 q0, q6, #32-7
+ veor.u32 q4, q4, q1
+ veor.u32 \ra, \ra, q0
+ vshr.u32 q1, \ry, #10
+ vshr.u32 q0, q6, #18
+ veor.u32 q4, q4, q1
+ veor.u32 \ra, \ra, q0
+ vshl.u32 q1, q6, #32-18
+ vshr.u32 q0, q6, #3
+ veor.u32 \ra, \ra, q1
+ vadd.u32 q4, q4, q5
+ veor.u32 \ra, \ra, q0
+ vld1.u32 {q5}, [\rr]!
+ vadd.u32 \ra, \ra, q4
+
+ vshr.u32 q4, \rz, #17
+ vshl.u32 q0, \rz, #32-17
+ vadd.u32 q6, q6, \rb
+ vst1.u32 {\ra}, [\rw]!
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, \rz, #19
+ vshl.u32 q1, \rz, #32-19
+ veor.u32 q4, q4, q0
+ vshr.u32 \rb, q5, #7
+ veor.u32 q4, q4, q1
+ vshl.u32 q0, q5, #32-7
+ vshr.u32 q1, \rz, #10
+ veor.u32 \rb, \rb, q0
+ vshr.u32 q0, q5, #18
+ veor.u32 q4, q4, q1
+ veor.u32 \rb, \rb, q0
+ vshl.u32 q1, q5, #32-18
+ vshr.u32 q0, q5, #3
+ veor.u32 \rb, \rb, q1
+ vadd.u32 q1, q6, q4
+ veor.u32 \rb, \rb, q0
+.endm
+
+.macro sha256_4way_extend_doubleround_head i, rr, rw, ra, rb, ry, rz
+ vld1.u32 {q6}, [\rr]!
+ vshr.u32 q4, \ry, #17
+ vshl.u32 q0, \ry, #32-17
+ sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz
+ vld1.u32 {q6}, [\rr]!
+ vadd.u32 \rb, \rb, q1
+.endm
+
+.macro sha256_4way_extend_doubleround_body i, rr, rw, ra, rb, ry, rz
+ vshr.u32 q4, \ry, #17
+ vshl.u32 q0, \ry, #32-17
+ vst1.u32 {\rz}, [\rw]!
+ sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz
+ vld1.u32 {q6}, [\rr]!
+ vadd.u32 \rb, \rb, q1
+.endm
+
+.macro sha256_4way_extend_doubleround_foot i, rr, rw, ra, rb, ry, rz
+ vshr.u32 q4, \ry, #17
+ vshl.u32 q0, \ry, #32-17
+ vst1.u32 {\rz}, [\rw]!
+ sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz
+ vadd.u32 \rb, \rb, q1
+ vst1.u32 {\rb}, [\rw]!
+.endm
+
+.macro sha256_4way_main_round i, rk, rw, ra, rb, rc, rd, re, rf, rg, rh
+ vld1.u32 {q8}, [\rw]!
+ vand.u32 q9, \rf, \re
+ vbic.u32 q10, \rg, \re
+ vshr.u32 q11, \re, #5
+ vorr.u32 q10, q10, q9
+ vld1.u32 {q9}, [\rk]!
+ vadd.u32 \rh, \rh, q10
+ vshl.u32 q12, \re, #32-5
+ veor.u32 q10, \re, q11
+ vshr.u32 q11, \re, #19
+ veor.u32 q10, q10, q12
+ vshl.u32 q12, \re, #32-19
+ veor.u32 q10, q10, q11
+ vadd.u32 \rh, \rh, q8
+ veor.u32 q10, q10, q12
+ vadd.u32 \rh, \rh, q9
+ veor.u32 q9, \ra, \rb
+ vshr.u32 q11, q10, #6
+ vshl.u32 q13, q10, #32-6
+ vadd.u32 \rh, \rh, q11
+
+ vshr.u32 q11, \ra, #11
+ vshl.u32 q12, \ra, #32-11
+ veor.u32 q8, \ra, q11
+ vand.u32 q10, \ra, \rb
+ veor.u32 q8, q8, q12
+ vshr.u32 q11, \ra, #20
+ vshl.u32 q12, \ra, #32-20
+ veor.u32 q8, q8, q11
+ vand.u32 q9, q9, \rc
+ veor.u32 q8, q8, q12
+ vadd.u32 \rh, \rh, q13
+ veor.u32 q10, q10, q9
+ vshr.u32 q11, q8, #2
+ vshl.u32 q12, q8, #32-2
+ vadd.u32 q9, \rh, q10
+ vadd.u32 q12, q12, q11
+ vadd.u32 \rh, \rh, \rd
+ vadd.u32 \rd, q9, q12
+.endm
+
+.macro sha256_4way_main_quadround i, rk, rw
+ sha256_4way_main_round \i+0, \rk, \rw, q0, q1, q2, q3, q4, q5, q6, q7
+ sha256_4way_main_round \i+1, \rk, \rw, q3, q0, q1, q2, q7, q4, q5, q6
+ sha256_4way_main_round \i+2, \rk, \rw, q2, q3, q0, q1, q6, q7, q4, q5
+ sha256_4way_main_round \i+3, \rk, \rw, q1, q2, q3, q0, q5, q6, q7, q4
+.endm
+
+
+ .text
+ .code 32
+ .align 2
+ .globl sha256_transform_4way
+ .globl _sha256_transform_4way
+#ifdef __ELF__
+ .type sha256_transform_4way, %function
+#endif
+sha256_transform_4way:
+_sha256_transform_4way:
+ stmfd sp!, {r4, lr}
+ vpush {q4-q7}
+ mov r12, sp
+ sub sp, sp, #64*16
+ bic sp, sp, #63
+ cmp r2, #0
+ bne sha256_transform_4way_swap
+
+ vldmia r1!, {q0-q7}
+ vstmia sp, {q0-q7}
+ add r3, sp, #8*16
+ vldmia r1, {q8-q15}
+ vstmia r3, {q8-q15}
+ b sha256_transform_4way_extend
+
+sha256_transform_4way_swap:
+ vldmia r1!, {q0-q7}
+ vrev32.8 q0, q0
+ vrev32.8 q1, q1
+ vrev32.8 q2, q2
+ vrev32.8 q3, q3
+ vldmia r1, {q8-q15}
+ vrev32.8 q4, q4
+ vrev32.8 q5, q5
+ vrev32.8 q6, q6
+ vrev32.8 q7, q7
+ vstmia sp, {q0-q7}
+ vrev32.8 q8, q8
+ vrev32.8 q9, q9
+ vrev32.8 q10, q10
+ vrev32.8 q11, q11
+ vrev32.8 q12, q12
+ vrev32.8 q13, q13
+ vrev32.8 q14, q14
+ vrev32.8 q15, q15
+ add r3, sp, #8*16
+ vstmia r3, {q8-q15}
+
+sha256_transform_4way_extend:
+ add r1, sp, #1*16
+ add r2, sp, #16*16
+ vmov.u32 q5, q0
+ sha256_4way_extend_doubleround_head 0, r1, r2, q9, q10, q14, q15
+ sha256_4way_extend_doubleround_body 2, r1, r2, q11, q12, q9, q10
+ sha256_4way_extend_doubleround_body 4, r1, r2, q13, q14, q11, q12
+ sha256_4way_extend_doubleround_body 6, r1, r2, q15, q9, q13, q14
+ sha256_4way_extend_doubleround_body 8, r1, r2, q10, q11, q15, q9
+ sha256_4way_extend_doubleround_body 10, r1, r2, q12, q13, q10, q11
+ sha256_4way_extend_doubleround_body 12, r1, r2, q14, q15, q12, q13
+ sha256_4way_extend_doubleround_body 14, r1, r2, q9, q10, q14, q15
+ sha256_4way_extend_doubleround_body 16, r1, r2, q11, q12, q9, q10
+ sha256_4way_extend_doubleround_body 18, r1, r2, q13, q14, q11, q12
+ sha256_4way_extend_doubleround_body 20, r1, r2, q15, q9, q13, q14
+ sha256_4way_extend_doubleround_body 22, r1, r2, q10, q11, q15, q9
+ sha256_4way_extend_doubleround_body 24, r1, r2, q12, q13, q10, q11
+ sha256_4way_extend_doubleround_body 26, r1, r2, q14, q15, q12, q13
+ sha256_4way_extend_doubleround_body 28, r1, r2, q9, q10, q14, q15
+ sha256_4way_extend_doubleround_body 30, r1, r2, q11, q12, q9, q10
+ sha256_4way_extend_doubleround_body 32, r1, r2, q13, q14, q11, q12
+ sha256_4way_extend_doubleround_body 34, r1, r2, q15, q9, q13, q14
+ sha256_4way_extend_doubleround_body 36, r1, r2, q10, q11, q15, q9
+ sha256_4way_extend_doubleround_body 38, r1, r2, q12, q13, q10, q11
+ sha256_4way_extend_doubleround_body 40, r1, r2, q14, q15, q12, q13
+ sha256_4way_extend_doubleround_body 42, r1, r2, q9, q10, q14, q15
+ sha256_4way_extend_doubleround_body 44, r1, r2, q11, q12, q9, q10
+ sha256_4way_extend_doubleround_foot 46, r1, r2, q13, q14, q11, q12
+
+ vldmia r0, {q0-q7}
+ adr r4, sha256_transform_4way_4k
+ b sha256_transform_4way_4k_over
+ .align 4
+sha256_transform_4way_4k:
+ sha256_4k
+sha256_transform_4way_4k_over:
+ sha256_4way_main_quadround 0, r4, sp
+ sha256_4way_main_quadround 4, r4, sp
+ sha256_4way_main_quadround 8, r4, sp
+ sha256_4way_main_quadround 12, r4, sp
+ sha256_4way_main_quadround 16, r4, sp
+ sha256_4way_main_quadround 20, r4, sp
+ sha256_4way_main_quadround 24, r4, sp
+ sha256_4way_main_quadround 28, r4, sp
+ sha256_4way_main_quadround 32, r4, sp
+ sha256_4way_main_quadround 36, r4, sp
+ sha256_4way_main_quadround 40, r4, sp
+ sha256_4way_main_quadround 44, r4, sp
+ sha256_4way_main_quadround 48, r4, sp
+ sha256_4way_main_quadround 52, r4, sp
+ sha256_4way_main_quadround 56, r4, sp
+ sha256_4way_main_quadround 60, r4, sp
+
+ vldmia r0, {q8-q15}
+ vadd.u32 q0, q0, q8
+ vadd.u32 q1, q1, q9
+ vadd.u32 q2, q2, q10
+ vadd.u32 q3, q3, q11
+ vadd.u32 q4, q4, q12
+ vadd.u32 q5, q5, q13
+ vadd.u32 q6, q6, q14
+ vadd.u32 q7, q7, q15
+ vstmia r0, {q0-q7}
+
+ mov sp, r12
+ vpop {q4-q7}
+ ldmfd sp!, {r4, pc}
+
+
+ .text
+ .code 32
+ .align 2
+ .globl sha256d_ms_4way
+ .globl _sha256d_ms_4way
+#ifdef __ELF__
+ .type sha256d_ms_4way, %function
+#endif
+sha256d_ms_4way:
+_sha256d_ms_4way:
+ stmfd sp!, {r4, lr}
+ vpush {q4-q7}
+ mov r12, sp
+ sub sp, sp, #64*16
+ bic sp, sp, #63
+
+ add r4, r1, #3*16
+ vld1.u32 {q6}, [r4]!
+ add r1, r1, #18*16
+ vldmia r1, {q11-q13}
+ cmp r0, r0
+
+ vshr.u32 q10, q6, #7
+ vshl.u32 q0, q6, #32-7
+ vshr.u32 q1, q6, #18
+ veor.u32 q10, q10, q0
+ vshl.u32 q0, q6, #32-18
+ veor.u32 q10, q10, q1
+ vshr.u32 q1, q6, #3
+ veor.u32 q10, q10, q0
+ vstmia sp!, {q11-q13}
+ veor.u32 q4, q10, q1
+ vadd.u32 q12, q12, q6
+ vadd.u32 q11, q11, q4
+
+ vshr.u32 q14, q12, #17
+ vshr.u32 q4, q11, #17
+ vshl.u32 q0, q11, #32-17
+ vst1.u32 {q11}, [r1]!
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q11, #19
+ vshl.u32 q1, q11, #32-19
+ veor.u32 q4, q4, q0
+ vst1.u32 {q12}, [r1]!
+ veor.u32 q4, q4, q1
+ vshr.u32 q1, q11, #10
+ vshl.u32 q0, q12, #32-17
+ veor.u32 q4, q4, q1
+ veor.u32 q14, q14, q0
+ vadd.u32 q13, q13, q4
+ vshr.u32 q0, q12, #19
+ vshl.u32 q1, q12, #32-19
+ veor.u32 q14, q14, q0
+ vst1.u32 {q13}, [r1]!
+ veor.u32 q14, q14, q1
+ vshr.u32 q1, q12, #10
+
+ vshr.u32 q4, q13, #17
+ vshl.u32 q0, q13, #32-17
+ veor.u32 q14, q14, q1
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q13, #19
+ vshl.u32 q1, q13, #32-19
+ veor.u32 q4, q4, q0
+ vst1.u32 {q14}, [r1]!
+ veor.u32 q4, q4, q1
+ vshr.u32 q1, q13, #10
+ vld1.u32 {q15}, [r1]
+ veor.u32 q4, q4, q1
+ vst1.u32 {q15}, [sp]!
+ vadd.u32 q15, q15, q4
+ vshr.u32 q4, q14, #17
+ vshl.u32 q0, q14, #32-17
+ vshl.u32 q1, q14, #32-19
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q14, #19
+ vst1.u32 {q15}, [r1]!
+ veor.u32 q4, q4, q0
+ vld1.u32 {q9}, [r1]
+ veor.u32 q4, q4, q1
+ vshr.u32 q1, q14, #10
+ vst1.u32 {q9}, [sp]!
+ veor.u32 q5, q4, q1
+
+ vshr.u32 q4, q15, #17
+ vadd.u32 q9, q9, q5
+ vshl.u32 q0, q15, #32-17
+ vshl.u32 q1, q15, #32-19
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q15, #19
+ vst1.u32 {q9}, [r1]!
+ veor.u32 q4, q4, q0
+ vld1.u32 {q10}, [r1]
+ veor.u32 q4, q4, q1
+ vshr.u32 q1, q15, #10
+ vst1.u32 {q10}, [sp]!
+ veor.u32 q4, q4, q1
+ vshl.u32 q0, q9, #32-17
+ vadd.u32 q10, q10, q4
+ vshr.u32 q4, q9, #17
+ vshl.u32 q1, q9, #32-19
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q9, #19
+ veor.u32 q4, q4, q1
+ vshr.u32 q1, q9, #10
+ veor.u32 q4, q4, q0
+ vst1.u32 {q10}, [r1]!
+ veor.u32 q5, q4, q1
+
+ vshr.u32 q4, q10, #17
+ vshl.u32 q0, q10, #32-17
+ vadd.u32 q11, q11, q5
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q10, #19
+ vshl.u32 q1, q10, #32-19
+ veor.u32 q4, q4, q0
+ vst1.u32 {q11}, [r1]!
+ veor.u32 q4, q4, q1
+ vshr.u32 q1, q10, #10
+ vshl.u32 q0, q11, #32-17
+ veor.u32 q2, q4, q1
+ vshr.u32 q4, q11, #17
+ vadd.u32 q12, q12, q2
+ vshl.u32 q1, q11, #32-19
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q11, #19
+ veor.u32 q4, q4, q1
+ vshr.u32 q1, q11, #10
+ veor.u32 q4, q4, q0
+ vst1.u32 {q12}, [r1]!
+ veor.u32 q5, q4, q1
+
+ vshr.u32 q4, q12, #17
+ vshl.u32 q0, q12, #32-17
+ vadd.u32 q13, q13, q5
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q12, #19
+ vshl.u32 q1, q12, #32-19
+ veor.u32 q4, q4, q0
+ vst1.u32 {q13}, [r1]!
+ veor.u32 q4, q4, q1
+ vshr.u32 q1, q12, #10
+ vshl.u32 q0, q13, #32-17
+ veor.u32 q2, q4, q1
+ vshr.u32 q4, q13, #17
+ vadd.u32 q14, q14, q2
+ vshl.u32 q1, q13, #32-19
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q13, #19
+ veor.u32 q4, q4, q1
+ vshr.u32 q1, q13, #10
+ veor.u32 q4, q4, q0
+ vst1.u32 {q14}, [r1]!
+ veor.u32 q5, q4, q1
+ add r4, r4, #12*16
+
+ vshr.u32 q4, q14, #17
+ vshl.u32 q0, q14, #32-17
+ vadd.u32 q15, q15, q5
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q14, #19
+ vshl.u32 q1, q14, #32-19
+ veor.u32 q4, q4, q0
+ vst1.u32 {q15}, [r1]!
+ veor.u32 q4, q4, q1
+ vshr.u32 q1, q14, #10
+ vld1.u32 {q2}, [r1]
+ veor.u32 q4, q4, q1
+ vshl.u32 q0, q15, #32-17
+ vadd.u32 q9, q9, q4
+ vst1.u32 {q2}, [sp]!
+ vadd.u32 q9, q9, q2
+ vshr.u32 q4, q15, #17
+ vshr.u32 q2, q15, #19
+ veor.u32 q4, q4, q0
+ vst1.u32 {q9}, [r1]!
+ vshl.u32 q1, q15, #32-19
+ veor.u32 q4, q4, q2
+ vshr.u32 q0, q15, #10
+ veor.u32 q4, q4, q1
+ vld1.u32 {q5-q6}, [r4]!
+ veor.u32 q4, q4, q0
+ vld1.u32 {q2}, [r1]
+ vadd.u32 q10, q10, q4
+ vst1.u32 {q2}, [sp]!
+ vadd.u32 q10, q10, q2
+
+ sub sp, sp, #8*16
+
+sha256d_ms_4way_extend_loop2:
+ sha256_4way_extend_doubleround_body 16, r4, r1, q11, q12, q9, q10
+ sha256_4way_extend_doubleround_body 18, r4, r1, q13, q14, q11, q12
+ sha256_4way_extend_doubleround_body 20, r4, r1, q15, q9, q13, q14
+ sha256_4way_extend_doubleround_body 22, r4, r1, q10, q11, q15, q9
+ sha256_4way_extend_doubleround_body 24, r4, r1, q12, q13, q10, q11
+ sha256_4way_extend_doubleround_body 26, r4, r1, q14, q15, q12, q13
+ sha256_4way_extend_doubleround_body 28, r4, r1, q9, q10, q14, q15
+ sha256_4way_extend_doubleround_body 30, r4, r1, q11, q12, q9, q10
+ sha256_4way_extend_doubleround_body 32, r4, r1, q13, q14, q11, q12
+ sha256_4way_extend_doubleround_body 34, r4, r1, q15, q9, q13, q14
+ sha256_4way_extend_doubleround_body 36, r4, r1, q10, q11, q15, q9
+ sha256_4way_extend_doubleround_body 38, r4, r1, q12, q13, q10, q11
+ sha256_4way_extend_doubleround_body 40, r4, r1, q14, q15, q12, q13
+ sha256_4way_extend_doubleround_body 42, r4, r1, q9, q10, q14, q15
+ sha256_4way_extend_doubleround_body 44, r4, r1, q11, q12, q9, q10
+ sha256_4way_extend_doubleround_foot 46, r4, r1, q13, q14, q11, q12
+ bne sha256d_ms_4way_extend_coda2
+
+ vldmia r3!, {q4-q7}
+ vldmia r3, {q0-q3}
+ vswp q0, q4
+ adr r3, sha256d_ms_4way_4k+3*16
+ sub r1, r1, #(64-3)*16
+ b sha256d_ms_4way_main_loop1
+
+ .align 4
+sha256d_ms_4way_4k:
+ sha256_4k
+
+sha256d_ms_4way_main_loop2:
+ sha256_4way_main_round 0, r3, r1, q0, q1, q2, q3, q4, q5, q6, q7
+ sha256_4way_main_round 1, r3, r1, q3, q0, q1, q2, q7, q4, q5, q6
+ sha256_4way_main_round 2, r3, r1, q2, q3, q0, q1, q6, q7, q4, q5
+sha256d_ms_4way_main_loop1:
+ sha256_4way_main_round 3, r3, r1, q1, q2, q3, q0, q5, q6, q7, q4
+ sha256_4way_main_quadround 4, r3, r1
+ sha256_4way_main_quadround 8, r3, r1
+ sha256_4way_main_quadround 12, r3, r1
+ sha256_4way_main_quadround 16, r3, r1
+ sha256_4way_main_quadround 20, r3, r1
+ sha256_4way_main_quadround 24, r3, r1
+ sha256_4way_main_quadround 28, r3, r1
+ sha256_4way_main_quadround 32, r3, r1
+ sha256_4way_main_quadround 36, r3, r1
+ sha256_4way_main_quadround 40, r3, r1
+ sha256_4way_main_quadround 44, r3, r1
+ sha256_4way_main_quadround 48, r3, r1
+ sha256_4way_main_quadround 52, r3, r1
+ sha256_4way_main_round 56, r3, r1, q0, q1, q2, q3, q4, q5, q6, q7
+ bne sha256d_ms_4way_finish
+ sha256_4way_main_round 57, r3, r1, q3, q0, q1, q2, q7, q4, q5, q6
+ sha256_4way_main_round 58, r3, r1, q2, q3, q0, q1, q6, q7, q4, q5
+ sha256_4way_main_round 59, r3, r1, q1, q2, q3, q0, q5, q6, q7, q4
+ sha256_4way_main_quadround 60, r3, r1
+
+ vldmia r2, {q8-q15}
+ vadd.u32 q0, q0, q8
+ vadd.u32 q1, q1, q9
+ vadd.u32 q2, q2, q10
+ vadd.u32 q3, q3, q11
+ vadd.u32 q4, q4, q12
+ vadd.u32 q5, q5, q13
+ vadd.u32 q6, q6, q14
+ vadd.u32 q7, q7, q15
+
+ vldmia sp, {q8-q15}
+ sub r1, r1, #(64-18)*16
+ vstmia r1, {q8-q10}
+ add r1, r1, #4*16
+ vstmia r1, {q11-q13}
+ add r1, r1, #8*16
+ vstmia r1, {q14-q15}
+
+ vstmia sp, {q0-q7}
+ vmov.u32 q8, #0x80000000
+ vmov.u32 q9, #0
+ vmov.u32 q10, #0
+ vmov.u32 q11, #0
+ vmov.u32 q12, #0
+ vmov.u32 q13, #0
+ vmov.u32 q14, #0
+ vmov.u32 q15, #0x00000100
+ add r1, sp, #8*16
+ vstmia r1!, {q8-q15}
+ adds r4, sp, #2*16
+
+ vshr.u32 q9, q1, #7
+ vshl.u32 q2, q1, #32-7
+ vshr.u32 q4, q1, #18
+ veor.u32 q9, q9, q2
+ vshl.u32 q3, q1, #32-18
+ veor.u32 q9, q9, q4
+ vshr.u32 q2, q1, #3
+ veor.u32 q9, q9, q3
+ vld1.u32 {q5}, [r4]!
+ veor.u32 q9, q9, q2
+ vmov.u32 q7, #0x00a00000
+ vadd.u32 q9, q9, q0
+ vshr.u32 q10, q5, #7
+ vshl.u32 q0, q5, #32-7
+ vshl.u32 q3, q5, #32-18
+ veor.u32 q10, q10, q0
+ vshr.u32 q0, q5, #18
+ veor.u32 q10, q10, q3
+ vst1.u32 {q9}, [r1]!
+ vadd.u32 q3, q1, q7
+ veor.u32 q10, q10, q0
+ vshr.u32 q0, q5, #3
+ vld1.u32 {q6}, [r4]!
+ veor.u32 q10, q10, q0
+
+ vshr.u32 q4, q9, #17
+ vshl.u32 q0, q9, #32-17
+ vadd.u32 q10, q10, q3
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q9, #19
+ vshl.u32 q1, q9, #32-19
+ veor.u32 q4, q4, q0
+ vshr.u32 q11, q6, #7
+ vshl.u32 q0, q6, #32-7
+ veor.u32 q4, q4, q1
+ veor.u32 q11, q11, q0
+ vshr.u32 q1, q9, #10
+ vshr.u32 q0, q6, #18
+ veor.u32 q4, q4, q1
+ veor.u32 q11, q11, q0
+ vshl.u32 q1, q6, #32-18
+ vshr.u32 q0, q6, #3
+ veor.u32 q11, q11, q1
+ vadd.u32 q4, q4, q5
+ veor.u32 q11, q11, q0
+ vld1.u32 {q5}, [r4]!
+ vadd.u32 q11, q11, q4
+ vshr.u32 q4, q10, #17
+ vshl.u32 q0, q10, #32-17
+ vst1.u32 {q10}, [r1]!
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q10, #19
+ vshl.u32 q1, q10, #32-19
+ veor.u32 q4, q4, q0
+ vshr.u32 q12, q5, #7
+ veor.u32 q4, q4, q1
+ vshl.u32 q0, q5, #32-7
+ vshr.u32 q1, q10, #10
+ veor.u32 q12, q12, q0
+ vshr.u32 q0, q5, #18
+ veor.u32 q4, q4, q1
+ veor.u32 q12, q12, q0
+ vshl.u32 q1, q5, #32-18
+ vst1.u32 {q11}, [r1]!
+ veor.u32 q12, q12, q1
+ vshr.u32 q0, q5, #3
+ vadd.u32 q1, q6, q4
+ veor.u32 q12, q12, q0
+
+ vshr.u32 q4, q11, #17
+ vshl.u32 q0, q11, #32-17
+ vadd.u32 q12, q12, q1
+ vld1.u32 {q6}, [r4]!
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q11, #19
+ vshl.u32 q1, q11, #32-19
+ veor.u32 q4, q4, q0
+ vshr.u32 q13, q6, #7
+ vshl.u32 q0, q6, #32-7
+ veor.u32 q4, q4, q1
+ veor.u32 q13, q13, q0
+ vshr.u32 q1, q11, #10
+ vshr.u32 q0, q6, #18
+ veor.u32 q4, q4, q1
+ veor.u32 q13, q13, q0
+ vshl.u32 q1, q6, #32-18
+ vshr.u32 q0, q6, #3
+ veor.u32 q13, q13, q1
+ vadd.u32 q4, q4, q5
+ veor.u32 q13, q13, q0
+ vld1.u32 {q5}, [r4]!
+ vadd.u32 q13, q13, q4
+ vshr.u32 q4, q12, #17
+ vshl.u32 q0, q12, #32-17
+ vst1.u32 {q12}, [r1]!
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q12, #19
+ vshl.u32 q1, q12, #32-19
+ veor.u32 q4, q4, q0
+ vshr.u32 q14, q5, #7
+ veor.u32 q4, q4, q1
+ vshl.u32 q0, q5, #32-7
+ vshr.u32 q1, q12, #10
+ veor.u32 q14, q14, q0
+ vshr.u32 q0, q5, #18
+ veor.u32 q4, q4, q1
+ veor.u32 q14, q14, q0
+ vshl.u32 q1, q5, #32-18
+ vst1.u32 {q13}, [r1]!
+ veor.u32 q14, q14, q1
+ vshr.u32 q0, q5, #3
+ vadd.u32 q1, q6, q4
+ veor.u32 q14, q14, q0
+
+ vshr.u32 q4, q13, #17
+ vshl.u32 q0, q13, #32-17
+ vadd.u32 q14, q14, q1
+ vld1.u32 {q6}, [r4]!
+ vadd.u32 q5, q5, q15
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q13, #19
+ vshl.u32 q1, q13, #32-19
+ veor.u32 q4, q4, q0
+ vshr.u32 q15, q6, #7
+ vshl.u32 q0, q6, #32-7
+ veor.u32 q4, q4, q1
+ veor.u32 q15, q15, q0
+ vshr.u32 q1, q13, #10
+ vshr.u32 q0, q6, #18
+ veor.u32 q4, q4, q1
+ veor.u32 q15, q15, q0
+ vshl.u32 q1, q6, #32-18
+ vshr.u32 q0, q6, #3
+ veor.u32 q15, q15, q1
+ vadd.u32 q4, q4, q5
+ veor.u32 q15, q15, q0
+ vmov.u32 q5, #0x80000000
+ vadd.u32 q15, q15, q4
+ vshr.u32 q4, q14, #17
+ vshl.u32 q0, q14, #32-17
+ vadd.u32 q6, q6, q9
+ vst1.u32 {q14}, [r1]!
+ vmov.u32 q7, #0x11000000
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q14, #19
+ vshl.u32 q1, q14, #32-19
+ vadd.u32 q6, q6, q7
+ vmov.u32 q2, #0x00002000
+ veor.u32 q4, q4, q0
+ vst1.u32 {q15}, [r1]!
+ veor.u32 q4, q4, q1
+ vshr.u32 q1, q14, #10
+ vadd.u32 q6, q6, q2
+ veor.u32 q1, q4, q1
+ add r4, r4, #8*16
+
+ vshr.u32 q4, q15, #17
+ vshl.u32 q0, q15, #32-17
+ vadd.u32 q9, q6, q1
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q15, #19
+ vshl.u32 q1, q15, #32-19
+ veor.u32 q4, q4, q0
+ vst1.u32 {q9}, [r1]!
+ vadd.u32 q5, q5, q10
+ veor.u32 q4, q4, q1
+ vshr.u32 q1, q15, #10
+ vshl.u32 q0, q9, #32-17
+ veor.u32 q10, q4, q1
+ vshr.u32 q4, q9, #17
+ vadd.u32 q10, q10, q5
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q9, #19
+ vshl.u32 q1, q9, #32-19
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q9, #10
+ veor.u32 q4, q4, q1
+ vst1.u32 {q10}, [r1]!
+ veor.u32 q1, q4, q0
+
+ vshr.u32 q4, q10, #17
+ vshl.u32 q0, q10, #32-17
+ vadd.u32 q11, q11, q1
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q10, #19
+ vshl.u32 q1, q10, #32-19
+ veor.u32 q4, q4, q0
+ vst1.u32 {q11}, [r1]!
+ veor.u32 q4, q4, q1
+ vshr.u32 q1, q10, #10
+ vshl.u32 q0, q11, #32-17
+ veor.u32 q1, q4, q1
+ vshr.u32 q4, q11, #17
+ vadd.u32 q12, q12, q1
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q11, #19
+ vshl.u32 q1, q11, #32-19
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q11, #10
+ veor.u32 q4, q4, q1
+ vst1.u32 {q12}, [r1]!
+ veor.u32 q1, q4, q0
+
+ vshr.u32 q4, q12, #17
+ vshl.u32 q0, q12, #32-17
+ vadd.u32 q13, q13, q1
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q12, #19
+ vshl.u32 q1, q12, #32-19
+ veor.u32 q4, q4, q0
+ vst1.u32 {q13}, [r1]!
+ veor.u32 q4, q4, q1
+ vshr.u32 q1, q12, #10
+ vshl.u32 q0, q13, #32-17
+ veor.u32 q1, q4, q1
+ vshr.u32 q4, q13, #17
+ vadd.u32 q14, q14, q1
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q13, #19
+ vshl.u32 q1, q13, #32-19
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q13, #10
+ veor.u32 q4, q4, q1
+ vst1.u32 {q14}, [r1]!
+ veor.u32 q4, q4, q0
+ vmov.u32 q6, #0x00000100
+ vadd.u32 q15, q15, q4
+
+ vshr.u32 q4, q14, #17
+ vshl.u32 q0, q14, #32-17
+ vmov.u32 q7, #0x00400000
+ vst1.u32 {q15}, [r1]!
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q14, #19
+ vshl.u32 q1, q14, #32-19
+ veor.u32 q4, q4, q0
+ vadd.u32 q9, q9, q7
+ veor.u32 q4, q4, q1
+ vshr.u32 q1, q14, #10
+ vmov.u32 q2, #0x00000022
+ veor.u32 q4, q4, q1
+ vadd.u32 q9, q9, q2
+ vld1.u32 {q5}, [r4]!
+ vadd.u32 q9, q9, q4
+ vshr.u32 q4, q15, #17
+ vshl.u32 q0, q15, #32-17
+ vadd.u32 q6, q6, q10
+ vst1.u32 {q9}, [r1]!
+ veor.u32 q4, q4, q0
+ vshr.u32 q0, q15, #19
+ vshl.u32 q1, q15, #32-19
+ veor.u32 q4, q4, q0
+ vshr.u32 q10, q5, #7
+ veor.u32 q4, q4, q1
+ vshl.u32 q0, q5, #32-7
+ vshr.u32 q1, q15, #10
+ veor.u32 q10, q10, q0
+ vshr.u32 q0, q5, #18
+ veor.u32 q4, q4, q1
+ veor.u32 q10, q10, q0
+ vshl.u32 q1, q5, #32-18
+ vshr.u32 q0, q5, #3
+ veor.u32 q10, q10, q1
+ vadd.u32 q1, q6, q4
+ veor.u32 q10, q10, q0
+ vld1.u32 {q6}, [r4]!
+ vadd.u32 q10, q10, q1
+
+ b sha256d_ms_4way_extend_loop2
+
+ .align 4
+sha256d_ms_4way_4h:
+ .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
+ .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
+ .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
+ .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
+ .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
+ .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
+ .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
+ .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
+
+sha256d_ms_4way_extend_coda2:
+ adr r4, sha256d_ms_4way_4h
+ mov r1, sp
+ vldmia r4, {q0-q7}
+ vmov.u32 q15, q7
+ sub r3, r3, #64*16
+ b sha256d_ms_4way_main_loop2
+
+.macro sha256_4way_main_round_red i, rk, rw, rd, re, rf, rg, rh
+ vld1.u32 {q8}, [\rw]!
+ vand.u32 q9, \rf, \re
+ vbic.u32 q10, \rg, \re
+ vshr.u32 q11, \re, #5
+ vorr.u32 q10, q10, q9
+ vshl.u32 q12, \re, #32-5
+ vadd.u32 \rh, \rh, q10
+ veor.u32 q10, \re, q11
+ vshr.u32 q11, \re, #19
+ veor.u32 q10, q10, q12
+ vshl.u32 q12, \re, #32-19
+ veor.u32 q10, q10, q11
+ vadd.u32 \rh, \rh, q8
+ veor.u32 q10, q10, q12
+ vld1.u32 {q9}, [\rk]!
+ vadd.u32 \rh, \rh, \rd
+ vshr.u32 q11, q10, #6
+ vadd.u32 \rh, \rh, q9
+ vshl.u32 q13, q10, #32-6
+ vadd.u32 \rh, \rh, q11
+ vadd.u32 \rh, \rh, q13
+.endm
+
+sha256d_ms_4way_finish:
+ sha256_4way_main_round_red 57, r3, r1, q2, q7, q4, q5, q6
+ sha256_4way_main_round_red 58, r3, r1, q1, q6, q7, q4, q5
+ sha256_4way_main_round_red 59, r3, r1, q0, q5, q6, q7, q4
+ sha256_4way_main_round_red 60, r3, r1, q3, q4, q5, q6, q7
+
+ vadd.u32 q7, q7, q15
+ add r0, r0, #7*16
+ vst1.u32 {q7}, [r0]
+
+ mov sp, r12
+ vpop {q4-q7}
+ ldmfd sp!, {r4, pc}
+
+
+ .text
+ .code 32
+ .align 2
+ .globl sha256_use_4way
+ .globl _sha256_use_4way
+#ifdef __ELF__
+ .type sha256_use_4way, %function
+#endif
+sha256_use_4way:
+_sha256_use_4way:
+ mov r0, #1
+ bx lr
+
+#endif /* __ARM_NEON__ */
+
+#endif
--- /dev/null
+/*
+ * Copyright 2012-2015 pooler@litecoinpool.org
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version. See COPYING for more details.
+ */
+
+#if defined(__linux__) && defined(__ELF__)
+ .section .note.GNU-stack,"",%progbits
+#endif
+
+#if defined(__x86_64__)
+ .data
+ .p2align 4
+sha256_h:
+ .long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a
+ .long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+
+ .data
+ .p2align 6
+sha256_k:
+ .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+bswap_xmm_mask:
+ .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
+
+
+.macro sha256_mixed_quadround ra, rb, rc, rd, re, rf, rg, rh, x0, x1, x2, x3
+ movdqa \x3, %xmm4
+ movl \re, %eax
+ movdqa \x2, %xmm6
+ rorl $(25-11), %eax
+ movl \ra, %ebx
+ pslldq $12, %xmm4
+ rorl $(22-13), %ebx
+ psrldq $4, %xmm6
+ xorl \re, %eax
+ movl \rf, %ecx
+ rorl $(11-6), %eax
+ pxor %xmm6, %xmm4
+ movdqa \x1, %xmm5
+ xorl \ra, %ebx
+ xorl \rg, %ecx
+ xorl \re, %eax
+ paddd \x0, %xmm4
+ movdqa \x0, %xmm7
+ andl \re, %ecx
+ rorl $(13-2), %ebx
+ xorl \ra, %ebx
+ pslldq $12, %xmm5
+ psrldq $4, %xmm7
+ rorl $6, %eax
+ xorl \rg, %ecx
+ pxor %xmm7, %xmm5
+ rorl $2, %ebx
+ addl %eax, %ecx
+ addl (%rsp) , %ecx
+ movdqa %xmm5, %xmm6
+ movl \ra, %eax
+ addl %ecx, \rh
+ movl \ra, %ecx
+ movdqa %xmm5, %xmm7
+ orl \rc, %eax
+ addl \rh, \rd
+ andl \rc, %ecx
+ pslld $(32-7), %xmm5
+ psrld $7, %xmm6
+ andl \rb, %eax
+ addl %ebx, \rh
+ orl %ecx, %eax
+ por %xmm6, %xmm5
+ addl %eax, \rh
+
+ movl \rd, %eax
+ movdqa %xmm7, %xmm6
+ movl \rh, %ebx
+ rorl $(25-11), %eax
+ xorl \rd, %eax
+ movdqa %xmm7, %xmm8
+ movl \re, %ecx
+ rorl $(22-13), %ebx
+ xorl \rh, %ebx
+ pslld $(32-18), %xmm7
+ rorl $(11-6), %eax
+ xorl \rf, %ecx
+ rorl $(13-2), %ebx
+ psrld $18, %xmm6
+ xorl \rd, %eax
+ andl \rd, %ecx
+ rorl $6, %eax
+ pxor %xmm7, %xmm5
+ xorl \rh, %ebx
+ xorl \rf, %ecx
+ psrld $3, %xmm8
+ addl %eax, %ecx
+ addl 1*4(%rsp), %ecx
+ rorl $2, %ebx
+ pxor %xmm6, %xmm5
+ movl \rh, %eax
+ addl %ecx, \rg
+ movl \rh, %ecx
+ pxor %xmm8, %xmm5
+ orl \rb, %eax
+ addl \rg, \rc
+ andl \rb, %ecx
+ pshufd $0xfa, \x3, %xmm6
+ andl \ra, %eax
+ addl %ebx, \rg
+ paddd %xmm5, %xmm4
+ orl %ecx, %eax
+ addl %eax, \rg
+
+ movl \rc, %eax
+ movdqa %xmm6, %xmm7
+ movl \rg, %ebx
+ rorl $(25-11), %eax
+ xorl \rc, %eax
+ movdqa %xmm6, %xmm8
+ rorl $(22-13), %ebx
+ movl \rd, %ecx
+ xorl \rg, %ebx
+ psrlq $17, %xmm6
+ psrlq $19, %xmm7
+ rorl $(11-6), %eax
+ xorl \re, %ecx
+ xorl \rc, %eax
+ psrld $10, %xmm8
+ pxor %xmm7, %xmm6
+ andl \rc, %ecx
+ rorl $(13-2), %ebx
+ xorl \rg, %ebx
+ pxor %xmm6, %xmm8
+ xorl \re, %ecx
+ rorl $6, %eax
+ addl %eax, %ecx
+ pshufd $0x8f, %xmm8, %xmm8
+ rorl $2, %ebx
+ addl 2*4(%rsp), %ecx
+ movl \rg, %eax
+ psrldq $8, %xmm8
+ addl %ecx, \rf
+ movl \rg, %ecx
+ orl \ra, %eax
+ paddd %xmm8, %xmm4
+ addl \rf, \rb
+ andl \ra, %ecx
+ andl \rh, %eax
+ pshufd $0x50, %xmm4, %xmm6
+ addl %ebx, \rf
+ orl %ecx, %eax
+ addl %eax, \rf
+
+ movdqa %xmm6, %xmm7
+ movl \rb, %eax
+ rorl $(25-11), %eax
+ movl \rf, %ebx
+ movdqa %xmm6, \x0
+ rorl $(22-13), %ebx
+ xorl \rb, %eax
+ movl \rc, %ecx
+ psrlq $17, %xmm6
+ rorl $(11-6), %eax
+ xorl \rf, %ebx
+ xorl \rd, %ecx
+ psrlq $19, %xmm7
+ xorl \rb, %eax
+ andl \rb, %ecx
+ rorl $(13-2), %ebx
+ psrld $10, \x0
+ xorl \rf, %ebx
+ rorl $6, %eax
+ pxor %xmm7, %xmm6
+ xorl \rd, %ecx
+ rorl $2, %ebx
+ addl %eax, %ecx
+ pxor %xmm6, \x0
+ addl 3*4(%rsp), %ecx
+ movl \rf, %eax
+ addl %ecx, \re
+ pshufd $0xf8, \x0, \x0
+ movl \rf, %ecx
+ orl \rh, %eax
+ addl \re, \ra
+ pslldq $8, \x0
+ andl \rh, %ecx
+ andl \rg, %eax
+ paddd %xmm4, \x0
+ addl %ebx, \re
+ orl %ecx, %eax
+ addl %eax, \re
+.endm
+
+.macro sha256_main_round i, ra, rb, rc, rd, re, rf, rg, rh
+ movl \re, %eax
+ rorl $(25-11), %eax
+ movl \ra, %ebx
+ xorl \re, %eax
+ rorl $(22-13), %ebx
+ movl \rf, %ecx
+ xorl \ra, %ebx
+ rorl $(11-6), %eax
+ xorl \rg, %ecx
+ xorl \re, %eax
+ rorl $(13-2), %ebx
+ andl \re, %ecx
+ xorl \ra, %ebx
+ rorl $6, %eax
+ xorl \rg, %ecx
+ addl %eax, %ecx
+ rorl $2, %ebx
+ addl \i*4(%rsp), %ecx
+ movl \ra, %eax
+ addl %ecx, \rh
+ movl \ra, %ecx
+ orl \rc, %eax
+ addl \rh, \rd
+ andl \rc, %ecx
+ andl \rb, %eax
+ addl %ebx, \rh
+ orl %ecx, %eax
+ addl %eax, \rh
+.endm
+
+
+ .text
+ .p2align 6
+sha256_transform_sse2:
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+#if defined(_WIN64) || defined(__CYGWIN__)
+ pushq %rdi
+ pushq %rsi
+ subq $5*16, %rsp
+ movdqa %xmm6, 1*16(%rsp)
+ movdqa %xmm7, 2*16(%rsp)
+ movdqa %xmm8, 3*16(%rsp)
+ movdqa %xmm9, 4*16(%rsp)
+ movq %rcx, %rdi
+ movq %rdx, %rsi
+ movq %r8, %rdx
+#else
+ subq $16, %rsp
+#endif
+
+ movl 0*4(%rdi), %r8d
+ movl 1*4(%rdi), %r9d
+ movl 2*4(%rdi), %r10d
+ movl 3*4(%rdi), %r11d
+ movl 4*4(%rdi), %r12d
+ movl 5*4(%rdi), %r13d
+ movl 6*4(%rdi), %r14d
+ movl 7*4(%rdi), %r15d
+
+ testq %rdx, %rdx
+ jnz sha256_transform_sse2_swap
+
+ movdqu 0*16(%rsi), %xmm0
+ movdqu 1*16(%rsi), %xmm1
+ movdqu 2*16(%rsi), %xmm2
+ movdqu 3*16(%rsi), %xmm3
+ jmp sha256_transform_sse2_core
+
+sha256_transform_sse2_swap:
+ movdqu 0*16(%rsi), %xmm0
+ movdqu 1*16(%rsi), %xmm1
+ movdqu 2*16(%rsi), %xmm2
+ movdqu 3*16(%rsi), %xmm3
+ pshuflw $0xb1, %xmm0, %xmm0
+ pshuflw $0xb1, %xmm1, %xmm1
+ pshuflw $0xb1, %xmm2, %xmm2
+ pshuflw $0xb1, %xmm3, %xmm3
+ pshufhw $0xb1, %xmm0, %xmm0
+ pshufhw $0xb1, %xmm1, %xmm1
+ pshufhw $0xb1, %xmm2, %xmm2
+ pshufhw $0xb1, %xmm3, %xmm3
+ movdqa %xmm0, %xmm4
+ movdqa %xmm1, %xmm5
+ movdqa %xmm2, %xmm6
+ movdqa %xmm3, %xmm7
+ psrlw $8, %xmm4
+ psrlw $8, %xmm5
+ psrlw $8, %xmm6
+ psrlw $8, %xmm7
+ psllw $8, %xmm0
+ psllw $8, %xmm1
+ psllw $8, %xmm2
+ psllw $8, %xmm3
+ pxor %xmm4, %xmm0
+ pxor %xmm5, %xmm1
+ pxor %xmm6, %xmm2
+ pxor %xmm7, %xmm3
+
+sha256_transform_sse2_core:
+ leaq sha256_k(%rip), %rdx
+ movq $48, %rsi
+ .p2align 4
+sha256_transform_sse2_loop:
+ movdqa 0*16(%rdx), %xmm9
+ paddd %xmm0, %xmm9
+ movdqa %xmm9, (%rsp)
+ sha256_mixed_quadround %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %xmm0, %xmm1, %xmm2, %xmm3
+ movdqa 1*16(%rdx), %xmm9
+ paddd %xmm1, %xmm9
+ movdqa %xmm9, (%rsp)
+ sha256_mixed_quadround %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %xmm1, %xmm2, %xmm3, %xmm0
+ movdqa 2*16(%rdx), %xmm9
+ paddd %xmm2, %xmm9
+ movdqa %xmm9, (%rsp)
+ sha256_mixed_quadround %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %xmm2, %xmm3, %xmm0, %xmm1
+ movdqa 3*16(%rdx), %xmm9
+ paddd %xmm3, %xmm9
+ movdqa %xmm9, (%rsp)
+ addq $4*16, %rdx
+ sha256_mixed_quadround %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %xmm3, %xmm0, %xmm1, %xmm2
+
+ subq $16, %rsi
+ jne sha256_transform_sse2_loop
+
+ paddd 0*16(%rdx), %xmm0
+ movdqa %xmm0, (%rsp)
+ sha256_main_round 0, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d
+ sha256_main_round 1, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d
+ sha256_main_round 2, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d
+ sha256_main_round 3, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d
+ paddd 1*16(%rdx), %xmm1
+ movdqa %xmm1, (%rsp)
+ sha256_main_round 0, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d
+ sha256_main_round 1, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d
+ sha256_main_round 2, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d
+ sha256_main_round 3, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d
+ paddd 2*16(%rdx), %xmm2
+ movdqa %xmm2, (%rsp)
+ sha256_main_round 0, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d
+ sha256_main_round 1, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d
+ sha256_main_round 2, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d
+ sha256_main_round 3, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d
+ paddd 3*16(%rdx), %xmm3
+ movdqa %xmm3, (%rsp)
+ sha256_main_round 0, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d
+ sha256_main_round 1, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d
+ sha256_main_round 2, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d
+ sha256_main_round 3, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d
+
+ addl %r8d, 0*4(%rdi)
+ addl %r9d, 1*4(%rdi)
+ addl %r10d, 2*4(%rdi)
+ addl %r11d, 3*4(%rdi)
+ addl %r12d, 4*4(%rdi)
+ addl %r13d, 5*4(%rdi)
+ addl %r14d, 6*4(%rdi)
+ addl %r15d, 7*4(%rdi)
+
+#if defined(_WIN64) || defined(__CYGWIN__)
+ movdqa 1*16(%rsp), %xmm6
+ movdqa 2*16(%rsp), %xmm7
+ movdqa 3*16(%rsp), %xmm8
+ movdqa 4*16(%rsp), %xmm9
+ addq $5*16, %rsp
+ popq %rsi
+ popq %rdi
+#else
+ addq $16, %rsp
+#endif
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ ret
+
+
+ .text
+ .p2align 6
+sha256_transform_phe:
+#if defined(_WIN64) || defined(__CYGWIN__)
+ pushq %rdi
+ pushq %rsi
+ movq %rcx, %rdi
+ movq %rdx, %rsi
+ movq %r8, %rdx
+#endif
+ movq %rsp, %r8
+ subq $64, %rsp
+ andq $-64, %rsp
+
+ testq %rdx, %rdx
+ jnz sha256_transform_phe_noswap
+
+ movl 0*4(%rsi), %eax
+ movl 1*4(%rsi), %ecx
+ movl 2*4(%rsi), %edx
+ movl 3*4(%rsi), %r9d
+ bswapl %eax
+ bswapl %ecx
+ bswapl %edx
+ bswapl %r9d
+ movl %eax, 0*4(%rsp)
+ movl %ecx, 1*4(%rsp)
+ movl %edx, 2*4(%rsp)
+ movl %r9d, 3*4(%rsp)
+ movl 4*4(%rsi), %eax
+ movl 5*4(%rsi), %ecx
+ movl 6*4(%rsi), %edx
+ movl 7*4(%rsi), %r9d
+ bswapl %eax
+ bswapl %ecx
+ bswapl %edx
+ bswapl %r9d
+ movl %eax, 4*4(%rsp)
+ movl %ecx, 5*4(%rsp)
+ movl %edx, 6*4(%rsp)
+ movl %r9d, 7*4(%rsp)
+
+ movdqu 2*16(%rsi), %xmm0
+ movdqu 3*16(%rsi), %xmm2
+ pshuflw $0xb1, %xmm0, %xmm0
+ pshuflw $0xb1, %xmm2, %xmm2
+ pshufhw $0xb1, %xmm0, %xmm0
+ pshufhw $0xb1, %xmm2, %xmm2
+ movdqa %xmm0, %xmm1
+ movdqa %xmm2, %xmm3
+ psrlw $8, %xmm1
+ psrlw $8, %xmm3
+ psllw $8, %xmm0
+ psllw $8, %xmm2
+ pxor %xmm1, %xmm0
+ pxor %xmm3, %xmm2
+ movdqa %xmm0, 2*16(%rsp)
+ movdqa %xmm2, 3*16(%rsp)
+
+ jmp sha256_transform_phe_core
+
+sha256_transform_phe_noswap:
+ movdqu 0*16(%rsi), %xmm0
+ movdqu 1*16(%rsi), %xmm1
+ movdqu 2*16(%rsi), %xmm2
+ movdqu 3*16(%rsi), %xmm3
+ movdqa %xmm0, 0*16(%rsp)
+ movdqa %xmm1, 1*16(%rsp)
+ movdqa %xmm2, 2*16(%rsp)
+ movdqa %xmm3, 3*16(%rsp)
+
+sha256_transform_phe_core:
+ movq %rsp, %rsi
+ movq $-1, %rax
+ movq $1, %rcx
+ /* rep xsha256 */
+ .byte 0xf3, 0x0f, 0xa6, 0xd0
+
+ movq %r8, %rsp
+#if defined(_WIN64) || defined(__CYGWIN__)
+ popq %rsi
+ popq %rdi
+#endif
+ ret
+
+
+ .data
+ .p2align 3
+sha256_transform_addr:
+ .quad sha256_transform_sse2
+
+ .text
+ .p2align 3
+ .globl sha256_transform
+ .globl _sha256_transform
+sha256_transform:
+_sha256_transform:
+ jmp *sha256_transform_addr(%rip)
+
+
+ .text
+ .p2align 6
+ .globl sha256d_ms
+ .globl _sha256d_ms
+sha256d_ms:
+_sha256d_ms:
+#if defined(_WIN64) || defined(__CYGWIN__)
+ pushq %rdi
+ pushq %rsi
+ movq %rcx, %rdi
+ movq %rdx, %rsi
+ movq %r8, %rdx
+#endif
+ movq %rsp, %r8
+ subq $32, %rsp
+ andq $-32, %rsp
+
+ movdqa 0*16(%rdx), %xmm0
+ movdqa 1*16(%rdx), %xmm1
+ movdqa %xmm0, 0*16(%rdi)
+ movdqa %xmm1, 1*16(%rdi)
+
+ movl 0*4(%rsi), %eax
+ movl 1*4(%rsi), %ecx
+ movl 2*4(%rsi), %edx
+ movl 3*4(%rsi), %r9d
+ bswapl %eax
+ bswapl %ecx
+ bswapl %edx
+ bswapl %r9d
+ movl %eax, 0*4(%rsp)
+ movl %ecx, 1*4(%rsp)
+ movl %edx, 2*4(%rsp)
+ movl %r9d, 3*4(%rsp)
+
+ movq %rsp, %rsi
+ movl $64, %eax
+ movl $80, %ecx
+ /* rep xsha256 */
+ .byte 0xf3, 0x0f, 0xa6, 0xd0
+
+ movdqa bswap_xmm_mask(%rip), %xmm1
+ movdqa 0*16(%rdi), %xmm0
+ movdqa 1*16(%rdi), %xmm2
+ pshufb %xmm1, %xmm0
+ pshufb %xmm1, %xmm2
+ movdqa %xmm0, 0*16(%rsp)
+ movdqa %xmm2, 1*16(%rsp)
+
+ movdqa sha256_h+0*16(%rip), %xmm0
+ movdqa sha256_h+1*16(%rip), %xmm1
+ movdqa %xmm0, 0*16(%rdi)
+ movdqa %xmm1, 1*16(%rdi)
+
+ movq %rsp, %rsi
+ xorq %rax, %rax
+ movl $32, %ecx
+ /* rep xsha256 */
+ .byte 0xf3, 0x0f, 0xa6, 0xd0
+
+ movq %r8, %rsp
+#if defined(_WIN64) || defined(__CYGWIN__)
+ popq %rsi
+ popq %rdi
+#endif
+ ret
+
+
+ .data
+ .p2align 7
+sha256_4h:
+ .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
+ .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
+ .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
+ .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
+ .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
+ .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
+ .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
+ .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
+
+ .data
+ .p2align 7
+sha256_4k:
+ .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
+ .long 0x71374491, 0x71374491, 0x71374491, 0x71374491
+ .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
+ .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
+ .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
+ .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
+ .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
+ .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
+ .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
+ .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
+ .long 0x243185be, 0x243185be, 0x243185be, 0x243185be
+ .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
+ .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
+ .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
+ .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
+ .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
+ .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
+ .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
+ .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
+ .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
+ .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
+ .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
+ .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
+ .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
+ .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
+ .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
+ .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
+ .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
+ .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
+ .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
+ .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
+ .long 0x14292967, 0x14292967, 0x14292967, 0x14292967
+ .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
+ .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
+ .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
+ .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
+ .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
+ .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
+ .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
+ .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
+ .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
+ .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
+ .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
+ .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
+ .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
+ .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
+ .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
+ .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
+ .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
+ .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
+ .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
+ .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
+ .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
+ .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
+ .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
+ .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
+ .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
+ .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
+ .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
+ .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
+ .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
+ .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
+ .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
+ .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
+
+ .data
+ .p2align 6
+sha256d_4preext2_17:
+ .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000
+sha256d_4preext2_23:
+ .long 0x11002000, 0x11002000, 0x11002000, 0x11002000
+sha256d_4preext2_24:
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+sha256d_4preext2_30:
+ .long 0x00400022, 0x00400022, 0x00400022, 0x00400022
+
+ .data
+ .p2align 7
+sha256_8h:
+ .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
+ .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
+ .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
+ .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
+ .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
+ .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
+ .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
+ .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
+
+ .data
+ .p2align 7
+sha256_8k:
+ .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
+ .long 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491
+ .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
+ .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
+ .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
+ .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
+ .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
+ .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
+ .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
+ .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
+ .long 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be
+ .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
+ .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
+ .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
+ .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
+ .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
+ .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
+ .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
+ .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
+ .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
+ .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
+ .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
+ .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
+ .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
+ .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
+ .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
+ .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
+ .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
+ .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
+ .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
+ .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
+ .long 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967
+ .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
+ .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
+ .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
+ .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
+ .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
+ .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
+ .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
+ .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
+ .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
+ .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
+ .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
+ .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
+ .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
+ .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
+ .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
+ .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
+ .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
+ .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
+ .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
+ .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
+ .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
+ .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
+ .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
+ .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
+ .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
+ .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
+ .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
+ .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
+ .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
+ .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
+ .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
+ .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
+
+ .data
+ .p2align 6
+sha256d_8preext2_17:
+ .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000
+sha256d_8preext2_23:
+ .long 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000
+sha256d_8preext2_24:
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
+sha256d_8preext2_30:
+ .long 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022
+
+ .text
+ .p2align 6
+ .globl sha256_init_4way
+ .globl _sha256_init_4way
+sha256_init_4way:
+_sha256_init_4way:
+#if defined(_WIN64) || defined(__CYGWIN__)
+ pushq %rdi
+ movq %rcx, %rdi
+#endif
+ movdqa sha256_4h+0(%rip), %xmm0
+ movdqa sha256_4h+16(%rip), %xmm1
+ movdqa sha256_4h+32(%rip), %xmm2
+ movdqa sha256_4h+48(%rip), %xmm3
+ movdqu %xmm0, 0(%rdi)
+ movdqu %xmm1, 16(%rdi)
+ movdqu %xmm2, 32(%rdi)
+ movdqu %xmm3, 48(%rdi)
+ movdqa sha256_4h+64(%rip), %xmm0
+ movdqa sha256_4h+80(%rip), %xmm1
+ movdqa sha256_4h+96(%rip), %xmm2
+ movdqa sha256_4h+112(%rip), %xmm3
+ movdqu %xmm0, 64(%rdi)
+ movdqu %xmm1, 80(%rdi)
+ movdqu %xmm2, 96(%rdi)
+ movdqu %xmm3, 112(%rdi)
+#if defined(_WIN64) || defined(__CYGWIN__)
+ popq %rdi
+#endif
+ ret
+
+ .text
+ .p2align 6
+ .globl sha256_init_8way
+ .globl _sha256_init_8way
+sha256_init_8way:
+_sha256_init_8way:
+#if defined(_WIN64) || defined(__CYGWIN__)
+ pushq %rdi
+ movq %rcx, %rdi
+#endif
+ vpbroadcastd sha256_4h+0(%rip), %ymm0
+ vpbroadcastd sha256_4h+16(%rip), %ymm1
+ vpbroadcastd sha256_4h+32(%rip), %ymm2
+ vpbroadcastd sha256_4h+48(%rip), %ymm3
+ vmovdqu %ymm0, 0*32(%rdi)
+ vmovdqu %ymm1, 1*32(%rdi)
+ vmovdqu %ymm2, 2*32(%rdi)
+ vmovdqu %ymm3, 3*32(%rdi)
+ vpbroadcastd sha256_4h+64(%rip), %ymm0
+ vpbroadcastd sha256_4h+80(%rip), %ymm1
+ vpbroadcastd sha256_4h+96(%rip), %ymm2
+ vpbroadcastd sha256_4h+112(%rip), %ymm3
+ vmovdqu %ymm0, 4*32(%rdi)
+ vmovdqu %ymm1, 5*32(%rdi)
+ vmovdqu %ymm2, 6*32(%rdi)
+ vmovdqu %ymm3, 7*32(%rdi)
+#if defined(_WIN64) || defined(__CYGWIN__)
+ popq %rdi
+#endif
+ ret
+
+.macro sha256_sse2_extend_round i
+ movdqa (\i-15)*16(%rax), %xmm0
+ movdqa %xmm0, %xmm2
+ psrld $3, %xmm0
+ movdqa %xmm0, %xmm1
+ pslld $14, %xmm2
+ psrld $4, %xmm1
+ pxor %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ psrld $11, %xmm1
+ pslld $11, %xmm2
+ pxor %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ paddd (\i-16)*16(%rax), %xmm0
+ paddd (\i-7)*16(%rax), %xmm0
+
+ movdqa %xmm3, %xmm2
+ psrld $10, %xmm3
+ pslld $13, %xmm2
+ movdqa %xmm3, %xmm1
+ psrld $7, %xmm1
+ pxor %xmm1, %xmm3
+ pxor %xmm2, %xmm3
+ psrld $2, %xmm1
+ pslld $2, %xmm2
+ pxor %xmm1, %xmm3
+ pxor %xmm2, %xmm3
+ paddd %xmm0, %xmm3
+ movdqa %xmm3, \i*16(%rax)
+.endm
+
+.macro sha256_sse2_extend_doubleround i
+ movdqa (\i-15)*16(%rax), %xmm0
+ movdqa (\i-14)*16(%rax), %xmm4
+ movdqa %xmm0, %xmm2
+ movdqa %xmm4, %xmm6
+ psrld $3, %xmm0
+ psrld $3, %xmm4
+ movdqa %xmm0, %xmm1
+ movdqa %xmm4, %xmm5
+ pslld $14, %xmm2
+ pslld $14, %xmm6
+ psrld $4, %xmm1
+ psrld $4, %xmm5
+ pxor %xmm1, %xmm0
+ pxor %xmm5, %xmm4
+ psrld $11, %xmm1
+ psrld $11, %xmm5
+ pxor %xmm2, %xmm0
+ pxor %xmm6, %xmm4
+ pslld $11, %xmm2
+ pslld $11, %xmm6
+ pxor %xmm1, %xmm0
+ pxor %xmm5, %xmm4
+ pxor %xmm2, %xmm0
+ pxor %xmm6, %xmm4
+
+ paddd (\i-16)*16(%rax), %xmm0
+ paddd (\i-15)*16(%rax), %xmm4
+
+ movdqa %xmm3, %xmm2
+ movdqa %xmm7, %xmm6
+ psrld $10, %xmm3
+ psrld $10, %xmm7
+ movdqa %xmm3, %xmm1
+ movdqa %xmm7, %xmm5
+ pslld $13, %xmm2
+ pslld $13, %xmm6
+ psrld $7, %xmm1
+ psrld $7, %xmm5
+
+ paddd (\i-7)*16(%rax), %xmm0
+ paddd (\i-6)*16(%rax), %xmm4
+
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ psrld $2, %xmm1
+ psrld $2, %xmm5
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ pslld $2, %xmm2
+ pslld $2, %xmm6
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+
+ paddd %xmm0, %xmm3
+ paddd %xmm4, %xmm7
+ movdqa %xmm3, \i*16(%rax)
+ movdqa %xmm7, (\i+1)*16(%rax)
+.endm
+
+.macro sha256_sse2_main_round i
+ movdqa 16*(\i)(%rax), %xmm6
+
+ movdqa %xmm0, %xmm1
+ movdqa 16(%rsp), %xmm2
+ pandn %xmm2, %xmm1
+ paddd 32(%rsp), %xmm6
+
+ movdqa %xmm2, 32(%rsp)
+ movdqa 0(%rsp), %xmm2
+ movdqa %xmm2, 16(%rsp)
+
+ pand %xmm0, %xmm2
+ pxor %xmm2, %xmm1
+ movdqa %xmm0, 0(%rsp)
+
+ paddd %xmm1, %xmm6
+
+ movdqa %xmm0, %xmm1
+ psrld $6, %xmm0
+ paddd 16*(\i)(%rcx), %xmm6
+ movdqa %xmm0, %xmm2
+ pslld $7, %xmm1
+ psrld $5, %xmm2
+ pxor %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ pslld $14, %xmm1
+ psrld $14, %xmm2
+ pxor %xmm1, %xmm0
+ pslld $5, %xmm1
+ pxor %xmm2, %xmm0
+ pxor %xmm1, %xmm0
+ movdqa %xmm5, %xmm1
+ paddd %xmm0, %xmm6
+
+ movdqa %xmm3, %xmm0
+ movdqa %xmm4, %xmm3
+ movdqa %xmm4, %xmm2
+ paddd %xmm6, %xmm0
+ pand %xmm5, %xmm2
+ pand %xmm7, %xmm1
+ pand %xmm7, %xmm4
+ pxor %xmm4, %xmm1
+ movdqa %xmm5, %xmm4
+ movdqa %xmm7, %xmm5
+ pxor %xmm2, %xmm1
+ paddd %xmm1, %xmm6
+
+ movdqa %xmm7, %xmm2
+ psrld $2, %xmm7
+ movdqa %xmm7, %xmm1
+ pslld $10, %xmm2
+ psrld $11, %xmm1
+ pxor %xmm2, %xmm7
+ pslld $9, %xmm2
+ pxor %xmm1, %xmm7
+ psrld $9, %xmm1
+ pxor %xmm2, %xmm7
+ pslld $11, %xmm2
+ pxor %xmm1, %xmm7
+ pxor %xmm2, %xmm7
+ paddd %xmm6, %xmm7
+.endm
+
+.macro sha256_sse2_main_quadround i
+ sha256_sse2_main_round \i+0
+ sha256_sse2_main_round \i+1
+ sha256_sse2_main_round \i+2
+ sha256_sse2_main_round \i+3
+.endm
+
+
+.macro sha256_avx_extend_round i
+ vmovdqa (\i-15)*16(%rax), %xmm0
+ vpslld $14, %xmm0, %xmm2
+ vpsrld $3, %xmm0, %xmm0
+ vpsrld $4, %xmm0, %xmm1
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vpsrld $11, %xmm1, %xmm1
+ vpslld $11, %xmm2, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
+ vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
+
+ vpslld $13, %xmm3, %xmm2
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $7, %xmm3, %xmm1
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm2, %xmm3, %xmm3
+ vpsrld $2, %xmm1, %xmm1
+ vpslld $2, %xmm2, %xmm2
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm2, %xmm3, %xmm3
+ vpaddd %xmm0, %xmm3, %xmm3
+ vmovdqa %xmm3, \i*16(%rax)
+.endm
+
+.macro sha256_avx_extend_doubleround i
+ vmovdqa (\i-15)*16(%rax), %xmm0
+ vmovdqa (\i-14)*16(%rax), %xmm4
+ vpslld $14, %xmm0, %xmm2
+ vpslld $14, %xmm4, %xmm6
+ vpsrld $3, %xmm0, %xmm8
+ vpsrld $3, %xmm4, %xmm4
+ vpsrld $7, %xmm0, %xmm1
+ vpsrld $4, %xmm4, %xmm5
+ vpxor %xmm1, %xmm8, %xmm8
+ vpxor %xmm5, %xmm4, %xmm4
+ vpsrld $11, %xmm1, %xmm1
+ vpsrld $11, %xmm5, %xmm5
+ vpxor %xmm2, %xmm8, %xmm8
+ vpxor %xmm6, %xmm4, %xmm4
+ vpslld $11, %xmm2, %xmm2
+ vpslld $11, %xmm6, %xmm6
+ vpxor %xmm1, %xmm8, %xmm8
+ vpxor %xmm5, %xmm4, %xmm4
+ vpxor %xmm2, %xmm8, %xmm8
+ vpxor %xmm6, %xmm4, %xmm4
+
+ vpaddd %xmm0, %xmm4, %xmm4
+ vpaddd (\i-16)*16(%rax), %xmm8, %xmm0
+
+ vpslld $13, %xmm3, %xmm2
+ vpslld $13, %xmm7, %xmm6
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+
+ vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
+ vpaddd (\i-6)*16(%rax), %xmm4, %xmm4
+
+ vpsrld $7, %xmm3, %xmm1
+ vpsrld $7, %xmm7, %xmm5
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpsrld $2, %xmm1, %xmm1
+ vpsrld $2, %xmm5, %xmm5
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpslld $2, %xmm2, %xmm2
+ vpslld $2, %xmm6, %xmm6
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+
+ vpaddd %xmm0, %xmm3, %xmm3
+ vpaddd %xmm4, %xmm7, %xmm7
+ vmovdqa %xmm3, \i*16(%rax)
+ vmovdqa %xmm7, (\i+1)*16(%rax)
+.endm
+
+.macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
+ vpaddd 16*(\i)(%rax), \r0, %xmm6
+ vpaddd 16*(\i)(%rcx), %xmm6, %xmm6
+
+ vpandn \r1, \r3, %xmm1
+ vpand \r3, \r2, %xmm2
+ vpxor %xmm2, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm6, %xmm6
+
+ vpslld $7, \r3, %xmm1
+ vpsrld $6, \r3, \r0
+ vpsrld $5, \r0, %xmm2
+ vpxor %xmm1, \r0, \r0
+ vpxor %xmm2, \r0, \r0
+ vpslld $14, %xmm1, %xmm1
+ vpsrld $14, %xmm2, %xmm2
+ vpxor %xmm1, \r0, \r0
+ vpxor %xmm2, \r0, \r0
+ vpslld $5, %xmm1, %xmm1
+ vpxor %xmm1, \r0, \r0
+ vpaddd \r0, %xmm6, %xmm6
+ vpaddd %xmm6, \r4, \r0
+
+ vpand \r6, \r5, %xmm2
+ vpand \r7, \r5, \r4
+ vpand \r7, \r6, %xmm1
+ vpxor \r4, %xmm1, %xmm1
+ vpxor %xmm2, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm6, %xmm6
+
+ vpslld $10, \r7, %xmm2
+ vpsrld $2, \r7, \r4
+ vpsrld $11, \r4, %xmm1
+ vpxor %xmm2, \r4, \r4
+ vpxor %xmm1, \r4, \r4
+ vpslld $9, %xmm2, %xmm2
+ vpsrld $9, %xmm1, %xmm1
+ vpxor %xmm2, \r4, \r4
+ vpxor %xmm1, \r4, \r4
+ vpslld $11, %xmm2, %xmm2
+ vpxor %xmm2, \r4, \r4
+ vpaddd %xmm6, \r4, \r4
+.endm
+
+.macro sha256_avx_main_quadround i
+ sha256_avx_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
+ sha256_avx_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
+ sha256_avx_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
+ sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
+.endm
+
+
+.macro sha256_avx2_extend_round i
+ vmovdqa (\i-15)*32(%rax), %ymm0
+ vpslld $14, %ymm0, %ymm2
+ vpsrld $3, %ymm0, %ymm0
+ vpsrld $4, %ymm0, %ymm1
+ vpxor %ymm1, %ymm0, %ymm0
+ vpxor %ymm2, %ymm0, %ymm0
+ vpsrld $11, %ymm1, %ymm1
+ vpslld $11, %ymm2, %ymm2
+ vpxor %ymm1, %ymm0, %ymm0
+ vpxor %ymm2, %ymm0, %ymm0
+ vpaddd (\i-16)*32(%rax), %ymm0, %ymm0
+ vpaddd (\i-7)*32(%rax), %ymm0, %ymm0
+
+ vpslld $13, %ymm3, %ymm2
+ vpsrld $10, %ymm3, %ymm3
+ vpsrld $7, %ymm3, %ymm1
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm2, %ymm3, %ymm3
+ vpsrld $2, %ymm1, %ymm1
+ vpslld $2, %ymm2, %ymm2
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm2, %ymm3, %ymm3
+ vpaddd %ymm0, %ymm3, %ymm3
+ vmovdqa %ymm3, \i*32(%rax)
+.endm
+
+.macro sha256_avx2_extend_doubleround i
+ vmovdqa (\i-15)*32(%rax), %ymm0
+ vmovdqa (\i-14)*32(%rax), %ymm4
+ vpslld $14, %ymm0, %ymm2
+ vpslld $14, %ymm4, %ymm6
+ vpsrld $3, %ymm0, %ymm8
+ vpsrld $3, %ymm4, %ymm4
+ vpsrld $7, %ymm0, %ymm1
+ vpsrld $4, %ymm4, %ymm5
+ vpxor %ymm1, %ymm8, %ymm8
+ vpxor %ymm5, %ymm4, %ymm4
+ vpsrld $11, %ymm1, %ymm1
+ vpsrld $11, %ymm5, %ymm5
+ vpxor %ymm2, %ymm8, %ymm8
+ vpxor %ymm6, %ymm4, %ymm4
+ vpslld $11, %ymm2, %ymm2
+ vpslld $11, %ymm6, %ymm6
+ vpxor %ymm1, %ymm8, %ymm8
+ vpxor %ymm5, %ymm4, %ymm4
+ vpxor %ymm2, %ymm8, %ymm8
+ vpxor %ymm6, %ymm4, %ymm4
+
+ vpaddd %ymm0, %ymm4, %ymm4
+ vpaddd (\i-16)*32(%rax), %ymm8, %ymm0
+
+ vpslld $13, %ymm3, %ymm2
+ vpslld $13, %ymm7, %ymm6
+ vpsrld $10, %ymm3, %ymm3
+ vpsrld $10, %ymm7, %ymm7
+
+ vpaddd (\i-7)*32(%rax), %ymm0, %ymm0
+ vpaddd (\i-6)*32(%rax), %ymm4, %ymm4
+
+ vpsrld $7, %ymm3, %ymm1
+ vpsrld $7, %ymm7, %ymm5
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpsrld $2, %ymm1, %ymm1
+ vpsrld $2, %ymm5, %ymm5
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpslld $2, %ymm2, %ymm2
+ vpslld $2, %ymm6, %ymm6
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+
+ vpaddd %ymm0, %ymm3, %ymm3
+ vpaddd %ymm4, %ymm7, %ymm7
+ vmovdqa %ymm3, \i*32(%rax)
+ vmovdqa %ymm7, (\i+1)*32(%rax)
+.endm
+
+.macro sha256_avx2_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
+ vpaddd 32*(\i)(%rax), \r0, %ymm6
+ vpaddd 32*(\i)(%rcx), %ymm6, %ymm6
+
+ vpandn \r1, \r3, %ymm1
+ vpand \r3, \r2, %ymm2
+ vpxor %ymm2, %ymm1, %ymm1
+ vpaddd %ymm1, %ymm6, %ymm6
+
+ vpslld $7, \r3, %ymm1
+ vpsrld $6, \r3, \r0
+ vpsrld $5, \r0, %ymm2
+ vpxor %ymm1, \r0, \r0
+ vpxor %ymm2, \r0, \r0
+ vpslld $14, %ymm1, %ymm1
+ vpsrld $14, %ymm2, %ymm2
+ vpxor %ymm1, \r0, \r0
+ vpxor %ymm2, \r0, \r0
+ vpslld $5, %ymm1, %ymm1
+ vpxor %ymm1, \r0, \r0
+ vpaddd \r0, %ymm6, %ymm6
+ vpaddd %ymm6, \r4, \r0
+
+ vpand \r6, \r5, %ymm2
+ vpand \r7, \r5, \r4
+ vpand \r7, \r6, %ymm1
+ vpxor \r4, %ymm1, %ymm1
+ vpxor %ymm2, %ymm1, %ymm1
+ vpaddd %ymm1, %ymm6, %ymm6
+
+ vpslld $10, \r7, %ymm2
+ vpsrld $2, \r7, \r4
+ vpsrld $11, \r4, %ymm1
+ vpxor %ymm2, \r4, \r4
+ vpxor %ymm1, \r4, \r4
+ vpslld $9, %ymm2, %ymm2
+ vpsrld $9, %ymm1, %ymm1
+ vpxor %ymm2, \r4, \r4
+ vpxor %ymm1, \r4, \r4
+ vpslld $11, %ymm2, %ymm2
+ vpxor %ymm2, \r4, \r4
+ vpaddd %ymm6, \r4, \r4
+.endm
+
+.macro sha256_avx2_main_quadround i
+ sha256_avx2_main_round \i+0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7
+ sha256_avx2_main_round \i+1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3
+ sha256_avx2_main_round \i+2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4
+ sha256_avx2_main_round \i+3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5
+.endm
+
+.macro sha256_xop_extend_round i
+ vmovdqa (\i-15)*16(%rax), %xmm0
+ vprotd $25, %xmm0, %xmm1
+ vprotd $14, %xmm0, %xmm2
+ vpsrld $3, %xmm0, %xmm0
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm2, %xmm0, %xmm0
+
+ vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
+ vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
+
+ vprotd $15, %xmm3, %xmm1
+ vprotd $13, %xmm3, %xmm2
+ vpsrld $10, %xmm3, %xmm3
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm2, %xmm3, %xmm3
+ vpaddd %xmm0, %xmm3, %xmm3
+ vmovdqa %xmm3, \i*16(%rax)
+.endm
+
+.macro sha256_xop_extend_doubleround i
+ vmovdqa (\i-15)*16(%rax), %xmm0
+ vmovdqa (\i-14)*16(%rax), %xmm4
+ vprotd $25, %xmm0, %xmm1
+ vprotd $25, %xmm4, %xmm5
+ vprotd $14, %xmm0, %xmm2
+ vprotd $14, %xmm4, %xmm6
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm5, %xmm6, %xmm6
+ vpsrld $3, %xmm0, %xmm0
+ vpsrld $3, %xmm4, %xmm4
+ vpxor %xmm2, %xmm0, %xmm0
+ vpxor %xmm6, %xmm4, %xmm4
+
+ vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
+ vpaddd (\i-15)*16(%rax), %xmm4, %xmm4
+
+ vprotd $15, %xmm3, %xmm1
+ vprotd $15, %xmm7, %xmm5
+ vprotd $13, %xmm3, %xmm2
+ vprotd $13, %xmm7, %xmm6
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm5, %xmm6, %xmm6
+
+ vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
+ vpaddd (\i-6)*16(%rax), %xmm4, %xmm4
+
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+
+ vpaddd %xmm0, %xmm3, %xmm3
+ vpaddd %xmm4, %xmm7, %xmm7
+ vmovdqa %xmm3, \i*16(%rax)
+ vmovdqa %xmm7, (\i+1)*16(%rax)
+.endm
+
+.macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
+ vpaddd 16*(\i)(%rax), \r0, %xmm6
+ vpaddd 16*(\i)(%rcx), %xmm6, %xmm6
+
+ vpandn \r1, \r3, %xmm1
+ vpand \r3, \r2, %xmm2
+ vpxor %xmm2, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm6, %xmm6
+
+ vprotd $26, \r3, %xmm1
+ vprotd $21, \r3, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vprotd $7, \r3, \r0
+ vpxor %xmm2, \r0, \r0
+ vpaddd \r0, %xmm6, %xmm6
+ vpaddd %xmm6, \r4, \r0
+
+ vpand \r6, \r5, %xmm2
+ vpand \r7, \r5, \r4
+ vpand \r7, \r6, %xmm1
+ vpxor \r4, %xmm1, %xmm1
+ vpxor %xmm2, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm6, %xmm6
+
+ vprotd $30, \r7, %xmm1
+ vprotd $19, \r7, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vprotd $10, \r7, \r4
+ vpxor %xmm2, \r4, \r4
+ vpaddd %xmm6, \r4, \r4
+.endm
+
+.macro sha256_xop_main_quadround i
+ sha256_xop_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
+ sha256_xop_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
+ sha256_xop_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
+ sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
+.endm
+
+ .text
+ .p2align 6
+sha256_transform_4way_core_sse2:
+ leaq 256(%rsp), %rcx
+ leaq 48*16(%rcx), %rax
+ movdqa -2*16(%rcx), %xmm3
+ movdqa -1*16(%rcx), %xmm7
+sha256_transform_4way_sse2_extend_loop:
+ movdqa -15*16(%rcx), %xmm0
+ movdqa -14*16(%rcx), %xmm4
+ movdqa %xmm0, %xmm2
+ movdqa %xmm4, %xmm6
+ psrld $3, %xmm0
+ psrld $3, %xmm4
+ movdqa %xmm0, %xmm1
+ movdqa %xmm4, %xmm5
+ pslld $14, %xmm2
+ pslld $14, %xmm6
+ psrld $4, %xmm1
+ psrld $4, %xmm5
+ pxor %xmm1, %xmm0
+ pxor %xmm5, %xmm4
+ psrld $11, %xmm1
+ psrld $11, %xmm5
+ pxor %xmm2, %xmm0
+ pxor %xmm6, %xmm4
+ pslld $11, %xmm2
+ pslld $11, %xmm6
+ pxor %xmm1, %xmm0
+ pxor %xmm5, %xmm4
+ pxor %xmm2, %xmm0
+ pxor %xmm6, %xmm4
+
+ paddd -16*16(%rcx), %xmm0
+ paddd -15*16(%rcx), %xmm4
+
+ movdqa %xmm3, %xmm2
+ movdqa %xmm7, %xmm6
+ psrld $10, %xmm3
+ psrld $10, %xmm7
+ movdqa %xmm3, %xmm1
+ movdqa %xmm7, %xmm5
+ pslld $13, %xmm2
+ pslld $13, %xmm6
+ psrld $7, %xmm1
+ psrld $7, %xmm5
+
+ paddd -7*16(%rcx), %xmm0
+ paddd -6*16(%rcx), %xmm4
+
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ psrld $2, %xmm1
+ psrld $2, %xmm5
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ pslld $2, %xmm2
+ pslld $2, %xmm6
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+
+ paddd %xmm0, %xmm3
+ paddd %xmm4, %xmm7
+ movdqa %xmm3, (%rcx)
+ movdqa %xmm7, 16(%rcx)
+ addq $2*16, %rcx
+ cmpq %rcx, %rax
+ jne sha256_transform_4way_sse2_extend_loop
+
+ movdqu 0(%rdi), %xmm7
+ movdqu 16(%rdi), %xmm5
+ movdqu 32(%rdi), %xmm4
+ movdqu 48(%rdi), %xmm3
+ movdqu 64(%rdi), %xmm0
+ movdqu 80(%rdi), %xmm8
+ movdqu 96(%rdi), %xmm9
+ movdqu 112(%rdi), %xmm10
+
+ leaq sha256_4k(%rip), %rcx
+ xorq %rax, %rax
+sha256_transform_4way_sse2_main_loop:
+ movdqa (%rsp, %rax), %xmm6
+ paddd (%rcx, %rax), %xmm6
+ paddd %xmm10, %xmm6
+
+ movdqa %xmm0, %xmm1
+ movdqa %xmm9, %xmm2
+ pandn %xmm2, %xmm1
+
+ movdqa %xmm2, %xmm10
+ movdqa %xmm8, %xmm2
+ movdqa %xmm2, %xmm9
+
+ pand %xmm0, %xmm2
+ pxor %xmm2, %xmm1
+ movdqa %xmm0, %xmm8
+
+ paddd %xmm1, %xmm6
+
+ movdqa %xmm0, %xmm1
+ psrld $6, %xmm0
+ movdqa %xmm0, %xmm2
+ pslld $7, %xmm1
+ psrld $5, %xmm2
+ pxor %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ pslld $14, %xmm1
+ psrld $14, %xmm2
+ pxor %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ pslld $5, %xmm1
+ pxor %xmm1, %xmm0
+ paddd %xmm0, %xmm6
+
+ movdqa %xmm3, %xmm0
+ paddd %xmm6, %xmm0
+
+ movdqa %xmm5, %xmm1
+ movdqa %xmm4, %xmm3
+ movdqa %xmm4, %xmm2
+ pand %xmm5, %xmm2
+ pand %xmm7, %xmm4
+ pand %xmm7, %xmm1
+ pxor %xmm4, %xmm1
+ movdqa %xmm5, %xmm4
+ movdqa %xmm7, %xmm5
+ pxor %xmm2, %xmm1
+ paddd %xmm1, %xmm6
+
+ movdqa %xmm7, %xmm2
+ psrld $2, %xmm7
+ movdqa %xmm7, %xmm1
+ pslld $10, %xmm2
+ psrld $11, %xmm1
+ pxor %xmm2, %xmm7
+ pxor %xmm1, %xmm7
+ pslld $9, %xmm2
+ psrld $9, %xmm1
+ pxor %xmm2, %xmm7
+ pxor %xmm1, %xmm7
+ pslld $11, %xmm2
+ pxor %xmm2, %xmm7
+ paddd %xmm6, %xmm7
+
+ addq $16, %rax
+ cmpq $16*64, %rax
+ jne sha256_transform_4way_sse2_main_loop
+ jmp sha256_transform_4way_finish
+
+ .text
+ .p2align 6
+sha256_transform_4way_core_avx:
+ leaq 256(%rsp), %rax
+ movdqa -2*16(%rax), %xmm3
+ movdqa -1*16(%rax), %xmm7
+ sha256_avx_extend_doubleround 0
+ sha256_avx_extend_doubleround 2
+ sha256_avx_extend_doubleround 4
+ sha256_avx_extend_doubleround 6
+ sha256_avx_extend_doubleround 8
+ sha256_avx_extend_doubleround 10
+ sha256_avx_extend_doubleround 12
+ sha256_avx_extend_doubleround 14
+ sha256_avx_extend_doubleround 16
+ sha256_avx_extend_doubleround 18
+ sha256_avx_extend_doubleround 20
+ sha256_avx_extend_doubleround 22
+ sha256_avx_extend_doubleround 24
+ sha256_avx_extend_doubleround 26
+ sha256_avx_extend_doubleround 28
+ sha256_avx_extend_doubleround 30
+ sha256_avx_extend_doubleround 32
+ sha256_avx_extend_doubleround 34
+ sha256_avx_extend_doubleround 36
+ sha256_avx_extend_doubleround 38
+ sha256_avx_extend_doubleround 40
+ sha256_avx_extend_doubleround 42
+ sha256_avx_extend_doubleround 44
+ sha256_avx_extend_doubleround 46
+ movdqu 0(%rdi), %xmm7
+ movdqu 16(%rdi), %xmm5
+ movdqu 32(%rdi), %xmm4
+ movdqu 48(%rdi), %xmm3
+ movdqu 64(%rdi), %xmm0
+ movdqu 80(%rdi), %xmm8
+ movdqu 96(%rdi), %xmm9
+ movdqu 112(%rdi), %xmm10
+ movq %rsp, %rax
+ leaq sha256_4k(%rip), %rcx
+ sha256_avx_main_quadround 0
+ sha256_avx_main_quadround 4
+ sha256_avx_main_quadround 8
+ sha256_avx_main_quadround 12
+ sha256_avx_main_quadround 16
+ sha256_avx_main_quadround 20
+ sha256_avx_main_quadround 24
+ sha256_avx_main_quadround 28
+ sha256_avx_main_quadround 32
+ sha256_avx_main_quadround 36
+ sha256_avx_main_quadround 40
+ sha256_avx_main_quadround 44
+ sha256_avx_main_quadround 48
+ sha256_avx_main_quadround 52
+ sha256_avx_main_quadround 56
+ sha256_avx_main_quadround 60
+ jmp sha256_transform_4way_finish
+
+ .text
+ .p2align 6
+sha256_transform_4way_core_xop:
+ leaq 256(%rsp), %rax
+ movdqa -2*16(%rax), %xmm3
+ movdqa -1*16(%rax), %xmm7
+ sha256_xop_extend_doubleround 0
+ sha256_xop_extend_doubleround 2
+ sha256_xop_extend_doubleround 4
+ sha256_xop_extend_doubleround 6
+ sha256_xop_extend_doubleround 8
+ sha256_xop_extend_doubleround 10
+ sha256_xop_extend_doubleround 12
+ sha256_xop_extend_doubleround 14
+ sha256_xop_extend_doubleround 16
+ sha256_xop_extend_doubleround 18
+ sha256_xop_extend_doubleround 20
+ sha256_xop_extend_doubleround 22
+ sha256_xop_extend_doubleround 24
+ sha256_xop_extend_doubleround 26
+ sha256_xop_extend_doubleround 28
+ sha256_xop_extend_doubleround 30
+ sha256_xop_extend_doubleround 32
+ sha256_xop_extend_doubleround 34
+ sha256_xop_extend_doubleround 36
+ sha256_xop_extend_doubleround 38
+ sha256_xop_extend_doubleround 40
+ sha256_xop_extend_doubleround 42
+ sha256_xop_extend_doubleround 44
+ sha256_xop_extend_doubleround 46
+ movdqu 0(%rdi), %xmm7
+ movdqu 16(%rdi), %xmm5
+ movdqu 32(%rdi), %xmm4
+ movdqu 48(%rdi), %xmm3
+ movdqu 64(%rdi), %xmm0
+ movdqu 80(%rdi), %xmm8
+ movdqu 96(%rdi), %xmm9
+ movdqu 112(%rdi), %xmm10
+ movq %rsp, %rax
+ leaq sha256_4k(%rip), %rcx
+ sha256_xop_main_quadround 0
+ sha256_xop_main_quadround 4
+ sha256_xop_main_quadround 8
+ sha256_xop_main_quadround 12
+ sha256_xop_main_quadround 16
+ sha256_xop_main_quadround 20
+ sha256_xop_main_quadround 24
+ sha256_xop_main_quadround 28
+ sha256_xop_main_quadround 32
+ sha256_xop_main_quadround 36
+ sha256_xop_main_quadround 40
+ sha256_xop_main_quadround 44
+ sha256_xop_main_quadround 48
+ sha256_xop_main_quadround 52
+ sha256_xop_main_quadround 56
+ sha256_xop_main_quadround 60
+ jmp sha256_transform_4way_finish
+
+ .data
+ .p2align 3
+sha256_transform_4way_core_addr:
+ .quad 0x0
+
+.macro p2bswap_rsi_rsp i
+ movdqu \i*16(%rsi), %xmm0
+ movdqu (\i+1)*16(%rsi), %xmm2
+ pshuflw $0xb1, %xmm0, %xmm0
+ pshuflw $0xb1, %xmm2, %xmm2
+ pshufhw $0xb1, %xmm0, %xmm0
+ pshufhw $0xb1, %xmm2, %xmm2
+ movdqa %xmm0, %xmm1
+ movdqa %xmm2, %xmm3
+ psrlw $8, %xmm1
+ psrlw $8, %xmm3
+ psllw $8, %xmm0
+ psllw $8, %xmm2
+ pxor %xmm1, %xmm0
+ pxor %xmm3, %xmm2
+ movdqa %xmm0, \i*16(%rsp)
+ movdqa %xmm2, (\i+1)*16(%rsp)
+.endm
+
+ .text
+ .p2align 6
+ .globl sha256_transform_4way
+ .globl _sha256_transform_4way
+sha256_transform_4way:
+_sha256_transform_4way:
+#if defined(_WIN64) || defined(__CYGWIN__)
+ pushq %rdi
+ subq $96, %rsp
+ movdqa %xmm6, 0(%rsp)
+ movdqa %xmm7, 16(%rsp)
+ movdqa %xmm8, 32(%rsp)
+ movdqa %xmm9, 48(%rsp)
+ movdqa %xmm10, 64(%rsp)
+ movdqa %xmm11, 80(%rsp)
+ pushq %rsi
+ movq %rcx, %rdi
+ movq %rdx, %rsi
+ movq %r8, %rdx
+#endif
+ movq %rsp, %r8
+ subq $1032, %rsp
+ andq $-128, %rsp
+
+ testq %rdx, %rdx
+ jnz sha256_transform_4way_swap
+
+ movdqu 0*16(%rsi), %xmm0
+ movdqu 1*16(%rsi), %xmm1
+ movdqu 2*16(%rsi), %xmm2
+ movdqu 3*16(%rsi), %xmm3
+ movdqu 4*16(%rsi), %xmm4
+ movdqu 5*16(%rsi), %xmm5
+ movdqu 6*16(%rsi), %xmm6
+ movdqu 7*16(%rsi), %xmm7
+ movdqa %xmm0, 0*16(%rsp)
+ movdqa %xmm1, 1*16(%rsp)
+ movdqa %xmm2, 2*16(%rsp)
+ movdqa %xmm3, 3*16(%rsp)
+ movdqa %xmm4, 4*16(%rsp)
+ movdqa %xmm5, 5*16(%rsp)
+ movdqa %xmm6, 6*16(%rsp)
+ movdqa %xmm7, 7*16(%rsp)
+ movdqu 8*16(%rsi), %xmm0
+ movdqu 9*16(%rsi), %xmm1
+ movdqu 10*16(%rsi), %xmm2
+ movdqu 11*16(%rsi), %xmm3
+ movdqu 12*16(%rsi), %xmm4
+ movdqu 13*16(%rsi), %xmm5
+ movdqu 14*16(%rsi), %xmm6
+ movdqu 15*16(%rsi), %xmm7
+ movdqa %xmm0, 8*16(%rsp)
+ movdqa %xmm1, 9*16(%rsp)
+ movdqa %xmm2, 10*16(%rsp)
+ movdqa %xmm3, 11*16(%rsp)
+ movdqa %xmm4, 12*16(%rsp)
+ movdqa %xmm5, 13*16(%rsp)
+ movdqa %xmm6, 14*16(%rsp)
+ movdqa %xmm7, 15*16(%rsp)
+ jmp *sha256_transform_4way_core_addr(%rip)
+
+ .p2align 6
+sha256_transform_4way_swap:
+ p2bswap_rsi_rsp 0
+ p2bswap_rsi_rsp 2
+ p2bswap_rsi_rsp 4
+ p2bswap_rsi_rsp 6
+ p2bswap_rsi_rsp 8
+ p2bswap_rsi_rsp 10
+ p2bswap_rsi_rsp 12
+ p2bswap_rsi_rsp 14
+ jmp *sha256_transform_4way_core_addr(%rip)
+
+ .p2align 6
+sha256_transform_4way_finish:
+ movdqu 0(%rdi), %xmm2
+ movdqu 16(%rdi), %xmm6
+ movdqu 32(%rdi), %xmm11
+ movdqu 48(%rdi), %xmm1
+ paddd %xmm2, %xmm7
+ paddd %xmm6, %xmm5
+ paddd %xmm11, %xmm4
+ paddd %xmm1, %xmm3
+ movdqu 64(%rdi), %xmm2
+ movdqu 80(%rdi), %xmm6
+ movdqu 96(%rdi), %xmm11
+ movdqu 112(%rdi), %xmm1
+ paddd %xmm2, %xmm0
+ paddd %xmm6, %xmm8
+ paddd %xmm11, %xmm9
+ paddd %xmm1, %xmm10
+
+ movdqu %xmm7, 0(%rdi)
+ movdqu %xmm5, 16(%rdi)
+ movdqu %xmm4, 32(%rdi)
+ movdqu %xmm3, 48(%rdi)
+ movdqu %xmm0, 64(%rdi)
+ movdqu %xmm8, 80(%rdi)
+ movdqu %xmm9, 96(%rdi)
+ movdqu %xmm10, 112(%rdi)
+
+ movq %r8, %rsp
+#if defined(_WIN64) || defined(__CYGWIN__)
+ popq %rsi
+ movdqa 0(%rsp), %xmm6
+ movdqa 16(%rsp), %xmm7
+ movdqa 32(%rsp), %xmm8
+ movdqa 48(%rsp), %xmm9
+ movdqa 64(%rsp), %xmm10
+ movdqa 80(%rsp), %xmm11
+ addq $96, %rsp
+ popq %rdi
+#endif
+ ret
+
+ .text
+ .p2align 6
+sha256_transform_8way_core_avx2:
+ leaq 8*64(%rsp), %rax
+ vmovdqa -2*32(%rax), %ymm3
+ vmovdqa -1*32(%rax), %ymm7
+ sha256_avx2_extend_doubleround 0
+ sha256_avx2_extend_doubleround 2
+ sha256_avx2_extend_doubleround 4
+ sha256_avx2_extend_doubleround 6
+ sha256_avx2_extend_doubleround 8
+ sha256_avx2_extend_doubleround 10
+ sha256_avx2_extend_doubleround 12
+ sha256_avx2_extend_doubleround 14
+ sha256_avx2_extend_doubleround 16
+ sha256_avx2_extend_doubleround 18
+ sha256_avx2_extend_doubleround 20
+ sha256_avx2_extend_doubleround 22
+ sha256_avx2_extend_doubleround 24
+ sha256_avx2_extend_doubleround 26
+ sha256_avx2_extend_doubleround 28
+ sha256_avx2_extend_doubleround 30
+ sha256_avx2_extend_doubleround 32
+ sha256_avx2_extend_doubleround 34
+ sha256_avx2_extend_doubleround 36
+ sha256_avx2_extend_doubleround 38
+ sha256_avx2_extend_doubleround 40
+ sha256_avx2_extend_doubleround 42
+ sha256_avx2_extend_doubleround 44
+ sha256_avx2_extend_doubleround 46
+ vmovdqu 0*32(%rdi), %ymm7
+ vmovdqu 1*32(%rdi), %ymm5
+ vmovdqu 2*32(%rdi), %ymm4
+ vmovdqu 3*32(%rdi), %ymm3
+ vmovdqu 4*32(%rdi), %ymm0
+ vmovdqu 5*32(%rdi), %ymm8
+ vmovdqu 6*32(%rdi), %ymm9
+ vmovdqu 7*32(%rdi), %ymm10
+ movq %rsp, %rax
+ leaq sha256_8k(%rip), %rcx
+ sha256_avx2_main_quadround 0
+ sha256_avx2_main_quadround 4
+ sha256_avx2_main_quadround 8
+ sha256_avx2_main_quadround 12
+ sha256_avx2_main_quadround 16
+ sha256_avx2_main_quadround 20
+ sha256_avx2_main_quadround 24
+ sha256_avx2_main_quadround 28
+ sha256_avx2_main_quadround 32
+ sha256_avx2_main_quadround 36
+ sha256_avx2_main_quadround 40
+ sha256_avx2_main_quadround 44
+ sha256_avx2_main_quadround 48
+ sha256_avx2_main_quadround 52
+ sha256_avx2_main_quadround 56
+ sha256_avx2_main_quadround 60
+ jmp sha256_transform_8way_finish
+
+.macro p2bswap_avx2_rsi_rsp i
+ vmovdqu \i*32(%rsi), %ymm0
+ vmovdqu (\i+1)*32(%rsi), %ymm2
+ vpshuflw $0xb1, %ymm0, %ymm0
+ vpshuflw $0xb1, %ymm2, %ymm2
+ vpshufhw $0xb1, %ymm0, %ymm0
+ vpshufhw $0xb1, %ymm2, %ymm2
+ vpsrlw $8, %ymm0, %ymm1
+ vpsrlw $8, %ymm2, %ymm3
+ vpsllw $8, %ymm0, %ymm0
+ vpsllw $8, %ymm2, %ymm2
+ vpxor %ymm1, %ymm0, %ymm0
+ vpxor %ymm3, %ymm2, %ymm2
+ vmovdqa %ymm0, \i*32(%rsp)
+ vmovdqa %ymm2, (\i+1)*32(%rsp)
+.endm
+
+ .text
+ .p2align 6
+ .globl sha256_transform_8way
+ .globl _sha256_transform_8way
+sha256_transform_8way:
+_sha256_transform_8way:
+#if defined(_WIN64) || defined(__CYGWIN__)
+ pushq %rdi
+ subq $96, %rsp
+ vmovdqa %xmm6, 0(%rsp)
+ vmovdqa %xmm7, 16(%rsp)
+ vmovdqa %xmm8, 32(%rsp)
+ vmovdqa %xmm9, 48(%rsp)
+ vmovdqa %xmm10, 64(%rsp)
+ vmovdqa %xmm11, 80(%rsp)
+ pushq %rsi
+ movq %rcx, %rdi
+ movq %rdx, %rsi
+ movq %r8, %rdx
+#endif
+ movq %rsp, %r8
+ subq $64*32, %rsp
+ andq $-128, %rsp
+
+ testq %rdx, %rdx
+ jnz sha256_transform_8way_swap
+
+ vmovdqu 0*32(%rsi), %ymm0
+ vmovdqu 1*32(%rsi), %ymm1
+ vmovdqu 2*32(%rsi), %ymm2
+ vmovdqu 3*32(%rsi), %ymm3
+ vmovdqu 4*32(%rsi), %ymm4
+ vmovdqu 5*32(%rsi), %ymm5
+ vmovdqu 6*32(%rsi), %ymm6
+ vmovdqu 7*32(%rsi), %ymm7
+ vmovdqa %ymm0, 0*32(%rsp)
+ vmovdqa %ymm1, 1*32(%rsp)
+ vmovdqa %ymm2, 2*32(%rsp)
+ vmovdqa %ymm3, 3*32(%rsp)
+ vmovdqa %ymm4, 4*32(%rsp)
+ vmovdqa %ymm5, 5*32(%rsp)
+ vmovdqa %ymm6, 6*32(%rsp)
+ vmovdqa %ymm7, 7*32(%rsp)
+ vmovdqu 8*32(%rsi), %ymm0
+ vmovdqu 9*32(%rsi), %ymm1
+ vmovdqu 10*32(%rsi), %ymm2
+ vmovdqu 11*32(%rsi), %ymm3
+ vmovdqu 12*32(%rsi), %ymm4
+ vmovdqu 13*32(%rsi), %ymm5
+ vmovdqu 14*32(%rsi), %ymm6
+ vmovdqu 15*32(%rsi), %ymm7
+ vmovdqa %ymm0, 8*32(%rsp)
+ vmovdqa %ymm1, 9*32(%rsp)
+ vmovdqa %ymm2, 10*32(%rsp)
+ vmovdqa %ymm3, 11*32(%rsp)
+ vmovdqa %ymm4, 12*32(%rsp)
+ vmovdqa %ymm5, 13*32(%rsp)
+ vmovdqa %ymm6, 14*32(%rsp)
+ vmovdqa %ymm7, 15*32(%rsp)
+ jmp sha256_transform_8way_core_avx2
+
+ .p2align 6
+sha256_transform_8way_swap:
+ p2bswap_avx2_rsi_rsp 0
+ p2bswap_avx2_rsi_rsp 2
+ p2bswap_avx2_rsi_rsp 4
+ p2bswap_avx2_rsi_rsp 6
+ p2bswap_avx2_rsi_rsp 8
+ p2bswap_avx2_rsi_rsp 10
+ p2bswap_avx2_rsi_rsp 12
+ p2bswap_avx2_rsi_rsp 14
+ jmp sha256_transform_8way_core_avx2
+
+ .p2align 6
+sha256_transform_8way_finish:
+ vmovdqu 0*32(%rdi), %ymm2
+ vmovdqu 1*32(%rdi), %ymm6
+ vmovdqu 2*32(%rdi), %ymm11
+ vmovdqu 3*32(%rdi), %ymm1
+ vpaddd %ymm2, %ymm7, %ymm7
+ vpaddd %ymm6, %ymm5, %ymm5
+ vpaddd %ymm11, %ymm4, %ymm4
+ vpaddd %ymm1, %ymm3, %ymm3
+ vmovdqu 4*32(%rdi), %ymm2
+ vmovdqu 5*32(%rdi), %ymm6
+ vmovdqu 6*32(%rdi), %ymm11
+ vmovdqu 7*32(%rdi), %ymm1
+ vpaddd %ymm2, %ymm0, %ymm0
+ vpaddd %ymm6, %ymm8, %ymm8
+ vpaddd %ymm11, %ymm9, %ymm9
+ vpaddd %ymm1, %ymm10, %ymm10
+
+ vmovdqu %ymm7, 0*32(%rdi)
+ vmovdqu %ymm5, 1*32(%rdi)
+ vmovdqu %ymm4, 2*32(%rdi)
+ vmovdqu %ymm3, 3*32(%rdi)
+ vmovdqu %ymm0, 4*32(%rdi)
+ vmovdqu %ymm8, 5*32(%rdi)
+ vmovdqu %ymm9, 6*32(%rdi)
+ vmovdqu %ymm10, 7*32(%rdi)
+
+ movq %r8, %rsp
+#if defined(_WIN64) || defined(__CYGWIN__)
+ popq %rsi
+ vmovdqa 0(%rsp), %xmm6
+ vmovdqa 16(%rsp), %xmm7
+ vmovdqa 32(%rsp), %xmm8
+ vmovdqa 48(%rsp), %xmm9
+ vmovdqa 64(%rsp), %xmm10
+ vmovdqa 80(%rsp), %xmm11
+ addq $96, %rsp
+ popq %rdi
+#endif
+ ret
+
+
+ .data
+ .p2align 3
+sha256d_ms_4way_addr:
+ .quad 0x0
+
+ .text
+ .p2align 6
+ .globl sha256d_ms_4way
+ .globl _sha256d_ms_4way
+sha256d_ms_4way:
+_sha256d_ms_4way:
+ jmp *sha256d_ms_4way_addr(%rip)
+
+
+ .p2align 6
+sha256d_ms_4way_sse2:
+#if defined(_WIN64) || defined(__CYGWIN__)
+ pushq %rdi
+ subq $32, %rsp
+ movdqa %xmm6, 0(%rsp)
+ movdqa %xmm7, 16(%rsp)
+ pushq %rsi
+ movq %rcx, %rdi
+ movq %rdx, %rsi
+ movq %r8, %rdx
+ movq %r9, %rcx
+#endif
+ subq $8+67*16, %rsp
+
+ leaq 256(%rsi), %rax
+
+sha256d_ms_4way_sse2_extend_loop1:
+ movdqa 3*16(%rsi), %xmm0
+ movdqa 2*16(%rax), %xmm3
+ movdqa 3*16(%rax), %xmm7
+ movdqa %xmm3, 5*16(%rsp)
+ movdqa %xmm7, 6*16(%rsp)
+ movdqa %xmm0, %xmm2
+ paddd %xmm0, %xmm7
+ psrld $3, %xmm0
+ movdqa %xmm0, %xmm1
+ pslld $14, %xmm2
+ psrld $4, %xmm1
+ pxor %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ psrld $11, %xmm1
+ pslld $11, %xmm2
+ pxor %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ paddd %xmm0, %xmm3
+ movdqa %xmm3, 2*16(%rax)
+ movdqa %xmm7, 3*16(%rax)
+
+ movdqa 4*16(%rax), %xmm0
+ movdqa %xmm0, 7*16(%rsp)
+ movdqa %xmm3, %xmm2
+ movdqa %xmm7, %xmm6
+ psrld $10, %xmm3
+ psrld $10, %xmm7
+ movdqa %xmm3, %xmm1
+ movdqa %xmm7, %xmm5
+ pslld $13, %xmm2
+ pslld $13, %xmm6
+ psrld $7, %xmm1
+ psrld $7, %xmm5
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ psrld $2, %xmm1
+ psrld $2, %xmm5
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ pslld $2, %xmm2
+ pslld $2, %xmm6
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ paddd %xmm0, %xmm3
+ movdqa %xmm3, 4*16(%rax)
+ movdqa %xmm7, 5*16(%rax)
+
+ movdqa 6*16(%rax), %xmm0
+ movdqa 7*16(%rax), %xmm4
+ movdqa %xmm0, 9*16(%rsp)
+ movdqa %xmm4, 10*16(%rsp)
+ movdqa %xmm3, %xmm2
+ movdqa %xmm7, %xmm6
+ psrld $10, %xmm3
+ psrld $10, %xmm7
+ movdqa %xmm3, %xmm1
+ movdqa %xmm7, %xmm5
+ pslld $13, %xmm2
+ pslld $13, %xmm6
+ psrld $7, %xmm1
+ psrld $7, %xmm5
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ psrld $2, %xmm1
+ psrld $2, %xmm5
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ pslld $2, %xmm2
+ pslld $2, %xmm6
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ paddd %xmm0, %xmm3
+ paddd %xmm4, %xmm7
+ movdqa %xmm3, 6*16(%rax)
+ movdqa %xmm7, 7*16(%rax)
+
+ movdqa 8*16(%rax), %xmm0
+ movdqa 2*16(%rax), %xmm4
+ movdqa %xmm0, 11*16(%rsp)
+ movdqa %xmm3, %xmm2
+ movdqa %xmm7, %xmm6
+ psrld $10, %xmm3
+ psrld $10, %xmm7
+ movdqa %xmm3, %xmm1
+ movdqa %xmm7, %xmm5
+ pslld $13, %xmm2
+ pslld $13, %xmm6
+ psrld $7, %xmm1
+ psrld $7, %xmm5
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ psrld $2, %xmm1
+ psrld $2, %xmm5
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ pslld $2, %xmm2
+ pslld $2, %xmm6
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ paddd %xmm0, %xmm3
+ paddd %xmm4, %xmm7
+ movdqa %xmm3, 8*16(%rax)
+ movdqa %xmm7, 9*16(%rax)
+
+ movdqa %xmm3, %xmm2
+ movdqa %xmm7, %xmm6
+ psrld $10, %xmm3
+ psrld $10, %xmm7
+ movdqa %xmm3, %xmm1
+ movdqa %xmm7, %xmm5
+ pslld $13, %xmm2
+ pslld $13, %xmm6
+ psrld $7, %xmm1
+ psrld $7, %xmm5
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ psrld $2, %xmm1
+ psrld $2, %xmm5
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ pslld $2, %xmm2
+ pslld $2, %xmm6
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ paddd 3*16(%rax), %xmm3
+ paddd 4*16(%rax), %xmm7
+ movdqa %xmm3, 10*16(%rax)
+ movdqa %xmm7, 11*16(%rax)
+
+ movdqa %xmm3, %xmm2
+ movdqa %xmm7, %xmm6
+ psrld $10, %xmm3
+ psrld $10, %xmm7
+ movdqa %xmm3, %xmm1
+ movdqa %xmm7, %xmm5
+ pslld $13, %xmm2
+ pslld $13, %xmm6
+ psrld $7, %xmm1
+ psrld $7, %xmm5
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ psrld $2, %xmm1
+ psrld $2, %xmm5
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ pslld $2, %xmm2
+ pslld $2, %xmm6
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ paddd 5*16(%rax), %xmm3
+ paddd 6*16(%rax), %xmm7
+ movdqa %xmm3, 12*16(%rax)
+ movdqa %xmm7, 13*16(%rax)
+
+ movdqa 14*16(%rax), %xmm0
+ movdqa 15*16(%rax), %xmm4
+ movdqa %xmm0, 17*16(%rsp)
+ movdqa %xmm4, 18*16(%rsp)
+ movdqa %xmm3, %xmm2
+ movdqa %xmm7, %xmm6
+ psrld $10, %xmm3
+ psrld $10, %xmm7
+ movdqa %xmm3, %xmm1
+ movdqa %xmm7, %xmm5
+ paddd 7*16(%rax), %xmm0
+ paddd 8*16(%rax), %xmm4
+ pslld $13, %xmm2
+ pslld $13, %xmm6
+ psrld $7, %xmm1
+ psrld $7, %xmm5
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ psrld $2, %xmm1
+ psrld $2, %xmm5
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ pslld $2, %xmm2
+ pslld $2, %xmm6
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ paddd %xmm0, %xmm3
+ paddd %xmm4, %xmm7
+ movdqa %xmm3, 14*16(%rax)
+ movdqa %xmm7, 15*16(%rax)
+
+sha256d_ms_4way_sse2_extend_loop2:
+ sha256_sse2_extend_doubleround 16
+ sha256_sse2_extend_doubleround 18
+ sha256_sse2_extend_doubleround 20
+ sha256_sse2_extend_doubleround 22
+ sha256_sse2_extend_doubleround 24
+ sha256_sse2_extend_doubleround 26
+ sha256_sse2_extend_doubleround 28
+ sha256_sse2_extend_doubleround 30
+ sha256_sse2_extend_doubleround 32
+ sha256_sse2_extend_doubleround 34
+ sha256_sse2_extend_doubleround 36
+ sha256_sse2_extend_doubleround 38
+ sha256_sse2_extend_doubleround 40
+ sha256_sse2_extend_doubleround 42
+ jz sha256d_ms_4way_sse2_extend_coda2
+ sha256_sse2_extend_doubleround 44
+ sha256_sse2_extend_doubleround 46
+
+ movdqa 0(%rcx), %xmm3
+ movdqa 16(%rcx), %xmm0
+ movdqa 32(%rcx), %xmm1
+ movdqa 48(%rcx), %xmm2
+ movdqa 64(%rcx), %xmm6
+ movdqa 80(%rcx), %xmm7
+ movdqa 96(%rcx), %xmm5
+ movdqa 112(%rcx), %xmm4
+ movdqa %xmm1, 0(%rsp)
+ movdqa %xmm2, 16(%rsp)
+ movdqa %xmm6, 32(%rsp)
+
+ movq %rsi, %rax
+ leaq sha256_4k(%rip), %rcx
+ jmp sha256d_ms_4way_sse2_main_loop1
+
+sha256d_ms_4way_sse2_main_loop2:
+ sha256_sse2_main_round 0
+ sha256_sse2_main_round 1
+ sha256_sse2_main_round 2
+sha256d_ms_4way_sse2_main_loop1:
+ sha256_sse2_main_round 3
+ sha256_sse2_main_quadround 4
+ sha256_sse2_main_quadround 8
+ sha256_sse2_main_quadround 12
+ sha256_sse2_main_quadround 16
+ sha256_sse2_main_quadround 20
+ sha256_sse2_main_quadround 24
+ sha256_sse2_main_quadround 28
+ sha256_sse2_main_quadround 32
+ sha256_sse2_main_quadround 36
+ sha256_sse2_main_quadround 40
+ sha256_sse2_main_quadround 44
+ sha256_sse2_main_quadround 48
+ sha256_sse2_main_quadround 52
+ sha256_sse2_main_round 56
+ jz sha256d_ms_4way_sse2_finish
+ sha256_sse2_main_round 57
+ sha256_sse2_main_round 58
+ sha256_sse2_main_round 59
+ sha256_sse2_main_quadround 60
+
+ movdqa 5*16(%rsp), %xmm1
+ movdqa 6*16(%rsp), %xmm2
+ movdqa 7*16(%rsp), %xmm6
+ movdqa %xmm1, 18*16(%rsi)
+ movdqa %xmm2, 19*16(%rsi)
+ movdqa %xmm6, 20*16(%rsi)
+ movdqa 9*16(%rsp), %xmm1
+ movdqa 10*16(%rsp), %xmm2
+ movdqa 11*16(%rsp), %xmm6
+ movdqa %xmm1, 22*16(%rsi)
+ movdqa %xmm2, 23*16(%rsi)
+ movdqa %xmm6, 24*16(%rsi)
+ movdqa 17*16(%rsp), %xmm1
+ movdqa 18*16(%rsp), %xmm2
+ movdqa %xmm1, 30*16(%rsi)
+ movdqa %xmm2, 31*16(%rsi)
+
+ movdqa 0(%rsp), %xmm1
+ movdqa 16(%rsp), %xmm2
+ movdqa 32(%rsp), %xmm6
+ paddd 0(%rdx), %xmm7
+ paddd 16(%rdx), %xmm5
+ paddd 32(%rdx), %xmm4
+ paddd 48(%rdx), %xmm3
+ paddd 64(%rdx), %xmm0
+ paddd 80(%rdx), %xmm1
+ paddd 96(%rdx), %xmm2
+ paddd 112(%rdx), %xmm6
+
+ movdqa %xmm7, 48+0(%rsp)
+ movdqa %xmm5, 48+16(%rsp)
+ movdqa %xmm4, 48+32(%rsp)
+ movdqa %xmm3, 48+48(%rsp)
+ movdqa %xmm0, 48+64(%rsp)
+ movdqa %xmm1, 48+80(%rsp)
+ movdqa %xmm2, 48+96(%rsp)
+ movdqa %xmm6, 48+112(%rsp)
+
+ pxor %xmm0, %xmm0
+ movq $0x8000000000000100, %rax
+ movd %rax, %xmm1
+ pshufd $0x55, %xmm1, %xmm2
+ pshufd $0x00, %xmm1, %xmm1
+ movdqa %xmm2, 48+128(%rsp)
+ movdqa %xmm0, 48+144(%rsp)
+ movdqa %xmm0, 48+160(%rsp)
+ movdqa %xmm0, 48+176(%rsp)
+ movdqa %xmm0, 48+192(%rsp)
+ movdqa %xmm0, 48+208(%rsp)
+ movdqa %xmm0, 48+224(%rsp)
+ movdqa %xmm1, 48+240(%rsp)
+
+ leaq 19*16(%rsp), %rax
+ cmpq %rax, %rax
+
+ movdqa -15*16(%rax), %xmm0
+ movdqa -14*16(%rax), %xmm4
+ movdqa %xmm0, %xmm2
+ movdqa %xmm4, %xmm6
+ psrld $3, %xmm0
+ psrld $3, %xmm4
+ movdqa %xmm0, %xmm1
+ movdqa %xmm4, %xmm5
+ pslld $14, %xmm2
+ pslld $14, %xmm6
+ psrld $4, %xmm1
+ psrld $4, %xmm5
+ pxor %xmm1, %xmm0
+ pxor %xmm5, %xmm4
+ psrld $11, %xmm1
+ psrld $11, %xmm5
+ pxor %xmm2, %xmm0
+ pxor %xmm6, %xmm4
+ pslld $11, %xmm2
+ pslld $11, %xmm6
+ pxor %xmm1, %xmm0
+ pxor %xmm5, %xmm4
+ pxor %xmm2, %xmm0
+ pxor %xmm6, %xmm4
+ paddd -16*16(%rax), %xmm0
+ paddd -15*16(%rax), %xmm4
+ paddd sha256d_4preext2_17(%rip), %xmm4
+ movdqa %xmm0, %xmm3
+ movdqa %xmm4, %xmm7
+ movdqa %xmm3, 0*16(%rax)
+ movdqa %xmm7, 1*16(%rax)
+
+ sha256_sse2_extend_doubleround 2
+ sha256_sse2_extend_doubleround 4
+
+ movdqa -9*16(%rax), %xmm0
+ movdqa sha256d_4preext2_23(%rip), %xmm4
+ movdqa %xmm0, %xmm2
+ psrld $3, %xmm0
+ movdqa %xmm0, %xmm1
+ pslld $14, %xmm2
+ psrld $4, %xmm1
+ pxor %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ psrld $11, %xmm1
+ pslld $11, %xmm2
+ pxor %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ paddd -10*16(%rax), %xmm0
+ paddd -9*16(%rax), %xmm4
+ movdqa %xmm3, %xmm2
+ movdqa %xmm7, %xmm6
+ psrld $10, %xmm3
+ psrld $10, %xmm7
+ movdqa %xmm3, %xmm1
+ movdqa %xmm7, %xmm5
+ paddd -1*16(%rax), %xmm0
+ pslld $13, %xmm2
+ pslld $13, %xmm6
+ psrld $7, %xmm1
+ psrld $7, %xmm5
+ paddd 0*16(%rax), %xmm4
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ psrld $2, %xmm1
+ psrld $2, %xmm5
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ pslld $2, %xmm2
+ pslld $2, %xmm6
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ paddd %xmm0, %xmm3
+ paddd %xmm4, %xmm7
+ movdqa %xmm3, 6*16(%rax)
+ movdqa %xmm7, 7*16(%rax)
+
+ movdqa sha256d_4preext2_24(%rip), %xmm0
+ movdqa %xmm3, %xmm2
+ movdqa %xmm7, %xmm6
+ psrld $10, %xmm3
+ psrld $10, %xmm7
+ movdqa %xmm3, %xmm1
+ movdqa %xmm7, %xmm5
+ paddd 1*16(%rax), %xmm0
+ pslld $13, %xmm2
+ pslld $13, %xmm6
+ psrld $7, %xmm1
+ psrld $7, %xmm5
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ psrld $2, %xmm1
+ psrld $2, %xmm5
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ pslld $2, %xmm2
+ pslld $2, %xmm6
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ paddd %xmm0, %xmm3
+ paddd 2*16(%rax), %xmm7
+ movdqa %xmm3, 8*16(%rax)
+ movdqa %xmm7, 9*16(%rax)
+
+ movdqa %xmm3, %xmm2
+ movdqa %xmm7, %xmm6
+ psrld $10, %xmm3
+ psrld $10, %xmm7
+ movdqa %xmm3, %xmm1
+ movdqa %xmm7, %xmm5
+ pslld $13, %xmm2
+ pslld $13, %xmm6
+ psrld $7, %xmm1
+ psrld $7, %xmm5
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ psrld $2, %xmm1
+ psrld $2, %xmm5
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ pslld $2, %xmm2
+ pslld $2, %xmm6
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ paddd 3*16(%rax), %xmm3
+ paddd 4*16(%rax), %xmm7
+ movdqa %xmm3, 10*16(%rax)
+ movdqa %xmm7, 11*16(%rax)
+
+ movdqa %xmm3, %xmm2
+ movdqa %xmm7, %xmm6
+ psrld $10, %xmm3
+ psrld $10, %xmm7
+ movdqa %xmm3, %xmm1
+ movdqa %xmm7, %xmm5
+ pslld $13, %xmm2
+ pslld $13, %xmm6
+ psrld $7, %xmm1
+ psrld $7, %xmm5
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ psrld $2, %xmm1
+ psrld $2, %xmm5
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ pslld $2, %xmm2
+ pslld $2, %xmm6
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ paddd 5*16(%rax), %xmm3
+ paddd 6*16(%rax), %xmm7
+ movdqa %xmm3, 12*16(%rax)
+ movdqa %xmm7, 13*16(%rax)
+
+ movdqa sha256d_4preext2_30(%rip), %xmm0
+ movdqa 0*16(%rax), %xmm4
+ movdqa %xmm4, %xmm6
+ psrld $3, %xmm4
+ movdqa %xmm4, %xmm5
+ pslld $14, %xmm6
+ psrld $4, %xmm5
+ pxor %xmm5, %xmm4
+ pxor %xmm6, %xmm4
+ psrld $11, %xmm5
+ pslld $11, %xmm6
+ pxor %xmm5, %xmm4
+ pxor %xmm6, %xmm4
+ paddd -1*16(%rax), %xmm4
+ movdqa %xmm3, %xmm2
+ movdqa %xmm7, %xmm6
+ psrld $10, %xmm3
+ psrld $10, %xmm7
+ movdqa %xmm3, %xmm1
+ movdqa %xmm7, %xmm5
+ paddd 7*16(%rax), %xmm0
+ pslld $13, %xmm2
+ pslld $13, %xmm6
+ psrld $7, %xmm1
+ psrld $7, %xmm5
+ paddd 8*16(%rax), %xmm4
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ psrld $2, %xmm1
+ psrld $2, %xmm5
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ pslld $2, %xmm2
+ pslld $2, %xmm6
+ pxor %xmm1, %xmm3
+ pxor %xmm5, %xmm7
+ pxor %xmm2, %xmm3
+ pxor %xmm6, %xmm7
+ paddd %xmm0, %xmm3
+ paddd %xmm4, %xmm7
+ movdqa %xmm3, 14*16(%rax)
+ movdqa %xmm7, 15*16(%rax)
+
+ jmp sha256d_ms_4way_sse2_extend_loop2
+
+sha256d_ms_4way_sse2_extend_coda2:
+ sha256_sse2_extend_round 44
+
+ movdqa sha256_4h+0(%rip), %xmm7
+ movdqa sha256_4h+16(%rip), %xmm5
+ movdqa sha256_4h+32(%rip), %xmm4
+ movdqa sha256_4h+48(%rip), %xmm3
+ movdqa sha256_4h+64(%rip), %xmm0
+ movdqa sha256_4h+80(%rip), %xmm1
+ movdqa sha256_4h+96(%rip), %xmm2
+ movdqa sha256_4h+112(%rip), %xmm6
+ movdqa %xmm1, 0(%rsp)
+ movdqa %xmm2, 16(%rsp)
+ movdqa %xmm6, 32(%rsp)
+
+ leaq 48(%rsp), %rax
+ leaq sha256_4k(%rip), %rcx
+ jmp sha256d_ms_4way_sse2_main_loop2
+
+.macro sha256_sse2_main_round_red i, r7
+ movdqa 16*\i(%rax), %xmm6
+ paddd 16*\i(%rcx), %xmm6
+ paddd 32(%rsp), %xmm6
+ movdqa %xmm0, %xmm1
+ movdqa 16(%rsp), %xmm2
+ paddd \r7, %xmm6
+ pandn %xmm2, %xmm1
+ movdqa %xmm2, 32(%rsp)
+ movdqa 0(%rsp), %xmm2
+ movdqa %xmm2, 16(%rsp)
+ pand %xmm0, %xmm2
+ pxor %xmm2, %xmm1
+ movdqa %xmm0, 0(%rsp)
+ paddd %xmm1, %xmm6
+ movdqa %xmm0, %xmm1
+ psrld $6, %xmm0
+ movdqa %xmm0, %xmm2
+ pslld $7, %xmm1
+ psrld $5, %xmm2
+ pxor %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ pslld $14, %xmm1
+ psrld $14, %xmm2
+ pxor %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ pslld $5, %xmm1
+ pxor %xmm1, %xmm0
+ paddd %xmm6, %xmm0
+.endm
+
+sha256d_ms_4way_sse2_finish:
+ sha256_sse2_main_round_red 57, %xmm3
+ sha256_sse2_main_round_red 58, %xmm4
+ sha256_sse2_main_round_red 59, %xmm5
+ sha256_sse2_main_round_red 60, %xmm7
+
+ paddd sha256_4h+112(%rip), %xmm0
+ movdqa %xmm0, 112(%rdi)
+
+ addq $8+67*16, %rsp
+#if defined(_WIN64) || defined(__CYGWIN__)
+ popq %rsi
+ movdqa 0(%rsp), %xmm6
+ movdqa 16(%rsp), %xmm7
+ addq $32, %rsp
+ popq %rdi
+#endif
+ ret
+
+
+ .p2align 6
+sha256d_ms_4way_avx:
+#if defined(_WIN64) || defined(__CYGWIN__)
+ pushq %rdi
+ subq $80, %rsp
+ movdqa %xmm6, 0(%rsp)
+ movdqa %xmm7, 16(%rsp)
+ movdqa %xmm8, 32(%rsp)
+ movdqa %xmm9, 48(%rsp)
+ movdqa %xmm10, 64(%rsp)
+ pushq %rsi
+ movq %rcx, %rdi
+ movq %rdx, %rsi
+ movq %r8, %rdx
+ movq %r9, %rcx
+#endif
+ subq $1032, %rsp
+
+ leaq 256(%rsi), %rax
+
+sha256d_ms_4way_avx_extend_loop1:
+ vmovdqa 3*16(%rsi), %xmm0
+ vmovdqa 2*16(%rax), %xmm3
+ vmovdqa 3*16(%rax), %xmm7
+ vmovdqa %xmm3, 2*16(%rsp)
+ vmovdqa %xmm7, 3*16(%rsp)
+ vpaddd %xmm0, %xmm7, %xmm7
+ vpslld $14, %xmm0, %xmm2
+ vpsrld $3, %xmm0, %xmm0
+ vpsrld $4, %xmm0, %xmm1
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vpsrld $11, %xmm1, %xmm1
+ vpslld $11, %xmm2, %xmm2
+ vpxor %xmm1, %xmm0, %xmm0
+ vpxor %xmm2, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm3, %xmm3
+ vmovdqa %xmm3, 2*16(%rax)
+ vmovdqa %xmm7, 3*16(%rax)
+
+ vmovdqa 4*16(%rax), %xmm0
+ vmovdqa %xmm0, 4*16(%rsp)
+ vpslld $13, %xmm3, %xmm2
+ vpslld $13, %xmm7, %xmm6
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpsrld $7, %xmm3, %xmm1
+ vpsrld $7, %xmm7, %xmm5
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpsrld $2, %xmm1, %xmm1
+ vpsrld $2, %xmm5, %xmm5
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpslld $2, %xmm2, %xmm2
+ vpslld $2, %xmm6, %xmm6
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd %xmm0, %xmm3, %xmm3
+ vmovdqa %xmm3, 4*16(%rax)
+ vmovdqa %xmm7, 5*16(%rax)
+
+ vmovdqa 6*16(%rax), %xmm0
+ vmovdqa 7*16(%rax), %xmm4
+ vmovdqa %xmm0, 6*16(%rsp)
+ vmovdqa %xmm4, 7*16(%rsp)
+ vpslld $13, %xmm3, %xmm2
+ vpslld $13, %xmm7, %xmm6
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpsrld $7, %xmm3, %xmm1
+ vpsrld $7, %xmm7, %xmm5
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpsrld $2, %xmm1, %xmm1
+ vpsrld $2, %xmm5, %xmm5
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpslld $2, %xmm2, %xmm2
+ vpslld $2, %xmm6, %xmm6
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd %xmm0, %xmm3, %xmm3
+ vpaddd %xmm4, %xmm7, %xmm7
+ vmovdqa %xmm3, 6*16(%rax)
+ vmovdqa %xmm7, 7*16(%rax)
+
+ vmovdqa 8*16(%rax), %xmm0
+ vmovdqa 2*16(%rax), %xmm4
+ vmovdqa %xmm0, 8*16(%rsp)
+ vpslld $13, %xmm3, %xmm2
+ vpslld $13, %xmm7, %xmm6
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpsrld $7, %xmm3, %xmm1
+ vpsrld $7, %xmm7, %xmm5
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpsrld $2, %xmm1, %xmm1
+ vpsrld $2, %xmm5, %xmm5
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpslld $2, %xmm2, %xmm2
+ vpslld $2, %xmm6, %xmm6
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd %xmm0, %xmm3, %xmm3
+ vpaddd %xmm4, %xmm7, %xmm7
+ vmovdqa %xmm3, 8*16(%rax)
+ vmovdqa %xmm7, 9*16(%rax)
+
+ vpslld $13, %xmm3, %xmm2
+ vpslld $13, %xmm7, %xmm6
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpsrld $7, %xmm3, %xmm1
+ vpsrld $7, %xmm7, %xmm5
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpsrld $2, %xmm1, %xmm1
+ vpsrld $2, %xmm5, %xmm5
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpslld $2, %xmm2, %xmm2
+ vpslld $2, %xmm6, %xmm6
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd 3*16(%rax), %xmm3, %xmm3
+ vpaddd 4*16(%rax), %xmm7, %xmm7
+ vmovdqa %xmm3, 10*16(%rax)
+ vmovdqa %xmm7, 11*16(%rax)
+
+ vpslld $13, %xmm3, %xmm2
+ vpslld $13, %xmm7, %xmm6
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpsrld $7, %xmm3, %xmm1
+ vpsrld $7, %xmm7, %xmm5
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpsrld $2, %xmm1, %xmm1
+ vpsrld $2, %xmm5, %xmm5
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpslld $2, %xmm2, %xmm2
+ vpslld $2, %xmm6, %xmm6
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd 5*16(%rax), %xmm3, %xmm3
+ vpaddd 6*16(%rax), %xmm7, %xmm7
+ vmovdqa %xmm3, 12*16(%rax)
+ vmovdqa %xmm7, 13*16(%rax)
+
+ vmovdqa 14*16(%rax), %xmm0
+ vmovdqa 15*16(%rax), %xmm4
+ vmovdqa %xmm0, 14*16(%rsp)
+ vmovdqa %xmm4, 15*16(%rsp)
+ vpslld $13, %xmm3, %xmm2
+ vpslld $13, %xmm7, %xmm6
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpaddd 7*16(%rax), %xmm0, %xmm0
+ vpaddd 8*16(%rax), %xmm4, %xmm4
+ vpsrld $7, %xmm3, %xmm1
+ vpsrld $7, %xmm7, %xmm5
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpsrld $2, %xmm1, %xmm1
+ vpsrld $2, %xmm5, %xmm5
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpslld $2, %xmm2, %xmm2
+ vpslld $2, %xmm6, %xmm6
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd %xmm0, %xmm3, %xmm3
+ vpaddd %xmm4, %xmm7, %xmm7
+ vmovdqa %xmm3, 14*16(%rax)
+ vmovdqa %xmm7, 15*16(%rax)
+
+sha256d_ms_4way_avx_extend_loop2:
+ sha256_avx_extend_doubleround 16
+ sha256_avx_extend_doubleround 18
+ sha256_avx_extend_doubleround 20
+ sha256_avx_extend_doubleround 22
+ sha256_avx_extend_doubleround 24
+ sha256_avx_extend_doubleround 26
+ sha256_avx_extend_doubleround 28
+ sha256_avx_extend_doubleround 30
+ sha256_avx_extend_doubleround 32
+ sha256_avx_extend_doubleround 34
+ sha256_avx_extend_doubleround 36
+ sha256_avx_extend_doubleround 38
+ sha256_avx_extend_doubleround 40
+ sha256_avx_extend_doubleround 42
+ jz sha256d_ms_4way_avx_extend_coda2
+ sha256_avx_extend_doubleround 44
+ sha256_avx_extend_doubleround 46
+
+ movdqa 0(%rcx), %xmm7
+ movdqa 16(%rcx), %xmm8
+ movdqa 32(%rcx), %xmm9
+ movdqa 48(%rcx), %xmm10
+ movdqa 64(%rcx), %xmm0
+ movdqa 80(%rcx), %xmm5
+ movdqa 96(%rcx), %xmm4
+ movdqa 112(%rcx), %xmm3
+
+ movq %rsi, %rax
+ leaq sha256_4k(%rip), %rcx
+ jmp sha256d_ms_4way_avx_main_loop1
+
+sha256d_ms_4way_avx_main_loop2:
+ sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
+ sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
+ sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
+sha256d_ms_4way_avx_main_loop1:
+ sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
+ sha256_avx_main_quadround 4
+ sha256_avx_main_quadround 8
+ sha256_avx_main_quadround 12
+ sha256_avx_main_quadround 16
+ sha256_avx_main_quadround 20
+ sha256_avx_main_quadround 24
+ sha256_avx_main_quadround 28
+ sha256_avx_main_quadround 32
+ sha256_avx_main_quadround 36
+ sha256_avx_main_quadround 40
+ sha256_avx_main_quadround 44
+ sha256_avx_main_quadround 48
+ sha256_avx_main_quadround 52
+ sha256_avx_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
+ jz sha256d_ms_4way_avx_finish
+ sha256_avx_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
+ sha256_avx_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
+ sha256_avx_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
+ sha256_avx_main_quadround 60
+
+ movdqa 2*16(%rsp), %xmm1
+ movdqa 3*16(%rsp), %xmm2
+ movdqa 4*16(%rsp), %xmm6
+ movdqa %xmm1, 18*16(%rsi)
+ movdqa %xmm2, 19*16(%rsi)
+ movdqa %xmm6, 20*16(%rsi)
+ movdqa 6*16(%rsp), %xmm1
+ movdqa 7*16(%rsp), %xmm2
+ movdqa 8*16(%rsp), %xmm6
+ movdqa %xmm1, 22*16(%rsi)
+ movdqa %xmm2, 23*16(%rsi)
+ movdqa %xmm6, 24*16(%rsi)
+ movdqa 14*16(%rsp), %xmm1
+ movdqa 15*16(%rsp), %xmm2
+ movdqa %xmm1, 30*16(%rsi)
+ movdqa %xmm2, 31*16(%rsi)
+
+ paddd 0(%rdx), %xmm7
+ paddd 16(%rdx), %xmm5
+ paddd 32(%rdx), %xmm4
+ paddd 48(%rdx), %xmm3
+ paddd 64(%rdx), %xmm0
+ paddd 80(%rdx), %xmm8
+ paddd 96(%rdx), %xmm9
+ paddd 112(%rdx), %xmm10
+
+ movdqa %xmm7, 0(%rsp)
+ movdqa %xmm5, 16(%rsp)
+ movdqa %xmm4, 32(%rsp)
+ movdqa %xmm3, 48(%rsp)
+ movdqa %xmm0, 64(%rsp)
+ movdqa %xmm8, 80(%rsp)
+ movdqa %xmm9, 96(%rsp)
+ movdqa %xmm10, 112(%rsp)
+
+ pxor %xmm0, %xmm0
+ movq $0x8000000000000100, %rax
+ movd %rax, %xmm1
+ pshufd $0x55, %xmm1, %xmm2
+ pshufd $0x00, %xmm1, %xmm1
+ movdqa %xmm2, 128(%rsp)
+ movdqa %xmm0, 144(%rsp)
+ movdqa %xmm0, 160(%rsp)
+ movdqa %xmm0, 176(%rsp)
+ movdqa %xmm0, 192(%rsp)
+ movdqa %xmm0, 208(%rsp)
+ movdqa %xmm0, 224(%rsp)
+ movdqa %xmm1, 240(%rsp)
+
+ leaq 256(%rsp), %rax
+ cmpq %rax, %rax
+
+ vmovdqa -15*16(%rax), %xmm0
+ vmovdqa -14*16(%rax), %xmm4
+ vpslld $14, %xmm0, %xmm2
+ vpslld $14, %xmm4, %xmm6
+ vpsrld $3, %xmm0, %xmm8
+ vpsrld $3, %xmm4, %xmm4
+ vpsrld $7, %xmm0, %xmm1
+ vpsrld $4, %xmm4, %xmm5
+ vpxor %xmm1, %xmm8, %xmm8
+ vpxor %xmm5, %xmm4, %xmm4
+ vpsrld $11, %xmm1, %xmm1
+ vpsrld $11, %xmm5, %xmm5
+ vpxor %xmm2, %xmm8, %xmm8
+ vpxor %xmm6, %xmm4, %xmm4
+ vpslld $11, %xmm2, %xmm2
+ vpslld $11, %xmm6, %xmm6
+ vpxor %xmm1, %xmm8, %xmm8
+ vpxor %xmm5, %xmm4, %xmm4
+ vpxor %xmm2, %xmm8, %xmm8
+ vpxor %xmm6, %xmm4, %xmm4
+ vpaddd %xmm0, %xmm4, %xmm4
+ vpaddd -16*16(%rax), %xmm8, %xmm3
+ vpaddd sha256d_4preext2_17(%rip), %xmm4, %xmm7
+ vmovdqa %xmm3, 0*16(%rax)
+ vmovdqa %xmm7, 1*16(%rax)
+
+ sha256_avx_extend_doubleround 2
+ sha256_avx_extend_doubleround 4
+
+ vmovdqa -9*16(%rax), %xmm0
+ vpslld $14, %xmm0, %xmm2
+ vpsrld $3, %xmm0, %xmm8
+ vpsrld $7, %xmm0, %xmm1
+ vpxor %xmm1, %xmm8, %xmm8
+ vpxor %xmm2, %xmm8, %xmm8
+ vpsrld $11, %xmm1, %xmm1
+ vpslld $11, %xmm2, %xmm2
+ vpxor %xmm1, %xmm8, %xmm8
+ vpxor %xmm2, %xmm8, %xmm8
+ vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4
+ vpaddd -10*16(%rax), %xmm8, %xmm0
+ vpslld $13, %xmm3, %xmm2
+ vpslld $13, %xmm7, %xmm6
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpaddd -1*16(%rax), %xmm0, %xmm0
+ vpaddd 0*16(%rax), %xmm4, %xmm4
+ vpsrld $7, %xmm3, %xmm1
+ vpsrld $7, %xmm7, %xmm5
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpsrld $2, %xmm1, %xmm1
+ vpsrld $2, %xmm5, %xmm5
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpslld $2, %xmm2, %xmm2
+ vpslld $2, %xmm6, %xmm6
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd %xmm0, %xmm3, %xmm3
+ vpaddd %xmm4, %xmm7, %xmm7
+ vmovdqa %xmm3, 6*16(%rax)
+ vmovdqa %xmm7, 7*16(%rax)
+
+ vpslld $13, %xmm3, %xmm2
+ vpslld $13, %xmm7, %xmm6
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpsrld $7, %xmm3, %xmm1
+ vpsrld $7, %xmm7, %xmm5
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpsrld $2, %xmm1, %xmm1
+ vpsrld $2, %xmm5, %xmm5
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpslld $2, %xmm2, %xmm2
+ vpslld $2, %xmm6, %xmm6
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3
+ vpaddd 1*16(%rax), %xmm3, %xmm3
+ vpaddd 2*16(%rax), %xmm7, %xmm7
+ vmovdqa %xmm3, 8*16(%rax)
+ vmovdqa %xmm7, 9*16(%rax)
+
+ vpslld $13, %xmm3, %xmm2
+ vpslld $13, %xmm7, %xmm6
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpsrld $7, %xmm3, %xmm1
+ vpsrld $7, %xmm7, %xmm5
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpsrld $2, %xmm1, %xmm1
+ vpsrld $2, %xmm5, %xmm5
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpslld $2, %xmm2, %xmm2
+ vpslld $2, %xmm6, %xmm6
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd 3*16(%rax), %xmm3, %xmm3
+ vpaddd 4*16(%rax), %xmm7, %xmm7
+ vmovdqa %xmm3, 10*16(%rax)
+ vmovdqa %xmm7, 11*16(%rax)
+
+ vpslld $13, %xmm3, %xmm2
+ vpslld $13, %xmm7, %xmm6
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpsrld $7, %xmm3, %xmm1
+ vpsrld $7, %xmm7, %xmm5
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpsrld $2, %xmm1, %xmm1
+ vpsrld $2, %xmm5, %xmm5
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpslld $2, %xmm2, %xmm2
+ vpslld $2, %xmm6, %xmm6
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd 5*16(%rax), %xmm3, %xmm3
+ vpaddd 6*16(%rax), %xmm7, %xmm7
+ vmovdqa %xmm3, 12*16(%rax)
+ vmovdqa %xmm7, 13*16(%rax)
+
+ vmovdqa sha256d_4preext2_30(%rip), %xmm0
+ vmovdqa 0*16(%rax), %xmm4
+ vpslld $14, %xmm4, %xmm6
+ vpsrld $3, %xmm4, %xmm4
+ vpsrld $4, %xmm4, %xmm5
+ vpxor %xmm5, %xmm4, %xmm4
+ vpxor %xmm6, %xmm4, %xmm4
+ vpsrld $11, %xmm5, %xmm5
+ vpslld $11, %xmm6, %xmm6
+ vpxor %xmm5, %xmm4, %xmm4
+ vpxor %xmm6, %xmm4, %xmm4
+ vpaddd -1*16(%rax), %xmm4, %xmm4
+ vpslld $13, %xmm3, %xmm2
+ vpslld $13, %xmm7, %xmm6
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpaddd 7*16(%rax), %xmm0, %xmm0
+ vpaddd 8*16(%rax), %xmm4, %xmm4
+ vpsrld $7, %xmm3, %xmm1
+ vpsrld $7, %xmm7, %xmm5
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpsrld $2, %xmm1, %xmm1
+ vpsrld $2, %xmm5, %xmm5
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpslld $2, %xmm2, %xmm2
+ vpslld $2, %xmm6, %xmm6
+ vpxor %xmm1, %xmm3, %xmm3
+ vpxor %xmm5, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd %xmm0, %xmm3, %xmm3
+ vpaddd %xmm4, %xmm7, %xmm7
+ vmovdqa %xmm3, 14*16(%rax)
+ vmovdqa %xmm7, 15*16(%rax)
+
+ jmp sha256d_ms_4way_avx_extend_loop2
+
+sha256d_ms_4way_avx_extend_coda2:
+ sha256_avx_extend_round 44
+
+ movdqa sha256_4h+0(%rip), %xmm7
+ movdqa sha256_4h+16(%rip), %xmm5
+ movdqa sha256_4h+32(%rip), %xmm4
+ movdqa sha256_4h+48(%rip), %xmm3
+ movdqa sha256_4h+64(%rip), %xmm0
+ movdqa sha256_4h+80(%rip), %xmm8
+ movdqa sha256_4h+96(%rip), %xmm9
+ movdqa sha256_4h+112(%rip), %xmm10
+
+ movq %rsp, %rax
+ leaq sha256_4k(%rip), %rcx
+ jmp sha256d_ms_4way_avx_main_loop2
+
+.macro sha256_avx_main_round_red i, r0, r1, r2, r3, r4
+ vpaddd 16*\i(%rax), \r0, %xmm6
+ vpaddd 16*\i(%rcx), %xmm6, %xmm6
+ vpandn \r1, \r3, %xmm1
+ vpand \r3, \r2, %xmm2
+ vpxor %xmm2, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm6, %xmm6
+ vpslld $7, \r3, %xmm1
+ vpsrld $6, \r3, \r0
+ vpsrld $5, \r0, %xmm2
+ vpxor %xmm1, \r0, \r0
+ vpxor %xmm2, \r0, \r0
+ vpslld $14, %xmm1, %xmm1
+ vpsrld $14, %xmm2, %xmm2
+ vpxor %xmm1, \r0, \r0
+ vpxor %xmm2, \r0, \r0
+ vpslld $5, %xmm1, %xmm1
+ vpxor %xmm1, \r0, \r0
+ vpaddd \r0, %xmm6, %xmm6
+ vpaddd %xmm6, \r4, \r0
+.endm
+
+sha256d_ms_4way_avx_finish:
+ sha256_avx_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4
+ sha256_avx_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5
+ sha256_avx_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7
+ sha256_avx_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3
+
+ paddd sha256_4h+112(%rip), %xmm10
+ movdqa %xmm10, 112(%rdi)
+
+ addq $1032, %rsp
+#if defined(_WIN64) || defined(__CYGWIN__)
+ popq %rsi
+ movdqa 0(%rsp), %xmm6
+ movdqa 16(%rsp), %xmm7
+ movdqa 32(%rsp), %xmm8
+ movdqa 48(%rsp), %xmm9
+ movdqa 64(%rsp), %xmm10
+ addq $80, %rsp
+ popq %rdi
+#endif
+ ret
+
+ .p2align 6
+sha256d_ms_4way_xop:
+#if defined(_WIN64) || defined(__CYGWIN__)
+ pushq %rdi
+ subq $80, %rsp
+ movdqa %xmm6, 0(%rsp)
+ movdqa %xmm7, 16(%rsp)
+ movdqa %xmm8, 32(%rsp)
+ movdqa %xmm9, 48(%rsp)
+ movdqa %xmm10, 64(%rsp)
+ pushq %rsi
+ movq %rcx, %rdi
+ movq %rdx, %rsi
+ movq %r8, %rdx
+ movq %r9, %rcx
+#endif
+ subq $1032, %rsp
+
+ leaq 256(%rsi), %rax
+
+sha256d_ms_4way_xop_extend_loop1:
+ vmovdqa 3*16(%rsi), %xmm0
+ vmovdqa 2*16(%rax), %xmm3
+ vmovdqa 3*16(%rax), %xmm7
+ vmovdqa %xmm3, 2*16(%rsp)
+ vmovdqa %xmm7, 3*16(%rsp)
+ vpaddd %xmm0, %xmm7, %xmm7
+ vprotd $25, %xmm0, %xmm1
+ vprotd $14, %xmm0, %xmm2
+ vpsrld $3, %xmm0, %xmm0
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm2, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm3, %xmm3
+ vmovdqa %xmm3, 2*16(%rax)
+ vmovdqa %xmm7, 3*16(%rax)
+
+ vmovdqa 4*16(%rax), %xmm0
+ vmovdqa %xmm0, 4*16(%rsp)
+ vprotd $15, %xmm3, %xmm1
+ vprotd $15, %xmm7, %xmm5
+ vprotd $13, %xmm3, %xmm2
+ vprotd $13, %xmm7, %xmm6
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm5, %xmm6, %xmm6
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd %xmm0, %xmm3, %xmm3
+ vmovdqa %xmm3, 4*16(%rax)
+ vmovdqa %xmm7, 5*16(%rax)
+
+ vmovdqa 6*16(%rax), %xmm0
+ vmovdqa 7*16(%rax), %xmm4
+ vmovdqa %xmm0, 6*16(%rsp)
+ vmovdqa %xmm4, 7*16(%rsp)
+ vprotd $15, %xmm3, %xmm1
+ vprotd $15, %xmm7, %xmm5
+ vprotd $13, %xmm3, %xmm2
+ vprotd $13, %xmm7, %xmm6
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm5, %xmm6, %xmm6
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd %xmm0, %xmm3, %xmm3
+ vpaddd %xmm4, %xmm7, %xmm7
+ vmovdqa %xmm3, 6*16(%rax)
+ vmovdqa %xmm7, 7*16(%rax)
+
+ vmovdqa 8*16(%rax), %xmm0
+ vmovdqa 2*16(%rax), %xmm4
+ vmovdqa %xmm0, 8*16(%rsp)
+ vprotd $15, %xmm3, %xmm1
+ vprotd $15, %xmm7, %xmm5
+ vprotd $13, %xmm3, %xmm2
+ vprotd $13, %xmm7, %xmm6
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm5, %xmm6, %xmm6
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd %xmm0, %xmm3, %xmm3
+ vpaddd %xmm4, %xmm7, %xmm7
+ vmovdqa %xmm3, 8*16(%rax)
+ vmovdqa %xmm7, 9*16(%rax)
+
+ vprotd $15, %xmm3, %xmm1
+ vprotd $15, %xmm7, %xmm5
+ vprotd $13, %xmm3, %xmm2
+ vprotd $13, %xmm7, %xmm6
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm5, %xmm6, %xmm6
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd 3*16(%rax), %xmm3, %xmm3
+ vpaddd 4*16(%rax), %xmm7, %xmm7
+ vmovdqa %xmm3, 10*16(%rax)
+ vmovdqa %xmm7, 11*16(%rax)
+
+ vprotd $15, %xmm3, %xmm1
+ vprotd $15, %xmm7, %xmm5
+ vprotd $13, %xmm3, %xmm2
+ vprotd $13, %xmm7, %xmm6
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm5, %xmm6, %xmm6
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd 5*16(%rax), %xmm3, %xmm3
+ vpaddd 6*16(%rax), %xmm7, %xmm7
+ vmovdqa %xmm3, 12*16(%rax)
+ vmovdqa %xmm7, 13*16(%rax)
+
+ vmovdqa 14*16(%rax), %xmm0
+ vmovdqa 15*16(%rax), %xmm4
+ vmovdqa %xmm0, 14*16(%rsp)
+ vmovdqa %xmm4, 15*16(%rsp)
+ vprotd $15, %xmm3, %xmm1
+ vprotd $15, %xmm7, %xmm5
+ vprotd $13, %xmm3, %xmm2
+ vprotd $13, %xmm7, %xmm6
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm5, %xmm6, %xmm6
+ vpaddd 7*16(%rax), %xmm0, %xmm0
+ vpaddd 8*16(%rax), %xmm4, %xmm4
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd %xmm0, %xmm3, %xmm3
+ vpaddd %xmm4, %xmm7, %xmm7
+ vmovdqa %xmm3, 14*16(%rax)
+ vmovdqa %xmm7, 15*16(%rax)
+
+sha256d_ms_4way_xop_extend_loop2:
+ sha256_xop_extend_doubleround 16
+ sha256_xop_extend_doubleround 18
+ sha256_xop_extend_doubleround 20
+ sha256_xop_extend_doubleround 22
+ sha256_xop_extend_doubleround 24
+ sha256_xop_extend_doubleround 26
+ sha256_xop_extend_doubleround 28
+ sha256_xop_extend_doubleround 30
+ sha256_xop_extend_doubleround 32
+ sha256_xop_extend_doubleround 34
+ sha256_xop_extend_doubleround 36
+ sha256_xop_extend_doubleround 38
+ sha256_xop_extend_doubleround 40
+ sha256_xop_extend_doubleround 42
+ jz sha256d_ms_4way_xop_extend_coda2
+ sha256_xop_extend_doubleround 44
+ sha256_xop_extend_doubleround 46
+
+ movdqa 0(%rcx), %xmm7
+ movdqa 16(%rcx), %xmm8
+ movdqa 32(%rcx), %xmm9
+ movdqa 48(%rcx), %xmm10
+ movdqa 64(%rcx), %xmm0
+ movdqa 80(%rcx), %xmm5
+ movdqa 96(%rcx), %xmm4
+ movdqa 112(%rcx), %xmm3
+
+ movq %rsi, %rax
+ leaq sha256_4k(%rip), %rcx
+ jmp sha256d_ms_4way_xop_main_loop1
+
+sha256d_ms_4way_xop_main_loop2:
+ sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
+ sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
+ sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
+sha256d_ms_4way_xop_main_loop1:
+ sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
+ sha256_xop_main_quadround 4
+ sha256_xop_main_quadround 8
+ sha256_xop_main_quadround 12
+ sha256_xop_main_quadround 16
+ sha256_xop_main_quadround 20
+ sha256_xop_main_quadround 24
+ sha256_xop_main_quadround 28
+ sha256_xop_main_quadround 32
+ sha256_xop_main_quadround 36
+ sha256_xop_main_quadround 40
+ sha256_xop_main_quadround 44
+ sha256_xop_main_quadround 48
+ sha256_xop_main_quadround 52
+ sha256_xop_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
+ jz sha256d_ms_4way_xop_finish
+ sha256_xop_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
+ sha256_xop_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
+ sha256_xop_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
+ sha256_xop_main_quadround 60
+
+ movdqa 2*16(%rsp), %xmm1
+ movdqa 3*16(%rsp), %xmm2
+ movdqa 4*16(%rsp), %xmm6
+ movdqa %xmm1, 18*16(%rsi)
+ movdqa %xmm2, 19*16(%rsi)
+ movdqa %xmm6, 20*16(%rsi)
+ movdqa 6*16(%rsp), %xmm1
+ movdqa 7*16(%rsp), %xmm2
+ movdqa 8*16(%rsp), %xmm6
+ movdqa %xmm1, 22*16(%rsi)
+ movdqa %xmm2, 23*16(%rsi)
+ movdqa %xmm6, 24*16(%rsi)
+ movdqa 14*16(%rsp), %xmm1
+ movdqa 15*16(%rsp), %xmm2
+ movdqa %xmm1, 30*16(%rsi)
+ movdqa %xmm2, 31*16(%rsi)
+
+ paddd 0(%rdx), %xmm7
+ paddd 16(%rdx), %xmm5
+ paddd 32(%rdx), %xmm4
+ paddd 48(%rdx), %xmm3
+ paddd 64(%rdx), %xmm0
+ paddd 80(%rdx), %xmm8
+ paddd 96(%rdx), %xmm9
+ paddd 112(%rdx), %xmm10
+
+ movdqa %xmm7, 0(%rsp)
+ movdqa %xmm5, 16(%rsp)
+ movdqa %xmm4, 32(%rsp)
+ movdqa %xmm3, 48(%rsp)
+ movdqa %xmm0, 64(%rsp)
+ movdqa %xmm8, 80(%rsp)
+ movdqa %xmm9, 96(%rsp)
+ movdqa %xmm10, 112(%rsp)
+
+ pxor %xmm0, %xmm0
+ movq $0x8000000000000100, %rax
+ movd %rax, %xmm1
+ pshufd $0x55, %xmm1, %xmm2
+ pshufd $0x00, %xmm1, %xmm1
+ movdqa %xmm2, 128(%rsp)
+ movdqa %xmm0, 144(%rsp)
+ movdqa %xmm0, 160(%rsp)
+ movdqa %xmm0, 176(%rsp)
+ movdqa %xmm0, 192(%rsp)
+ movdqa %xmm0, 208(%rsp)
+ movdqa %xmm0, 224(%rsp)
+ movdqa %xmm1, 240(%rsp)
+
+ leaq 256(%rsp), %rax
+ cmpq %rax, %rax
+
+ vmovdqa -15*16(%rax), %xmm0
+ vmovdqa -14*16(%rax), %xmm4
+ vprotd $25, %xmm0, %xmm1
+ vprotd $25, %xmm4, %xmm5
+ vprotd $14, %xmm0, %xmm2
+ vprotd $14, %xmm4, %xmm6
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm5, %xmm6, %xmm6
+ vpsrld $3, %xmm0, %xmm8
+ vpsrld $3, %xmm4, %xmm4
+ vpxor %xmm2, %xmm8, %xmm8
+ vpxor %xmm6, %xmm4, %xmm4
+ vpaddd %xmm0, %xmm4, %xmm4
+ vpaddd -16*16(%rax), %xmm8, %xmm3
+ vpaddd sha256d_4preext2_17(%rip), %xmm4, %xmm7
+ vmovdqa %xmm3, 0*16(%rax)
+ vmovdqa %xmm7, 1*16(%rax)
+
+ sha256_xop_extend_doubleround 2
+ sha256_xop_extend_doubleround 4
+
+ vmovdqa -9*16(%rax), %xmm0
+ vprotd $25, %xmm0, %xmm1
+ vprotd $14, %xmm0, %xmm2
+ vpsrld $3, %xmm0, %xmm8
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm2, %xmm8, %xmm8
+ vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4
+ vpaddd -10*16(%rax), %xmm8, %xmm0
+ vprotd $15, %xmm3, %xmm1
+ vprotd $15, %xmm7, %xmm5
+ vprotd $13, %xmm3, %xmm2
+ vprotd $13, %xmm7, %xmm6
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm5, %xmm6, %xmm6
+ vpaddd -1*16(%rax), %xmm0, %xmm0
+ vpaddd 0*16(%rax), %xmm4, %xmm4
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd %xmm0, %xmm3, %xmm3
+ vpaddd %xmm4, %xmm7, %xmm7
+ vmovdqa %xmm3, 6*16(%rax)
+ vmovdqa %xmm7, 7*16(%rax)
+
+ vprotd $15, %xmm3, %xmm1
+ vprotd $15, %xmm7, %xmm5
+ vprotd $13, %xmm3, %xmm2
+ vprotd $13, %xmm7, %xmm6
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm5, %xmm6, %xmm6
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3
+ vpaddd 1*16(%rax), %xmm3, %xmm3
+ vpaddd 2*16(%rax), %xmm7, %xmm7
+ vmovdqa %xmm3, 8*16(%rax)
+ vmovdqa %xmm7, 9*16(%rax)
+
+ vprotd $15, %xmm3, %xmm1
+ vprotd $15, %xmm7, %xmm5
+ vprotd $13, %xmm3, %xmm2
+ vprotd $13, %xmm7, %xmm6
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm5, %xmm6, %xmm6
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd 3*16(%rax), %xmm3, %xmm3
+ vpaddd 4*16(%rax), %xmm7, %xmm7
+ vmovdqa %xmm3, 10*16(%rax)
+ vmovdqa %xmm7, 11*16(%rax)
+
+ vprotd $15, %xmm3, %xmm1
+ vprotd $15, %xmm7, %xmm5
+ vprotd $13, %xmm3, %xmm2
+ vprotd $13, %xmm7, %xmm6
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm5, %xmm6, %xmm6
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd 5*16(%rax), %xmm3, %xmm3
+ vpaddd 6*16(%rax), %xmm7, %xmm7
+ vmovdqa %xmm3, 12*16(%rax)
+ vmovdqa %xmm7, 13*16(%rax)
+
+ vmovdqa sha256d_4preext2_30(%rip), %xmm0
+ vmovdqa 0*16(%rax), %xmm4
+ vprotd $25, %xmm4, %xmm5
+ vprotd $14, %xmm4, %xmm6
+ vpxor %xmm5, %xmm6, %xmm6
+ vpsrld $3, %xmm4, %xmm4
+ vpxor %xmm6, %xmm4, %xmm4
+ vpaddd -1*16(%rax), %xmm4, %xmm4
+ vprotd $15, %xmm3, %xmm1
+ vprotd $15, %xmm7, %xmm5
+ vprotd $13, %xmm3, %xmm2
+ vprotd $13, %xmm7, %xmm6
+ vpxor %xmm1, %xmm2, %xmm2
+ vpxor %xmm5, %xmm6, %xmm6
+ vpaddd 7*16(%rax), %xmm0, %xmm0
+ vpaddd 8*16(%rax), %xmm4, %xmm4
+ vpsrld $10, %xmm3, %xmm3
+ vpsrld $10, %xmm7, %xmm7
+ vpxor %xmm2, %xmm3, %xmm3
+ vpxor %xmm6, %xmm7, %xmm7
+ vpaddd %xmm0, %xmm3, %xmm3
+ vpaddd %xmm4, %xmm7, %xmm7
+ vmovdqa %xmm3, 14*16(%rax)
+ vmovdqa %xmm7, 15*16(%rax)
+
+ jmp sha256d_ms_4way_xop_extend_loop2
+
+sha256d_ms_4way_xop_extend_coda2:
+ sha256_xop_extend_round 44
+
+ movdqa sha256_4h+0(%rip), %xmm7
+ movdqa sha256_4h+16(%rip), %xmm5
+ movdqa sha256_4h+32(%rip), %xmm4
+ movdqa sha256_4h+48(%rip), %xmm3
+ movdqa sha256_4h+64(%rip), %xmm0
+ movdqa sha256_4h+80(%rip), %xmm8
+ movdqa sha256_4h+96(%rip), %xmm9
+ movdqa sha256_4h+112(%rip), %xmm10
+
+ movq %rsp, %rax
+ leaq sha256_4k(%rip), %rcx
+ jmp sha256d_ms_4way_xop_main_loop2
+
+.macro sha256_xop_main_round_red i, r0, r1, r2, r3, r4
+ vpaddd 16*\i(%rax), \r0, %xmm6
+ vpaddd 16*\i(%rcx), %xmm6, %xmm6
+ vpandn \r1, \r3, %xmm1
+ vpand \r3, \r2, %xmm2
+ vpxor %xmm2, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm6, %xmm6
+ vprotd $26, \r3, %xmm1
+ vprotd $21, \r3, %xmm2
+ vpxor %xmm1, %xmm2, %xmm2
+ vprotd $7, \r3, \r0
+ vpxor %xmm2, \r0, \r0
+ vpaddd \r0, %xmm6, %xmm6
+ vpaddd %xmm6, \r4, \r0
+.endm
+
+sha256d_ms_4way_xop_finish:
+ sha256_xop_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4
+ sha256_xop_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5
+ sha256_xop_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7
+ sha256_xop_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3
+
+ paddd sha256_4h+112(%rip), %xmm10
+ movdqa %xmm10, 112(%rdi)
+
+ addq $1032, %rsp
+#if defined(_WIN64) || defined(__CYGWIN__)
+ popq %rsi
+ movdqa 0(%rsp), %xmm6
+ movdqa 16(%rsp), %xmm7
+ movdqa 32(%rsp), %xmm8
+ movdqa 48(%rsp), %xmm9
+ movdqa 64(%rsp), %xmm10
+ addq $80, %rsp
+ popq %rdi
+#endif
+ ret
+
+ .text
+ .p2align 6
+ .globl sha256_use_4way
+ .globl _sha256_use_4way
+sha256_use_4way:
+_sha256_use_4way:
+ pushq %rbx
+ pushq %rcx
+ pushq %rdx
+
+ /* Check for VIA PadLock Hash Engine */
+ movl $0xc0000000, %eax
+ cpuid
+ cmpl $0xc0000001, %eax
+ jb sha256_use_4way_no_phe
+ movl $0xc0000001, %eax
+ cpuid
+ andl $0x00000c00, %edx
+ cmpl $0x00000c00, %edx
+ jne sha256_use_4way_no_phe
+ leaq sha256_transform_phe(%rip), %rdx
+ movq %rdx, sha256_transform_addr(%rip)
+ xorl %eax, %eax
+ jmp sha256_use_4way_exit
+sha256_use_4way_no_phe:
+ /* Check for AVX and OSXSAVE support */
+ movl $1, %eax
+ cpuid
+ andl $0x18000000, %ecx
+ cmpl $0x18000000, %ecx
+ jne sha256_use_4way_base
+ /* Check for XMM and YMM state support */
+ xorl %ecx, %ecx
+ xgetbv
+ andl $0x00000006, %eax
+ cmpl $0x00000006, %eax
+ jne sha256_use_4way_base
+ /* Check for XOP support */
+ movl $0x80000001, %eax
+ cpuid
+ andl $0x00000800, %ecx
+ jz sha256_use_4way_avx
+
+sha256_use_4way_xop:
+ leaq sha256d_ms_4way_xop(%rip), %rcx
+ leaq sha256_transform_4way_core_xop(%rip), %rdx
+ jmp sha256_use_4way_done
+
+sha256_use_4way_avx:
+ leaq sha256d_ms_4way_avx(%rip), %rcx
+ leaq sha256_transform_4way_core_avx(%rip), %rdx
+ jmp sha256_use_4way_done
+
+sha256_use_4way_base:
+ leaq sha256d_ms_4way_sse2(%rip), %rcx
+ leaq sha256_transform_4way_core_sse2(%rip), %rdx
+
+sha256_use_4way_done:
+ movq %rcx, sha256d_ms_4way_addr(%rip)
+ movq %rdx, sha256_transform_4way_core_addr(%rip)
+ movl $1, %eax
+sha256_use_4way_exit:
+ popq %rdx
+ popq %rcx
+ popq %rbx
+ ret
+
+ .text
+ .p2align 6
+ .globl sha256d_ms_8way
+ .globl _sha256d_ms_8way
+sha256d_ms_8way:
+_sha256d_ms_8way:
+sha256d_ms_8way_avx2:
+#if defined(_WIN64) || defined(__CYGWIN__)
+ pushq %rdi
+ subq $80, %rsp
+ vmovdqa %xmm6, 0(%rsp)
+ vmovdqa %xmm7, 16(%rsp)
+ vmovdqa %xmm8, 32(%rsp)
+ vmovdqa %xmm9, 48(%rsp)
+ vmovdqa %xmm10, 64(%rsp)
+ pushq %rsi
+ movq %rcx, %rdi
+ movq %rdx, %rsi
+ movq %r8, %rdx
+ movq %r9, %rcx
+#endif
+ pushq %rbp
+ movq %rsp, %rbp
+ subq $64*32, %rsp
+ andq $-128, %rsp
+
+ leaq 16*32(%rsi), %rax
+
+sha256d_ms_8way_avx2_extend_loop1:
+ vmovdqa 3*32(%rsi), %ymm0
+ vmovdqa 2*32(%rax), %ymm3
+ vmovdqa 3*32(%rax), %ymm7
+ vmovdqa %ymm3, 2*32(%rsp)
+ vmovdqa %ymm7, 3*32(%rsp)
+ vpaddd %ymm0, %ymm7, %ymm7
+ vpslld $14, %ymm0, %ymm2
+ vpsrld $3, %ymm0, %ymm0
+ vpsrld $4, %ymm0, %ymm1
+ vpxor %ymm1, %ymm0, %ymm0
+ vpxor %ymm2, %ymm0, %ymm0
+ vpsrld $11, %ymm1, %ymm1
+ vpslld $11, %ymm2, %ymm2
+ vpxor %ymm1, %ymm0, %ymm0
+ vpxor %ymm2, %ymm0, %ymm0
+ vpaddd %ymm0, %ymm3, %ymm3
+ vmovdqa %ymm3, 2*32(%rax)
+ vmovdqa %ymm7, 3*32(%rax)
+
+ vmovdqa 4*32(%rax), %ymm0
+ vmovdqa %ymm0, 4*32(%rsp)
+ vpslld $13, %ymm3, %ymm2
+ vpslld $13, %ymm7, %ymm6
+ vpsrld $10, %ymm3, %ymm3
+ vpsrld $10, %ymm7, %ymm7
+ vpsrld $7, %ymm3, %ymm1
+ vpsrld $7, %ymm7, %ymm5
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpsrld $2, %ymm1, %ymm1
+ vpsrld $2, %ymm5, %ymm5
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpslld $2, %ymm2, %ymm2
+ vpslld $2, %ymm6, %ymm6
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpaddd %ymm0, %ymm3, %ymm3
+ vmovdqa %ymm3, 4*32(%rax)
+ vmovdqa %ymm7, 5*32(%rax)
+
+ vmovdqa 6*32(%rax), %ymm0
+ vmovdqa 7*32(%rax), %ymm4
+ vmovdqa %ymm0, 6*32(%rsp)
+ vmovdqa %ymm4, 7*32(%rsp)
+ vpslld $13, %ymm3, %ymm2
+ vpslld $13, %ymm7, %ymm6
+ vpsrld $10, %ymm3, %ymm3
+ vpsrld $10, %ymm7, %ymm7
+ vpsrld $7, %ymm3, %ymm1
+ vpsrld $7, %ymm7, %ymm5
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpsrld $2, %ymm1, %ymm1
+ vpsrld $2, %ymm5, %ymm5
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpslld $2, %ymm2, %ymm2
+ vpslld $2, %ymm6, %ymm6
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpaddd %ymm0, %ymm3, %ymm3
+ vpaddd %ymm4, %ymm7, %ymm7
+ vmovdqa %ymm3, 6*32(%rax)
+ vmovdqa %ymm7, 7*32(%rax)
+
+ vmovdqa 8*32(%rax), %ymm0
+ vmovdqa 2*32(%rax), %ymm4
+ vmovdqa %ymm0, 8*32(%rsp)
+ vpslld $13, %ymm3, %ymm2
+ vpslld $13, %ymm7, %ymm6
+ vpsrld $10, %ymm3, %ymm3
+ vpsrld $10, %ymm7, %ymm7
+ vpsrld $7, %ymm3, %ymm1
+ vpsrld $7, %ymm7, %ymm5
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpsrld $2, %ymm1, %ymm1
+ vpsrld $2, %ymm5, %ymm5
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpslld $2, %ymm2, %ymm2
+ vpslld $2, %ymm6, %ymm6
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpaddd %ymm0, %ymm3, %ymm3
+ vpaddd %ymm4, %ymm7, %ymm7
+ vmovdqa %ymm3, 8*32(%rax)
+ vmovdqa %ymm7, 9*32(%rax)
+
+ vpslld $13, %ymm3, %ymm2
+ vpslld $13, %ymm7, %ymm6
+ vpsrld $10, %ymm3, %ymm3
+ vpsrld $10, %ymm7, %ymm7
+ vpsrld $7, %ymm3, %ymm1
+ vpsrld $7, %ymm7, %ymm5
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpsrld $2, %ymm1, %ymm1
+ vpsrld $2, %ymm5, %ymm5
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpslld $2, %ymm2, %ymm2
+ vpslld $2, %ymm6, %ymm6
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpaddd 3*32(%rax), %ymm3, %ymm3
+ vpaddd 4*32(%rax), %ymm7, %ymm7
+ vmovdqa %ymm3, 10*32(%rax)
+ vmovdqa %ymm7, 11*32(%rax)
+
+ vpslld $13, %ymm3, %ymm2
+ vpslld $13, %ymm7, %ymm6
+ vpsrld $10, %ymm3, %ymm3
+ vpsrld $10, %ymm7, %ymm7
+ vpsrld $7, %ymm3, %ymm1
+ vpsrld $7, %ymm7, %ymm5
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpsrld $2, %ymm1, %ymm1
+ vpsrld $2, %ymm5, %ymm5
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpslld $2, %ymm2, %ymm2
+ vpslld $2, %ymm6, %ymm6
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpaddd 5*32(%rax), %ymm3, %ymm3
+ vpaddd 6*32(%rax), %ymm7, %ymm7
+ vmovdqa %ymm3, 12*32(%rax)
+ vmovdqa %ymm7, 13*32(%rax)
+
+ vmovdqa 14*32(%rax), %ymm0
+ vmovdqa 15*32(%rax), %ymm4
+ vmovdqa %ymm0, 14*32(%rsp)
+ vmovdqa %ymm4, 15*32(%rsp)
+ vpslld $13, %ymm3, %ymm2
+ vpslld $13, %ymm7, %ymm6
+ vpsrld $10, %ymm3, %ymm3
+ vpsrld $10, %ymm7, %ymm7
+ vpaddd 7*32(%rax), %ymm0, %ymm0
+ vpaddd 8*32(%rax), %ymm4, %ymm4
+ vpsrld $7, %ymm3, %ymm1
+ vpsrld $7, %ymm7, %ymm5
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpsrld $2, %ymm1, %ymm1
+ vpsrld $2, %ymm5, %ymm5
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpslld $2, %ymm2, %ymm2
+ vpslld $2, %ymm6, %ymm6
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpaddd %ymm0, %ymm3, %ymm3
+ vpaddd %ymm4, %ymm7, %ymm7
+ vmovdqa %ymm3, 14*32(%rax)
+ vmovdqa %ymm7, 15*32(%rax)
+
+sha256d_ms_8way_avx2_extend_loop2:
+ sha256_avx2_extend_doubleround 16
+ sha256_avx2_extend_doubleround 18
+ sha256_avx2_extend_doubleround 20
+ sha256_avx2_extend_doubleround 22
+ sha256_avx2_extend_doubleround 24
+ sha256_avx2_extend_doubleround 26
+ sha256_avx2_extend_doubleround 28
+ sha256_avx2_extend_doubleround 30
+ sha256_avx2_extend_doubleround 32
+ sha256_avx2_extend_doubleround 34
+ sha256_avx2_extend_doubleround 36
+ sha256_avx2_extend_doubleround 38
+ sha256_avx2_extend_doubleround 40
+ sha256_avx2_extend_doubleround 42
+ jz sha256d_ms_8way_avx2_extend_coda2
+ sha256_avx2_extend_doubleround 44
+ sha256_avx2_extend_doubleround 46
+
+ vmovdqa 0(%rcx), %ymm7
+ vmovdqa 32(%rcx), %ymm8
+ vmovdqa 64(%rcx), %ymm9
+ vmovdqa 96(%rcx), %ymm10
+ vmovdqa 128(%rcx), %ymm0
+ vmovdqa 160(%rcx), %ymm5
+ vmovdqa 192(%rcx), %ymm4
+ vmovdqa 224(%rcx), %ymm3
+
+ movq %rsi, %rax
+ leaq sha256_8k(%rip), %rcx
+ jmp sha256d_ms_8way_avx2_main_loop1
+
+sha256d_ms_8way_avx2_main_loop2:
+ sha256_avx2_main_round 0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7
+ sha256_avx2_main_round 1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3
+ sha256_avx2_main_round 2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4
+sha256d_ms_8way_avx2_main_loop1:
+ sha256_avx2_main_round 3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5
+ sha256_avx2_main_quadround 4
+ sha256_avx2_main_quadround 8
+ sha256_avx2_main_quadround 12
+ sha256_avx2_main_quadround 16
+ sha256_avx2_main_quadround 20
+ sha256_avx2_main_quadround 24
+ sha256_avx2_main_quadround 28
+ sha256_avx2_main_quadround 32
+ sha256_avx2_main_quadround 36
+ sha256_avx2_main_quadround 40
+ sha256_avx2_main_quadround 44
+ sha256_avx2_main_quadround 48
+ sha256_avx2_main_quadround 52
+ sha256_avx2_main_round 56, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7
+ jz sha256d_ms_8way_avx2_finish
+ sha256_avx2_main_round 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3
+ sha256_avx2_main_round 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4
+ sha256_avx2_main_round 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5
+ sha256_avx2_main_quadround 60
+
+ vmovdqa 2*32(%rsp), %ymm1
+ vmovdqa 3*32(%rsp), %ymm2
+ vmovdqa 4*32(%rsp), %ymm6
+ vmovdqa %ymm1, 18*32(%rsi)
+ vmovdqa %ymm2, 19*32(%rsi)
+ vmovdqa %ymm6, 20*32(%rsi)
+ vmovdqa 6*32(%rsp), %ymm1
+ vmovdqa 7*32(%rsp), %ymm2
+ vmovdqa 8*32(%rsp), %ymm6
+ vmovdqa %ymm1, 22*32(%rsi)
+ vmovdqa %ymm2, 23*32(%rsi)
+ vmovdqa %ymm6, 24*32(%rsi)
+ vmovdqa 14*32(%rsp), %ymm1
+ vmovdqa 15*32(%rsp), %ymm2
+ vmovdqa %ymm1, 30*32(%rsi)
+ vmovdqa %ymm2, 31*32(%rsi)
+
+ vpaddd 0(%rdx), %ymm7, %ymm7
+ vpaddd 32(%rdx), %ymm5, %ymm5
+ vpaddd 64(%rdx), %ymm4, %ymm4
+ vpaddd 96(%rdx), %ymm3, %ymm3
+ vpaddd 128(%rdx), %ymm0, %ymm0
+ vpaddd 160(%rdx), %ymm8, %ymm8
+ vpaddd 192(%rdx), %ymm9, %ymm9
+ vpaddd 224(%rdx), %ymm10, %ymm10
+
+ vmovdqa %ymm7, 0(%rsp)
+ vmovdqa %ymm5, 32(%rsp)
+ vmovdqa %ymm4, 64(%rsp)
+ vmovdqa %ymm3, 96(%rsp)
+ vmovdqa %ymm0, 128(%rsp)
+ vmovdqa %ymm8, 160(%rsp)
+ vmovdqa %ymm9, 192(%rsp)
+ vmovdqa %ymm10, 224(%rsp)
+
+ vpxor %ymm0, %ymm0, %ymm0
+ movq $0x8000000000000100, %rax
+ vmovd %rax, %xmm1
+ vinserti128 $1, %xmm1, %ymm1, %ymm1
+ vpshufd $0x55, %ymm1, %ymm2
+ vpshufd $0x00, %ymm1, %ymm1
+ vmovdqa %ymm2, 8*32(%rsp)
+ vmovdqa %ymm0, 9*32(%rsp)
+ vmovdqa %ymm0, 10*32(%rsp)
+ vmovdqa %ymm0, 11*32(%rsp)
+ vmovdqa %ymm0, 12*32(%rsp)
+ vmovdqa %ymm0, 13*32(%rsp)
+ vmovdqa %ymm0, 14*32(%rsp)
+ vmovdqa %ymm1, 15*32(%rsp)
+
+ leaq 16*32(%rsp), %rax
+ cmpq %rax, %rax
+
+ vmovdqa -15*32(%rax), %ymm0
+ vmovdqa -14*32(%rax), %ymm4
+ vpslld $14, %ymm0, %ymm2
+ vpslld $14, %ymm4, %ymm6
+ vpsrld $3, %ymm0, %ymm8
+ vpsrld $3, %ymm4, %ymm4
+ vpsrld $7, %ymm0, %ymm1
+ vpsrld $4, %ymm4, %ymm5
+ vpxor %ymm1, %ymm8, %ymm8
+ vpxor %ymm5, %ymm4, %ymm4
+ vpsrld $11, %ymm1, %ymm1
+ vpsrld $11, %ymm5, %ymm5
+ vpxor %ymm2, %ymm8, %ymm8
+ vpxor %ymm6, %ymm4, %ymm4
+ vpslld $11, %ymm2, %ymm2
+ vpslld $11, %ymm6, %ymm6
+ vpxor %ymm1, %ymm8, %ymm8
+ vpxor %ymm5, %ymm4, %ymm4
+ vpxor %ymm2, %ymm8, %ymm8
+ vpxor %ymm6, %ymm4, %ymm4
+ vpaddd %ymm0, %ymm4, %ymm4
+ vpaddd -16*32(%rax), %ymm8, %ymm3
+ vpaddd sha256d_8preext2_17(%rip), %ymm4, %ymm7
+ vmovdqa %ymm3, 0*32(%rax)
+ vmovdqa %ymm7, 1*32(%rax)
+
+ sha256_avx2_extend_doubleround 2
+ sha256_avx2_extend_doubleround 4
+
+ vmovdqa -9*32(%rax), %ymm0
+ vpslld $14, %ymm0, %ymm2
+ vpsrld $3, %ymm0, %ymm8
+ vpsrld $7, %ymm0, %ymm1
+ vpxor %ymm1, %ymm8, %ymm8
+ vpxor %ymm2, %ymm8, %ymm8
+ vpsrld $11, %ymm1, %ymm1
+ vpslld $11, %ymm2, %ymm2
+ vpxor %ymm1, %ymm8, %ymm8
+ vpxor %ymm2, %ymm8, %ymm8
+ vpaddd sha256d_8preext2_23(%rip), %ymm0, %ymm4
+ vpaddd -10*32(%rax), %ymm8, %ymm0
+ vpslld $13, %ymm3, %ymm2
+ vpslld $13, %ymm7, %ymm6
+ vpsrld $10, %ymm3, %ymm3
+ vpsrld $10, %ymm7, %ymm7
+ vpaddd -1*32(%rax), %ymm0, %ymm0
+ vpaddd 0*32(%rax), %ymm4, %ymm4
+ vpsrld $7, %ymm3, %ymm1
+ vpsrld $7, %ymm7, %ymm5
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpsrld $2, %ymm1, %ymm1
+ vpsrld $2, %ymm5, %ymm5
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpslld $2, %ymm2, %ymm2
+ vpslld $2, %ymm6, %ymm6
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpaddd %ymm0, %ymm3, %ymm3
+ vpaddd %ymm4, %ymm7, %ymm7
+ vmovdqa %ymm3, 6*32(%rax)
+ vmovdqa %ymm7, 7*32(%rax)
+
+ vpslld $13, %ymm3, %ymm2
+ vpslld $13, %ymm7, %ymm6
+ vpsrld $10, %ymm3, %ymm3
+ vpsrld $10, %ymm7, %ymm7
+ vpsrld $7, %ymm3, %ymm1
+ vpsrld $7, %ymm7, %ymm5
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpsrld $2, %ymm1, %ymm1
+ vpsrld $2, %ymm5, %ymm5
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpslld $2, %ymm2, %ymm2
+ vpslld $2, %ymm6, %ymm6
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpaddd sha256d_8preext2_24(%rip), %ymm3, %ymm3
+ vpaddd 1*32(%rax), %ymm3, %ymm3
+ vpaddd 2*32(%rax), %ymm7, %ymm7
+ vmovdqa %ymm3, 8*32(%rax)
+ vmovdqa %ymm7, 9*32(%rax)
+
+ vpslld $13, %ymm3, %ymm2
+ vpslld $13, %ymm7, %ymm6
+ vpsrld $10, %ymm3, %ymm3
+ vpsrld $10, %ymm7, %ymm7
+ vpsrld $7, %ymm3, %ymm1
+ vpsrld $7, %ymm7, %ymm5
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpsrld $2, %ymm1, %ymm1
+ vpsrld $2, %ymm5, %ymm5
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpslld $2, %ymm2, %ymm2
+ vpslld $2, %ymm6, %ymm6
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpaddd 3*32(%rax), %ymm3, %ymm3
+ vpaddd 4*32(%rax), %ymm7, %ymm7
+ vmovdqa %ymm3, 10*32(%rax)
+ vmovdqa %ymm7, 11*32(%rax)
+
+ vpslld $13, %ymm3, %ymm2
+ vpslld $13, %ymm7, %ymm6
+ vpsrld $10, %ymm3, %ymm3
+ vpsrld $10, %ymm7, %ymm7
+ vpsrld $7, %ymm3, %ymm1
+ vpsrld $7, %ymm7, %ymm5
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpsrld $2, %ymm1, %ymm1
+ vpsrld $2, %ymm5, %ymm5
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpslld $2, %ymm2, %ymm2
+ vpslld $2, %ymm6, %ymm6
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpaddd 5*32(%rax), %ymm3, %ymm3
+ vpaddd 6*32(%rax), %ymm7, %ymm7
+ vmovdqa %ymm3, 12*32(%rax)
+ vmovdqa %ymm7, 13*32(%rax)
+
+ vmovdqa sha256d_8preext2_30(%rip), %ymm0
+ vmovdqa 0*32(%rax), %ymm4
+ vpslld $14, %ymm4, %ymm6
+ vpsrld $3, %ymm4, %ymm4
+ vpsrld $4, %ymm4, %ymm5
+ vpxor %ymm5, %ymm4, %ymm4
+ vpxor %ymm6, %ymm4, %ymm4
+ vpsrld $11, %ymm5, %ymm5
+ vpslld $11, %ymm6, %ymm6
+ vpxor %ymm5, %ymm4, %ymm4
+ vpxor %ymm6, %ymm4, %ymm4
+ vpaddd -1*32(%rax), %ymm4, %ymm4
+ vpslld $13, %ymm3, %ymm2
+ vpslld $13, %ymm7, %ymm6
+ vpsrld $10, %ymm3, %ymm3
+ vpsrld $10, %ymm7, %ymm7
+ vpaddd 7*32(%rax), %ymm0, %ymm0
+ vpaddd 8*32(%rax), %ymm4, %ymm4
+ vpsrld $7, %ymm3, %ymm1
+ vpsrld $7, %ymm7, %ymm5
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpsrld $2, %ymm1, %ymm1
+ vpsrld $2, %ymm5, %ymm5
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpslld $2, %ymm2, %ymm2
+ vpslld $2, %ymm6, %ymm6
+ vpxor %ymm1, %ymm3, %ymm3
+ vpxor %ymm5, %ymm7, %ymm7
+ vpxor %ymm2, %ymm3, %ymm3
+ vpxor %ymm6, %ymm7, %ymm7
+ vpaddd %ymm0, %ymm3, %ymm3
+ vpaddd %ymm4, %ymm7, %ymm7
+ vmovdqa %ymm3, 14*32(%rax)
+ vmovdqa %ymm7, 15*32(%rax)
+
+ jmp sha256d_ms_8way_avx2_extend_loop2
+
+sha256d_ms_8way_avx2_extend_coda2:
+ sha256_avx2_extend_round 44
+
+ vmovdqa sha256_8h+0(%rip), %ymm7
+ vmovdqa sha256_8h+32(%rip), %ymm5
+ vmovdqa sha256_8h+64(%rip), %ymm4
+ vmovdqa sha256_8h+96(%rip), %ymm3
+ vmovdqa sha256_8h+128(%rip), %ymm0
+ vmovdqa sha256_8h+160(%rip), %ymm8
+ vmovdqa sha256_8h+192(%rip), %ymm9
+ vmovdqa sha256_8h+224(%rip), %ymm10
+
+ movq %rsp, %rax
+ leaq sha256_8k(%rip), %rcx
+ jmp sha256d_ms_8way_avx2_main_loop2
+
+.macro sha256_avx2_main_round_red i, r0, r1, r2, r3, r4
+ vpaddd 32*\i(%rax), \r0, %ymm6
+ vpaddd 32*\i(%rcx), %ymm6, %ymm6
+ vpandn \r1, \r3, %ymm1
+ vpand \r3, \r2, %ymm2
+ vpxor %ymm2, %ymm1, %ymm1
+ vpaddd %ymm1, %ymm6, %ymm6
+ vpslld $7, \r3, %ymm1
+ vpsrld $6, \r3, \r0
+ vpsrld $5, \r0, %ymm2
+ vpxor %ymm1, \r0, \r0
+ vpxor %ymm2, \r0, \r0
+ vpslld $14, %ymm1, %ymm1
+ vpsrld $14, %ymm2, %ymm2
+ vpxor %ymm1, \r0, \r0
+ vpxor %ymm2, \r0, \r0
+ vpslld $5, %ymm1, %ymm1
+ vpxor %ymm1, \r0, \r0
+ vpaddd \r0, %ymm6, %ymm6
+ vpaddd %ymm6, \r4, \r0
+.endm
+
+sha256d_ms_8way_avx2_finish:
+ sha256_avx2_main_round_red 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4
+ sha256_avx2_main_round_red 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5
+ sha256_avx2_main_round_red 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7
+ sha256_avx2_main_round_red 60, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3
+
+ vpaddd sha256_8h+224(%rip), %ymm10, %ymm10
+ vmovdqa %ymm10, 224(%rdi)
+
+ movq %rbp, %rsp
+ popq %rbp
+#if defined(_WIN64) || defined(__CYGWIN__)
+ popq %rsi
+ vmovdqa 0(%rsp), %xmm6
+ vmovdqa 16(%rsp), %xmm7
+ vmovdqa 32(%rsp), %xmm8
+ vmovdqa 48(%rsp), %xmm9
+ vmovdqa 64(%rsp), %xmm10
+ addq $80, %rsp
+ popq %rdi
+#endif
+ ret
+
+
+ .text
+ .p2align 6
+ .globl sha256_use_8way
+ .globl _sha256_use_8way
+sha256_use_8way:
+_sha256_use_8way:
+
+ pushq %rbx
+ /* Check for AVX and OSXSAVE support */
+ movl $1, %eax
+ cpuid
+ andl $0x18000000, %ecx
+ cmpl $0x18000000, %ecx
+ jne sha256_use_8way_no
+ /* Check for AVX2 support */
+ movl $7, %eax
+ xorl %ecx, %ecx
+ cpuid
+ andl $0x00000020, %ebx
+ cmpl $0x00000020, %ebx
+ jne sha256_use_8way_no
+ /* Check for XMM and YMM state support */
+ xorl %ecx, %ecx
+ xgetbv
+ andl $0x00000006, %eax
+ cmpl $0x00000006, %eax
+ jne sha256_use_8way_no
+
+sha256_use_8way_yes:
+ movl $1, %eax
+ jmp sha256_use_8way_done
+
+sha256_use_8way_no:
+ xorl %eax, %eax
+
+sha256_use_8way_done:
+ popq %rbx
+ ret
+
+#endif