From 961a3c4f1132805426c904e411bfdfafa31bda77 Mon Sep 17 00:00:00 2001 From: CryptoManiac Date: Sat, 10 Oct 2015 18:31:45 -0700 Subject: [PATCH] RPC scaninput: replace Intel implementation of sha256 with the one from cpuminer. Intel implementatio is still better in terms of performance, but loses in terms of compatibility. I think we have to think about using it in the near future. --- novacoin-qt.pro | 21 +- src/crypto/sha2/asm/sha2-arm.S | 1581 +++++++++++++ src/crypto/sha2/asm/sha2-x86.S | 1191 ++++++++++ src/crypto/sha2/asm/sha2-x86_64.S | 4164 +++++++++++++++++++++++++++++++++++ src/crypto/sha2/asm/sha256_avx1.asm | 766 ------- src/crypto/sha2/asm/sha256_sse4.asm | 726 ------ src/kernel.cpp | 163 ++- src/makefile.bsd | 12 + src/makefile.linux-mingw | 12 + src/makefile.mingw | 17 +- src/makefile.osx | 12 + src/makefile.unix | 12 + 12 files changed, 7127 insertions(+), 1550 deletions(-) create mode 100644 src/crypto/sha2/asm/sha2-arm.S create mode 100644 src/crypto/sha2/asm/sha2-x86.S create mode 100644 src/crypto/sha2/asm/sha2-x86_64.S delete mode 100644 src/crypto/sha2/asm/sha256_avx1.asm delete mode 100644 src/crypto/sha2/asm/sha256_sse4.asm diff --git a/novacoin-qt.pro b/novacoin-qt.pro index ac3d5e2..22f6dbc 100644 --- a/novacoin-qt.pro +++ b/novacoin-qt.pro @@ -127,35 +127,24 @@ contains(USE_LEVELDB, 1) { # use: qmake "USE_ASM=1" contains(USE_ASM, 1) { - message(Using assembler scrypt implementation) + message(Using assembler scrypt & sha256 implementations) + DEFINES += USE_ASM SOURCES += src/crypto/scrypt/asm/scrypt-arm.S src/crypto/scrypt/asm/scrypt-x86.S src/crypto/scrypt/asm/scrypt-x86_64.S src/crypto/scrypt/asm/asm-wrapper.cpp + SOURCES += src/crypto/sha2/asm/sha2-arm.S src/crypto/sha2/asm/sha2-x86.S src/crypto/sha2/asm/sha2-x86_64.S } else { # use: qmake "USE_SSE2=1" contains(USE_SSE2, 1) { - message(Using SSE2 intrinsic scrypt implementation) + message(Using SSE2 intrinsic scrypt implementation & generic sha256 implementation) SOURCES += src/crypto/scrypt/intrin/scrypt-sse2.cpp DEFINES += USE_SSE2 QMAKE_CXXFLAGS += -msse2 QMAKE_CFLAGS += -msse2 } else { - message(Using generic scrypt implementation) + message(Using generic scrypt & sha256 implementations) SOURCES += src/crypto/scrypt/generic/scrypt-generic.cpp } } -contains(USE_YASM, 1) { - !win32 { - DEFINES += USE_YASM - - LIBS += $$PWD/src/crypto/sha2/asm/obj/sha256_simd.a - gensha2.commands = cd $$PWD/src/crypto/sha2/asm && yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o obj/sha256_avx1.o sha256_avx1.asm && yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o obj/sha256_sse4.o sha256_sse4.asm && ar -rs obj/sha256_simd.a obj/*.o - gensha2.target = $$PWD/src/crypto/sha2/asm/obj/sha256_simd.a - gensha2.depends = FORCE - PRE_TARGETDEPS += $$PWD/src/crypto/sha2/asm/obj/sha256_simd.a - QMAKE_EXTRA_TARGETS += gensha2 - } -} - # regenerate src/build.h !windows|contains(USE_BUILD_INFO, 1) { genbuild.depends = FORCE diff --git a/src/crypto/sha2/asm/sha2-arm.S b/src/crypto/sha2/asm/sha2-arm.S new file mode 100644 index 0000000..182a36c --- /dev/null +++ b/src/crypto/sha2/asm/sha2-arm.S @@ -0,0 +1,1581 @@ +/* + * Copyright 2012 pooler@litecoinpool.org + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#if defined(__arm__) && defined(__APCS_32__) + +.macro sha256_k + .align 2 + .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +.endm + +.macro sha256_extend_doubleround_core i, rw, ra, rb, ry, rz + mov r12, \ry, ror #17 + add r11, r11, \ra + eor r12, r12, \ry, ror #19 + mov \ra, lr, ror #7 + eor r12, r12, \ry, lsr #10 + eor \ra, \ra, lr, ror #18 + add r12, r12, r11 + ldr r11, [\rw, #(\i+2)*4] + eor \ra, \ra, lr, lsr #3 + add \ra, \ra, r12 + + mov r12, \rz, ror #17 + str \ra, [\rw, #(\i+16)*4] + add lr, lr, \rb + eor r12, r12, \rz, ror #19 + mov \rb, r11, ror #7 + eor r12, r12, \rz, lsr #10 + eor \rb, \rb, r11, ror #18 + add lr, lr, r12 + eor \rb, \rb, r11, lsr #3 + add \rb, \rb, lr +.endm + +.macro sha256_extend_doubleround_head i, rw, ra, rb, ry, rz + ldr lr, [\rw, #(\i+1)*4] + sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz + ldr lr, [\rw, #(\i+3)*4] +.endm + +.macro sha256_extend_doubleround_body i, rw, ra, rb, ry, rz + str \rz, [\rw, #(\i+15)*4] + sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz + ldr lr, [\rw, #(\i+3)*4] +.endm + +.macro sha256_extend_doubleround_foot i, rw, ra, rb, ry, rz + str \rz, [\rw, #(\i+15)*4] + sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz + str \rb, [\rw, #(\i+17)*4] +.endm + +.macro sha256_main_round i, ka, rw, ra, rb, rc, rd, re, rf, rg, rh + ldr r12, [\rw, #(\i)*4] + and r3, \rf, \re + bic lr, \rg, \re + orr lr, lr, r3 + ldr r3, \ka + (\i)*4 + add \rh, \rh, lr + eor lr, \re, \re, ror #5 + add \rh, \rh, r12 + eor lr, lr, \re, ror #19 + add \rh, \rh, r3 + eor r3, \ra, \rb + add \rh, \rh, lr, ror #6 + + and r3, r3, \rc + eor r12, \ra, \ra, ror #11 + and lr, \ra, \rb + eor r12, r12, \ra, ror #20 + eor lr, lr, r3 + add r3, \rh, lr + add \rh, \rh, \rd + add \rd, r3, r12, ror #2 +.endm + +.macro sha256_main_quadround i, ka, rw + sha256_main_round \i+0, \ka, \rw, r4, r5, r6, r7, r8, r9, r10, r11 + sha256_main_round \i+1, \ka, \rw, r7, r4, r5, r6, r11, r8, r9, r10 + sha256_main_round \i+2, \ka, \rw, r6, r7, r4, r5, r10, r11, r8, r9 + sha256_main_round \i+3, \ka, \rw, r5, r6, r7, r4, r9, r10, r11, r8 +.endm + + + .text + .code 32 + .align 2 + .globl sha256_transform + .globl _sha256_transform +#ifdef __ELF__ + .type sha256_transform, %function +#endif +sha256_transform: +_sha256_transform: + stmfd sp!, {r4-r11, lr} + cmp r2, #0 + sub sp, sp, #64*4 + bne sha256_transform_swap + + ldmia r1!, {r4-r11} + stmia sp, {r4-r11} + add r3, sp, #8*4 + ldmia r1, {r4-r11} + stmia r3, {r4-r11} + b sha256_transform_extend + +.macro bswap rd, rn + eor r12, \rn, \rn, ror #16 + bic r12, r12, #0x00ff0000 + mov \rd, \rn, ror #8 + eor \rd, \rd, r12, lsr #8 +.endm + +sha256_transform_swap: + ldmia r1!, {r4-r11} + bswap r4, r4 + bswap r5, r5 + bswap r6, r6 + bswap r7, r7 + bswap r8, r8 + bswap r9, r9 + bswap r10, r10 + bswap r11, r11 + stmia sp, {r4-r11} + add r3, sp, #8*4 + ldmia r1, {r4-r11} + bswap r4, r4 + bswap r5, r5 + bswap r6, r6 + bswap r7, r7 + bswap r8, r8 + bswap r9, r9 + bswap r10, r10 + bswap r11, r11 + stmia r3, {r4-r11} + +sha256_transform_extend: + add r12, sp, #9*4 + ldr r11, [sp, #0*4] + ldmia r12, {r4-r10} + sha256_extend_doubleround_head 0, sp, r4, r5, r9, r10 + sha256_extend_doubleround_body 2, sp, r6, r7, r4, r5 + sha256_extend_doubleround_body 4, sp, r8, r9, r6, r7 + sha256_extend_doubleround_body 6, sp, r10, r4, r8, r9 + sha256_extend_doubleround_body 8, sp, r5, r6, r10, r4 + sha256_extend_doubleround_body 10, sp, r7, r8, r5, r6 + sha256_extend_doubleround_body 12, sp, r9, r10, r7, r8 + sha256_extend_doubleround_body 14, sp, r4, r5, r9, r10 + sha256_extend_doubleround_body 16, sp, r6, r7, r4, r5 + sha256_extend_doubleround_body 18, sp, r8, r9, r6, r7 + sha256_extend_doubleround_body 20, sp, r10, r4, r8, r9 + sha256_extend_doubleround_body 22, sp, r5, r6, r10, r4 + sha256_extend_doubleround_body 24, sp, r7, r8, r5, r6 + sha256_extend_doubleround_body 26, sp, r9, r10, r7, r8 + sha256_extend_doubleround_body 28, sp, r4, r5, r9, r10 + sha256_extend_doubleround_body 30, sp, r6, r7, r4, r5 + sha256_extend_doubleround_body 32, sp, r8, r9, r6, r7 + sha256_extend_doubleround_body 34, sp, r10, r4, r8, r9 + sha256_extend_doubleround_body 36, sp, r5, r6, r10, r4 + sha256_extend_doubleround_body 38, sp, r7, r8, r5, r6 + sha256_extend_doubleround_body 40, sp, r9, r10, r7, r8 + sha256_extend_doubleround_body 42, sp, r4, r5, r9, r10 + sha256_extend_doubleround_body 44, sp, r6, r7, r4, r5 + sha256_extend_doubleround_foot 46, sp, r8, r9, r6, r7 + + ldmia r0, {r4-r11} + sha256_main_quadround 0, sha256_transform_k, sp + sha256_main_quadround 4, sha256_transform_k, sp + sha256_main_quadround 8, sha256_transform_k, sp + sha256_main_quadround 12, sha256_transform_k, sp + sha256_main_quadround 16, sha256_transform_k, sp + sha256_main_quadround 20, sha256_transform_k, sp + sha256_main_quadround 24, sha256_transform_k, sp + sha256_main_quadround 28, sha256_transform_k, sp + b sha256_transform_k_over +sha256_transform_k: + sha256_k +sha256_transform_k_over: + sha256_main_quadround 32, sha256_transform_k, sp + sha256_main_quadround 36, sha256_transform_k, sp + sha256_main_quadround 40, sha256_transform_k, sp + sha256_main_quadround 44, sha256_transform_k, sp + sha256_main_quadround 48, sha256_transform_k, sp + sha256_main_quadround 52, sha256_transform_k, sp + sha256_main_quadround 56, sha256_transform_k, sp + sha256_main_quadround 60, sha256_transform_k, sp + + ldmia r0, {r1, r2, r3, r12} + add r4, r4, r1 + add r5, r5, r2 + add r6, r6, r3 + add r7, r7, r12 + stmia r0!, {r4-r7} + ldmia r0, {r1, r2, r3, r12} + add r8, r8, r1 + add r9, r9, r2 + add r10, r10, r3 + add r11, r11, r12 + stmia r0, {r8-r11} + + add sp, sp, #64*4 +#ifdef __thumb__ + ldmfd sp!, {r4-r11, lr} + bx lr +#else + ldmfd sp!, {r4-r11, pc} +#endif + + + .text + .code 32 + .align 2 + .globl sha256d_ms + .globl _sha256d_ms +#ifdef __ELF__ + .type sha256d_ms, %function +#endif +sha256d_ms: +_sha256d_ms: + stmfd sp!, {r4-r11, lr} + sub sp, sp, #64*4 + + cmp r0, r0 + + ldr lr, [r1, #3*4] + ldr r6, [r1, #18*4] + ldr r7, [r1, #19*4] + + mov r12, lr, ror #7 + str r6, [sp, #18*4] + eor r12, r12, lr, ror #18 + str r7, [sp, #19*4] + eor r12, r12, lr, lsr #3 + ldr r8, [r1, #20*4] + add r6, r6, r12 + ldr r10, [r1, #22*4] + add r7, r7, lr + str r6, [r1, #18*4] + + mov r12, r6, ror #17 + str r7, [r1, #19*4] + eor r12, r12, r6, ror #19 + str r8, [sp, #20*4] + eor r12, r12, r6, lsr #10 + ldr r4, [r1, #23*4] + add r8, r8, r12 + ldr r5, [r1, #24*4] + + mov r9, r7, ror #17 + str r8, [r1, #20*4] + eor r9, r9, r7, ror #19 + str r10, [sp, #21*4] + eor r9, r9, r7, lsr #10 + str r4, [sp, #22*4] + + mov r12, r8, ror #17 + str r9, [r1, #21*4] + eor r12, r12, r8, ror #19 + str r5, [sp, #23*4] + eor r12, r12, r8, lsr #10 + mov lr, r9, ror #17 + add r10, r10, r12 + ldr r11, [r1, #30*4] + + eor lr, lr, r9, ror #19 + str r10, [r1, #22*4] + eor lr, lr, r9, lsr #10 + str r11, [sp, #24*4] + add r4, r4, lr + + mov r12, r10, ror #17 + str r4, [r1, #23*4] + eor r12, r12, r10, ror #19 + mov lr, r4, ror #17 + eor r12, r12, r10, lsr #10 + eor lr, lr, r4, ror #19 + add r5, r5, r12 + eor lr, lr, r4, lsr #10 + str r5, [r1, #24*4] + add r6, r6, lr + + mov r12, r5, ror #17 + str r6, [r1, #25*4] + eor r12, r12, r5, ror #19 + mov lr, r6, ror #17 + eor r12, r12, r5, lsr #10 + eor lr, lr, r6, ror #19 + add r7, r7, r12 + eor lr, lr, r6, lsr #10 + str r7, [r1, #26*4] + add r8, r8, lr + + mov r12, r7, ror #17 + str r8, [r1, #27*4] + eor r12, r12, r7, ror #19 + mov lr, r8, ror #17 + eor r12, r12, r7, lsr #10 + eor lr, lr, r8, ror #19 + add r9, r9, r12 + eor lr, lr, r8, lsr #10 + str r9, [r1, #28*4] + add r10, r10, lr + + ldr lr, [r1, #31*4] + mov r12, r9, ror #17 + str r10, [r1, #29*4] + eor r12, r12, r9, ror #19 + str lr, [sp, #25*4] + eor r12, r12, r9, lsr #10 + add r11, r11, r12 + add r5, r5, lr + mov r12, r10, ror #17 + add r4, r4, r11 + + ldr r11, [r1, #16*4] + eor r12, r12, r10, ror #19 + str r4, [r1, #30*4] + eor r12, r12, r10, lsr #10 + add r5, r5, r12 + ldr lr, [r1, #17*4] + +sha256d_ms_extend_loop2: + sha256_extend_doubleround_body 16, r1, r6, r7, r4, r5 + sha256_extend_doubleround_body 18, r1, r8, r9, r6, r7 + sha256_extend_doubleround_body 20, r1, r10, r4, r8, r9 + sha256_extend_doubleround_body 22, r1, r5, r6, r10, r4 + sha256_extend_doubleround_body 24, r1, r7, r8, r5, r6 + sha256_extend_doubleround_body 26, r1, r9, r10, r7, r8 + sha256_extend_doubleround_body 28, r1, r4, r5, r9, r10 + sha256_extend_doubleround_body 30, r1, r6, r7, r4, r5 + sha256_extend_doubleround_body 32, r1, r8, r9, r6, r7 + sha256_extend_doubleround_body 34, r1, r10, r4, r8, r9 + sha256_extend_doubleround_body 36, r1, r5, r6, r10, r4 + sha256_extend_doubleround_body 38, r1, r7, r8, r5, r6 + sha256_extend_doubleround_body 40, r1, r9, r10, r7, r8 + sha256_extend_doubleround_body 42, r1, r4, r5, r9, r10 + bne sha256d_ms_extend_coda2 + sha256_extend_doubleround_body 44, r1, r6, r7, r4, r5 + sha256_extend_doubleround_foot 46, r1, r8, r9, r6, r7 + + ldr r4, [r3, #0*4] + ldr r9, [r3, #1*4] + ldr r10, [r3, #2*4] + ldr r11, [r3, #3*4] + ldr r8, [r3, #4*4] + ldr r5, [r3, #5*4] + ldr r6, [r3, #6*4] + ldr r7, [r3, #7*4] + b sha256d_ms_main_loop1 + +sha256d_ms_main_loop2: + sha256_main_round 0, sha256d_ms_k, r1, r4, r5, r6, r7, r8, r9, r10, r11 + sha256_main_round 1, sha256d_ms_k, r1, r7, r4, r5, r6, r11, r8, r9, r10 + sha256_main_round 2, sha256d_ms_k, r1, r6, r7, r4, r5, r10, r11, r8, r9 +sha256d_ms_main_loop1: + sha256_main_round 3, sha256d_ms_k, r1, r5, r6, r7, r4, r9, r10, r11, r8 + sha256_main_quadround 4, sha256d_ms_k, r1 + sha256_main_quadround 8, sha256d_ms_k, r1 + sha256_main_quadround 12, sha256d_ms_k, r1 + sha256_main_quadround 16, sha256d_ms_k, r1 + sha256_main_quadround 20, sha256d_ms_k, r1 + sha256_main_quadround 24, sha256d_ms_k, r1 + sha256_main_quadround 28, sha256d_ms_k, r1 + b sha256d_ms_k_over +sha256d_ms_k: + sha256_k +sha256d_ms_k_over: + sha256_main_quadround 32, sha256d_ms_k, r1 + sha256_main_quadround 36, sha256d_ms_k, r1 + sha256_main_quadround 40, sha256d_ms_k, r1 + sha256_main_quadround 44, sha256d_ms_k, r1 + sha256_main_quadround 48, sha256d_ms_k, r1 + sha256_main_quadround 52, sha256d_ms_k, r1 + sha256_main_round 56, sha256d_ms_k, r1, r4, r5, r6, r7, r8, r9, r10, r11 + bne sha256d_ms_finish + sha256_main_round 57, sha256d_ms_k, r1, r7, r4, r5, r6, r11, r8, r9, r10 + sha256_main_round 58, sha256d_ms_k, r1, r6, r7, r4, r5, r10, r11, r8, r9 + sha256_main_round 59, sha256d_ms_k, r1, r5, r6, r7, r4, r9, r10, r11, r8 + sha256_main_quadround 60, sha256d_ms_k, r1 + + ldmia r2!, {r3, r12, lr} + add r4, r4, r3 + add r5, r5, r12 + add r6, r6, lr + stmia sp, {r4-r6} + ldmia r2, {r3, r4, r5, r6, r12} + add lr, sp, #3*4 + add r7, r7, r3 + add r8, r8, r4 + add r9, r9, r5 + add r10, r10, r6 + add r11, r11, r12 + add r12, sp, #18*4 + stmia lr!, {r7-r11} + + ldmia r12, {r4-r11} + str r4, [r1, #18*4] + str r5, [r1, #19*4] + str r6, [r1, #20*4] + str r7, [r1, #22*4] + str r8, [r1, #23*4] + str r9, [r1, #24*4] + str r10, [r1, #30*4] + str r11, [r1, #31*4] + + mov r3, #0x80000000 + mov r4, #0 + mov r5, #0 + mov r6, #0 + mov r7, #0 + mov r8, #0 + mov r9, #0 + mov r10, #0x00000100 + stmia lr, {r3-r10} + + ldr lr, [sp, #1*4] + movs r1, sp + ldr r4, [sp, #0*4] + + ldr r11, [sp, #2*4] + mov r12, lr, ror #7 + eor r12, r12, lr, ror #18 + add r5, lr, #0x00a00000 + eor r12, r12, lr, lsr #3 + mov lr, r11, ror #7 + add r4, r4, r12 + eor lr, lr, r11, ror #18 + str r4, [sp, #16*4] + eor lr, lr, r11, lsr #3 + mov r12, r4, ror #17 + add r5, r5, lr + ldr lr, [sp, #3*4] + + str r5, [sp, #17*4] + eor r12, r12, r4, ror #19 + mov r6, lr, ror #7 + eor r12, r12, r4, lsr #10 + eor r6, r6, lr, ror #18 + add r11, r11, r12 + eor r6, r6, lr, lsr #3 + mov r12, r5, ror #17 + add r6, r6, r11 + ldr r11, [sp, #4*4] + + str r6, [sp, #18*4] + eor r12, r12, r5, ror #19 + mov r7, r11, ror #7 + eor r12, r12, r5, lsr #10 + eor r7, r7, r11, ror #18 + add lr, lr, r12 + eor r7, r7, r11, lsr #3 + mov r12, r6, ror #17 + add r7, r7, lr + ldr lr, [sp, #5*4] + + str r7, [sp, #19*4] + eor r12, r12, r6, ror #19 + mov r8, lr, ror #7 + eor r12, r12, r6, lsr #10 + eor r8, r8, lr, ror #18 + add r11, r11, r12 + eor r8, r8, lr, lsr #3 + mov r12, r7, ror #17 + add r8, r8, r11 + ldr r11, [sp, #6*4] + + str r8, [sp, #20*4] + eor r12, r12, r7, ror #19 + mov r9, r11, ror #7 + eor r12, r12, r7, lsr #10 + eor r9, r9, r11, ror #18 + add lr, lr, r12 + eor r9, r9, r11, lsr #3 + mov r12, r8, ror #17 + add r9, r9, lr + ldr lr, [sp, #7*4] + + str r9, [sp, #21*4] + eor r12, r12, r8, ror #19 + mov r10, lr, ror #7 + eor r12, r12, r8, lsr #10 + eor r10, r10, lr, ror #18 + add r11, r11, r12 + eor r10, r10, lr, lsr #3 + mov r12, r9, ror #17 + add r11, r11, #0x00000100 + add lr, lr, r4 + add r10, r10, r11 + + eor r12, r12, r9, ror #19 + str r10, [sp, #22*4] + add lr, lr, #0x11000000 + eor r12, r12, r9, lsr #10 + add lr, lr, r12 + mov r12, r10, ror #17 + add r4, lr, #0x00002000 + eor r12, r12, r10, ror #19 + str r4, [sp, #23*4] + add r5, r5, #0x80000000 + eor r12, r12, r10, lsr #10 + add r5, r5, r12 + + mov r12, r4, ror #17 + str r5, [sp, #24*4] + eor r12, r12, r4, ror #19 + mov r11, r5, ror #17 + eor r12, r12, r4, lsr #10 + eor r11, r11, r5, ror #19 + add r6, r6, r12 + eor r11, r11, r5, lsr #10 + str r6, [sp, #25*4] + add r7, r7, r11 + + mov r12, r6, ror #17 + str r7, [sp, #26*4] + eor r12, r12, r6, ror #19 + mov r11, r7, ror #17 + eor r12, r12, r6, lsr #10 + eor r11, r11, r7, ror #19 + add r8, r8, r12 + eor r11, r11, r7, lsr #10 + str r8, [sp, #27*4] + add r9, r9, r11 + + mov lr, r8, ror #17 + mov r12, r9, ror #17 + str r9, [sp, #28*4] + add r4, r4, #0x00400000 + eor lr, lr, r8, ror #19 + eor r12, r12, r9, ror #19 + eor lr, lr, r8, lsr #10 + eor r12, r12, r9, lsr #10 + add r4, r4, #0x00000022 + add r10, r10, lr + add r4, r4, r12 + ldr r11, [sp, #16*4] + + add r5, r5, #0x00000100 + str r4, [sp, #30*4] + mov lr, r11, ror #7 + str r10, [sp, #29*4] + mov r12, r10, ror #17 + eor lr, lr, r11, ror #18 + eor r12, r12, r10, ror #19 + eor lr, lr, r11, lsr #3 + eor r12, r12, r10, lsr #10 + add r5, r5, lr + ldr lr, [r1, #17*4] + add r5, r5, r12 + + b sha256d_ms_extend_loop2 + +sha256d_ms_extend_coda2: + str r5, [r1, #(44+15)*4] + mov r12, r4, ror #17 + add r11, r11, r6 + mov r6, lr, ror #7 + eor r12, r12, r4, ror #19 + eor r6, r6, lr, ror #18 + eor r12, r12, r4, lsr #10 + eor r6, r6, lr, lsr #3 + add r12, r12, r11 + add r6, r6, r12 + str r6, [r1, #(44+16)*4] + + adr r2, sha256d_ms_h + ldmia r2, {r4-r11} + b sha256d_ms_main_loop2 + +sha256d_ms_h: + .long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a + .long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + +.macro sha256_main_round_red i, ka, rw, rd, re, rf, rg, rh + ldr r12, [\rw, #(\i)*4] + and r3, \rf, \re + bic lr, \rg, \re + add \rh, \rh, \rd + orr lr, lr, r3 + ldr r3, \ka + (\i)*4 + add \rh, \rh, lr + eor lr, \re, \re, ror #5 + add \rh, \rh, r12 + eor lr, lr, \re, ror #19 + add \rh, \rh, r3 + add \rh, \rh, lr, ror #6 +.endm + +sha256d_ms_finish: + sha256_main_round_red 57, sha256d_ms_k, r1, r6, r11, r8, r9, r10 + sha256_main_round_red 58, sha256d_ms_k, r1, r5, r10, r11, r8, r9 + sha256_main_round_red 59, sha256d_ms_k, r1, r4, r9, r10, r11, r8 + ldr r5, [r2, #7*4] + sha256_main_round_red 60, sha256d_ms_k, r1, r7, r8, r9, r10, r11 + + add r11, r11, r5 + str r11, [r0, #7*4] + + add sp, sp, #64*4 +#ifdef __thumb__ + ldmfd sp!, {r4-r11, lr} + bx lr +#else + ldmfd sp!, {r4-r11, pc} +#endif + + +#ifdef __ARM_NEON__ + + .text + .code 32 + .align 2 + .globl sha256_init_4way + .globl _sha256_init_4way +#ifdef __ELF__ + .type sha256_init_4way, %function +#endif +sha256_init_4way: +_sha256_init_4way: + adr r12, sha256_4h + vldmia r12, {q8-q15} + vstmia r0, {q8-q15} + bx lr + .align 4 +sha256_4h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + +.macro sha256_4k + .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 + .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 + .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf + .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 + .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b + .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 + .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 + .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 + .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 + .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 + .long 0x243185be, 0x243185be, 0x243185be, 0x243185be + .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 + .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 + .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe + .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 + .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 + .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 + .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 + .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 + .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc + .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f + .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa + .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc + .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da + .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 + .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d + .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 + .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 + .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 + .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 + .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 + .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 + .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 + .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 + .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc + .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 + .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 + .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb + .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e + .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 + .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 + .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b + .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 + .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 + .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 + .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 + .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 + .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 + .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 + .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 + .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c + .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 + .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 + .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a + .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f + .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 + .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee + .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f + .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 + .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 + .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa + .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb + .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 + .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 +.endm + +.macro sha256_4way_extend_doubleround_core i, rr, rw, ra, rb, ry, rz + vadd.u32 q5, q5, \ra + veor.u32 q4, q4, q0 + vshr.u32 q0, \ry, #19 + vshl.u32 q1, \ry, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 \ra, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 \ra, \ra, q0 + vshr.u32 q1, \ry, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 \ra, \ra, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 \ra, \ra, q1 + vadd.u32 q4, q4, q5 + veor.u32 \ra, \ra, q0 + vld1.u32 {q5}, [\rr]! + vadd.u32 \ra, \ra, q4 + + vshr.u32 q4, \rz, #17 + vshl.u32 q0, \rz, #32-17 + vadd.u32 q6, q6, \rb + vst1.u32 {\ra}, [\rw]! + veor.u32 q4, q4, q0 + vshr.u32 q0, \rz, #19 + vshl.u32 q1, \rz, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 \rb, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, \rz, #10 + veor.u32 \rb, \rb, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 \rb, \rb, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 \rb, \rb, q1 + vadd.u32 q1, q6, q4 + veor.u32 \rb, \rb, q0 +.endm + +.macro sha256_4way_extend_doubleround_head i, rr, rw, ra, rb, ry, rz + vld1.u32 {q6}, [\rr]! + vshr.u32 q4, \ry, #17 + vshl.u32 q0, \ry, #32-17 + sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz + vld1.u32 {q6}, [\rr]! + vadd.u32 \rb, \rb, q1 +.endm + +.macro sha256_4way_extend_doubleround_body i, rr, rw, ra, rb, ry, rz + vshr.u32 q4, \ry, #17 + vshl.u32 q0, \ry, #32-17 + vst1.u32 {\rz}, [\rw]! + sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz + vld1.u32 {q6}, [\rr]! + vadd.u32 \rb, \rb, q1 +.endm + +.macro sha256_4way_extend_doubleround_foot i, rr, rw, ra, rb, ry, rz + vshr.u32 q4, \ry, #17 + vshl.u32 q0, \ry, #32-17 + vst1.u32 {\rz}, [\rw]! + sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz + vadd.u32 \rb, \rb, q1 + vst1.u32 {\rb}, [\rw]! +.endm + +.macro sha256_4way_main_round i, rk, rw, ra, rb, rc, rd, re, rf, rg, rh + vld1.u32 {q8}, [\rw]! + vand.u32 q9, \rf, \re + vbic.u32 q10, \rg, \re + vshr.u32 q11, \re, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [\rk]! + vadd.u32 \rh, \rh, q10 + vshl.u32 q12, \re, #32-5 + veor.u32 q10, \re, q11 + vshr.u32 q11, \re, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, \re, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 \rh, \rh, q8 + veor.u32 q10, q10, q12 + vadd.u32 \rh, \rh, q9 + veor.u32 q9, \ra, \rb + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 \rh, \rh, q11 + + vshr.u32 q11, \ra, #11 + vshl.u32 q12, \ra, #32-11 + veor.u32 q8, \ra, q11 + vand.u32 q10, \ra, \rb + veor.u32 q8, q8, q12 + vshr.u32 q11, \ra, #20 + vshl.u32 q12, \ra, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, \rc + veor.u32 q8, q8, q12 + vadd.u32 \rh, \rh, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, \rh, q10 + vadd.u32 q12, q12, q11 + vadd.u32 \rh, \rh, \rd + vadd.u32 \rd, q9, q12 +.endm + +.macro sha256_4way_main_quadround i, rk, rw + sha256_4way_main_round \i+0, \rk, \rw, q0, q1, q2, q3, q4, q5, q6, q7 + sha256_4way_main_round \i+1, \rk, \rw, q3, q0, q1, q2, q7, q4, q5, q6 + sha256_4way_main_round \i+2, \rk, \rw, q2, q3, q0, q1, q6, q7, q4, q5 + sha256_4way_main_round \i+3, \rk, \rw, q1, q2, q3, q0, q5, q6, q7, q4 +.endm + + + .text + .code 32 + .align 2 + .globl sha256_transform_4way + .globl _sha256_transform_4way +#ifdef __ELF__ + .type sha256_transform_4way, %function +#endif +sha256_transform_4way: +_sha256_transform_4way: + stmfd sp!, {r4, lr} + vpush {q4-q7} + mov r12, sp + sub sp, sp, #64*16 + bic sp, sp, #63 + cmp r2, #0 + bne sha256_transform_4way_swap + + vldmia r1!, {q0-q7} + vstmia sp, {q0-q7} + add r3, sp, #8*16 + vldmia r1, {q8-q15} + vstmia r3, {q8-q15} + b sha256_transform_4way_extend + +sha256_transform_4way_swap: + vldmia r1!, {q0-q7} + vrev32.8 q0, q0 + vrev32.8 q1, q1 + vrev32.8 q2, q2 + vrev32.8 q3, q3 + vldmia r1, {q8-q15} + vrev32.8 q4, q4 + vrev32.8 q5, q5 + vrev32.8 q6, q6 + vrev32.8 q7, q7 + vstmia sp, {q0-q7} + vrev32.8 q8, q8 + vrev32.8 q9, q9 + vrev32.8 q10, q10 + vrev32.8 q11, q11 + vrev32.8 q12, q12 + vrev32.8 q13, q13 + vrev32.8 q14, q14 + vrev32.8 q15, q15 + add r3, sp, #8*16 + vstmia r3, {q8-q15} + +sha256_transform_4way_extend: + add r1, sp, #1*16 + add r2, sp, #16*16 + vmov.u32 q5, q0 + sha256_4way_extend_doubleround_head 0, r1, r2, q9, q10, q14, q15 + sha256_4way_extend_doubleround_body 2, r1, r2, q11, q12, q9, q10 + sha256_4way_extend_doubleround_body 4, r1, r2, q13, q14, q11, q12 + sha256_4way_extend_doubleround_body 6, r1, r2, q15, q9, q13, q14 + sha256_4way_extend_doubleround_body 8, r1, r2, q10, q11, q15, q9 + sha256_4way_extend_doubleround_body 10, r1, r2, q12, q13, q10, q11 + sha256_4way_extend_doubleround_body 12, r1, r2, q14, q15, q12, q13 + sha256_4way_extend_doubleround_body 14, r1, r2, q9, q10, q14, q15 + sha256_4way_extend_doubleround_body 16, r1, r2, q11, q12, q9, q10 + sha256_4way_extend_doubleround_body 18, r1, r2, q13, q14, q11, q12 + sha256_4way_extend_doubleround_body 20, r1, r2, q15, q9, q13, q14 + sha256_4way_extend_doubleround_body 22, r1, r2, q10, q11, q15, q9 + sha256_4way_extend_doubleround_body 24, r1, r2, q12, q13, q10, q11 + sha256_4way_extend_doubleround_body 26, r1, r2, q14, q15, q12, q13 + sha256_4way_extend_doubleround_body 28, r1, r2, q9, q10, q14, q15 + sha256_4way_extend_doubleround_body 30, r1, r2, q11, q12, q9, q10 + sha256_4way_extend_doubleround_body 32, r1, r2, q13, q14, q11, q12 + sha256_4way_extend_doubleround_body 34, r1, r2, q15, q9, q13, q14 + sha256_4way_extend_doubleround_body 36, r1, r2, q10, q11, q15, q9 + sha256_4way_extend_doubleround_body 38, r1, r2, q12, q13, q10, q11 + sha256_4way_extend_doubleround_body 40, r1, r2, q14, q15, q12, q13 + sha256_4way_extend_doubleround_body 42, r1, r2, q9, q10, q14, q15 + sha256_4way_extend_doubleround_body 44, r1, r2, q11, q12, q9, q10 + sha256_4way_extend_doubleround_foot 46, r1, r2, q13, q14, q11, q12 + + vldmia r0, {q0-q7} + adr r4, sha256_transform_4way_4k + b sha256_transform_4way_4k_over + .align 4 +sha256_transform_4way_4k: + sha256_4k +sha256_transform_4way_4k_over: + sha256_4way_main_quadround 0, r4, sp + sha256_4way_main_quadround 4, r4, sp + sha256_4way_main_quadround 8, r4, sp + sha256_4way_main_quadround 12, r4, sp + sha256_4way_main_quadround 16, r4, sp + sha256_4way_main_quadround 20, r4, sp + sha256_4way_main_quadround 24, r4, sp + sha256_4way_main_quadround 28, r4, sp + sha256_4way_main_quadround 32, r4, sp + sha256_4way_main_quadround 36, r4, sp + sha256_4way_main_quadround 40, r4, sp + sha256_4way_main_quadround 44, r4, sp + sha256_4way_main_quadround 48, r4, sp + sha256_4way_main_quadround 52, r4, sp + sha256_4way_main_quadround 56, r4, sp + sha256_4way_main_quadround 60, r4, sp + + vldmia r0, {q8-q15} + vadd.u32 q0, q0, q8 + vadd.u32 q1, q1, q9 + vadd.u32 q2, q2, q10 + vadd.u32 q3, q3, q11 + vadd.u32 q4, q4, q12 + vadd.u32 q5, q5, q13 + vadd.u32 q6, q6, q14 + vadd.u32 q7, q7, q15 + vstmia r0, {q0-q7} + + mov sp, r12 + vpop {q4-q7} + ldmfd sp!, {r4, pc} + + + .text + .code 32 + .align 2 + .globl sha256d_ms_4way + .globl _sha256d_ms_4way +#ifdef __ELF__ + .type sha256d_ms_4way, %function +#endif +sha256d_ms_4way: +_sha256d_ms_4way: + stmfd sp!, {r4, lr} + vpush {q4-q7} + mov r12, sp + sub sp, sp, #64*16 + bic sp, sp, #63 + + add r4, r1, #3*16 + vld1.u32 {q6}, [r4]! + add r1, r1, #18*16 + vldmia r1, {q11-q13} + cmp r0, r0 + + vshr.u32 q10, q6, #7 + vshl.u32 q0, q6, #32-7 + vshr.u32 q1, q6, #18 + veor.u32 q10, q10, q0 + vshl.u32 q0, q6, #32-18 + veor.u32 q10, q10, q1 + vshr.u32 q1, q6, #3 + veor.u32 q10, q10, q0 + vstmia sp!, {q11-q13} + veor.u32 q4, q10, q1 + vadd.u32 q12, q12, q6 + vadd.u32 q11, q11, q4 + + vshr.u32 q14, q12, #17 + vshr.u32 q4, q11, #17 + vshl.u32 q0, q11, #32-17 + vst1.u32 {q11}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q12}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q11, #10 + vshl.u32 q0, q12, #32-17 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vadd.u32 q13, q13, q4 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q14, q14, q0 + vst1.u32 {q13}, [r1]! + veor.u32 q14, q14, q1 + vshr.u32 q1, q12, #10 + + vshr.u32 q4, q13, #17 + vshl.u32 q0, q13, #32-17 + veor.u32 q14, q14, q1 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q14}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q13, #10 + vld1.u32 {q15}, [r1] + veor.u32 q4, q4, q1 + vst1.u32 {q15}, [sp]! + vadd.u32 q15, q15, q4 + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vst1.u32 {q15}, [r1]! + veor.u32 q4, q4, q0 + vld1.u32 {q9}, [r1] + veor.u32 q4, q4, q1 + vshr.u32 q1, q14, #10 + vst1.u32 {q9}, [sp]! + veor.u32 q5, q4, q1 + + vshr.u32 q4, q15, #17 + vadd.u32 q9, q9, q5 + vshl.u32 q0, q15, #32-17 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vst1.u32 {q9}, [r1]! + veor.u32 q4, q4, q0 + vld1.u32 {q10}, [r1] + veor.u32 q4, q4, q1 + vshr.u32 q1, q15, #10 + vst1.u32 {q10}, [sp]! + veor.u32 q4, q4, q1 + vshl.u32 q0, q9, #32-17 + vadd.u32 q10, q10, q4 + vshr.u32 q4, q9, #17 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + veor.u32 q4, q4, q1 + vshr.u32 q1, q9, #10 + veor.u32 q4, q4, q0 + vst1.u32 {q10}, [r1]! + veor.u32 q5, q4, q1 + + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vadd.u32 q11, q11, q5 + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q11}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q10, #10 + vshl.u32 q0, q11, #32-17 + veor.u32 q2, q4, q1 + vshr.u32 q4, q11, #17 + vadd.u32 q12, q12, q2 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + veor.u32 q4, q4, q1 + vshr.u32 q1, q11, #10 + veor.u32 q4, q4, q0 + vst1.u32 {q12}, [r1]! + veor.u32 q5, q4, q1 + + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vadd.u32 q13, q13, q5 + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q13}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q12, #10 + vshl.u32 q0, q13, #32-17 + veor.u32 q2, q4, q1 + vshr.u32 q4, q13, #17 + vadd.u32 q14, q14, q2 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + veor.u32 q4, q4, q1 + vshr.u32 q1, q13, #10 + veor.u32 q4, q4, q0 + vst1.u32 {q14}, [r1]! + veor.u32 q5, q4, q1 + add r4, r4, #12*16 + + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vadd.u32 q15, q15, q5 + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q15}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q14, #10 + vld1.u32 {q2}, [r1] + veor.u32 q4, q4, q1 + vshl.u32 q0, q15, #32-17 + vadd.u32 q9, q9, q4 + vst1.u32 {q2}, [sp]! + vadd.u32 q9, q9, q2 + vshr.u32 q4, q15, #17 + vshr.u32 q2, q15, #19 + veor.u32 q4, q4, q0 + vst1.u32 {q9}, [r1]! + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q2 + vshr.u32 q0, q15, #10 + veor.u32 q4, q4, q1 + vld1.u32 {q5-q6}, [r4]! + veor.u32 q4, q4, q0 + vld1.u32 {q2}, [r1] + vadd.u32 q10, q10, q4 + vst1.u32 {q2}, [sp]! + vadd.u32 q10, q10, q2 + + sub sp, sp, #8*16 + +sha256d_ms_4way_extend_loop2: + sha256_4way_extend_doubleround_body 16, r4, r1, q11, q12, q9, q10 + sha256_4way_extend_doubleround_body 18, r4, r1, q13, q14, q11, q12 + sha256_4way_extend_doubleround_body 20, r4, r1, q15, q9, q13, q14 + sha256_4way_extend_doubleround_body 22, r4, r1, q10, q11, q15, q9 + sha256_4way_extend_doubleround_body 24, r4, r1, q12, q13, q10, q11 + sha256_4way_extend_doubleround_body 26, r4, r1, q14, q15, q12, q13 + sha256_4way_extend_doubleround_body 28, r4, r1, q9, q10, q14, q15 + sha256_4way_extend_doubleround_body 30, r4, r1, q11, q12, q9, q10 + sha256_4way_extend_doubleround_body 32, r4, r1, q13, q14, q11, q12 + sha256_4way_extend_doubleround_body 34, r4, r1, q15, q9, q13, q14 + sha256_4way_extend_doubleround_body 36, r4, r1, q10, q11, q15, q9 + sha256_4way_extend_doubleround_body 38, r4, r1, q12, q13, q10, q11 + sha256_4way_extend_doubleround_body 40, r4, r1, q14, q15, q12, q13 + sha256_4way_extend_doubleround_body 42, r4, r1, q9, q10, q14, q15 + sha256_4way_extend_doubleround_body 44, r4, r1, q11, q12, q9, q10 + sha256_4way_extend_doubleround_foot 46, r4, r1, q13, q14, q11, q12 + bne sha256d_ms_4way_extend_coda2 + + vldmia r3!, {q4-q7} + vldmia r3, {q0-q3} + vswp q0, q4 + adr r3, sha256d_ms_4way_4k+3*16 + sub r1, r1, #(64-3)*16 + b sha256d_ms_4way_main_loop1 + + .align 4 +sha256d_ms_4way_4k: + sha256_4k + +sha256d_ms_4way_main_loop2: + sha256_4way_main_round 0, r3, r1, q0, q1, q2, q3, q4, q5, q6, q7 + sha256_4way_main_round 1, r3, r1, q3, q0, q1, q2, q7, q4, q5, q6 + sha256_4way_main_round 2, r3, r1, q2, q3, q0, q1, q6, q7, q4, q5 +sha256d_ms_4way_main_loop1: + sha256_4way_main_round 3, r3, r1, q1, q2, q3, q0, q5, q6, q7, q4 + sha256_4way_main_quadround 4, r3, r1 + sha256_4way_main_quadround 8, r3, r1 + sha256_4way_main_quadround 12, r3, r1 + sha256_4way_main_quadround 16, r3, r1 + sha256_4way_main_quadround 20, r3, r1 + sha256_4way_main_quadround 24, r3, r1 + sha256_4way_main_quadround 28, r3, r1 + sha256_4way_main_quadround 32, r3, r1 + sha256_4way_main_quadround 36, r3, r1 + sha256_4way_main_quadround 40, r3, r1 + sha256_4way_main_quadround 44, r3, r1 + sha256_4way_main_quadround 48, r3, r1 + sha256_4way_main_quadround 52, r3, r1 + sha256_4way_main_round 56, r3, r1, q0, q1, q2, q3, q4, q5, q6, q7 + bne sha256d_ms_4way_finish + sha256_4way_main_round 57, r3, r1, q3, q0, q1, q2, q7, q4, q5, q6 + sha256_4way_main_round 58, r3, r1, q2, q3, q0, q1, q6, q7, q4, q5 + sha256_4way_main_round 59, r3, r1, q1, q2, q3, q0, q5, q6, q7, q4 + sha256_4way_main_quadround 60, r3, r1 + + vldmia r2, {q8-q15} + vadd.u32 q0, q0, q8 + vadd.u32 q1, q1, q9 + vadd.u32 q2, q2, q10 + vadd.u32 q3, q3, q11 + vadd.u32 q4, q4, q12 + vadd.u32 q5, q5, q13 + vadd.u32 q6, q6, q14 + vadd.u32 q7, q7, q15 + + vldmia sp, {q8-q15} + sub r1, r1, #(64-18)*16 + vstmia r1, {q8-q10} + add r1, r1, #4*16 + vstmia r1, {q11-q13} + add r1, r1, #8*16 + vstmia r1, {q14-q15} + + vstmia sp, {q0-q7} + vmov.u32 q8, #0x80000000 + vmov.u32 q9, #0 + vmov.u32 q10, #0 + vmov.u32 q11, #0 + vmov.u32 q12, #0 + vmov.u32 q13, #0 + vmov.u32 q14, #0 + vmov.u32 q15, #0x00000100 + add r1, sp, #8*16 + vstmia r1!, {q8-q15} + adds r4, sp, #2*16 + + vshr.u32 q9, q1, #7 + vshl.u32 q2, q1, #32-7 + vshr.u32 q4, q1, #18 + veor.u32 q9, q9, q2 + vshl.u32 q3, q1, #32-18 + veor.u32 q9, q9, q4 + vshr.u32 q2, q1, #3 + veor.u32 q9, q9, q3 + vld1.u32 {q5}, [r4]! + veor.u32 q9, q9, q2 + vmov.u32 q7, #0x00a00000 + vadd.u32 q9, q9, q0 + vshr.u32 q10, q5, #7 + vshl.u32 q0, q5, #32-7 + vshl.u32 q3, q5, #32-18 + veor.u32 q10, q10, q0 + vshr.u32 q0, q5, #18 + veor.u32 q10, q10, q3 + vst1.u32 {q9}, [r1]! + vadd.u32 q3, q1, q7 + veor.u32 q10, q10, q0 + vshr.u32 q0, q5, #3 + vld1.u32 {q6}, [r4]! + veor.u32 q10, q10, q0 + + vshr.u32 q4, q9, #17 + vshl.u32 q0, q9, #32-17 + vadd.u32 q10, q10, q3 + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q11, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshr.u32 q1, q9, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q11, q11, q1 + vadd.u32 q4, q4, q5 + veor.u32 q11, q11, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q11, q11, q4 + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vst1.u32 {q10}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q12, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q10, #10 + veor.u32 q12, q12, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q12, q12, q0 + vshl.u32 q1, q5, #32-18 + vst1.u32 {q11}, [r1]! + veor.u32 q12, q12, q1 + vshr.u32 q0, q5, #3 + vadd.u32 q1, q6, q4 + veor.u32 q12, q12, q0 + + vshr.u32 q4, q11, #17 + vshl.u32 q0, q11, #32-17 + vadd.u32 q12, q12, q1 + vld1.u32 {q6}, [r4]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q13, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshr.u32 q1, q11, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q13, q13, q1 + vadd.u32 q4, q4, q5 + veor.u32 q13, q13, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q13, q13, q4 + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vst1.u32 {q12}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q14, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q12, #10 + veor.u32 q14, q14, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vshl.u32 q1, q5, #32-18 + vst1.u32 {q13}, [r1]! + veor.u32 q14, q14, q1 + vshr.u32 q0, q5, #3 + vadd.u32 q1, q6, q4 + veor.u32 q14, q14, q0 + + vshr.u32 q4, q13, #17 + vshl.u32 q0, q13, #32-17 + vadd.u32 q14, q14, q1 + vld1.u32 {q6}, [r4]! + vadd.u32 q5, q5, q15 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q15, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshr.u32 q1, q13, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q15, q15, q1 + vadd.u32 q4, q4, q5 + veor.u32 q15, q15, q0 + vmov.u32 q5, #0x80000000 + vadd.u32 q15, q15, q4 + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vadd.u32 q6, q6, q9 + vst1.u32 {q14}, [r1]! + vmov.u32 q7, #0x11000000 + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + vadd.u32 q6, q6, q7 + vmov.u32 q2, #0x00002000 + veor.u32 q4, q4, q0 + vst1.u32 {q15}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q14, #10 + vadd.u32 q6, q6, q2 + veor.u32 q1, q4, q1 + add r4, r4, #8*16 + + vshr.u32 q4, q15, #17 + vshl.u32 q0, q15, #32-17 + vadd.u32 q9, q6, q1 + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q9}, [r1]! + vadd.u32 q5, q5, q10 + veor.u32 q4, q4, q1 + vshr.u32 q1, q15, #10 + vshl.u32 q0, q9, #32-17 + veor.u32 q10, q4, q1 + vshr.u32 q4, q9, #17 + vadd.u32 q10, q10, q5 + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #10 + veor.u32 q4, q4, q1 + vst1.u32 {q10}, [r1]! + veor.u32 q1, q4, q0 + + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vadd.u32 q11, q11, q1 + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q11}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q10, #10 + vshl.u32 q0, q11, #32-17 + veor.u32 q1, q4, q1 + vshr.u32 q4, q11, #17 + vadd.u32 q12, q12, q1 + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #10 + veor.u32 q4, q4, q1 + vst1.u32 {q12}, [r1]! + veor.u32 q1, q4, q0 + + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vadd.u32 q13, q13, q1 + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q13}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q12, #10 + vshl.u32 q0, q13, #32-17 + veor.u32 q1, q4, q1 + vshr.u32 q4, q13, #17 + vadd.u32 q14, q14, q1 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #10 + veor.u32 q4, q4, q1 + vst1.u32 {q14}, [r1]! + veor.u32 q4, q4, q0 + vmov.u32 q6, #0x00000100 + vadd.u32 q15, q15, q4 + + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vmov.u32 q7, #0x00400000 + vst1.u32 {q15}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vadd.u32 q9, q9, q7 + veor.u32 q4, q4, q1 + vshr.u32 q1, q14, #10 + vmov.u32 q2, #0x00000022 + veor.u32 q4, q4, q1 + vadd.u32 q9, q9, q2 + vld1.u32 {q5}, [r4]! + vadd.u32 q9, q9, q4 + vshr.u32 q4, q15, #17 + vshl.u32 q0, q15, #32-17 + vadd.u32 q6, q6, q10 + vst1.u32 {q9}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q10, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q15, #10 + veor.u32 q10, q10, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q10, q10, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q10, q10, q1 + vadd.u32 q1, q6, q4 + veor.u32 q10, q10, q0 + vld1.u32 {q6}, [r4]! + vadd.u32 q10, q10, q1 + + b sha256d_ms_4way_extend_loop2 + + .align 4 +sha256d_ms_4way_4h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + +sha256d_ms_4way_extend_coda2: + adr r4, sha256d_ms_4way_4h + mov r1, sp + vldmia r4, {q0-q7} + vmov.u32 q15, q7 + sub r3, r3, #64*16 + b sha256d_ms_4way_main_loop2 + +.macro sha256_4way_main_round_red i, rk, rw, rd, re, rf, rg, rh + vld1.u32 {q8}, [\rw]! + vand.u32 q9, \rf, \re + vbic.u32 q10, \rg, \re + vshr.u32 q11, \re, #5 + vorr.u32 q10, q10, q9 + vshl.u32 q12, \re, #32-5 + vadd.u32 \rh, \rh, q10 + veor.u32 q10, \re, q11 + vshr.u32 q11, \re, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, \re, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 \rh, \rh, q8 + veor.u32 q10, q10, q12 + vld1.u32 {q9}, [\rk]! + vadd.u32 \rh, \rh, \rd + vshr.u32 q11, q10, #6 + vadd.u32 \rh, \rh, q9 + vshl.u32 q13, q10, #32-6 + vadd.u32 \rh, \rh, q11 + vadd.u32 \rh, \rh, q13 +.endm + +sha256d_ms_4way_finish: + sha256_4way_main_round_red 57, r3, r1, q2, q7, q4, q5, q6 + sha256_4way_main_round_red 58, r3, r1, q1, q6, q7, q4, q5 + sha256_4way_main_round_red 59, r3, r1, q0, q5, q6, q7, q4 + sha256_4way_main_round_red 60, r3, r1, q3, q4, q5, q6, q7 + + vadd.u32 q7, q7, q15 + add r0, r0, #7*16 + vst1.u32 {q7}, [r0] + + mov sp, r12 + vpop {q4-q7} + ldmfd sp!, {r4, pc} + + + .text + .code 32 + .align 2 + .globl sha256_use_4way + .globl _sha256_use_4way +#ifdef __ELF__ + .type sha256_use_4way, %function +#endif +sha256_use_4way: +_sha256_use_4way: + mov r0, #1 + bx lr + +#endif /* __ARM_NEON__ */ + +#endif diff --git a/src/crypto/sha2/asm/sha2-x86.S b/src/crypto/sha2/asm/sha2-x86.S new file mode 100644 index 0000000..65f2062 --- /dev/null +++ b/src/crypto/sha2/asm/sha2-x86.S @@ -0,0 +1,1191 @@ +/* + * Copyright 2012 pooler@litecoinpool.org + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif + +#if defined(__i386__) + + .data + .p2align 7 +sha256_4h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + + .data + .p2align 7 +sha256_4k: + .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 + .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 + .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf + .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 + .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b + .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 + .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 + .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 + .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 + .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 + .long 0x243185be, 0x243185be, 0x243185be, 0x243185be + .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 + .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 + .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe + .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 + .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 + .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 + .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 + .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 + .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc + .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f + .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa + .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc + .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da + .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 + .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d + .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 + .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 + .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 + .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 + .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 + .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 + .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 + .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 + .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc + .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 + .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 + .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb + .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e + .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 + .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 + .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b + .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 + .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 + .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 + .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 + .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 + .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 + .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 + .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 + .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c + .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 + .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 + .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a + .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f + .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 + .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee + .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f + .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 + .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 + .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa + .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb + .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 + .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 + + .data + .p2align 6 +sha256d_4preext2_15: + .long 0x00000100, 0x00000100, 0x00000100, 0x00000100 +sha256d_4preext2_17: + .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000 +sha256d_4preext2_23: + .long 0x11002000, 0x11002000, 0x11002000, 0x11002000 +sha256d_4preext2_24: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +sha256d_4preext2_30: + .long 0x00400022, 0x00400022, 0x00400022, 0x00400022 + + + .text + .p2align 5 + .globl sha256_init_4way + .globl _sha256_init_4way +sha256_init_4way: +_sha256_init_4way: + movl 4(%esp), %edx + movdqa sha256_4h+0, %xmm0 + movdqa sha256_4h+16, %xmm1 + movdqa sha256_4h+32, %xmm2 + movdqa sha256_4h+48, %xmm3 + movdqu %xmm0, 0(%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + movdqa sha256_4h+64, %xmm0 + movdqa sha256_4h+80, %xmm1 + movdqa sha256_4h+96, %xmm2 + movdqa sha256_4h+112, %xmm3 + movdqu %xmm0, 64(%edx) + movdqu %xmm1, 80(%edx) + movdqu %xmm2, 96(%edx) + movdqu %xmm3, 112(%edx) + ret + + +.macro sha256_sse2_extend_round i + movdqa (\i-15)*16(%eax), %xmm0 + movdqa %xmm0, %xmm2 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd (\i-16)*16(%eax), %xmm0 + paddd (\i-7)*16(%eax), %xmm0 + + movdqa %xmm3, %xmm2 + psrld $10, %xmm3 + pslld $13, %xmm2 + movdqa %xmm3, %xmm1 + psrld $7, %xmm1 + pxor %xmm1, %xmm3 + pxor %xmm2, %xmm3 + psrld $2, %xmm1 + pslld $2, %xmm2 + pxor %xmm1, %xmm3 + pxor %xmm2, %xmm3 + paddd %xmm0, %xmm3 + movdqa %xmm3, \i*16(%eax) +.endm + +.macro sha256_sse2_extend_doubleround i + movdqa (\i-15)*16(%eax), %xmm0 + movdqa (\i-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (\i-16)*16(%eax), %xmm0 + paddd (\i-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (\i-7)*16(%eax), %xmm0 + paddd (\i-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, \i*16(%eax) + movdqa %xmm7, (\i+1)*16(%eax) +.endm + +.macro sha256_sse2_main_round i + movdqa 16*(\i)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(\i)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 +.endm + +.macro sha256_sse2_main_quadround i + sha256_sse2_main_round \i+0 + sha256_sse2_main_round \i+1 + sha256_sse2_main_round \i+2 + sha256_sse2_main_round \i+3 +.endm + + +.macro p2bswap_esi_esp i + movdqu \i*16(%esi), %xmm0 + movdqu (\i+1)*16(%esi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, (\i+3)*16(%esp) + movdqa %xmm2, (\i+4)*16(%esp) +.endm + + .text + .p2align 5 + .globl sha256_transform_4way + .globl _sha256_transform_4way +sha256_transform_4way: +_sha256_transform_4way: + pushl %edi + pushl %esi + movl 12(%esp), %edi + movl 16(%esp), %esi + movl 20(%esp), %ecx + movl %esp, %edx + subl $67*16, %esp + andl $-128, %esp + + testl %ecx, %ecx + jnz sha256_transform_4way_swap + + movdqu 0*16(%esi), %xmm0 + movdqu 1*16(%esi), %xmm1 + movdqu 2*16(%esi), %xmm2 + movdqu 3*16(%esi), %xmm3 + movdqu 4*16(%esi), %xmm4 + movdqu 5*16(%esi), %xmm5 + movdqu 6*16(%esi), %xmm6 + movdqu 7*16(%esi), %xmm7 + movdqa %xmm0, 3*16(%esp) + movdqa %xmm1, 4*16(%esp) + movdqa %xmm2, 5*16(%esp) + movdqa %xmm3, 6*16(%esp) + movdqa %xmm4, 7*16(%esp) + movdqa %xmm5, 8*16(%esp) + movdqa %xmm6, 9*16(%esp) + movdqa %xmm7, 10*16(%esp) + movdqu 8*16(%esi), %xmm0 + movdqu 9*16(%esi), %xmm1 + movdqu 10*16(%esi), %xmm2 + movdqu 11*16(%esi), %xmm3 + movdqu 12*16(%esi), %xmm4 + movdqu 13*16(%esi), %xmm5 + movdqu 14*16(%esi), %xmm6 + movdqu 15*16(%esi), %xmm7 + movdqa %xmm0, 11*16(%esp) + movdqa %xmm1, 12*16(%esp) + movdqa %xmm2, 13*16(%esp) + movdqa %xmm3, 14*16(%esp) + movdqa %xmm4, 15*16(%esp) + movdqa %xmm5, 16*16(%esp) + movdqa %xmm6, 17*16(%esp) + movdqa %xmm7, 18*16(%esp) + jmp sha256_transform_4way_extend + + .p2align 5 +sha256_transform_4way_swap: + p2bswap_esi_esp 0 + p2bswap_esi_esp 2 + p2bswap_esi_esp 4 + p2bswap_esi_esp 6 + p2bswap_esi_esp 8 + p2bswap_esi_esp 10 + p2bswap_esi_esp 12 + p2bswap_esi_esp 14 + +sha256_transform_4way_extend: + leal 19*16(%esp), %ecx + leal 48*16(%ecx), %eax + movdqa -2*16(%ecx), %xmm3 + movdqa -1*16(%ecx), %xmm7 +sha256_transform_4way_extend_loop: + movdqa -15*16(%ecx), %xmm0 + movdqa -14*16(%ecx), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd -16*16(%ecx), %xmm0 + paddd -15*16(%ecx), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd -7*16(%ecx), %xmm0 + paddd -6*16(%ecx), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, (%ecx) + movdqa %xmm7, 16(%ecx) + addl $2*16, %ecx + cmpl %ecx, %eax + jne sha256_transform_4way_extend_loop + + movdqu 0(%edi), %xmm7 + movdqu 16(%edi), %xmm5 + movdqu 32(%edi), %xmm4 + movdqu 48(%edi), %xmm3 + movdqu 64(%edi), %xmm0 + movdqu 80(%edi), %xmm1 + movdqu 96(%edi), %xmm2 + movdqu 112(%edi), %xmm6 + movdqa %xmm1, 0(%esp) + movdqa %xmm2, 16(%esp) + movdqa %xmm6, 32(%esp) + + xorl %eax, %eax +sha256_transform_4way_main_loop: + movdqa 3*16(%esp, %eax), %xmm6 + paddd sha256_4k(%eax), %xmm6 + paddd 32(%esp), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + paddd %xmm6, %xmm0 + + movdqa %xmm5, %xmm1 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + pand %xmm5, %xmm2 + pand %xmm7, %xmm4 + pand %xmm7, %xmm1 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pxor %xmm1, %xmm7 + pslld $9, %xmm2 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pxor %xmm1, %xmm7 + pslld $11, %xmm2 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + + addl $16, %eax + cmpl $16*64, %eax + jne sha256_transform_4way_main_loop + + movdqu 0(%edi), %xmm1 + movdqu 16(%edi), %xmm2 + paddd %xmm1, %xmm7 + paddd %xmm2, %xmm5 + movdqu 32(%edi), %xmm1 + movdqu 48(%edi), %xmm2 + paddd %xmm1, %xmm4 + paddd %xmm2, %xmm3 + + movdqu %xmm7, 0(%edi) + movdqu %xmm5, 16(%edi) + movdqu %xmm4, 32(%edi) + movdqu %xmm3, 48(%edi) + + movdqu 64(%edi), %xmm1 + movdqu 80(%edi), %xmm2 + movdqu 96(%edi), %xmm6 + movdqu 112(%edi), %xmm7 + paddd %xmm1, %xmm0 + paddd 0(%esp), %xmm2 + paddd 16(%esp), %xmm6 + paddd 32(%esp), %xmm7 + + movdqu %xmm0, 64(%edi) + movdqu %xmm2, 80(%edi) + movdqu %xmm6, 96(%edi) + movdqu %xmm7, 112(%edi) + + movl %edx, %esp + popl %esi + popl %edi + ret + + + .text + .p2align 5 + .globl sha256d_ms_4way + .globl _sha256d_ms_4way +sha256d_ms_4way: +_sha256d_ms_4way: + pushl %edi + pushl %esi + pushl %ebp + movl 16(%esp), %edi + movl 20(%esp), %esi + movl 24(%esp), %edx + movl 28(%esp), %ecx + movl %esp, %ebp + subl $67*16, %esp + andl $-128, %esp + + leal 256(%esi), %eax + +sha256d_ms_4way_extend_loop1: + movdqa 3*16(%esi), %xmm0 + movdqa 2*16(%eax), %xmm3 + movdqa 3*16(%eax), %xmm7 + movdqa %xmm3, 5*16(%esp) + movdqa %xmm7, 6*16(%esp) + movdqa %xmm0, %xmm2 + paddd %xmm0, %xmm7 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd %xmm0, %xmm3 + movdqa %xmm3, 2*16(%eax) + movdqa %xmm7, 3*16(%eax) + + movdqa 4*16(%eax), %xmm0 + movdqa %xmm0, 7*16(%esp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + movdqa %xmm3, 4*16(%eax) + movdqa %xmm7, 5*16(%eax) + + movdqa 6*16(%eax), %xmm0 + movdqa 7*16(%eax), %xmm4 + movdqa %xmm0, 9*16(%esp) + movdqa %xmm4, 10*16(%esp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 6*16(%eax) + movdqa %xmm7, 7*16(%eax) + + movdqa 8*16(%eax), %xmm0 + movdqa 2*16(%eax), %xmm4 + movdqa %xmm0, 11*16(%esp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 8*16(%eax) + movdqa %xmm7, 9*16(%eax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 3*16(%eax), %xmm3 + paddd 4*16(%eax), %xmm7 + movdqa %xmm3, 10*16(%eax) + movdqa %xmm7, 11*16(%eax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 5*16(%eax), %xmm3 + paddd 6*16(%eax), %xmm7 + movdqa %xmm3, 12*16(%eax) + movdqa %xmm7, 13*16(%eax) + + movdqa 14*16(%eax), %xmm0 + movdqa 15*16(%eax), %xmm4 + movdqa %xmm0, 17*16(%esp) + movdqa %xmm4, 18*16(%esp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 7*16(%eax), %xmm0 + paddd 8*16(%eax), %xmm4 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 14*16(%eax) + movdqa %xmm7, 15*16(%eax) + +sha256d_ms_4way_extend_loop2: + sha256_sse2_extend_doubleround 16 + sha256_sse2_extend_doubleround 18 + sha256_sse2_extend_doubleround 20 + sha256_sse2_extend_doubleround 22 + sha256_sse2_extend_doubleround 24 + sha256_sse2_extend_doubleround 26 + sha256_sse2_extend_doubleround 28 + sha256_sse2_extend_doubleround 30 + sha256_sse2_extend_doubleround 32 + sha256_sse2_extend_doubleround 34 + sha256_sse2_extend_doubleround 36 + sha256_sse2_extend_doubleround 38 + sha256_sse2_extend_doubleround 40 + sha256_sse2_extend_doubleround 42 + jz sha256d_ms_4way_extend_coda2 + sha256_sse2_extend_doubleround 44 + sha256_sse2_extend_doubleround 46 + + movdqa 0(%ecx), %xmm3 + movdqa 16(%ecx), %xmm0 + movdqa 32(%ecx), %xmm1 + movdqa 48(%ecx), %xmm2 + movdqa 64(%ecx), %xmm6 + movdqa 80(%ecx), %xmm7 + movdqa 96(%ecx), %xmm5 + movdqa 112(%ecx), %xmm4 + movdqa %xmm1, 0(%esp) + movdqa %xmm2, 16(%esp) + movdqa %xmm6, 32(%esp) + + movl %esi, %eax + jmp sha256d_ms_4way_main_loop1 + +sha256d_ms_4way_main_loop2: + sha256_sse2_main_round 0 + sha256_sse2_main_round 1 + sha256_sse2_main_round 2 +sha256d_ms_4way_main_loop1: + sha256_sse2_main_round 3 + sha256_sse2_main_quadround 4 + sha256_sse2_main_quadround 8 + sha256_sse2_main_quadround 12 + sha256_sse2_main_quadround 16 + sha256_sse2_main_quadround 20 + sha256_sse2_main_quadround 24 + sha256_sse2_main_quadround 28 + sha256_sse2_main_quadround 32 + sha256_sse2_main_quadround 36 + sha256_sse2_main_quadround 40 + sha256_sse2_main_quadround 44 + sha256_sse2_main_quadround 48 + sha256_sse2_main_quadround 52 + sha256_sse2_main_round 56 + jz sha256d_ms_4way_finish + sha256_sse2_main_round 57 + sha256_sse2_main_round 58 + sha256_sse2_main_round 59 + sha256_sse2_main_quadround 60 + + movdqa 5*16(%esp), %xmm1 + movdqa 6*16(%esp), %xmm2 + movdqa 7*16(%esp), %xmm6 + movdqa %xmm1, 18*16(%esi) + movdqa %xmm2, 19*16(%esi) + movdqa %xmm6, 20*16(%esi) + movdqa 9*16(%esp), %xmm1 + movdqa 10*16(%esp), %xmm2 + movdqa 11*16(%esp), %xmm6 + movdqa %xmm1, 22*16(%esi) + movdqa %xmm2, 23*16(%esi) + movdqa %xmm6, 24*16(%esi) + movdqa 17*16(%esp), %xmm1 + movdqa 18*16(%esp), %xmm2 + movdqa %xmm1, 30*16(%esi) + movdqa %xmm2, 31*16(%esi) + + movdqa 0(%esp), %xmm1 + movdqa 16(%esp), %xmm2 + movdqa 32(%esp), %xmm6 + paddd 0(%edx), %xmm7 + paddd 16(%edx), %xmm5 + paddd 32(%edx), %xmm4 + paddd 48(%edx), %xmm3 + paddd 64(%edx), %xmm0 + paddd 80(%edx), %xmm1 + paddd 96(%edx), %xmm2 + paddd 112(%edx), %xmm6 + + movdqa %xmm7, 48+0(%esp) + movdqa %xmm5, 48+16(%esp) + movdqa %xmm4, 48+32(%esp) + movdqa %xmm3, 48+48(%esp) + movdqa %xmm0, 48+64(%esp) + movdqa %xmm1, 48+80(%esp) + movdqa %xmm2, 48+96(%esp) + movdqa %xmm6, 48+112(%esp) + + movdqa sha256d_4preext2_15, %xmm1 + movdqa sha256d_4preext2_24, %xmm2 + pxor %xmm0, %xmm0 + movdqa %xmm2, 48+128(%esp) + movdqa %xmm0, 48+144(%esp) + movdqa %xmm0, 48+160(%esp) + movdqa %xmm0, 48+176(%esp) + movdqa %xmm0, 48+192(%esp) + movdqa %xmm0, 48+208(%esp) + movdqa %xmm0, 48+224(%esp) + movdqa %xmm1, 48+240(%esp) + + leal 19*16(%esp), %eax + cmpl %eax, %eax + + movdqa -15*16(%eax), %xmm0 + movdqa -14*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + paddd -16*16(%eax), %xmm0 + paddd -15*16(%eax), %xmm4 + paddd sha256d_4preext2_17, %xmm4 + movdqa %xmm0, %xmm3 + movdqa %xmm4, %xmm7 + movdqa %xmm3, 0*16(%eax) + movdqa %xmm7, 1*16(%eax) + + sha256_sse2_extend_doubleround 2 + sha256_sse2_extend_doubleround 4 + + movdqa -9*16(%eax), %xmm0 + movdqa sha256d_4preext2_23, %xmm4 + movdqa %xmm0, %xmm2 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd -10*16(%eax), %xmm0 + paddd -9*16(%eax), %xmm4 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd -1*16(%eax), %xmm0 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + paddd 0*16(%eax), %xmm4 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 6*16(%eax) + movdqa %xmm7, 7*16(%eax) + + movdqa sha256d_4preext2_24, %xmm0 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 1*16(%eax), %xmm0 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd 2*16(%eax), %xmm7 + movdqa %xmm3, 8*16(%eax) + movdqa %xmm7, 9*16(%eax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 3*16(%eax), %xmm3 + paddd 4*16(%eax), %xmm7 + movdqa %xmm3, 10*16(%eax) + movdqa %xmm7, 11*16(%eax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 5*16(%eax), %xmm3 + paddd 6*16(%eax), %xmm7 + movdqa %xmm3, 12*16(%eax) + movdqa %xmm7, 13*16(%eax) + + movdqa sha256d_4preext2_30, %xmm0 + movdqa 0*16(%eax), %xmm4 + movdqa %xmm4, %xmm6 + psrld $3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $14, %xmm6 + psrld $4, %xmm5 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + psrld $11, %xmm5 + pslld $11, %xmm6 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + paddd -1*16(%eax), %xmm4 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 7*16(%eax), %xmm0 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + paddd 8*16(%eax), %xmm4 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 14*16(%eax) + movdqa %xmm7, 15*16(%eax) + + jmp sha256d_ms_4way_extend_loop2 + +sha256d_ms_4way_extend_coda2: + sha256_sse2_extend_round 44 + + movdqa sha256_4h+0, %xmm7 + movdqa sha256_4h+16, %xmm5 + movdqa sha256_4h+32, %xmm4 + movdqa sha256_4h+48, %xmm3 + movdqa sha256_4h+64, %xmm0 + movdqa sha256_4h+80, %xmm1 + movdqa sha256_4h+96, %xmm2 + movdqa sha256_4h+112, %xmm6 + movdqa %xmm1, 0(%esp) + movdqa %xmm2, 16(%esp) + movdqa %xmm6, 32(%esp) + + leal 48(%esp), %eax + jmp sha256d_ms_4way_main_loop2 + +.macro sha256_sse2_main_round_red i, r7 + movdqa 16*(\i)(%eax), %xmm6 + paddd 16*(\i)+sha256_4k, %xmm6 + paddd 32(%esp), %xmm6 + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + paddd \r7, %xmm6 + pandn %xmm2, %xmm1 + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + paddd %xmm1, %xmm6 + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm6, %xmm0 +.endm + +sha256d_ms_4way_finish: + sha256_sse2_main_round_red 57, %xmm3 + sha256_sse2_main_round_red 58, %xmm4 + sha256_sse2_main_round_red 59, %xmm5 + sha256_sse2_main_round_red 60, %xmm7 + + paddd sha256_4h+112, %xmm0 + movdqa %xmm0, 112(%edi) + + movl %ebp, %esp + popl %ebp + popl %esi + popl %edi + ret + + + .text + .p2align 5 + .globl sha256_use_4way + .globl _sha256_use_4way +sha256_use_4way: +_sha256_use_4way: + pushl %ebx + + /* Check for SSE2 availability */ + movl $1, %eax + cpuid + andl $0x04000000, %edx + jnz sha256_use_4way_sse2 + xorl %eax, %eax + popl %ebx + ret + +sha256_use_4way_sse2: + movl $1, %eax + popl %ebx + ret + +#endif diff --git a/src/crypto/sha2/asm/sha2-x86_64.S b/src/crypto/sha2/asm/sha2-x86_64.S new file mode 100644 index 0000000..9f3974b --- /dev/null +++ b/src/crypto/sha2/asm/sha2-x86_64.S @@ -0,0 +1,4164 @@ +/* + * Copyright 2012-2015 pooler@litecoinpool.org + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif + +#if defined(__x86_64__) + .data + .p2align 4 +sha256_h: + .long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a + .long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + + .data + .p2align 6 +sha256_k: + .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + +bswap_xmm_mask: + .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f + + +.macro sha256_mixed_quadround ra, rb, rc, rd, re, rf, rg, rh, x0, x1, x2, x3 + movdqa \x3, %xmm4 + movl \re, %eax + movdqa \x2, %xmm6 + rorl $(25-11), %eax + movl \ra, %ebx + pslldq $12, %xmm4 + rorl $(22-13), %ebx + psrldq $4, %xmm6 + xorl \re, %eax + movl \rf, %ecx + rorl $(11-6), %eax + pxor %xmm6, %xmm4 + movdqa \x1, %xmm5 + xorl \ra, %ebx + xorl \rg, %ecx + xorl \re, %eax + paddd \x0, %xmm4 + movdqa \x0, %xmm7 + andl \re, %ecx + rorl $(13-2), %ebx + xorl \ra, %ebx + pslldq $12, %xmm5 + psrldq $4, %xmm7 + rorl $6, %eax + xorl \rg, %ecx + pxor %xmm7, %xmm5 + rorl $2, %ebx + addl %eax, %ecx + addl (%rsp) , %ecx + movdqa %xmm5, %xmm6 + movl \ra, %eax + addl %ecx, \rh + movl \ra, %ecx + movdqa %xmm5, %xmm7 + orl \rc, %eax + addl \rh, \rd + andl \rc, %ecx + pslld $(32-7), %xmm5 + psrld $7, %xmm6 + andl \rb, %eax + addl %ebx, \rh + orl %ecx, %eax + por %xmm6, %xmm5 + addl %eax, \rh + + movl \rd, %eax + movdqa %xmm7, %xmm6 + movl \rh, %ebx + rorl $(25-11), %eax + xorl \rd, %eax + movdqa %xmm7, %xmm8 + movl \re, %ecx + rorl $(22-13), %ebx + xorl \rh, %ebx + pslld $(32-18), %xmm7 + rorl $(11-6), %eax + xorl \rf, %ecx + rorl $(13-2), %ebx + psrld $18, %xmm6 + xorl \rd, %eax + andl \rd, %ecx + rorl $6, %eax + pxor %xmm7, %xmm5 + xorl \rh, %ebx + xorl \rf, %ecx + psrld $3, %xmm8 + addl %eax, %ecx + addl 1*4(%rsp), %ecx + rorl $2, %ebx + pxor %xmm6, %xmm5 + movl \rh, %eax + addl %ecx, \rg + movl \rh, %ecx + pxor %xmm8, %xmm5 + orl \rb, %eax + addl \rg, \rc + andl \rb, %ecx + pshufd $0xfa, \x3, %xmm6 + andl \ra, %eax + addl %ebx, \rg + paddd %xmm5, %xmm4 + orl %ecx, %eax + addl %eax, \rg + + movl \rc, %eax + movdqa %xmm6, %xmm7 + movl \rg, %ebx + rorl $(25-11), %eax + xorl \rc, %eax + movdqa %xmm6, %xmm8 + rorl $(22-13), %ebx + movl \rd, %ecx + xorl \rg, %ebx + psrlq $17, %xmm6 + psrlq $19, %xmm7 + rorl $(11-6), %eax + xorl \re, %ecx + xorl \rc, %eax + psrld $10, %xmm8 + pxor %xmm7, %xmm6 + andl \rc, %ecx + rorl $(13-2), %ebx + xorl \rg, %ebx + pxor %xmm6, %xmm8 + xorl \re, %ecx + rorl $6, %eax + addl %eax, %ecx + pshufd $0x8f, %xmm8, %xmm8 + rorl $2, %ebx + addl 2*4(%rsp), %ecx + movl \rg, %eax + psrldq $8, %xmm8 + addl %ecx, \rf + movl \rg, %ecx + orl \ra, %eax + paddd %xmm8, %xmm4 + addl \rf, \rb + andl \ra, %ecx + andl \rh, %eax + pshufd $0x50, %xmm4, %xmm6 + addl %ebx, \rf + orl %ecx, %eax + addl %eax, \rf + + movdqa %xmm6, %xmm7 + movl \rb, %eax + rorl $(25-11), %eax + movl \rf, %ebx + movdqa %xmm6, \x0 + rorl $(22-13), %ebx + xorl \rb, %eax + movl \rc, %ecx + psrlq $17, %xmm6 + rorl $(11-6), %eax + xorl \rf, %ebx + xorl \rd, %ecx + psrlq $19, %xmm7 + xorl \rb, %eax + andl \rb, %ecx + rorl $(13-2), %ebx + psrld $10, \x0 + xorl \rf, %ebx + rorl $6, %eax + pxor %xmm7, %xmm6 + xorl \rd, %ecx + rorl $2, %ebx + addl %eax, %ecx + pxor %xmm6, \x0 + addl 3*4(%rsp), %ecx + movl \rf, %eax + addl %ecx, \re + pshufd $0xf8, \x0, \x0 + movl \rf, %ecx + orl \rh, %eax + addl \re, \ra + pslldq $8, \x0 + andl \rh, %ecx + andl \rg, %eax + paddd %xmm4, \x0 + addl %ebx, \re + orl %ecx, %eax + addl %eax, \re +.endm + +.macro sha256_main_round i, ra, rb, rc, rd, re, rf, rg, rh + movl \re, %eax + rorl $(25-11), %eax + movl \ra, %ebx + xorl \re, %eax + rorl $(22-13), %ebx + movl \rf, %ecx + xorl \ra, %ebx + rorl $(11-6), %eax + xorl \rg, %ecx + xorl \re, %eax + rorl $(13-2), %ebx + andl \re, %ecx + xorl \ra, %ebx + rorl $6, %eax + xorl \rg, %ecx + addl %eax, %ecx + rorl $2, %ebx + addl \i*4(%rsp), %ecx + movl \ra, %eax + addl %ecx, \rh + movl \ra, %ecx + orl \rc, %eax + addl \rh, \rd + andl \rc, %ecx + andl \rb, %eax + addl %ebx, \rh + orl %ecx, %eax + addl %eax, \rh +.endm + + + .text + .p2align 6 +sha256_transform_sse2: + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + pushq %rsi + subq $5*16, %rsp + movdqa %xmm6, 1*16(%rsp) + movdqa %xmm7, 2*16(%rsp) + movdqa %xmm8, 3*16(%rsp) + movdqa %xmm9, 4*16(%rsp) + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#else + subq $16, %rsp +#endif + + movl 0*4(%rdi), %r8d + movl 1*4(%rdi), %r9d + movl 2*4(%rdi), %r10d + movl 3*4(%rdi), %r11d + movl 4*4(%rdi), %r12d + movl 5*4(%rdi), %r13d + movl 6*4(%rdi), %r14d + movl 7*4(%rdi), %r15d + + testq %rdx, %rdx + jnz sha256_transform_sse2_swap + + movdqu 0*16(%rsi), %xmm0 + movdqu 1*16(%rsi), %xmm1 + movdqu 2*16(%rsi), %xmm2 + movdqu 3*16(%rsi), %xmm3 + jmp sha256_transform_sse2_core + +sha256_transform_sse2_swap: + movdqu 0*16(%rsi), %xmm0 + movdqu 1*16(%rsi), %xmm1 + movdqu 2*16(%rsi), %xmm2 + movdqu 3*16(%rsi), %xmm3 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm1, %xmm1 + pshuflw $0xb1, %xmm2, %xmm2 + pshuflw $0xb1, %xmm3, %xmm3 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm1, %xmm1 + pshufhw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm3, %xmm3 + movdqa %xmm0, %xmm4 + movdqa %xmm1, %xmm5 + movdqa %xmm2, %xmm6 + movdqa %xmm3, %xmm7 + psrlw $8, %xmm4 + psrlw $8, %xmm5 + psrlw $8, %xmm6 + psrlw $8, %xmm7 + psllw $8, %xmm0 + psllw $8, %xmm1 + psllw $8, %xmm2 + psllw $8, %xmm3 + pxor %xmm4, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 + +sha256_transform_sse2_core: + leaq sha256_k(%rip), %rdx + movq $48, %rsi + .p2align 4 +sha256_transform_sse2_loop: + movdqa 0*16(%rdx), %xmm9 + paddd %xmm0, %xmm9 + movdqa %xmm9, (%rsp) + sha256_mixed_quadround %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %xmm0, %xmm1, %xmm2, %xmm3 + movdqa 1*16(%rdx), %xmm9 + paddd %xmm1, %xmm9 + movdqa %xmm9, (%rsp) + sha256_mixed_quadround %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %xmm1, %xmm2, %xmm3, %xmm0 + movdqa 2*16(%rdx), %xmm9 + paddd %xmm2, %xmm9 + movdqa %xmm9, (%rsp) + sha256_mixed_quadround %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %xmm2, %xmm3, %xmm0, %xmm1 + movdqa 3*16(%rdx), %xmm9 + paddd %xmm3, %xmm9 + movdqa %xmm9, (%rsp) + addq $4*16, %rdx + sha256_mixed_quadround %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %xmm3, %xmm0, %xmm1, %xmm2 + + subq $16, %rsi + jne sha256_transform_sse2_loop + + paddd 0*16(%rdx), %xmm0 + movdqa %xmm0, (%rsp) + sha256_main_round 0, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d + sha256_main_round 1, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d + sha256_main_round 2, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d + sha256_main_round 3, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d + paddd 1*16(%rdx), %xmm1 + movdqa %xmm1, (%rsp) + sha256_main_round 0, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d + sha256_main_round 1, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d + sha256_main_round 2, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d + sha256_main_round 3, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d + paddd 2*16(%rdx), %xmm2 + movdqa %xmm2, (%rsp) + sha256_main_round 0, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d + sha256_main_round 1, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d + sha256_main_round 2, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d + sha256_main_round 3, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d + paddd 3*16(%rdx), %xmm3 + movdqa %xmm3, (%rsp) + sha256_main_round 0, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d + sha256_main_round 1, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d + sha256_main_round 2, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d + sha256_main_round 3, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d + + addl %r8d, 0*4(%rdi) + addl %r9d, 1*4(%rdi) + addl %r10d, 2*4(%rdi) + addl %r11d, 3*4(%rdi) + addl %r12d, 4*4(%rdi) + addl %r13d, 5*4(%rdi) + addl %r14d, 6*4(%rdi) + addl %r15d, 7*4(%rdi) + +#if defined(_WIN64) || defined(__CYGWIN__) + movdqa 1*16(%rsp), %xmm6 + movdqa 2*16(%rsp), %xmm7 + movdqa 3*16(%rsp), %xmm8 + movdqa 4*16(%rsp), %xmm9 + addq $5*16, %rsp + popq %rsi + popq %rdi +#else + addq $16, %rsp +#endif + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + ret + + + .text + .p2align 6 +sha256_transform_phe: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + movq %rsp, %r8 + subq $64, %rsp + andq $-64, %rsp + + testq %rdx, %rdx + jnz sha256_transform_phe_noswap + + movl 0*4(%rsi), %eax + movl 1*4(%rsi), %ecx + movl 2*4(%rsi), %edx + movl 3*4(%rsi), %r9d + bswapl %eax + bswapl %ecx + bswapl %edx + bswapl %r9d + movl %eax, 0*4(%rsp) + movl %ecx, 1*4(%rsp) + movl %edx, 2*4(%rsp) + movl %r9d, 3*4(%rsp) + movl 4*4(%rsi), %eax + movl 5*4(%rsi), %ecx + movl 6*4(%rsi), %edx + movl 7*4(%rsi), %r9d + bswapl %eax + bswapl %ecx + bswapl %edx + bswapl %r9d + movl %eax, 4*4(%rsp) + movl %ecx, 5*4(%rsp) + movl %edx, 6*4(%rsp) + movl %r9d, 7*4(%rsp) + + movdqu 2*16(%rsi), %xmm0 + movdqu 3*16(%rsi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, 2*16(%rsp) + movdqa %xmm2, 3*16(%rsp) + + jmp sha256_transform_phe_core + +sha256_transform_phe_noswap: + movdqu 0*16(%rsi), %xmm0 + movdqu 1*16(%rsi), %xmm1 + movdqu 2*16(%rsi), %xmm2 + movdqu 3*16(%rsi), %xmm3 + movdqa %xmm0, 0*16(%rsp) + movdqa %xmm1, 1*16(%rsp) + movdqa %xmm2, 2*16(%rsp) + movdqa %xmm3, 3*16(%rsp) + +sha256_transform_phe_core: + movq %rsp, %rsi + movq $-1, %rax + movq $1, %rcx + /* rep xsha256 */ + .byte 0xf3, 0x0f, 0xa6, 0xd0 + + movq %r8, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + popq %rdi +#endif + ret + + + .data + .p2align 3 +sha256_transform_addr: + .quad sha256_transform_sse2 + + .text + .p2align 3 + .globl sha256_transform + .globl _sha256_transform +sha256_transform: +_sha256_transform: + jmp *sha256_transform_addr(%rip) + + + .text + .p2align 6 + .globl sha256d_ms + .globl _sha256d_ms +sha256d_ms: +_sha256d_ms: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + movq %rsp, %r8 + subq $32, %rsp + andq $-32, %rsp + + movdqa 0*16(%rdx), %xmm0 + movdqa 1*16(%rdx), %xmm1 + movdqa %xmm0, 0*16(%rdi) + movdqa %xmm1, 1*16(%rdi) + + movl 0*4(%rsi), %eax + movl 1*4(%rsi), %ecx + movl 2*4(%rsi), %edx + movl 3*4(%rsi), %r9d + bswapl %eax + bswapl %ecx + bswapl %edx + bswapl %r9d + movl %eax, 0*4(%rsp) + movl %ecx, 1*4(%rsp) + movl %edx, 2*4(%rsp) + movl %r9d, 3*4(%rsp) + + movq %rsp, %rsi + movl $64, %eax + movl $80, %ecx + /* rep xsha256 */ + .byte 0xf3, 0x0f, 0xa6, 0xd0 + + movdqa bswap_xmm_mask(%rip), %xmm1 + movdqa 0*16(%rdi), %xmm0 + movdqa 1*16(%rdi), %xmm2 + pshufb %xmm1, %xmm0 + pshufb %xmm1, %xmm2 + movdqa %xmm0, 0*16(%rsp) + movdqa %xmm2, 1*16(%rsp) + + movdqa sha256_h+0*16(%rip), %xmm0 + movdqa sha256_h+1*16(%rip), %xmm1 + movdqa %xmm0, 0*16(%rdi) + movdqa %xmm1, 1*16(%rdi) + + movq %rsp, %rsi + xorq %rax, %rax + movl $32, %ecx + /* rep xsha256 */ + .byte 0xf3, 0x0f, 0xa6, 0xd0 + + movq %r8, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + popq %rdi +#endif + ret + + + .data + .p2align 7 +sha256_4h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + + .data + .p2align 7 +sha256_4k: + .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 + .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 + .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf + .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 + .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b + .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 + .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 + .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 + .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 + .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 + .long 0x243185be, 0x243185be, 0x243185be, 0x243185be + .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 + .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 + .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe + .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 + .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 + .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 + .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 + .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 + .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc + .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f + .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa + .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc + .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da + .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 + .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d + .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 + .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 + .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 + .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 + .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 + .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 + .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 + .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 + .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc + .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 + .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 + .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb + .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e + .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 + .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 + .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b + .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 + .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 + .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 + .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 + .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 + .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 + .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 + .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 + .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c + .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 + .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 + .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a + .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f + .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 + .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee + .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f + .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 + .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 + .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa + .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb + .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 + .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 + + .data + .p2align 6 +sha256d_4preext2_17: + .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000 +sha256d_4preext2_23: + .long 0x11002000, 0x11002000, 0x11002000, 0x11002000 +sha256d_4preext2_24: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +sha256d_4preext2_30: + .long 0x00400022, 0x00400022, 0x00400022, 0x00400022 + + .data + .p2align 7 +sha256_8h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + + .data + .p2align 7 +sha256_8k: + .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 + .long 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491 + .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf + .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 + .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b + .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 + .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 + .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 + .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 + .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 + .long 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be + .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 + .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 + .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe + .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 + .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 + .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 + .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 + .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 + .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc + .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f + .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa + .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc + .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da + .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 + .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d + .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 + .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 + .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 + .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 + .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 + .long 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967 + .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 + .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 + .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc + .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 + .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 + .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb + .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e + .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 + .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 + .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b + .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 + .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 + .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 + .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 + .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 + .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 + .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 + .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 + .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c + .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 + .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 + .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a + .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f + .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 + .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee + .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f + .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 + .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 + .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa + .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb + .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 + .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 + + .data + .p2align 6 +sha256d_8preext2_17: + .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000 +sha256d_8preext2_23: + .long 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000 +sha256d_8preext2_24: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 +sha256d_8preext2_30: + .long 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022 + + .text + .p2align 6 + .globl sha256_init_4way + .globl _sha256_init_4way +sha256_init_4way: +_sha256_init_4way: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + movq %rcx, %rdi +#endif + movdqa sha256_4h+0(%rip), %xmm0 + movdqa sha256_4h+16(%rip), %xmm1 + movdqa sha256_4h+32(%rip), %xmm2 + movdqa sha256_4h+48(%rip), %xmm3 + movdqu %xmm0, 0(%rdi) + movdqu %xmm1, 16(%rdi) + movdqu %xmm2, 32(%rdi) + movdqu %xmm3, 48(%rdi) + movdqa sha256_4h+64(%rip), %xmm0 + movdqa sha256_4h+80(%rip), %xmm1 + movdqa sha256_4h+96(%rip), %xmm2 + movdqa sha256_4h+112(%rip), %xmm3 + movdqu %xmm0, 64(%rdi) + movdqu %xmm1, 80(%rdi) + movdqu %xmm2, 96(%rdi) + movdqu %xmm3, 112(%rdi) +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rdi +#endif + ret + + .text + .p2align 6 + .globl sha256_init_8way + .globl _sha256_init_8way +sha256_init_8way: +_sha256_init_8way: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + movq %rcx, %rdi +#endif + vpbroadcastd sha256_4h+0(%rip), %ymm0 + vpbroadcastd sha256_4h+16(%rip), %ymm1 + vpbroadcastd sha256_4h+32(%rip), %ymm2 + vpbroadcastd sha256_4h+48(%rip), %ymm3 + vmovdqu %ymm0, 0*32(%rdi) + vmovdqu %ymm1, 1*32(%rdi) + vmovdqu %ymm2, 2*32(%rdi) + vmovdqu %ymm3, 3*32(%rdi) + vpbroadcastd sha256_4h+64(%rip), %ymm0 + vpbroadcastd sha256_4h+80(%rip), %ymm1 + vpbroadcastd sha256_4h+96(%rip), %ymm2 + vpbroadcastd sha256_4h+112(%rip), %ymm3 + vmovdqu %ymm0, 4*32(%rdi) + vmovdqu %ymm1, 5*32(%rdi) + vmovdqu %ymm2, 6*32(%rdi) + vmovdqu %ymm3, 7*32(%rdi) +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rdi +#endif + ret + +.macro sha256_sse2_extend_round i + movdqa (\i-15)*16(%rax), %xmm0 + movdqa %xmm0, %xmm2 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd (\i-16)*16(%rax), %xmm0 + paddd (\i-7)*16(%rax), %xmm0 + + movdqa %xmm3, %xmm2 + psrld $10, %xmm3 + pslld $13, %xmm2 + movdqa %xmm3, %xmm1 + psrld $7, %xmm1 + pxor %xmm1, %xmm3 + pxor %xmm2, %xmm3 + psrld $2, %xmm1 + pslld $2, %xmm2 + pxor %xmm1, %xmm3 + pxor %xmm2, %xmm3 + paddd %xmm0, %xmm3 + movdqa %xmm3, \i*16(%rax) +.endm + +.macro sha256_sse2_extend_doubleround i + movdqa (\i-15)*16(%rax), %xmm0 + movdqa (\i-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (\i-16)*16(%rax), %xmm0 + paddd (\i-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (\i-7)*16(%rax), %xmm0 + paddd (\i-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, \i*16(%rax) + movdqa %xmm7, (\i+1)*16(%rax) +.endm + +.macro sha256_sse2_main_round i + movdqa 16*(\i)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(\i)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 +.endm + +.macro sha256_sse2_main_quadround i + sha256_sse2_main_round \i+0 + sha256_sse2_main_round \i+1 + sha256_sse2_main_round \i+2 + sha256_sse2_main_round \i+3 +.endm + + +.macro sha256_avx_extend_round i + vmovdqa (\i-15)*16(%rax), %xmm0 + vpslld $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm0 + vpsrld $4, %xmm0, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpsrld $11, %xmm1, %xmm1 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 + vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpsrld $10, %xmm3, %xmm3 + vpsrld $7, %xmm3, %xmm1 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpsrld $2, %xmm1, %xmm1 + vpslld $2, %xmm2, %xmm2 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, \i*16(%rax) +.endm + +.macro sha256_avx_extend_doubleround i + vmovdqa (\i-15)*16(%rax), %xmm0 + vmovdqa (\i-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (\i-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 + vpaddd (\i-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, \i*16(%rax) + vmovdqa %xmm7, (\i+1)*16(%rax) +.endm + +.macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 + vpaddd 16*(\i)(%rax), \r0, %xmm6 + vpaddd 16*(\i)(%rcx), %xmm6, %xmm6 + + vpandn \r1, \r3, %xmm1 + vpand \r3, \r2, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, \r3, %xmm1 + vpsrld $6, \r3, \r0 + vpsrld $5, \r0, %xmm2 + vpxor %xmm1, \r0, \r0 + vpxor %xmm2, \r0, \r0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, \r0, \r0 + vpxor %xmm2, \r0, \r0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, \r0, \r0 + vpaddd \r0, %xmm6, %xmm6 + vpaddd %xmm6, \r4, \r0 + + vpand \r6, \r5, %xmm2 + vpand \r7, \r5, \r4 + vpand \r7, \r6, %xmm1 + vpxor \r4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, \r7, %xmm2 + vpsrld $2, \r7, \r4 + vpsrld $11, \r4, %xmm1 + vpxor %xmm2, \r4, \r4 + vpxor %xmm1, \r4, \r4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, \r4, \r4 + vpxor %xmm1, \r4, \r4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, \r4, \r4 + vpaddd %xmm6, \r4, \r4 +.endm + +.macro sha256_avx_main_quadround i + sha256_avx_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + sha256_avx_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_avx_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 + sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 +.endm + + +.macro sha256_avx2_extend_round i + vmovdqa (\i-15)*32(%rax), %ymm0 + vpslld $14, %ymm0, %ymm2 + vpsrld $3, %ymm0, %ymm0 + vpsrld $4, %ymm0, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpsrld $11, %ymm1, %ymm1 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpaddd (\i-16)*32(%rax), %ymm0, %ymm0 + vpaddd (\i-7)*32(%rax), %ymm0, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpsrld $10, %ymm3, %ymm3 + vpsrld $7, %ymm3, %ymm1 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm2, %ymm3, %ymm3 + vpsrld $2, %ymm1, %ymm1 + vpslld $2, %ymm2, %ymm2 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm0, %ymm3, %ymm3 + vmovdqa %ymm3, \i*32(%rax) +.endm + +.macro sha256_avx2_extend_doubleround i + vmovdqa (\i-15)*32(%rax), %ymm0 + vmovdqa (\i-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (\i-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (\i-7)*32(%rax), %ymm0, %ymm0 + vpaddd (\i-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, \i*32(%rax) + vmovdqa %ymm7, (\i+1)*32(%rax) +.endm + +.macro sha256_avx2_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 + vpaddd 32*(\i)(%rax), \r0, %ymm6 + vpaddd 32*(\i)(%rcx), %ymm6, %ymm6 + + vpandn \r1, \r3, %ymm1 + vpand \r3, \r2, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, \r3, %ymm1 + vpsrld $6, \r3, \r0 + vpsrld $5, \r0, %ymm2 + vpxor %ymm1, \r0, \r0 + vpxor %ymm2, \r0, \r0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, \r0, \r0 + vpxor %ymm2, \r0, \r0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, \r0, \r0 + vpaddd \r0, %ymm6, %ymm6 + vpaddd %ymm6, \r4, \r0 + + vpand \r6, \r5, %ymm2 + vpand \r7, \r5, \r4 + vpand \r7, \r6, %ymm1 + vpxor \r4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, \r7, %ymm2 + vpsrld $2, \r7, \r4 + vpsrld $11, \r4, %ymm1 + vpxor %ymm2, \r4, \r4 + vpxor %ymm1, \r4, \r4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, \r4, \r4 + vpxor %ymm1, \r4, \r4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, \r4, \r4 + vpaddd %ymm6, \r4, \r4 +.endm + +.macro sha256_avx2_main_quadround i + sha256_avx2_main_round \i+0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 + sha256_avx2_main_round \i+1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 + sha256_avx2_main_round \i+2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 + sha256_avx2_main_round \i+3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 +.endm + +.macro sha256_xop_extend_round i + vmovdqa (\i-15)*16(%rax), %xmm0 + vprotd $25, %xmm0, %xmm1 + vprotd $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm0 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm2, %xmm0, %xmm0 + + vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 + vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 + + vprotd $15, %xmm3, %xmm1 + vprotd $13, %xmm3, %xmm2 + vpsrld $10, %xmm3, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, \i*16(%rax) +.endm + +.macro sha256_xop_extend_doubleround i + vmovdqa (\i-15)*16(%rax), %xmm0 + vmovdqa (\i-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 + vpaddd (\i-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 + vpaddd (\i-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, \i*16(%rax) + vmovdqa %xmm7, (\i+1)*16(%rax) +.endm + +.macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 + vpaddd 16*(\i)(%rax), \r0, %xmm6 + vpaddd 16*(\i)(%rcx), %xmm6, %xmm6 + + vpandn \r1, \r3, %xmm1 + vpand \r3, \r2, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, \r3, %xmm1 + vprotd $21, \r3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, \r3, \r0 + vpxor %xmm2, \r0, \r0 + vpaddd \r0, %xmm6, %xmm6 + vpaddd %xmm6, \r4, \r0 + + vpand \r6, \r5, %xmm2 + vpand \r7, \r5, \r4 + vpand \r7, \r6, %xmm1 + vpxor \r4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, \r7, %xmm1 + vprotd $19, \r7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, \r7, \r4 + vpxor %xmm2, \r4, \r4 + vpaddd %xmm6, \r4, \r4 +.endm + +.macro sha256_xop_main_quadround i + sha256_xop_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + sha256_xop_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_xop_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 + sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 +.endm + + .text + .p2align 6 +sha256_transform_4way_core_sse2: + leaq 256(%rsp), %rcx + leaq 48*16(%rcx), %rax + movdqa -2*16(%rcx), %xmm3 + movdqa -1*16(%rcx), %xmm7 +sha256_transform_4way_sse2_extend_loop: + movdqa -15*16(%rcx), %xmm0 + movdqa -14*16(%rcx), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd -16*16(%rcx), %xmm0 + paddd -15*16(%rcx), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd -7*16(%rcx), %xmm0 + paddd -6*16(%rcx), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, (%rcx) + movdqa %xmm7, 16(%rcx) + addq $2*16, %rcx + cmpq %rcx, %rax + jne sha256_transform_4way_sse2_extend_loop + + movdqu 0(%rdi), %xmm7 + movdqu 16(%rdi), %xmm5 + movdqu 32(%rdi), %xmm4 + movdqu 48(%rdi), %xmm3 + movdqu 64(%rdi), %xmm0 + movdqu 80(%rdi), %xmm8 + movdqu 96(%rdi), %xmm9 + movdqu 112(%rdi), %xmm10 + + leaq sha256_4k(%rip), %rcx + xorq %rax, %rax +sha256_transform_4way_sse2_main_loop: + movdqa (%rsp, %rax), %xmm6 + paddd (%rcx, %rax), %xmm6 + paddd %xmm10, %xmm6 + + movdqa %xmm0, %xmm1 + movdqa %xmm9, %xmm2 + pandn %xmm2, %xmm1 + + movdqa %xmm2, %xmm10 + movdqa %xmm8, %xmm2 + movdqa %xmm2, %xmm9 + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, %xmm8 + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + paddd %xmm6, %xmm0 + + movdqa %xmm5, %xmm1 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + pand %xmm5, %xmm2 + pand %xmm7, %xmm4 + pand %xmm7, %xmm1 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pxor %xmm1, %xmm7 + pslld $9, %xmm2 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pxor %xmm1, %xmm7 + pslld $11, %xmm2 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + + addq $16, %rax + cmpq $16*64, %rax + jne sha256_transform_4way_sse2_main_loop + jmp sha256_transform_4way_finish + + .text + .p2align 6 +sha256_transform_4way_core_avx: + leaq 256(%rsp), %rax + movdqa -2*16(%rax), %xmm3 + movdqa -1*16(%rax), %xmm7 + sha256_avx_extend_doubleround 0 + sha256_avx_extend_doubleround 2 + sha256_avx_extend_doubleround 4 + sha256_avx_extend_doubleround 6 + sha256_avx_extend_doubleround 8 + sha256_avx_extend_doubleround 10 + sha256_avx_extend_doubleround 12 + sha256_avx_extend_doubleround 14 + sha256_avx_extend_doubleround 16 + sha256_avx_extend_doubleround 18 + sha256_avx_extend_doubleround 20 + sha256_avx_extend_doubleround 22 + sha256_avx_extend_doubleround 24 + sha256_avx_extend_doubleround 26 + sha256_avx_extend_doubleround 28 + sha256_avx_extend_doubleround 30 + sha256_avx_extend_doubleround 32 + sha256_avx_extend_doubleround 34 + sha256_avx_extend_doubleround 36 + sha256_avx_extend_doubleround 38 + sha256_avx_extend_doubleround 40 + sha256_avx_extend_doubleround 42 + sha256_avx_extend_doubleround 44 + sha256_avx_extend_doubleround 46 + movdqu 0(%rdi), %xmm7 + movdqu 16(%rdi), %xmm5 + movdqu 32(%rdi), %xmm4 + movdqu 48(%rdi), %xmm3 + movdqu 64(%rdi), %xmm0 + movdqu 80(%rdi), %xmm8 + movdqu 96(%rdi), %xmm9 + movdqu 112(%rdi), %xmm10 + movq %rsp, %rax + leaq sha256_4k(%rip), %rcx + sha256_avx_main_quadround 0 + sha256_avx_main_quadround 4 + sha256_avx_main_quadround 8 + sha256_avx_main_quadround 12 + sha256_avx_main_quadround 16 + sha256_avx_main_quadround 20 + sha256_avx_main_quadround 24 + sha256_avx_main_quadround 28 + sha256_avx_main_quadround 32 + sha256_avx_main_quadround 36 + sha256_avx_main_quadround 40 + sha256_avx_main_quadround 44 + sha256_avx_main_quadround 48 + sha256_avx_main_quadround 52 + sha256_avx_main_quadround 56 + sha256_avx_main_quadround 60 + jmp sha256_transform_4way_finish + + .text + .p2align 6 +sha256_transform_4way_core_xop: + leaq 256(%rsp), %rax + movdqa -2*16(%rax), %xmm3 + movdqa -1*16(%rax), %xmm7 + sha256_xop_extend_doubleround 0 + sha256_xop_extend_doubleround 2 + sha256_xop_extend_doubleround 4 + sha256_xop_extend_doubleround 6 + sha256_xop_extend_doubleround 8 + sha256_xop_extend_doubleround 10 + sha256_xop_extend_doubleround 12 + sha256_xop_extend_doubleround 14 + sha256_xop_extend_doubleround 16 + sha256_xop_extend_doubleround 18 + sha256_xop_extend_doubleround 20 + sha256_xop_extend_doubleround 22 + sha256_xop_extend_doubleround 24 + sha256_xop_extend_doubleround 26 + sha256_xop_extend_doubleround 28 + sha256_xop_extend_doubleround 30 + sha256_xop_extend_doubleround 32 + sha256_xop_extend_doubleround 34 + sha256_xop_extend_doubleround 36 + sha256_xop_extend_doubleround 38 + sha256_xop_extend_doubleround 40 + sha256_xop_extend_doubleround 42 + sha256_xop_extend_doubleround 44 + sha256_xop_extend_doubleround 46 + movdqu 0(%rdi), %xmm7 + movdqu 16(%rdi), %xmm5 + movdqu 32(%rdi), %xmm4 + movdqu 48(%rdi), %xmm3 + movdqu 64(%rdi), %xmm0 + movdqu 80(%rdi), %xmm8 + movdqu 96(%rdi), %xmm9 + movdqu 112(%rdi), %xmm10 + movq %rsp, %rax + leaq sha256_4k(%rip), %rcx + sha256_xop_main_quadround 0 + sha256_xop_main_quadround 4 + sha256_xop_main_quadround 8 + sha256_xop_main_quadround 12 + sha256_xop_main_quadround 16 + sha256_xop_main_quadround 20 + sha256_xop_main_quadround 24 + sha256_xop_main_quadround 28 + sha256_xop_main_quadround 32 + sha256_xop_main_quadround 36 + sha256_xop_main_quadround 40 + sha256_xop_main_quadround 44 + sha256_xop_main_quadround 48 + sha256_xop_main_quadround 52 + sha256_xop_main_quadround 56 + sha256_xop_main_quadround 60 + jmp sha256_transform_4way_finish + + .data + .p2align 3 +sha256_transform_4way_core_addr: + .quad 0x0 + +.macro p2bswap_rsi_rsp i + movdqu \i*16(%rsi), %xmm0 + movdqu (\i+1)*16(%rsi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, \i*16(%rsp) + movdqa %xmm2, (\i+1)*16(%rsp) +.endm + + .text + .p2align 6 + .globl sha256_transform_4way + .globl _sha256_transform_4way +sha256_transform_4way: +_sha256_transform_4way: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $96, %rsp + movdqa %xmm6, 0(%rsp) + movdqa %xmm7, 16(%rsp) + movdqa %xmm8, 32(%rsp) + movdqa %xmm9, 48(%rsp) + movdqa %xmm10, 64(%rsp) + movdqa %xmm11, 80(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + movq %rsp, %r8 + subq $1032, %rsp + andq $-128, %rsp + + testq %rdx, %rdx + jnz sha256_transform_4way_swap + + movdqu 0*16(%rsi), %xmm0 + movdqu 1*16(%rsi), %xmm1 + movdqu 2*16(%rsi), %xmm2 + movdqu 3*16(%rsi), %xmm3 + movdqu 4*16(%rsi), %xmm4 + movdqu 5*16(%rsi), %xmm5 + movdqu 6*16(%rsi), %xmm6 + movdqu 7*16(%rsi), %xmm7 + movdqa %xmm0, 0*16(%rsp) + movdqa %xmm1, 1*16(%rsp) + movdqa %xmm2, 2*16(%rsp) + movdqa %xmm3, 3*16(%rsp) + movdqa %xmm4, 4*16(%rsp) + movdqa %xmm5, 5*16(%rsp) + movdqa %xmm6, 6*16(%rsp) + movdqa %xmm7, 7*16(%rsp) + movdqu 8*16(%rsi), %xmm0 + movdqu 9*16(%rsi), %xmm1 + movdqu 10*16(%rsi), %xmm2 + movdqu 11*16(%rsi), %xmm3 + movdqu 12*16(%rsi), %xmm4 + movdqu 13*16(%rsi), %xmm5 + movdqu 14*16(%rsi), %xmm6 + movdqu 15*16(%rsi), %xmm7 + movdqa %xmm0, 8*16(%rsp) + movdqa %xmm1, 9*16(%rsp) + movdqa %xmm2, 10*16(%rsp) + movdqa %xmm3, 11*16(%rsp) + movdqa %xmm4, 12*16(%rsp) + movdqa %xmm5, 13*16(%rsp) + movdqa %xmm6, 14*16(%rsp) + movdqa %xmm7, 15*16(%rsp) + jmp *sha256_transform_4way_core_addr(%rip) + + .p2align 6 +sha256_transform_4way_swap: + p2bswap_rsi_rsp 0 + p2bswap_rsi_rsp 2 + p2bswap_rsi_rsp 4 + p2bswap_rsi_rsp 6 + p2bswap_rsi_rsp 8 + p2bswap_rsi_rsp 10 + p2bswap_rsi_rsp 12 + p2bswap_rsi_rsp 14 + jmp *sha256_transform_4way_core_addr(%rip) + + .p2align 6 +sha256_transform_4way_finish: + movdqu 0(%rdi), %xmm2 + movdqu 16(%rdi), %xmm6 + movdqu 32(%rdi), %xmm11 + movdqu 48(%rdi), %xmm1 + paddd %xmm2, %xmm7 + paddd %xmm6, %xmm5 + paddd %xmm11, %xmm4 + paddd %xmm1, %xmm3 + movdqu 64(%rdi), %xmm2 + movdqu 80(%rdi), %xmm6 + movdqu 96(%rdi), %xmm11 + movdqu 112(%rdi), %xmm1 + paddd %xmm2, %xmm0 + paddd %xmm6, %xmm8 + paddd %xmm11, %xmm9 + paddd %xmm1, %xmm10 + + movdqu %xmm7, 0(%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm4, 32(%rdi) + movdqu %xmm3, 48(%rdi) + movdqu %xmm0, 64(%rdi) + movdqu %xmm8, 80(%rdi) + movdqu %xmm9, 96(%rdi) + movdqu %xmm10, 112(%rdi) + + movq %r8, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + movdqa 0(%rsp), %xmm6 + movdqa 16(%rsp), %xmm7 + movdqa 32(%rsp), %xmm8 + movdqa 48(%rsp), %xmm9 + movdqa 64(%rsp), %xmm10 + movdqa 80(%rsp), %xmm11 + addq $96, %rsp + popq %rdi +#endif + ret + + .text + .p2align 6 +sha256_transform_8way_core_avx2: + leaq 8*64(%rsp), %rax + vmovdqa -2*32(%rax), %ymm3 + vmovdqa -1*32(%rax), %ymm7 + sha256_avx2_extend_doubleround 0 + sha256_avx2_extend_doubleround 2 + sha256_avx2_extend_doubleround 4 + sha256_avx2_extend_doubleround 6 + sha256_avx2_extend_doubleround 8 + sha256_avx2_extend_doubleround 10 + sha256_avx2_extend_doubleround 12 + sha256_avx2_extend_doubleround 14 + sha256_avx2_extend_doubleround 16 + sha256_avx2_extend_doubleround 18 + sha256_avx2_extend_doubleround 20 + sha256_avx2_extend_doubleround 22 + sha256_avx2_extend_doubleround 24 + sha256_avx2_extend_doubleround 26 + sha256_avx2_extend_doubleround 28 + sha256_avx2_extend_doubleround 30 + sha256_avx2_extend_doubleround 32 + sha256_avx2_extend_doubleround 34 + sha256_avx2_extend_doubleround 36 + sha256_avx2_extend_doubleround 38 + sha256_avx2_extend_doubleround 40 + sha256_avx2_extend_doubleround 42 + sha256_avx2_extend_doubleround 44 + sha256_avx2_extend_doubleround 46 + vmovdqu 0*32(%rdi), %ymm7 + vmovdqu 1*32(%rdi), %ymm5 + vmovdqu 2*32(%rdi), %ymm4 + vmovdqu 3*32(%rdi), %ymm3 + vmovdqu 4*32(%rdi), %ymm0 + vmovdqu 5*32(%rdi), %ymm8 + vmovdqu 6*32(%rdi), %ymm9 + vmovdqu 7*32(%rdi), %ymm10 + movq %rsp, %rax + leaq sha256_8k(%rip), %rcx + sha256_avx2_main_quadround 0 + sha256_avx2_main_quadround 4 + sha256_avx2_main_quadround 8 + sha256_avx2_main_quadround 12 + sha256_avx2_main_quadround 16 + sha256_avx2_main_quadround 20 + sha256_avx2_main_quadround 24 + sha256_avx2_main_quadround 28 + sha256_avx2_main_quadround 32 + sha256_avx2_main_quadround 36 + sha256_avx2_main_quadround 40 + sha256_avx2_main_quadround 44 + sha256_avx2_main_quadround 48 + sha256_avx2_main_quadround 52 + sha256_avx2_main_quadround 56 + sha256_avx2_main_quadround 60 + jmp sha256_transform_8way_finish + +.macro p2bswap_avx2_rsi_rsp i + vmovdqu \i*32(%rsi), %ymm0 + vmovdqu (\i+1)*32(%rsi), %ymm2 + vpshuflw $0xb1, %ymm0, %ymm0 + vpshuflw $0xb1, %ymm2, %ymm2 + vpshufhw $0xb1, %ymm0, %ymm0 + vpshufhw $0xb1, %ymm2, %ymm2 + vpsrlw $8, %ymm0, %ymm1 + vpsrlw $8, %ymm2, %ymm3 + vpsllw $8, %ymm0, %ymm0 + vpsllw $8, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm3, %ymm2, %ymm2 + vmovdqa %ymm0, \i*32(%rsp) + vmovdqa %ymm2, (\i+1)*32(%rsp) +.endm + + .text + .p2align 6 + .globl sha256_transform_8way + .globl _sha256_transform_8way +sha256_transform_8way: +_sha256_transform_8way: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $96, %rsp + vmovdqa %xmm6, 0(%rsp) + vmovdqa %xmm7, 16(%rsp) + vmovdqa %xmm8, 32(%rsp) + vmovdqa %xmm9, 48(%rsp) + vmovdqa %xmm10, 64(%rsp) + vmovdqa %xmm11, 80(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + movq %rsp, %r8 + subq $64*32, %rsp + andq $-128, %rsp + + testq %rdx, %rdx + jnz sha256_transform_8way_swap + + vmovdqu 0*32(%rsi), %ymm0 + vmovdqu 1*32(%rsi), %ymm1 + vmovdqu 2*32(%rsi), %ymm2 + vmovdqu 3*32(%rsi), %ymm3 + vmovdqu 4*32(%rsi), %ymm4 + vmovdqu 5*32(%rsi), %ymm5 + vmovdqu 6*32(%rsi), %ymm6 + vmovdqu 7*32(%rsi), %ymm7 + vmovdqa %ymm0, 0*32(%rsp) + vmovdqa %ymm1, 1*32(%rsp) + vmovdqa %ymm2, 2*32(%rsp) + vmovdqa %ymm3, 3*32(%rsp) + vmovdqa %ymm4, 4*32(%rsp) + vmovdqa %ymm5, 5*32(%rsp) + vmovdqa %ymm6, 6*32(%rsp) + vmovdqa %ymm7, 7*32(%rsp) + vmovdqu 8*32(%rsi), %ymm0 + vmovdqu 9*32(%rsi), %ymm1 + vmovdqu 10*32(%rsi), %ymm2 + vmovdqu 11*32(%rsi), %ymm3 + vmovdqu 12*32(%rsi), %ymm4 + vmovdqu 13*32(%rsi), %ymm5 + vmovdqu 14*32(%rsi), %ymm6 + vmovdqu 15*32(%rsi), %ymm7 + vmovdqa %ymm0, 8*32(%rsp) + vmovdqa %ymm1, 9*32(%rsp) + vmovdqa %ymm2, 10*32(%rsp) + vmovdqa %ymm3, 11*32(%rsp) + vmovdqa %ymm4, 12*32(%rsp) + vmovdqa %ymm5, 13*32(%rsp) + vmovdqa %ymm6, 14*32(%rsp) + vmovdqa %ymm7, 15*32(%rsp) + jmp sha256_transform_8way_core_avx2 + + .p2align 6 +sha256_transform_8way_swap: + p2bswap_avx2_rsi_rsp 0 + p2bswap_avx2_rsi_rsp 2 + p2bswap_avx2_rsi_rsp 4 + p2bswap_avx2_rsi_rsp 6 + p2bswap_avx2_rsi_rsp 8 + p2bswap_avx2_rsi_rsp 10 + p2bswap_avx2_rsi_rsp 12 + p2bswap_avx2_rsi_rsp 14 + jmp sha256_transform_8way_core_avx2 + + .p2align 6 +sha256_transform_8way_finish: + vmovdqu 0*32(%rdi), %ymm2 + vmovdqu 1*32(%rdi), %ymm6 + vmovdqu 2*32(%rdi), %ymm11 + vmovdqu 3*32(%rdi), %ymm1 + vpaddd %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd %ymm11, %ymm4, %ymm4 + vpaddd %ymm1, %ymm3, %ymm3 + vmovdqu 4*32(%rdi), %ymm2 + vmovdqu 5*32(%rdi), %ymm6 + vmovdqu 6*32(%rdi), %ymm11 + vmovdqu 7*32(%rdi), %ymm1 + vpaddd %ymm2, %ymm0, %ymm0 + vpaddd %ymm6, %ymm8, %ymm8 + vpaddd %ymm11, %ymm9, %ymm9 + vpaddd %ymm1, %ymm10, %ymm10 + + vmovdqu %ymm7, 0*32(%rdi) + vmovdqu %ymm5, 1*32(%rdi) + vmovdqu %ymm4, 2*32(%rdi) + vmovdqu %ymm3, 3*32(%rdi) + vmovdqu %ymm0, 4*32(%rdi) + vmovdqu %ymm8, 5*32(%rdi) + vmovdqu %ymm9, 6*32(%rdi) + vmovdqu %ymm10, 7*32(%rdi) + + movq %r8, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + vmovdqa 0(%rsp), %xmm6 + vmovdqa 16(%rsp), %xmm7 + vmovdqa 32(%rsp), %xmm8 + vmovdqa 48(%rsp), %xmm9 + vmovdqa 64(%rsp), %xmm10 + vmovdqa 80(%rsp), %xmm11 + addq $96, %rsp + popq %rdi +#endif + ret + + + .data + .p2align 3 +sha256d_ms_4way_addr: + .quad 0x0 + + .text + .p2align 6 + .globl sha256d_ms_4way + .globl _sha256d_ms_4way +sha256d_ms_4way: +_sha256d_ms_4way: + jmp *sha256d_ms_4way_addr(%rip) + + + .p2align 6 +sha256d_ms_4way_sse2: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $32, %rsp + movdqa %xmm6, 0(%rsp) + movdqa %xmm7, 16(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + subq $8+67*16, %rsp + + leaq 256(%rsi), %rax + +sha256d_ms_4way_sse2_extend_loop1: + movdqa 3*16(%rsi), %xmm0 + movdqa 2*16(%rax), %xmm3 + movdqa 3*16(%rax), %xmm7 + movdqa %xmm3, 5*16(%rsp) + movdqa %xmm7, 6*16(%rsp) + movdqa %xmm0, %xmm2 + paddd %xmm0, %xmm7 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd %xmm0, %xmm3 + movdqa %xmm3, 2*16(%rax) + movdqa %xmm7, 3*16(%rax) + + movdqa 4*16(%rax), %xmm0 + movdqa %xmm0, 7*16(%rsp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + movdqa %xmm3, 4*16(%rax) + movdqa %xmm7, 5*16(%rax) + + movdqa 6*16(%rax), %xmm0 + movdqa 7*16(%rax), %xmm4 + movdqa %xmm0, 9*16(%rsp) + movdqa %xmm4, 10*16(%rsp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 6*16(%rax) + movdqa %xmm7, 7*16(%rax) + + movdqa 8*16(%rax), %xmm0 + movdqa 2*16(%rax), %xmm4 + movdqa %xmm0, 11*16(%rsp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 8*16(%rax) + movdqa %xmm7, 9*16(%rax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 3*16(%rax), %xmm3 + paddd 4*16(%rax), %xmm7 + movdqa %xmm3, 10*16(%rax) + movdqa %xmm7, 11*16(%rax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 5*16(%rax), %xmm3 + paddd 6*16(%rax), %xmm7 + movdqa %xmm3, 12*16(%rax) + movdqa %xmm7, 13*16(%rax) + + movdqa 14*16(%rax), %xmm0 + movdqa 15*16(%rax), %xmm4 + movdqa %xmm0, 17*16(%rsp) + movdqa %xmm4, 18*16(%rsp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 7*16(%rax), %xmm0 + paddd 8*16(%rax), %xmm4 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 14*16(%rax) + movdqa %xmm7, 15*16(%rax) + +sha256d_ms_4way_sse2_extend_loop2: + sha256_sse2_extend_doubleround 16 + sha256_sse2_extend_doubleround 18 + sha256_sse2_extend_doubleround 20 + sha256_sse2_extend_doubleround 22 + sha256_sse2_extend_doubleround 24 + sha256_sse2_extend_doubleround 26 + sha256_sse2_extend_doubleround 28 + sha256_sse2_extend_doubleround 30 + sha256_sse2_extend_doubleround 32 + sha256_sse2_extend_doubleround 34 + sha256_sse2_extend_doubleround 36 + sha256_sse2_extend_doubleround 38 + sha256_sse2_extend_doubleround 40 + sha256_sse2_extend_doubleround 42 + jz sha256d_ms_4way_sse2_extend_coda2 + sha256_sse2_extend_doubleround 44 + sha256_sse2_extend_doubleround 46 + + movdqa 0(%rcx), %xmm3 + movdqa 16(%rcx), %xmm0 + movdqa 32(%rcx), %xmm1 + movdqa 48(%rcx), %xmm2 + movdqa 64(%rcx), %xmm6 + movdqa 80(%rcx), %xmm7 + movdqa 96(%rcx), %xmm5 + movdqa 112(%rcx), %xmm4 + movdqa %xmm1, 0(%rsp) + movdqa %xmm2, 16(%rsp) + movdqa %xmm6, 32(%rsp) + + movq %rsi, %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_sse2_main_loop1 + +sha256d_ms_4way_sse2_main_loop2: + sha256_sse2_main_round 0 + sha256_sse2_main_round 1 + sha256_sse2_main_round 2 +sha256d_ms_4way_sse2_main_loop1: + sha256_sse2_main_round 3 + sha256_sse2_main_quadround 4 + sha256_sse2_main_quadround 8 + sha256_sse2_main_quadround 12 + sha256_sse2_main_quadround 16 + sha256_sse2_main_quadround 20 + sha256_sse2_main_quadround 24 + sha256_sse2_main_quadround 28 + sha256_sse2_main_quadround 32 + sha256_sse2_main_quadround 36 + sha256_sse2_main_quadround 40 + sha256_sse2_main_quadround 44 + sha256_sse2_main_quadround 48 + sha256_sse2_main_quadround 52 + sha256_sse2_main_round 56 + jz sha256d_ms_4way_sse2_finish + sha256_sse2_main_round 57 + sha256_sse2_main_round 58 + sha256_sse2_main_round 59 + sha256_sse2_main_quadround 60 + + movdqa 5*16(%rsp), %xmm1 + movdqa 6*16(%rsp), %xmm2 + movdqa 7*16(%rsp), %xmm6 + movdqa %xmm1, 18*16(%rsi) + movdqa %xmm2, 19*16(%rsi) + movdqa %xmm6, 20*16(%rsi) + movdqa 9*16(%rsp), %xmm1 + movdqa 10*16(%rsp), %xmm2 + movdqa 11*16(%rsp), %xmm6 + movdqa %xmm1, 22*16(%rsi) + movdqa %xmm2, 23*16(%rsi) + movdqa %xmm6, 24*16(%rsi) + movdqa 17*16(%rsp), %xmm1 + movdqa 18*16(%rsp), %xmm2 + movdqa %xmm1, 30*16(%rsi) + movdqa %xmm2, 31*16(%rsi) + + movdqa 0(%rsp), %xmm1 + movdqa 16(%rsp), %xmm2 + movdqa 32(%rsp), %xmm6 + paddd 0(%rdx), %xmm7 + paddd 16(%rdx), %xmm5 + paddd 32(%rdx), %xmm4 + paddd 48(%rdx), %xmm3 + paddd 64(%rdx), %xmm0 + paddd 80(%rdx), %xmm1 + paddd 96(%rdx), %xmm2 + paddd 112(%rdx), %xmm6 + + movdqa %xmm7, 48+0(%rsp) + movdqa %xmm5, 48+16(%rsp) + movdqa %xmm4, 48+32(%rsp) + movdqa %xmm3, 48+48(%rsp) + movdqa %xmm0, 48+64(%rsp) + movdqa %xmm1, 48+80(%rsp) + movdqa %xmm2, 48+96(%rsp) + movdqa %xmm6, 48+112(%rsp) + + pxor %xmm0, %xmm0 + movq $0x8000000000000100, %rax + movd %rax, %xmm1 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0x00, %xmm1, %xmm1 + movdqa %xmm2, 48+128(%rsp) + movdqa %xmm0, 48+144(%rsp) + movdqa %xmm0, 48+160(%rsp) + movdqa %xmm0, 48+176(%rsp) + movdqa %xmm0, 48+192(%rsp) + movdqa %xmm0, 48+208(%rsp) + movdqa %xmm0, 48+224(%rsp) + movdqa %xmm1, 48+240(%rsp) + + leaq 19*16(%rsp), %rax + cmpq %rax, %rax + + movdqa -15*16(%rax), %xmm0 + movdqa -14*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + paddd -16*16(%rax), %xmm0 + paddd -15*16(%rax), %xmm4 + paddd sha256d_4preext2_17(%rip), %xmm4 + movdqa %xmm0, %xmm3 + movdqa %xmm4, %xmm7 + movdqa %xmm3, 0*16(%rax) + movdqa %xmm7, 1*16(%rax) + + sha256_sse2_extend_doubleround 2 + sha256_sse2_extend_doubleround 4 + + movdqa -9*16(%rax), %xmm0 + movdqa sha256d_4preext2_23(%rip), %xmm4 + movdqa %xmm0, %xmm2 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd -10*16(%rax), %xmm0 + paddd -9*16(%rax), %xmm4 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd -1*16(%rax), %xmm0 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + paddd 0*16(%rax), %xmm4 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 6*16(%rax) + movdqa %xmm7, 7*16(%rax) + + movdqa sha256d_4preext2_24(%rip), %xmm0 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 1*16(%rax), %xmm0 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd 2*16(%rax), %xmm7 + movdqa %xmm3, 8*16(%rax) + movdqa %xmm7, 9*16(%rax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 3*16(%rax), %xmm3 + paddd 4*16(%rax), %xmm7 + movdqa %xmm3, 10*16(%rax) + movdqa %xmm7, 11*16(%rax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 5*16(%rax), %xmm3 + paddd 6*16(%rax), %xmm7 + movdqa %xmm3, 12*16(%rax) + movdqa %xmm7, 13*16(%rax) + + movdqa sha256d_4preext2_30(%rip), %xmm0 + movdqa 0*16(%rax), %xmm4 + movdqa %xmm4, %xmm6 + psrld $3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $14, %xmm6 + psrld $4, %xmm5 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + psrld $11, %xmm5 + pslld $11, %xmm6 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + paddd -1*16(%rax), %xmm4 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 7*16(%rax), %xmm0 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + paddd 8*16(%rax), %xmm4 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 14*16(%rax) + movdqa %xmm7, 15*16(%rax) + + jmp sha256d_ms_4way_sse2_extend_loop2 + +sha256d_ms_4way_sse2_extend_coda2: + sha256_sse2_extend_round 44 + + movdqa sha256_4h+0(%rip), %xmm7 + movdqa sha256_4h+16(%rip), %xmm5 + movdqa sha256_4h+32(%rip), %xmm4 + movdqa sha256_4h+48(%rip), %xmm3 + movdqa sha256_4h+64(%rip), %xmm0 + movdqa sha256_4h+80(%rip), %xmm1 + movdqa sha256_4h+96(%rip), %xmm2 + movdqa sha256_4h+112(%rip), %xmm6 + movdqa %xmm1, 0(%rsp) + movdqa %xmm2, 16(%rsp) + movdqa %xmm6, 32(%rsp) + + leaq 48(%rsp), %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_sse2_main_loop2 + +.macro sha256_sse2_main_round_red i, r7 + movdqa 16*\i(%rax), %xmm6 + paddd 16*\i(%rcx), %xmm6 + paddd 32(%rsp), %xmm6 + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + paddd \r7, %xmm6 + pandn %xmm2, %xmm1 + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + paddd %xmm1, %xmm6 + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm6, %xmm0 +.endm + +sha256d_ms_4way_sse2_finish: + sha256_sse2_main_round_red 57, %xmm3 + sha256_sse2_main_round_red 58, %xmm4 + sha256_sse2_main_round_red 59, %xmm5 + sha256_sse2_main_round_red 60, %xmm7 + + paddd sha256_4h+112(%rip), %xmm0 + movdqa %xmm0, 112(%rdi) + + addq $8+67*16, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + movdqa 0(%rsp), %xmm6 + movdqa 16(%rsp), %xmm7 + addq $32, %rsp + popq %rdi +#endif + ret + + + .p2align 6 +sha256d_ms_4way_avx: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $80, %rsp + movdqa %xmm6, 0(%rsp) + movdqa %xmm7, 16(%rsp) + movdqa %xmm8, 32(%rsp) + movdqa %xmm9, 48(%rsp) + movdqa %xmm10, 64(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + subq $1032, %rsp + + leaq 256(%rsi), %rax + +sha256d_ms_4way_avx_extend_loop1: + vmovdqa 3*16(%rsi), %xmm0 + vmovdqa 2*16(%rax), %xmm3 + vmovdqa 3*16(%rax), %xmm7 + vmovdqa %xmm3, 2*16(%rsp) + vmovdqa %xmm7, 3*16(%rsp) + vpaddd %xmm0, %xmm7, %xmm7 + vpslld $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm0 + vpsrld $4, %xmm0, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpsrld $11, %xmm1, %xmm1 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, 2*16(%rax) + vmovdqa %xmm7, 3*16(%rax) + + vmovdqa 4*16(%rax), %xmm0 + vmovdqa %xmm0, 4*16(%rsp) + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, 4*16(%rax) + vmovdqa %xmm7, 5*16(%rax) + + vmovdqa 6*16(%rax), %xmm0 + vmovdqa 7*16(%rax), %xmm4 + vmovdqa %xmm0, 6*16(%rsp) + vmovdqa %xmm4, 7*16(%rsp) + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 6*16(%rax) + vmovdqa %xmm7, 7*16(%rax) + + vmovdqa 8*16(%rax), %xmm0 + vmovdqa 2*16(%rax), %xmm4 + vmovdqa %xmm0, 8*16(%rsp) + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 8*16(%rax) + vmovdqa %xmm7, 9*16(%rax) + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 3*16(%rax), %xmm3, %xmm3 + vpaddd 4*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 10*16(%rax) + vmovdqa %xmm7, 11*16(%rax) + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 5*16(%rax), %xmm3, %xmm3 + vpaddd 6*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 12*16(%rax) + vmovdqa %xmm7, 13*16(%rax) + + vmovdqa 14*16(%rax), %xmm0 + vmovdqa 15*16(%rax), %xmm4 + vmovdqa %xmm0, 14*16(%rsp) + vmovdqa %xmm4, 15*16(%rsp) + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpaddd 7*16(%rax), %xmm0, %xmm0 + vpaddd 8*16(%rax), %xmm4, %xmm4 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 14*16(%rax) + vmovdqa %xmm7, 15*16(%rax) + +sha256d_ms_4way_avx_extend_loop2: + sha256_avx_extend_doubleround 16 + sha256_avx_extend_doubleround 18 + sha256_avx_extend_doubleround 20 + sha256_avx_extend_doubleround 22 + sha256_avx_extend_doubleround 24 + sha256_avx_extend_doubleround 26 + sha256_avx_extend_doubleround 28 + sha256_avx_extend_doubleround 30 + sha256_avx_extend_doubleround 32 + sha256_avx_extend_doubleround 34 + sha256_avx_extend_doubleround 36 + sha256_avx_extend_doubleround 38 + sha256_avx_extend_doubleround 40 + sha256_avx_extend_doubleround 42 + jz sha256d_ms_4way_avx_extend_coda2 + sha256_avx_extend_doubleround 44 + sha256_avx_extend_doubleround 46 + + movdqa 0(%rcx), %xmm7 + movdqa 16(%rcx), %xmm8 + movdqa 32(%rcx), %xmm9 + movdqa 48(%rcx), %xmm10 + movdqa 64(%rcx), %xmm0 + movdqa 80(%rcx), %xmm5 + movdqa 96(%rcx), %xmm4 + movdqa 112(%rcx), %xmm3 + + movq %rsi, %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_avx_main_loop1 + +sha256d_ms_4way_avx_main_loop2: + sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 +sha256d_ms_4way_avx_main_loop1: + sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 + sha256_avx_main_quadround 4 + sha256_avx_main_quadround 8 + sha256_avx_main_quadround 12 + sha256_avx_main_quadround 16 + sha256_avx_main_quadround 20 + sha256_avx_main_quadround 24 + sha256_avx_main_quadround 28 + sha256_avx_main_quadround 32 + sha256_avx_main_quadround 36 + sha256_avx_main_quadround 40 + sha256_avx_main_quadround 44 + sha256_avx_main_quadround 48 + sha256_avx_main_quadround 52 + sha256_avx_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + jz sha256d_ms_4way_avx_finish + sha256_avx_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_avx_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 + sha256_avx_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 + sha256_avx_main_quadround 60 + + movdqa 2*16(%rsp), %xmm1 + movdqa 3*16(%rsp), %xmm2 + movdqa 4*16(%rsp), %xmm6 + movdqa %xmm1, 18*16(%rsi) + movdqa %xmm2, 19*16(%rsi) + movdqa %xmm6, 20*16(%rsi) + movdqa 6*16(%rsp), %xmm1 + movdqa 7*16(%rsp), %xmm2 + movdqa 8*16(%rsp), %xmm6 + movdqa %xmm1, 22*16(%rsi) + movdqa %xmm2, 23*16(%rsi) + movdqa %xmm6, 24*16(%rsi) + movdqa 14*16(%rsp), %xmm1 + movdqa 15*16(%rsp), %xmm2 + movdqa %xmm1, 30*16(%rsi) + movdqa %xmm2, 31*16(%rsi) + + paddd 0(%rdx), %xmm7 + paddd 16(%rdx), %xmm5 + paddd 32(%rdx), %xmm4 + paddd 48(%rdx), %xmm3 + paddd 64(%rdx), %xmm0 + paddd 80(%rdx), %xmm8 + paddd 96(%rdx), %xmm9 + paddd 112(%rdx), %xmm10 + + movdqa %xmm7, 0(%rsp) + movdqa %xmm5, 16(%rsp) + movdqa %xmm4, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm0, 64(%rsp) + movdqa %xmm8, 80(%rsp) + movdqa %xmm9, 96(%rsp) + movdqa %xmm10, 112(%rsp) + + pxor %xmm0, %xmm0 + movq $0x8000000000000100, %rax + movd %rax, %xmm1 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0x00, %xmm1, %xmm1 + movdqa %xmm2, 128(%rsp) + movdqa %xmm0, 144(%rsp) + movdqa %xmm0, 160(%rsp) + movdqa %xmm0, 176(%rsp) + movdqa %xmm0, 192(%rsp) + movdqa %xmm0, 208(%rsp) + movdqa %xmm0, 224(%rsp) + movdqa %xmm1, 240(%rsp) + + leaq 256(%rsp), %rax + cmpq %rax, %rax + + vmovdqa -15*16(%rax), %xmm0 + vmovdqa -14*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd -16*16(%rax), %xmm8, %xmm3 + vpaddd sha256d_4preext2_17(%rip), %xmm4, %xmm7 + vmovdqa %xmm3, 0*16(%rax) + vmovdqa %xmm7, 1*16(%rax) + + sha256_avx_extend_doubleround 2 + sha256_avx_extend_doubleround 4 + + vmovdqa -9*16(%rax), %xmm0 + vpslld $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm8 + vpsrld $7, %xmm0, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpsrld $11, %xmm1, %xmm1 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4 + vpaddd -10*16(%rax), %xmm8, %xmm0 + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpaddd -1*16(%rax), %xmm0, %xmm0 + vpaddd 0*16(%rax), %xmm4, %xmm4 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 6*16(%rax) + vmovdqa %xmm7, 7*16(%rax) + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3 + vpaddd 1*16(%rax), %xmm3, %xmm3 + vpaddd 2*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 8*16(%rax) + vmovdqa %xmm7, 9*16(%rax) + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 3*16(%rax), %xmm3, %xmm3 + vpaddd 4*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 10*16(%rax) + vmovdqa %xmm7, 11*16(%rax) + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 5*16(%rax), %xmm3, %xmm3 + vpaddd 6*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 12*16(%rax) + vmovdqa %xmm7, 13*16(%rax) + + vmovdqa sha256d_4preext2_30(%rip), %xmm0 + vmovdqa 0*16(%rax), %xmm4 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm4, %xmm4 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vpsrld $11, %xmm5, %xmm5 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vpaddd -1*16(%rax), %xmm4, %xmm4 + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpaddd 7*16(%rax), %xmm0, %xmm0 + vpaddd 8*16(%rax), %xmm4, %xmm4 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 14*16(%rax) + vmovdqa %xmm7, 15*16(%rax) + + jmp sha256d_ms_4way_avx_extend_loop2 + +sha256d_ms_4way_avx_extend_coda2: + sha256_avx_extend_round 44 + + movdqa sha256_4h+0(%rip), %xmm7 + movdqa sha256_4h+16(%rip), %xmm5 + movdqa sha256_4h+32(%rip), %xmm4 + movdqa sha256_4h+48(%rip), %xmm3 + movdqa sha256_4h+64(%rip), %xmm0 + movdqa sha256_4h+80(%rip), %xmm8 + movdqa sha256_4h+96(%rip), %xmm9 + movdqa sha256_4h+112(%rip), %xmm10 + + movq %rsp, %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_avx_main_loop2 + +.macro sha256_avx_main_round_red i, r0, r1, r2, r3, r4 + vpaddd 16*\i(%rax), \r0, %xmm6 + vpaddd 16*\i(%rcx), %xmm6, %xmm6 + vpandn \r1, \r3, %xmm1 + vpand \r3, \r2, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + vpslld $7, \r3, %xmm1 + vpsrld $6, \r3, \r0 + vpsrld $5, \r0, %xmm2 + vpxor %xmm1, \r0, \r0 + vpxor %xmm2, \r0, \r0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, \r0, \r0 + vpxor %xmm2, \r0, \r0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, \r0, \r0 + vpaddd \r0, %xmm6, %xmm6 + vpaddd %xmm6, \r4, \r0 +.endm + +sha256d_ms_4way_avx_finish: + sha256_avx_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4 + sha256_avx_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5 + sha256_avx_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7 + sha256_avx_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3 + + paddd sha256_4h+112(%rip), %xmm10 + movdqa %xmm10, 112(%rdi) + + addq $1032, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + movdqa 0(%rsp), %xmm6 + movdqa 16(%rsp), %xmm7 + movdqa 32(%rsp), %xmm8 + movdqa 48(%rsp), %xmm9 + movdqa 64(%rsp), %xmm10 + addq $80, %rsp + popq %rdi +#endif + ret + + .p2align 6 +sha256d_ms_4way_xop: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $80, %rsp + movdqa %xmm6, 0(%rsp) + movdqa %xmm7, 16(%rsp) + movdqa %xmm8, 32(%rsp) + movdqa %xmm9, 48(%rsp) + movdqa %xmm10, 64(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + subq $1032, %rsp + + leaq 256(%rsi), %rax + +sha256d_ms_4way_xop_extend_loop1: + vmovdqa 3*16(%rsi), %xmm0 + vmovdqa 2*16(%rax), %xmm3 + vmovdqa 3*16(%rax), %xmm7 + vmovdqa %xmm3, 2*16(%rsp) + vmovdqa %xmm7, 3*16(%rsp) + vpaddd %xmm0, %xmm7, %xmm7 + vprotd $25, %xmm0, %xmm1 + vprotd $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm0 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, 2*16(%rax) + vmovdqa %xmm7, 3*16(%rax) + + vmovdqa 4*16(%rax), %xmm0 + vmovdqa %xmm0, 4*16(%rsp) + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, 4*16(%rax) + vmovdqa %xmm7, 5*16(%rax) + + vmovdqa 6*16(%rax), %xmm0 + vmovdqa 7*16(%rax), %xmm4 + vmovdqa %xmm0, 6*16(%rsp) + vmovdqa %xmm4, 7*16(%rsp) + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 6*16(%rax) + vmovdqa %xmm7, 7*16(%rax) + + vmovdqa 8*16(%rax), %xmm0 + vmovdqa 2*16(%rax), %xmm4 + vmovdqa %xmm0, 8*16(%rsp) + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 8*16(%rax) + vmovdqa %xmm7, 9*16(%rax) + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 3*16(%rax), %xmm3, %xmm3 + vpaddd 4*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 10*16(%rax) + vmovdqa %xmm7, 11*16(%rax) + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 5*16(%rax), %xmm3, %xmm3 + vpaddd 6*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 12*16(%rax) + vmovdqa %xmm7, 13*16(%rax) + + vmovdqa 14*16(%rax), %xmm0 + vmovdqa 15*16(%rax), %xmm4 + vmovdqa %xmm0, 14*16(%rsp) + vmovdqa %xmm4, 15*16(%rsp) + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpaddd 7*16(%rax), %xmm0, %xmm0 + vpaddd 8*16(%rax), %xmm4, %xmm4 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 14*16(%rax) + vmovdqa %xmm7, 15*16(%rax) + +sha256d_ms_4way_xop_extend_loop2: + sha256_xop_extend_doubleround 16 + sha256_xop_extend_doubleround 18 + sha256_xop_extend_doubleround 20 + sha256_xop_extend_doubleround 22 + sha256_xop_extend_doubleround 24 + sha256_xop_extend_doubleround 26 + sha256_xop_extend_doubleround 28 + sha256_xop_extend_doubleround 30 + sha256_xop_extend_doubleround 32 + sha256_xop_extend_doubleround 34 + sha256_xop_extend_doubleround 36 + sha256_xop_extend_doubleround 38 + sha256_xop_extend_doubleround 40 + sha256_xop_extend_doubleround 42 + jz sha256d_ms_4way_xop_extend_coda2 + sha256_xop_extend_doubleround 44 + sha256_xop_extend_doubleround 46 + + movdqa 0(%rcx), %xmm7 + movdqa 16(%rcx), %xmm8 + movdqa 32(%rcx), %xmm9 + movdqa 48(%rcx), %xmm10 + movdqa 64(%rcx), %xmm0 + movdqa 80(%rcx), %xmm5 + movdqa 96(%rcx), %xmm4 + movdqa 112(%rcx), %xmm3 + + movq %rsi, %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_xop_main_loop1 + +sha256d_ms_4way_xop_main_loop2: + sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 +sha256d_ms_4way_xop_main_loop1: + sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 + sha256_xop_main_quadround 4 + sha256_xop_main_quadround 8 + sha256_xop_main_quadround 12 + sha256_xop_main_quadround 16 + sha256_xop_main_quadround 20 + sha256_xop_main_quadround 24 + sha256_xop_main_quadround 28 + sha256_xop_main_quadround 32 + sha256_xop_main_quadround 36 + sha256_xop_main_quadround 40 + sha256_xop_main_quadround 44 + sha256_xop_main_quadround 48 + sha256_xop_main_quadround 52 + sha256_xop_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + jz sha256d_ms_4way_xop_finish + sha256_xop_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_xop_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 + sha256_xop_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 + sha256_xop_main_quadround 60 + + movdqa 2*16(%rsp), %xmm1 + movdqa 3*16(%rsp), %xmm2 + movdqa 4*16(%rsp), %xmm6 + movdqa %xmm1, 18*16(%rsi) + movdqa %xmm2, 19*16(%rsi) + movdqa %xmm6, 20*16(%rsi) + movdqa 6*16(%rsp), %xmm1 + movdqa 7*16(%rsp), %xmm2 + movdqa 8*16(%rsp), %xmm6 + movdqa %xmm1, 22*16(%rsi) + movdqa %xmm2, 23*16(%rsi) + movdqa %xmm6, 24*16(%rsi) + movdqa 14*16(%rsp), %xmm1 + movdqa 15*16(%rsp), %xmm2 + movdqa %xmm1, 30*16(%rsi) + movdqa %xmm2, 31*16(%rsi) + + paddd 0(%rdx), %xmm7 + paddd 16(%rdx), %xmm5 + paddd 32(%rdx), %xmm4 + paddd 48(%rdx), %xmm3 + paddd 64(%rdx), %xmm0 + paddd 80(%rdx), %xmm8 + paddd 96(%rdx), %xmm9 + paddd 112(%rdx), %xmm10 + + movdqa %xmm7, 0(%rsp) + movdqa %xmm5, 16(%rsp) + movdqa %xmm4, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm0, 64(%rsp) + movdqa %xmm8, 80(%rsp) + movdqa %xmm9, 96(%rsp) + movdqa %xmm10, 112(%rsp) + + pxor %xmm0, %xmm0 + movq $0x8000000000000100, %rax + movd %rax, %xmm1 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0x00, %xmm1, %xmm1 + movdqa %xmm2, 128(%rsp) + movdqa %xmm0, 144(%rsp) + movdqa %xmm0, 160(%rsp) + movdqa %xmm0, 176(%rsp) + movdqa %xmm0, 192(%rsp) + movdqa %xmm0, 208(%rsp) + movdqa %xmm0, 224(%rsp) + movdqa %xmm1, 240(%rsp) + + leaq 256(%rsp), %rax + cmpq %rax, %rax + + vmovdqa -15*16(%rax), %xmm0 + vmovdqa -14*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd -16*16(%rax), %xmm8, %xmm3 + vpaddd sha256d_4preext2_17(%rip), %xmm4, %xmm7 + vmovdqa %xmm3, 0*16(%rax) + vmovdqa %xmm7, 1*16(%rax) + + sha256_xop_extend_doubleround 2 + sha256_xop_extend_doubleround 4 + + vmovdqa -9*16(%rax), %xmm0 + vprotd $25, %xmm0, %xmm1 + vprotd $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm8 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4 + vpaddd -10*16(%rax), %xmm8, %xmm0 + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpaddd -1*16(%rax), %xmm0, %xmm0 + vpaddd 0*16(%rax), %xmm4, %xmm4 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 6*16(%rax) + vmovdqa %xmm7, 7*16(%rax) + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3 + vpaddd 1*16(%rax), %xmm3, %xmm3 + vpaddd 2*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 8*16(%rax) + vmovdqa %xmm7, 9*16(%rax) + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 3*16(%rax), %xmm3, %xmm3 + vpaddd 4*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 10*16(%rax) + vmovdqa %xmm7, 11*16(%rax) + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 5*16(%rax), %xmm3, %xmm3 + vpaddd 6*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 12*16(%rax) + vmovdqa %xmm7, 13*16(%rax) + + vmovdqa sha256d_4preext2_30(%rip), %xmm0 + vmovdqa 0*16(%rax), %xmm4 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vpaddd -1*16(%rax), %xmm4, %xmm4 + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpaddd 7*16(%rax), %xmm0, %xmm0 + vpaddd 8*16(%rax), %xmm4, %xmm4 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 14*16(%rax) + vmovdqa %xmm7, 15*16(%rax) + + jmp sha256d_ms_4way_xop_extend_loop2 + +sha256d_ms_4way_xop_extend_coda2: + sha256_xop_extend_round 44 + + movdqa sha256_4h+0(%rip), %xmm7 + movdqa sha256_4h+16(%rip), %xmm5 + movdqa sha256_4h+32(%rip), %xmm4 + movdqa sha256_4h+48(%rip), %xmm3 + movdqa sha256_4h+64(%rip), %xmm0 + movdqa sha256_4h+80(%rip), %xmm8 + movdqa sha256_4h+96(%rip), %xmm9 + movdqa sha256_4h+112(%rip), %xmm10 + + movq %rsp, %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_xop_main_loop2 + +.macro sha256_xop_main_round_red i, r0, r1, r2, r3, r4 + vpaddd 16*\i(%rax), \r0, %xmm6 + vpaddd 16*\i(%rcx), %xmm6, %xmm6 + vpandn \r1, \r3, %xmm1 + vpand \r3, \r2, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + vprotd $26, \r3, %xmm1 + vprotd $21, \r3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, \r3, \r0 + vpxor %xmm2, \r0, \r0 + vpaddd \r0, %xmm6, %xmm6 + vpaddd %xmm6, \r4, \r0 +.endm + +sha256d_ms_4way_xop_finish: + sha256_xop_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4 + sha256_xop_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5 + sha256_xop_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7 + sha256_xop_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3 + + paddd sha256_4h+112(%rip), %xmm10 + movdqa %xmm10, 112(%rdi) + + addq $1032, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + movdqa 0(%rsp), %xmm6 + movdqa 16(%rsp), %xmm7 + movdqa 32(%rsp), %xmm8 + movdqa 48(%rsp), %xmm9 + movdqa 64(%rsp), %xmm10 + addq $80, %rsp + popq %rdi +#endif + ret + + .text + .p2align 6 + .globl sha256_use_4way + .globl _sha256_use_4way +sha256_use_4way: +_sha256_use_4way: + pushq %rbx + pushq %rcx + pushq %rdx + + /* Check for VIA PadLock Hash Engine */ + movl $0xc0000000, %eax + cpuid + cmpl $0xc0000001, %eax + jb sha256_use_4way_no_phe + movl $0xc0000001, %eax + cpuid + andl $0x00000c00, %edx + cmpl $0x00000c00, %edx + jne sha256_use_4way_no_phe + leaq sha256_transform_phe(%rip), %rdx + movq %rdx, sha256_transform_addr(%rip) + xorl %eax, %eax + jmp sha256_use_4way_exit +sha256_use_4way_no_phe: + /* Check for AVX and OSXSAVE support */ + movl $1, %eax + cpuid + andl $0x18000000, %ecx + cmpl $0x18000000, %ecx + jne sha256_use_4way_base + /* Check for XMM and YMM state support */ + xorl %ecx, %ecx + xgetbv + andl $0x00000006, %eax + cmpl $0x00000006, %eax + jne sha256_use_4way_base + /* Check for XOP support */ + movl $0x80000001, %eax + cpuid + andl $0x00000800, %ecx + jz sha256_use_4way_avx + +sha256_use_4way_xop: + leaq sha256d_ms_4way_xop(%rip), %rcx + leaq sha256_transform_4way_core_xop(%rip), %rdx + jmp sha256_use_4way_done + +sha256_use_4way_avx: + leaq sha256d_ms_4way_avx(%rip), %rcx + leaq sha256_transform_4way_core_avx(%rip), %rdx + jmp sha256_use_4way_done + +sha256_use_4way_base: + leaq sha256d_ms_4way_sse2(%rip), %rcx + leaq sha256_transform_4way_core_sse2(%rip), %rdx + +sha256_use_4way_done: + movq %rcx, sha256d_ms_4way_addr(%rip) + movq %rdx, sha256_transform_4way_core_addr(%rip) + movl $1, %eax +sha256_use_4way_exit: + popq %rdx + popq %rcx + popq %rbx + ret + + .text + .p2align 6 + .globl sha256d_ms_8way + .globl _sha256d_ms_8way +sha256d_ms_8way: +_sha256d_ms_8way: +sha256d_ms_8way_avx2: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $80, %rsp + vmovdqa %xmm6, 0(%rsp) + vmovdqa %xmm7, 16(%rsp) + vmovdqa %xmm8, 32(%rsp) + vmovdqa %xmm9, 48(%rsp) + vmovdqa %xmm10, 64(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + pushq %rbp + movq %rsp, %rbp + subq $64*32, %rsp + andq $-128, %rsp + + leaq 16*32(%rsi), %rax + +sha256d_ms_8way_avx2_extend_loop1: + vmovdqa 3*32(%rsi), %ymm0 + vmovdqa 2*32(%rax), %ymm3 + vmovdqa 3*32(%rax), %ymm7 + vmovdqa %ymm3, 2*32(%rsp) + vmovdqa %ymm7, 3*32(%rsp) + vpaddd %ymm0, %ymm7, %ymm7 + vpslld $14, %ymm0, %ymm2 + vpsrld $3, %ymm0, %ymm0 + vpsrld $4, %ymm0, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpsrld $11, %ymm1, %ymm1 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpaddd %ymm0, %ymm3, %ymm3 + vmovdqa %ymm3, 2*32(%rax) + vmovdqa %ymm7, 3*32(%rax) + + vmovdqa 4*32(%rax), %ymm0 + vmovdqa %ymm0, 4*32(%rsp) + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vmovdqa %ymm3, 4*32(%rax) + vmovdqa %ymm7, 5*32(%rax) + + vmovdqa 6*32(%rax), %ymm0 + vmovdqa 7*32(%rax), %ymm4 + vmovdqa %ymm0, 6*32(%rsp) + vmovdqa %ymm4, 7*32(%rsp) + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 6*32(%rax) + vmovdqa %ymm7, 7*32(%rax) + + vmovdqa 8*32(%rax), %ymm0 + vmovdqa 2*32(%rax), %ymm4 + vmovdqa %ymm0, 8*32(%rsp) + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 8*32(%rax) + vmovdqa %ymm7, 9*32(%rax) + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd 3*32(%rax), %ymm3, %ymm3 + vpaddd 4*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 10*32(%rax) + vmovdqa %ymm7, 11*32(%rax) + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd 5*32(%rax), %ymm3, %ymm3 + vpaddd 6*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 12*32(%rax) + vmovdqa %ymm7, 13*32(%rax) + + vmovdqa 14*32(%rax), %ymm0 + vmovdqa 15*32(%rax), %ymm4 + vmovdqa %ymm0, 14*32(%rsp) + vmovdqa %ymm4, 15*32(%rsp) + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpaddd 7*32(%rax), %ymm0, %ymm0 + vpaddd 8*32(%rax), %ymm4, %ymm4 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 14*32(%rax) + vmovdqa %ymm7, 15*32(%rax) + +sha256d_ms_8way_avx2_extend_loop2: + sha256_avx2_extend_doubleround 16 + sha256_avx2_extend_doubleround 18 + sha256_avx2_extend_doubleround 20 + sha256_avx2_extend_doubleround 22 + sha256_avx2_extend_doubleround 24 + sha256_avx2_extend_doubleround 26 + sha256_avx2_extend_doubleround 28 + sha256_avx2_extend_doubleround 30 + sha256_avx2_extend_doubleround 32 + sha256_avx2_extend_doubleround 34 + sha256_avx2_extend_doubleround 36 + sha256_avx2_extend_doubleround 38 + sha256_avx2_extend_doubleround 40 + sha256_avx2_extend_doubleround 42 + jz sha256d_ms_8way_avx2_extend_coda2 + sha256_avx2_extend_doubleround 44 + sha256_avx2_extend_doubleround 46 + + vmovdqa 0(%rcx), %ymm7 + vmovdqa 32(%rcx), %ymm8 + vmovdqa 64(%rcx), %ymm9 + vmovdqa 96(%rcx), %ymm10 + vmovdqa 128(%rcx), %ymm0 + vmovdqa 160(%rcx), %ymm5 + vmovdqa 192(%rcx), %ymm4 + vmovdqa 224(%rcx), %ymm3 + + movq %rsi, %rax + leaq sha256_8k(%rip), %rcx + jmp sha256d_ms_8way_avx2_main_loop1 + +sha256d_ms_8way_avx2_main_loop2: + sha256_avx2_main_round 0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 + sha256_avx2_main_round 1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 + sha256_avx2_main_round 2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 +sha256d_ms_8way_avx2_main_loop1: + sha256_avx2_main_round 3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 + sha256_avx2_main_quadround 4 + sha256_avx2_main_quadround 8 + sha256_avx2_main_quadround 12 + sha256_avx2_main_quadround 16 + sha256_avx2_main_quadround 20 + sha256_avx2_main_quadround 24 + sha256_avx2_main_quadround 28 + sha256_avx2_main_quadround 32 + sha256_avx2_main_quadround 36 + sha256_avx2_main_quadround 40 + sha256_avx2_main_quadround 44 + sha256_avx2_main_quadround 48 + sha256_avx2_main_quadround 52 + sha256_avx2_main_round 56, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 + jz sha256d_ms_8way_avx2_finish + sha256_avx2_main_round 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 + sha256_avx2_main_round 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 + sha256_avx2_main_round 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 + sha256_avx2_main_quadround 60 + + vmovdqa 2*32(%rsp), %ymm1 + vmovdqa 3*32(%rsp), %ymm2 + vmovdqa 4*32(%rsp), %ymm6 + vmovdqa %ymm1, 18*32(%rsi) + vmovdqa %ymm2, 19*32(%rsi) + vmovdqa %ymm6, 20*32(%rsi) + vmovdqa 6*32(%rsp), %ymm1 + vmovdqa 7*32(%rsp), %ymm2 + vmovdqa 8*32(%rsp), %ymm6 + vmovdqa %ymm1, 22*32(%rsi) + vmovdqa %ymm2, 23*32(%rsi) + vmovdqa %ymm6, 24*32(%rsi) + vmovdqa 14*32(%rsp), %ymm1 + vmovdqa 15*32(%rsp), %ymm2 + vmovdqa %ymm1, 30*32(%rsi) + vmovdqa %ymm2, 31*32(%rsi) + + vpaddd 0(%rdx), %ymm7, %ymm7 + vpaddd 32(%rdx), %ymm5, %ymm5 + vpaddd 64(%rdx), %ymm4, %ymm4 + vpaddd 96(%rdx), %ymm3, %ymm3 + vpaddd 128(%rdx), %ymm0, %ymm0 + vpaddd 160(%rdx), %ymm8, %ymm8 + vpaddd 192(%rdx), %ymm9, %ymm9 + vpaddd 224(%rdx), %ymm10, %ymm10 + + vmovdqa %ymm7, 0(%rsp) + vmovdqa %ymm5, 32(%rsp) + vmovdqa %ymm4, 64(%rsp) + vmovdqa %ymm3, 96(%rsp) + vmovdqa %ymm0, 128(%rsp) + vmovdqa %ymm8, 160(%rsp) + vmovdqa %ymm9, 192(%rsp) + vmovdqa %ymm10, 224(%rsp) + + vpxor %ymm0, %ymm0, %ymm0 + movq $0x8000000000000100, %rax + vmovd %rax, %xmm1 + vinserti128 $1, %xmm1, %ymm1, %ymm1 + vpshufd $0x55, %ymm1, %ymm2 + vpshufd $0x00, %ymm1, %ymm1 + vmovdqa %ymm2, 8*32(%rsp) + vmovdqa %ymm0, 9*32(%rsp) + vmovdqa %ymm0, 10*32(%rsp) + vmovdqa %ymm0, 11*32(%rsp) + vmovdqa %ymm0, 12*32(%rsp) + vmovdqa %ymm0, 13*32(%rsp) + vmovdqa %ymm0, 14*32(%rsp) + vmovdqa %ymm1, 15*32(%rsp) + + leaq 16*32(%rsp), %rax + cmpq %rax, %rax + + vmovdqa -15*32(%rax), %ymm0 + vmovdqa -14*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd -16*32(%rax), %ymm8, %ymm3 + vpaddd sha256d_8preext2_17(%rip), %ymm4, %ymm7 + vmovdqa %ymm3, 0*32(%rax) + vmovdqa %ymm7, 1*32(%rax) + + sha256_avx2_extend_doubleround 2 + sha256_avx2_extend_doubleround 4 + + vmovdqa -9*32(%rax), %ymm0 + vpslld $14, %ymm0, %ymm2 + vpsrld $3, %ymm0, %ymm8 + vpsrld $7, %ymm0, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpsrld $11, %ymm1, %ymm1 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpaddd sha256d_8preext2_23(%rip), %ymm0, %ymm4 + vpaddd -10*32(%rax), %ymm8, %ymm0 + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpaddd -1*32(%rax), %ymm0, %ymm0 + vpaddd 0*32(%rax), %ymm4, %ymm4 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 6*32(%rax) + vmovdqa %ymm7, 7*32(%rax) + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd sha256d_8preext2_24(%rip), %ymm3, %ymm3 + vpaddd 1*32(%rax), %ymm3, %ymm3 + vpaddd 2*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 8*32(%rax) + vmovdqa %ymm7, 9*32(%rax) + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd 3*32(%rax), %ymm3, %ymm3 + vpaddd 4*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 10*32(%rax) + vmovdqa %ymm7, 11*32(%rax) + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd 5*32(%rax), %ymm3, %ymm3 + vpaddd 6*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 12*32(%rax) + vmovdqa %ymm7, 13*32(%rax) + + vmovdqa sha256d_8preext2_30(%rip), %ymm0 + vmovdqa 0*32(%rax), %ymm4 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm4, %ymm4 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpsrld $11, %ymm5, %ymm5 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpaddd -1*32(%rax), %ymm4, %ymm4 + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpaddd 7*32(%rax), %ymm0, %ymm0 + vpaddd 8*32(%rax), %ymm4, %ymm4 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 14*32(%rax) + vmovdqa %ymm7, 15*32(%rax) + + jmp sha256d_ms_8way_avx2_extend_loop2 + +sha256d_ms_8way_avx2_extend_coda2: + sha256_avx2_extend_round 44 + + vmovdqa sha256_8h+0(%rip), %ymm7 + vmovdqa sha256_8h+32(%rip), %ymm5 + vmovdqa sha256_8h+64(%rip), %ymm4 + vmovdqa sha256_8h+96(%rip), %ymm3 + vmovdqa sha256_8h+128(%rip), %ymm0 + vmovdqa sha256_8h+160(%rip), %ymm8 + vmovdqa sha256_8h+192(%rip), %ymm9 + vmovdqa sha256_8h+224(%rip), %ymm10 + + movq %rsp, %rax + leaq sha256_8k(%rip), %rcx + jmp sha256d_ms_8way_avx2_main_loop2 + +.macro sha256_avx2_main_round_red i, r0, r1, r2, r3, r4 + vpaddd 32*\i(%rax), \r0, %ymm6 + vpaddd 32*\i(%rcx), %ymm6, %ymm6 + vpandn \r1, \r3, %ymm1 + vpand \r3, \r2, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + vpslld $7, \r3, %ymm1 + vpsrld $6, \r3, \r0 + vpsrld $5, \r0, %ymm2 + vpxor %ymm1, \r0, \r0 + vpxor %ymm2, \r0, \r0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, \r0, \r0 + vpxor %ymm2, \r0, \r0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, \r0, \r0 + vpaddd \r0, %ymm6, %ymm6 + vpaddd %ymm6, \r4, \r0 +.endm + +sha256d_ms_8way_avx2_finish: + sha256_avx2_main_round_red 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4 + sha256_avx2_main_round_red 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5 + sha256_avx2_main_round_red 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7 + sha256_avx2_main_round_red 60, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3 + + vpaddd sha256_8h+224(%rip), %ymm10, %ymm10 + vmovdqa %ymm10, 224(%rdi) + + movq %rbp, %rsp + popq %rbp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + vmovdqa 0(%rsp), %xmm6 + vmovdqa 16(%rsp), %xmm7 + vmovdqa 32(%rsp), %xmm8 + vmovdqa 48(%rsp), %xmm9 + vmovdqa 64(%rsp), %xmm10 + addq $80, %rsp + popq %rdi +#endif + ret + + + .text + .p2align 6 + .globl sha256_use_8way + .globl _sha256_use_8way +sha256_use_8way: +_sha256_use_8way: + + pushq %rbx + /* Check for AVX and OSXSAVE support */ + movl $1, %eax + cpuid + andl $0x18000000, %ecx + cmpl $0x18000000, %ecx + jne sha256_use_8way_no + /* Check for AVX2 support */ + movl $7, %eax + xorl %ecx, %ecx + cpuid + andl $0x00000020, %ebx + cmpl $0x00000020, %ebx + jne sha256_use_8way_no + /* Check for XMM and YMM state support */ + xorl %ecx, %ecx + xgetbv + andl $0x00000006, %eax + cmpl $0x00000006, %eax + jne sha256_use_8way_no + +sha256_use_8way_yes: + movl $1, %eax + jmp sha256_use_8way_done + +sha256_use_8way_no: + xorl %eax, %eax + +sha256_use_8way_done: + popq %rbx + ret + +#endif diff --git a/src/crypto/sha2/asm/sha256_avx1.asm b/src/crypto/sha2/asm/sha256_avx1.asm deleted file mode 100644 index 5917858..0000000 --- a/src/crypto/sha2/asm/sha256_avx1.asm +++ /dev/null @@ -1,766 +0,0 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright (c) 2012, Intel Corporation -; -; All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions are -; met: -; -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in the -; documentation and/or other materials provided with the -; distribution. -; -; * Neither the name of the Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived from -; this software without specific prior written permission. -; -; -; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY -; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR -; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; -; Example YASM command lines: -; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx1.obj -g cv8 sha256_avx1.asm -; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx1.o sha256_avx1.asm -; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; -; This code is described in an Intel White-Paper: -; "Fast SHA-256 Implementations on Intel Architecture Processors" -; -; To find it, surf to http://www.intel.com/p/en_US/embedded -; and search for that title. -; The paper is expected to be released roughly at the end of April, 2012 -; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; This code schedules 1 blocks at a time, with 4 lanes per block -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define VMOVDQ vmovdqu ;; assume buffers not aligned - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros - -; addm [mem], reg -; Add reg to mem using reg-mem add and store -%macro addm 2 - add %2, %1 - mov %1, %2 -%endm - -%macro MY_ROR 2 - shld %1,%1,(32-(%2)) -%endm - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask -; Load xmm with mem and byte swap each dword -%macro COPY_XMM_AND_BSWAP 3 - VMOVDQ %1, %2 - vpshufb %1, %1, %3 -%endmacro - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define X0 xmm4 -%define X1 xmm5 -%define X2 xmm6 -%define X3 xmm7 - -%define XTMP0 xmm0 -%define XTMP1 xmm1 -%define XTMP2 xmm2 -%define XTMP3 xmm3 -%define XTMP4 xmm8 -%define XFER xmm9 -%define XTMP5 xmm11 - -%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA -%define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00 -%define BYTE_FLIP_MASK xmm13 - -%ifdef LINUX -%define NUM_BLKS rdx ; 3rd arg -%define CTX rsi ; 2nd arg -%define INP rdi ; 1st arg - -%define SRND rdi ; clobbers INP -%define c ecx -%define d r8d -%define e edx -%else -%define NUM_BLKS r8 ; 3rd arg -%define CTX rdx ; 2nd arg -%define INP rcx ; 1st arg - -%define SRND rcx ; clobbers INP -%define c edi -%define d esi -%define e r8d - -%endif -%define TBL rbp -%define a eax -%define b ebx - -%define f r9d -%define g r10d -%define h r11d - -%define y0 r13d -%define y1 r14d -%define y2 r15d - - -_INP_END_SIZE equ 8 -_INP_SIZE equ 8 -_XFER_SIZE equ 8 -%ifdef LINUX -_XMM_SAVE_SIZE equ 0 -%else -_XMM_SAVE_SIZE equ 8*16 -%endif -; STACK_SIZE plus pushes must be an odd multiple of 8 -_ALIGN_SIZE equ 8 - -_INP_END equ 0 -_INP equ _INP_END + _INP_END_SIZE -_XFER equ _INP + _INP_SIZE -_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE -STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE - -; rotate_Xs -; Rotate values of symbols X0...X3 -%macro rotate_Xs 0 -%xdefine X_ X0 -%xdefine X0 X1 -%xdefine X1 X2 -%xdefine X2 X3 -%xdefine X3 X_ -%endm - -; ROTATE_ARGS -; Rotate values of symbols a...h -%macro ROTATE_ARGS 0 -%xdefine TMP_ h -%xdefine h g -%xdefine g f -%xdefine f e -%xdefine e d -%xdefine d c -%xdefine c b -%xdefine b a -%xdefine a TMP_ -%endm - -%macro FOUR_ROUNDS_AND_SCHED 0 - ;; compute s0 four at a time and s1 two at a time - ;; compute W[-16] + W[-7] 4 at a time - ;vmovdqa XTMP0, X3 - mov y0, e ; y0 = e - MY_ROR y0, (25-11) ; y0 = e >> (25-11) - mov y1, a ; y1 = a - vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7] - MY_ROR y1, (22-13) ; y1 = a >> (22-13) - xor y0, e ; y0 = e ^ (e >> (25-11)) - mov y2, f ; y2 = f - MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) - ;vmovdqa XTMP1, X1 - xor y1, a ; y1 = a ^ (a >> (22-13) - xor y2, g ; y2 = f^g - vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16] - xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and y2, e ; y2 = (f^g)&e - MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) - ;; compute s0 - vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15] - xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor y2, g ; y2 = CH = ((f^g)&e)^g - - - MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y2, y0 ; y2 = S1 + CH - add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH - - mov y0, a ; y0 = a - add h, y2 ; h = h + S1 + CH + k + w - mov y2, a ; y2 = a - - vpsrld XTMP2, XTMP1, 7 - - or y0, c ; y0 = a|c - add d, h ; d = d + h + S1 + CH + k + w - and y2, c ; y2 = a&c - - vpslld XTMP3, XTMP1, (32-7) - - and y0, b ; y0 = (a|c)&b - add h, y1 ; h = h + S1 + CH + k + w + S0 - - vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 - - or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) - add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ - -ROTATE_ARGS - - mov y0, e ; y0 = e - mov y1, a ; y1 = a - - - MY_ROR y0, (25-11) ; y0 = e >> (25-11) - xor y0, e ; y0 = e ^ (e >> (25-11)) - mov y2, f ; y2 = f - MY_ROR y1, (22-13) ; y1 = a >> (22-13) - - vpsrld XTMP2, XTMP1,18 - - xor y1, a ; y1 = a ^ (a >> (22-13) - MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor y2, g ; y2 = f^g - - vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3 - - MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and y2, e ; y2 = (f^g)&e - MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - - vpslld XTMP1, XTMP1, (32-18) - - xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor y2, g ; y2 = CH = ((f^g)&e)^g - - vpxor XTMP3, XTMP3, XTMP1 - - add y2, y0 ; y2 = S1 + CH - add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH - MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - - vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 - - mov y0, a ; y0 = a - add h, y2 ; h = h + S1 + CH + k + w - mov y2, a ; y2 = a - - vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0 - - or y0, c ; y0 = a|c - add d, h ; d = d + h + S1 + CH + k + w - and y2, c ; y2 = a&c - ;; compute low s1 - vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA} - and y0, b ; y0 = (a|c)&b - add h, y1 ; h = h + S1 + CH + k + w + S0 - vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 - or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) - add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ - -ROTATE_ARGS - ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA} - - mov y0, e ; y0 = e - mov y1, a ; y1 = a - MY_ROR y0, (25-11) ; y0 = e >> (25-11) - - ;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA} - - xor y0, e ; y0 = e ^ (e >> (25-11)) - MY_ROR y1, (22-13) ; y1 = a >> (22-13) - mov y2, f ; y2 = f - xor y1, a ; y1 = a ^ (a >> (22-13) - MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) - - vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA} - - xor y2, g ; y2 = f^g - - vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA} - - xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and y2, e ; y2 = (f^g)&e - - vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA} - - MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor y2, g ; y2 = CH = ((f^g)&e)^g - MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - vpxor XTMP2, XTMP2, XTMP3 - add y2, y0 ; y2 = S1 + CH - MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH - vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA} - mov y0, a ; y0 = a - add h, y2 ; h = h + S1 + CH + k + w - mov y2, a ; y2 = a - vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA} - or y0, c ; y0 = a|c - add d, h ; d = d + h + S1 + CH + k + w - and y2, c ; y2 = a&c - vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]} - and y0, b ; y0 = (a|c)&b - add h, y1 ; h = h + S1 + CH + k + w + S0 - ;; compute high s1 - vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC} - or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) - add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ - -ROTATE_ARGS - ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC} - mov y0, e ; y0 = e - MY_ROR y0, (25-11) ; y0 = e >> (25-11) - mov y1, a ; y1 = a - ;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC} - MY_ROR y1, (22-13) ; y1 = a >> (22-13) - xor y0, e ; y0 = e ^ (e >> (25-11)) - mov y2, f ; y2 = f - MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) - - vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC} - - xor y1, a ; y1 = a ^ (a >> (22-13) - xor y2, g ; y2 = f^g - - vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC} - - xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and y2, e ; y2 = (f^g)&e - MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) - - vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC} - - xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor y2, g ; y2 = CH = ((f^g)&e)^g - - vpxor XTMP2, XTMP2, XTMP3 - - MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y2, y0 ; y2 = S1 + CH - add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH - vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC} - mov y0, a ; y0 = a - add h, y2 ; h = h + S1 + CH + k + w - mov y2, a ; y2 = a - vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00} - or y0, c ; y0 = a|c - add d, h ; d = d + h + S1 + CH + k + w - and y2, c ; y2 = a&c - vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]} - and y0, b ; y0 = (a|c)&b - add h, y1 ; h = h + S1 + CH + k + w + S0 - or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) - add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ - -ROTATE_ARGS -rotate_Xs -%endm - -;; input is [rsp + _XFER + %1 * 4] -%macro DO_ROUND 1 - mov y0, e ; y0 = e - MY_ROR y0, (25-11) ; y0 = e >> (25-11) - mov y1, a ; y1 = a - xor y0, e ; y0 = e ^ (e >> (25-11)) - MY_ROR y1, (22-13) ; y1 = a >> (22-13) - mov y2, f ; y2 = f - xor y1, a ; y1 = a ^ (a >> (22-13) - MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor y2, g ; y2 = f^g - xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) - and y2, e ; y2 = (f^g)&e - xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor y2, g ; y2 = CH = ((f^g)&e)^g - add y2, y0 ; y2 = S1 + CH - MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH - mov y0, a ; y0 = a - add h, y2 ; h = h + S1 + CH + k + w - mov y2, a ; y2 = a - or y0, c ; y0 = a|c - add d, h ; d = d + h + S1 + CH + k + w - and y2, c ; y2 = a&c - and y0, b ; y0 = (a|c)&b - add h, y1 ; h = h + S1 + CH + k + w + S0 - or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) - add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS -%endm - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; void sha256_avx_swap(void *input_data, UINT32 digest[8], UINT64 num_blks) -;; arg 1 : pointer to input data -;; arg 2 : pointer to digest -;; arg 3 : Num blocks -section .text -global sha256_avx_swap -align 32 -sha256_avx_swap: - push rbx -%ifndef LINUX - push rsi - push rdi -%endif - push rbp - push r13 - push r14 - push r15 - - sub rsp,STACK_SIZE -%ifndef LINUX - vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6 - vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7 - vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8 - vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9 - vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10 - vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11 - vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12 - vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13 -%endif - - shl NUM_BLKS, 6 ; convert to bytes - jz done_hash_swap - add NUM_BLKS, INP ; pointer to end of data - mov [rsp + _INP_END], NUM_BLKS - - ;; load initial digest - mov a,[4*0 + CTX] - mov b,[4*1 + CTX] - mov c,[4*2 + CTX] - mov d,[4*3 + CTX] - mov e,[4*4 + CTX] - mov f,[4*5 + CTX] - mov g,[4*6 + CTX] - mov h,[4*7 + CTX] - - vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip] - vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip] - vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip] - -loop0_swap: - lea TBL,[K256 wrt rip] - - ;; byte swap first 16 dwords - COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK - - mov [rsp + _INP], INP - - ;; schedule 48 input dwords, by doing 3 rounds of 16 each - mov SRND, 3 -align 16 -loop1_swap: - vpaddd XFER, X0, [TBL + 0*16] - vmovdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED - - vpaddd XFER, X0, [TBL + 1*16] - vmovdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED - - vpaddd XFER, X0, [TBL + 2*16] - vmovdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED - - vpaddd XFER, X0, [TBL + 3*16] - vmovdqa [rsp + _XFER], XFER - add TBL, 4*16 - FOUR_ROUNDS_AND_SCHED - - sub SRND, 1 - jne loop1_swap - - mov SRND, 2 -loop2_swap: - vpaddd XFER, X0, [TBL + 0*16] - vmovdqa [rsp + _XFER], XFER - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - vpaddd XFER, X1, [TBL + 1*16] - vmovdqa [rsp + _XFER], XFER - add TBL, 2*16 - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - vmovdqa X0, X2 - vmovdqa X1, X3 - - sub SRND, 1 - jne loop2_swap - - addm [4*0 + CTX],a - addm [4*1 + CTX],b - addm [4*2 + CTX],c - addm [4*3 + CTX],d - addm [4*4 + CTX],e - addm [4*5 + CTX],f - addm [4*6 + CTX],g - addm [4*7 + CTX],h - - mov INP, [rsp + _INP] - add INP, 64 - cmp INP, [rsp + _INP_END] - jne loop0_swap - -done_hash_swap: - mov a, [4*0 + CTX] - bswap a - mov [4*0 + CTX],a - - mov b, [4*1 + CTX] - bswap b - mov [4*1 + CTX], b - - mov c, [4*2 + CTX] - bswap c - mov [4*2 + CTX], c - - mov d, [4*3 + CTX] - bswap d - mov [4*3 + CTX], d - - mov e, [4*4 + CTX] - bswap e - mov [4*4 + CTX], e - - mov f, [4*5 + CTX] - bswap f - mov [4*5 + CTX], f - - mov g, [4*6 + CTX] - bswap g - mov [4*6 + CTX], g - - mov h, [4*7 + CTX] - bswap h - mov [4*7 + CTX], h - -%ifndef LINUX - vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16] - vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16] - vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16] - vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16] - vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16] - vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16] - vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16] - vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16] -%endif - - - add rsp, STACK_SIZE - - pop r15 - pop r14 - pop r13 - pop rbp -%ifndef LINUX - pop rdi - pop rsi -%endif - pop rbx - - ret - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks) -;; arg 1 : pointer to input data -;; arg 2 : pointer to digest -;; arg 3 : Num blocks -section .text -global sha256_avx -align 32 -sha256_avx: - push rbx -%ifndef LINUX - push rsi - push rdi -%endif - push rbp - push r13 - push r14 - push r15 - - sub rsp,STACK_SIZE -%ifndef LINUX - vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6 - vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7 - vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8 - vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9 - vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10 - vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11 - vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12 - vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13 -%endif - - shl NUM_BLKS, 6 ; convert to bytes - jz done_hash - add NUM_BLKS, INP ; pointer to end of data - mov [rsp + _INP_END], NUM_BLKS - - ;; load initial digest - mov a,[4*0 + CTX] - mov b,[4*1 + CTX] - mov c,[4*2 + CTX] - mov d,[4*3 + CTX] - mov e,[4*4 + CTX] - mov f,[4*5 + CTX] - mov g,[4*6 + CTX] - mov h,[4*7 + CTX] - - vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip] - vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip] - vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip] - -loop0: - lea TBL,[K256 wrt rip] - - ;; byte swap first 16 dwords - COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK - - mov [rsp + _INP], INP - - ;; schedule 48 input dwords, by doing 3 rounds of 16 each - mov SRND, 3 -align 16 -loop1: - vpaddd XFER, X0, [TBL + 0*16] - vmovdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED - - vpaddd XFER, X0, [TBL + 1*16] - vmovdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED - - vpaddd XFER, X0, [TBL + 2*16] - vmovdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED - - vpaddd XFER, X0, [TBL + 3*16] - vmovdqa [rsp + _XFER], XFER - add TBL, 4*16 - FOUR_ROUNDS_AND_SCHED - - sub SRND, 1 - jne loop1 - - mov SRND, 2 -loop2: - vpaddd XFER, X0, [TBL + 0*16] - vmovdqa [rsp + _XFER], XFER - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - vpaddd XFER, X1, [TBL + 1*16] - vmovdqa [rsp + _XFER], XFER - add TBL, 2*16 - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - vmovdqa X0, X2 - vmovdqa X1, X3 - - sub SRND, 1 - jne loop2 - - addm [4*0 + CTX],a - addm [4*1 + CTX],b - addm [4*2 + CTX],c - addm [4*3 + CTX],d - addm [4*4 + CTX],e - addm [4*5 + CTX],f - addm [4*6 + CTX],g - addm [4*7 + CTX],h - - mov INP, [rsp + _INP] - add INP, 64 - cmp INP, [rsp + _INP_END] - jne loop0 - -done_hash: -%ifndef LINUX - vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16] - vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16] - vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16] - vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16] - vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16] - vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16] - vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16] - vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16] -%endif - - - add rsp, STACK_SIZE - - pop r15 - pop r14 - pop r13 - pop rbp -%ifndef LINUX - pop rdi - pop rsi -%endif - pop rbx - - ret - -section .data -align 64 -K256: - dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203 - -; shuffle xBxA -> 00BA -_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100 - -; shuffle xDxC -> DC00 -_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/src/crypto/sha2/asm/sha256_sse4.asm b/src/crypto/sha2/asm/sha256_sse4.asm deleted file mode 100644 index 9a37cf3..0000000 --- a/src/crypto/sha2/asm/sha256_sse4.asm +++ /dev/null @@ -1,726 +0,0 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright (c) 2012, Intel Corporation -; -; All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions are -; met: -; -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in the -; documentation and/or other materials provided with the -; distribution. -; -; * Neither the name of the Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived from -; this software without specific prior written permission. -; -; -; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY -; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR -; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; -; Example YASM command lines: -; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm -; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm -; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; -; This code is described in an Intel White-Paper: -; "Fast SHA-256 Implementations on Intel Architecture Processors" -; -; To find it, surf to http://www.intel.com/p/en_US/embedded -; and search for that title. -; The paper is expected to be released roughly at the end of April, 2012 -; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; This code schedules 1 blocks at a time, with 4 lanes per block -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define MOVDQ movdqu ;; assume buffers not aligned - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros - -; addm [mem], reg -; Add reg to mem using reg-mem add and store -%macro addm 2 - add %2, %1 - mov %1, %2 -%endm - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask -; Load xmm with mem and byte swap each dword -%macro COPY_XMM_AND_BSWAP 3 - MOVDQ %1, %2 - pshufb %1, %3 -%endmacro - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define X0 xmm4 -%define X1 xmm5 -%define X2 xmm6 -%define X3 xmm7 - -%define XTMP0 xmm0 -%define XTMP1 xmm1 -%define XTMP2 xmm2 -%define XTMP3 xmm3 -%define XTMP4 xmm8 -%define XFER xmm9 - -%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA -%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00 -%define BYTE_FLIP_MASK xmm12 - -%ifdef LINUX -%define NUM_BLKS rdx ; 3rd arg -%define CTX rsi ; 2nd arg -%define INP rdi ; 1st arg - -%define SRND rdi ; clobbers INP -%define c ecx -%define d r8d -%define e edx -%else -%define NUM_BLKS r8 ; 3rd arg -%define CTX rdx ; 2nd arg -%define INP rcx ; 1st arg - -%define SRND rcx ; clobbers INP -%define c edi -%define d esi -%define e r8d - -%endif -%define TBL rbp -%define a eax -%define b ebx - -%define f r9d -%define g r10d -%define h r11d - -%define y0 r13d -%define y1 r14d -%define y2 r15d - - - -_INP_END_SIZE equ 8 -_INP_SIZE equ 8 -_XFER_SIZE equ 8 -%ifdef LINUX -_XMM_SAVE_SIZE equ 0 -%else -_XMM_SAVE_SIZE equ 7*16 -%endif -; STACK_SIZE plus pushes must be an odd multiple of 8 -_ALIGN_SIZE equ 8 - -_INP_END equ 0 -_INP equ _INP_END + _INP_END_SIZE -_XFER equ _INP + _INP_SIZE -_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE -STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE - -; rotate_Xs -; Rotate values of symbols X0...X3 -%macro rotate_Xs 0 -%xdefine X_ X0 -%xdefine X0 X1 -%xdefine X1 X2 -%xdefine X2 X3 -%xdefine X3 X_ -%endm - -; ROTATE_ARGS -; Rotate values of symbols a...h -%macro ROTATE_ARGS 0 -%xdefine TMP_ h -%xdefine h g -%xdefine g f -%xdefine f e -%xdefine e d -%xdefine d c -%xdefine c b -%xdefine b a -%xdefine a TMP_ -%endm - -%macro FOUR_ROUNDS_AND_SCHED 0 - ;; compute s0 four at a time and s1 two at a time - ;; compute W[-16] + W[-7] 4 at a time - movdqa XTMP0, X3 - mov y0, e ; y0 = e - ror y0, (25-11) ; y0 = e >> (25-11) - mov y1, a ; y1 = a - palignr XTMP0, X2, 4 ; XTMP0 = W[-7] - ror y1, (22-13) ; y1 = a >> (22-13) - xor y0, e ; y0 = e ^ (e >> (25-11)) - mov y2, f ; y2 = f - ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) - movdqa XTMP1, X1 - xor y1, a ; y1 = a ^ (a >> (22-13) - xor y2, g ; y2 = f^g - paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16] - xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and y2, e ; y2 = (f^g)&e - ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) - ;; compute s0 - palignr XTMP1, X0, 4 ; XTMP1 = W[-15] - xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor y2, g ; y2 = CH = ((f^g)&e)^g - movdqa XTMP2, XTMP1 ; XTMP2 = W[-15] - ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y2, y0 ; y2 = S1 + CH - add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH - movdqa XTMP3, XTMP1 ; XTMP3 = W[-15] - mov y0, a ; y0 = a - add h, y2 ; h = h + S1 + CH + k + w - mov y2, a ; y2 = a - pslld XTMP1, (32-7) - or y0, c ; y0 = a|c - add d, h ; d = d + h + S1 + CH + k + w - and y2, c ; y2 = a&c - psrld XTMP2, 7 - and y0, b ; y0 = (a|c)&b - add h, y1 ; h = h + S1 + CH + k + w + S0 - por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 - or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) - add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ - -ROTATE_ARGS - movdqa XTMP2, XTMP3 ; XTMP2 = W[-15] - mov y0, e ; y0 = e - mov y1, a ; y1 = a - movdqa XTMP4, XTMP3 ; XTMP4 = W[-15] - ror y0, (25-11) ; y0 = e >> (25-11) - xor y0, e ; y0 = e ^ (e >> (25-11)) - mov y2, f ; y2 = f - ror y1, (22-13) ; y1 = a >> (22-13) - pslld XTMP3, (32-18) - xor y1, a ; y1 = a ^ (a >> (22-13) - ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor y2, g ; y2 = f^g - psrld XTMP2, 18 - ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and y2, e ; y2 = (f^g)&e - ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - pxor XTMP1, XTMP3 - xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor y2, g ; y2 = CH = ((f^g)&e)^g - psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3 - add y2, y0 ; y2 = S1 + CH - add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH - ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 - mov y0, a ; y0 = a - add h, y2 ; h = h + S1 + CH + k + w - mov y2, a ; y2 = a - pxor XTMP1, XTMP4 ; XTMP1 = s0 - or y0, c ; y0 = a|c - add d, h ; d = d + h + S1 + CH + k + w - and y2, c ; y2 = a&c - ;; compute low s1 - pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA} - and y0, b ; y0 = (a|c)&b - add h, y1 ; h = h + S1 + CH + k + w + S0 - paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 - or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) - add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ - -ROTATE_ARGS - movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA} - mov y0, e ; y0 = e - mov y1, a ; y1 = a - ror y0, (25-11) ; y0 = e >> (25-11) - movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA} - xor y0, e ; y0 = e ^ (e >> (25-11)) - ror y1, (22-13) ; y1 = a >> (22-13) - mov y2, f ; y2 = f - xor y1, a ; y1 = a ^ (a >> (22-13) - ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) - psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA} - xor y2, g ; y2 = f^g - psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA} - xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and y2, e ; y2 = (f^g)&e - psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA} - ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor y2, g ; y2 = CH = ((f^g)&e)^g - ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - pxor XTMP2, XTMP3 - add y2, y0 ; y2 = S1 + CH - ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH - pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA} - mov y0, a ; y0 = a - add h, y2 ; h = h + S1 + CH + k + w - mov y2, a ; y2 = a - pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA} - or y0, c ; y0 = a|c - add d, h ; d = d + h + S1 + CH + k + w - and y2, c ; y2 = a&c - paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]} - and y0, b ; y0 = (a|c)&b - add h, y1 ; h = h + S1 + CH + k + w + S0 - ;; compute high s1 - pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC} - or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) - add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ - -ROTATE_ARGS - movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC} - mov y0, e ; y0 = e - ror y0, (25-11) ; y0 = e >> (25-11) - mov y1, a ; y1 = a - movdqa X0, XTMP2 ; X0 = W[-2] {DDCC} - ror y1, (22-13) ; y1 = a >> (22-13) - xor y0, e ; y0 = e ^ (e >> (25-11)) - mov y2, f ; y2 = f - ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) - psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC} - xor y1, a ; y1 = a ^ (a >> (22-13) - xor y2, g ; y2 = f^g - psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC} - xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and y2, e ; y2 = (f^g)&e - ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) - psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC} - xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor y2, g ; y2 = CH = ((f^g)&e)^g - pxor XTMP2, XTMP3 - ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y2, y0 ; y2 = S1 + CH - add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH - pxor X0, XTMP2 ; X0 = s1 {xDxC} - mov y0, a ; y0 = a - add h, y2 ; h = h + S1 + CH + k + w - mov y2, a ; y2 = a - pshufb X0, SHUF_DC00 ; X0 = s1 {DC00} - or y0, c ; y0 = a|c - add d, h ; d = d + h + S1 + CH + k + w - and y2, c ; y2 = a&c - paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]} - and y0, b ; y0 = (a|c)&b - add h, y1 ; h = h + S1 + CH + k + w + S0 - or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) - add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ - -ROTATE_ARGS -rotate_Xs -%endm - -;; input is [rsp + _XFER + %1 * 4] -%macro DO_ROUND 1 - mov y0, e ; y0 = e - ror y0, (25-11) ; y0 = e >> (25-11) - mov y1, a ; y1 = a - xor y0, e ; y0 = e ^ (e >> (25-11)) - ror y1, (22-13) ; y1 = a >> (22-13) - mov y2, f ; y2 = f - xor y1, a ; y1 = a ^ (a >> (22-13) - ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor y2, g ; y2 = f^g - xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) - and y2, e ; y2 = (f^g)&e - xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor y2, g ; y2 = CH = ((f^g)&e)^g - add y2, y0 ; y2 = S1 + CH - ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH - mov y0, a ; y0 = a - add h, y2 ; h = h + S1 + CH + k + w - mov y2, a ; y2 = a - or y0, c ; y0 = a|c - add d, h ; d = d + h + S1 + CH + k + w - and y2, c ; y2 = a&c - and y0, b ; y0 = (a|c)&b - add h, y1 ; h = h + S1 + CH + k + w + S0 - or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) - add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS -%endm - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks) -;; arg 1 : pointer to input data -;; arg 2 : pointer to digest -;; arg 3 : Num blocks -section .text -global sha256_sse4 -align 32 -sha256_sse4: - push rbx -%ifndef LINUX - push rsi - push rdi -%endif - push rbp - push r13 - push r14 - push r15 - - sub rsp,STACK_SIZE -%ifndef LINUX - movdqa [rsp + _XMM_SAVE + 0*16],xmm6 - movdqa [rsp + _XMM_SAVE + 1*16],xmm7 - movdqa [rsp + _XMM_SAVE + 2*16],xmm8 - movdqa [rsp + _XMM_SAVE + 3*16],xmm9 - movdqa [rsp + _XMM_SAVE + 4*16],xmm10 - movdqa [rsp + _XMM_SAVE + 5*16],xmm11 - movdqa [rsp + _XMM_SAVE + 6*16],xmm12 -%endif - - shl NUM_BLKS, 6 ; convert to bytes - jz done_hash - add NUM_BLKS, INP ; pointer to end of data - mov [rsp + _INP_END], NUM_BLKS - - ;; load initial digest - mov a,[4*0 + CTX] - mov b,[4*1 + CTX] - mov c,[4*2 + CTX] - mov d,[4*3 + CTX] - mov e,[4*4 + CTX] - mov f,[4*5 + CTX] - mov g,[4*6 + CTX] - mov h,[4*7 + CTX] - - movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip] - movdqa SHUF_00BA, [_SHUF_00BA wrt rip] - movdqa SHUF_DC00, [_SHUF_DC00 wrt rip] - -loop0: - lea TBL,[K256 wrt rip] - - ;; byte swap first 16 dwords - COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK - - mov [rsp + _INP], INP - - ;; schedule 48 input dwords, by doing 3 rounds of 16 each - mov SRND, 3 -align 16 -loop1: - movdqa XFER, [TBL + 0*16] - paddd XFER, X0 - movdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED - - movdqa XFER, [TBL + 1*16] - paddd XFER, X0 - movdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED - - movdqa XFER, [TBL + 2*16] - paddd XFER, X0 - movdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED - - movdqa XFER, [TBL + 3*16] - paddd XFER, X0 - movdqa [rsp + _XFER], XFER - add TBL, 4*16 - FOUR_ROUNDS_AND_SCHED - - sub SRND, 1 - jne loop1 - - mov SRND, 2 -loop2: - paddd X0, [TBL + 0*16] - movdqa [rsp + _XFER], X0 - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - paddd X1, [TBL + 1*16] - movdqa [rsp + _XFER], X1 - add TBL, 2*16 - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - movdqa X0, X2 - movdqa X1, X3 - - sub SRND, 1 - jne loop2 - - addm [4*0 + CTX],a - addm [4*1 + CTX],b - addm [4*2 + CTX],c - addm [4*3 + CTX],d - addm [4*4 + CTX],e - addm [4*5 + CTX],f - addm [4*6 + CTX],g - addm [4*7 + CTX],h - - mov INP, [rsp + _INP] - add INP, 64 - cmp INP, [rsp + _INP_END] - jne loop0 - -done_hash: -%ifndef LINUX - movdqa xmm6,[rsp + _XMM_SAVE + 0*16] - movdqa xmm7,[rsp + _XMM_SAVE + 1*16] - movdqa xmm8,[rsp + _XMM_SAVE + 2*16] - movdqa xmm9,[rsp + _XMM_SAVE + 3*16] - movdqa xmm10,[rsp + _XMM_SAVE + 4*16] - movdqa xmm11,[rsp + _XMM_SAVE + 5*16] - movdqa xmm12,[rsp + _XMM_SAVE + 6*16] -%endif - - add rsp, STACK_SIZE - - pop r15 - pop r14 - pop r13 - pop rbp -%ifndef LINUX - pop rdi - pop rsi -%endif - pop rbx - - ret - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; void sha256_sse4_swap(void *input_data, UINT32 digest[8], UINT64 num_blks) -;; arg 1 : pointer to input data -;; arg 2 : pointer to digest -;; arg 3 : Num blocks -section .text -global sha256_sse4_swap -align 32 -sha256_sse4_swap: - push rbx -%ifndef LINUX - push rsi - push rdi -%endif - push rbp - push r13 - push r14 - push r15 - - sub rsp,STACK_SIZE -%ifndef LINUX - movdqa [rsp + _XMM_SAVE + 0*16],xmm6 - movdqa [rsp + _XMM_SAVE + 1*16],xmm7 - movdqa [rsp + _XMM_SAVE + 2*16],xmm8 - movdqa [rsp + _XMM_SAVE + 3*16],xmm9 - movdqa [rsp + _XMM_SAVE + 4*16],xmm10 - movdqa [rsp + _XMM_SAVE + 5*16],xmm11 - movdqa [rsp + _XMM_SAVE + 6*16],xmm12 -%endif - - shl NUM_BLKS, 6 ; convert to bytes - jz done_hash_swap - add NUM_BLKS, INP ; pointer to end of data - mov [rsp + _INP_END], NUM_BLKS - - ;; load initial digest - mov a,[4*0 + CTX] - mov b,[4*1 + CTX] - mov c,[4*2 + CTX] - mov d,[4*3 + CTX] - mov e,[4*4 + CTX] - mov f,[4*5 + CTX] - mov g,[4*6 + CTX] - mov h,[4*7 + CTX] - - movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip] - movdqa SHUF_00BA, [_SHUF_00BA wrt rip] - movdqa SHUF_DC00, [_SHUF_DC00 wrt rip] - -loop0_swap: - lea TBL,[K256 wrt rip] - - ;; byte swap first 16 dwords - COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK - - mov [rsp + _INP], INP - - ;; schedule 48 input dwords, by doing 3 rounds of 16 each - mov SRND, 3 -align 16 -loop1_swap: - movdqa XFER, [TBL + 0*16] - paddd XFER, X0 - movdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED - - movdqa XFER, [TBL + 1*16] - paddd XFER, X0 - movdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED - - movdqa XFER, [TBL + 2*16] - paddd XFER, X0 - movdqa [rsp + _XFER], XFER - FOUR_ROUNDS_AND_SCHED - - movdqa XFER, [TBL + 3*16] - paddd XFER, X0 - movdqa [rsp + _XFER], XFER - add TBL, 4*16 - FOUR_ROUNDS_AND_SCHED - - sub SRND, 1 - jne loop1_swap - - mov SRND, 2 -loop2_swap: - paddd X0, [TBL + 0*16] - movdqa [rsp + _XFER], X0 - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - paddd X1, [TBL + 1*16] - movdqa [rsp + _XFER], X1 - add TBL, 2*16 - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - movdqa X0, X2 - movdqa X1, X3 - - sub SRND, 1 - jne loop2_swap - - addm [4*0 + CTX],a - addm [4*1 + CTX],b - addm [4*2 + CTX],c - addm [4*3 + CTX],d - addm [4*4 + CTX],e - addm [4*5 + CTX],f - addm [4*6 + CTX],g - addm [4*7 + CTX],h - - mov INP, [rsp + _INP] - add INP, 64 - cmp INP, [rsp + _INP_END] - jne loop0_swap - -done_hash_swap: - mov a, [4*0 + CTX] - bswap a - mov [4*0 + CTX],a - - mov b, [4*1 + CTX] - bswap b - mov [4*1 + CTX], b - - mov c, [4*2 + CTX] - bswap c - mov [4*2 + CTX], c - - mov d, [4*3 + CTX] - bswap d - mov [4*3 + CTX], d - - mov e, [4*4 + CTX] - bswap e - mov [4*4 + CTX], e - - mov f, [4*5 + CTX] - bswap f - mov [4*5 + CTX], f - - mov g, [4*6 + CTX] - bswap g - mov [4*6 + CTX], g - - mov h, [4*7 + CTX] - bswap h - mov [4*7 + CTX], h - - -%ifndef LINUX - movdqa xmm6,[rsp + _XMM_SAVE + 0*16] - movdqa xmm7,[rsp + _XMM_SAVE + 1*16] - movdqa xmm8,[rsp + _XMM_SAVE + 2*16] - movdqa xmm9,[rsp + _XMM_SAVE + 3*16] - movdqa xmm10,[rsp + _XMM_SAVE + 4*16] - movdqa xmm11,[rsp + _XMM_SAVE + 5*16] - movdqa xmm12,[rsp + _XMM_SAVE + 6*16] -%endif - - add rsp, STACK_SIZE - - pop r15 - pop r14 - pop r13 - pop rbp -%ifndef LINUX - pop rdi - pop rsi -%endif - pop rbx - - ret - -section .data -align 64 -K256: - dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203 - -; shuffle xBxA -> 00BA -_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100 - -; shuffle xDxC -> DC00 -_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/src/kernel.cpp b/src/kernel.cpp index c52a482..52c17af 100644 --- a/src/kernel.cpp +++ b/src/kernel.cpp @@ -424,25 +424,42 @@ bool CheckStakeKernelHash(uint32_t nBits, const CBlock& blockFrom, uint32_t nTxP } -#ifdef USE_YASM - -// SHA256 initial state -static const uint32_t init[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; - -// 8000000000000000000000000000000000000000000000000000000000000000000000e0 -static const uint32_t block1_suffix[9] = { 0x00000080, 0, 0, 0, 0, 0, 0, 0, 0xe0000000 }; +#ifdef USE_ASM + +// kernel padding +static const uint32_t block1_suffix[9] = { 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0xe0000000 }; +static const uint32_t block1_suffix_4way[4 * 9] = { + 0x00000080, 0x00000080, 0x00000080, 0x00000080, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0xe0000000, 0xe0000000, 0xe0000000, 0xe0000000 +}; -// 8000000000000000000000000000000000000000000000000000000000000100 -static const uint32_t block2_suffix[8] = { 0x00000080, 0, 0, 0, 0, 0, 0, 0x00010000 }; +// hash padding +static const uint32_t block2_suffix[8] = { 0x80000000, 0, 0, 0, 0, 0, 0, 0x00010000 }; +static const uint32_t block2_suffix_4way[4 * 8] = { + 0x00000080, 0x00000080, 0x00000080, 0x00000080, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0x00010000, 0x00010000, 0x00010000, 0x00010000 +}; -// TODO: cpuid detection of supported instruction sets +extern "C" int sha256_use_4way(); -extern "C" void sha256_avx(void *input_data, uint32_t digest[8], uint64_t num_blks); -extern "C" void sha256_avx_swap(void *input_data, uint32_t digest[8], uint64_t num_blks); +extern "C" void sha256_init(uint32_t *state); +extern "C" void sha256_transform(uint32_t *state, const uint32_t *block, int swap); -// Not used yet -extern "C" void sha256_sse4(void *input_data, uint32_t digest[8], uint64_t num_blks); -extern "C" void sha256_sse4_swap(void *input_data, uint32_t digest[8], uint64_t num_blks); +extern "C" void sha256_init_4way(uint32_t *state); +extern "C" void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap); class ScanMidstateWorker { @@ -455,10 +472,88 @@ public: solutions = vector >(); } - void Do() + void Do_4way() { SetThreadPriority(THREAD_PRIORITY_LOWEST); + // Compute maximum possible target to filter out majority of obviously insufficient hashes + CBigNum bnTargetPerCoinDay; + bnTargetPerCoinDay.SetCompact(nBits); + uint256 nMaxTarget = (bnTargetPerCoinDay * bnValueIn * nStakeMaxAge / COIN / nOneDay).getuint256(); + + uint32_t state1[4 * 8] __attribute__((aligned(16))); + uint32_t state2[4 * 8] __attribute__((aligned(16))); + uint32_t blocks1[4 * 16] __attribute__((aligned(16))); + uint32_t blocks2[4 * 16] __attribute__((aligned(16))); + + vector vRow = vector(4); + uint32_t *pnKernel = (uint32_t *) kernel; + + for(int i = 0; i < 7; i++) + { + uint32_t nVal = pnKernel[i]; + fill(vRow.begin(), vRow.end(), nVal); + + for (int j = 0; j < 4; j++) + { + memcpy(&blocks1[i*4], &vRow[0], 16); + } + } + + memcpy(&blocks1[28], &block1_suffix_4way[0], 36*4); // sha256 padding + memcpy(&blocks2[32], &block2_suffix_4way[0], 32*4); + + // Search forward in time from the given timestamp + // Stopping search in case of shutting down + for (uint32_t nTimeTx=nIntervalBegin, nMaxTarget32 = nMaxTarget.Get32(7); nTimeTx= CBigNum(nHashProofOfStake)) + solutions.push_back(std::pair(nHashProofOfStake, nTime)); + } + } + } + } + + void Do_generic() + { + SetThreadPriority(THREAD_PRIORITY_LOWEST); + + // Init new sha256 context and update it + // with first 24 bytes of kernel + SHA256_CTX workerCtx; + SHA256_Init(&workerCtx); + SHA256_Update(&workerCtx, kernel, 8 + 16); + SHA256_CTX ctx = workerCtx; + // Sha256 result buffer uint32_t hashProofOfStake[8]; @@ -469,34 +564,25 @@ public: uint256 nMaxTarget = (bnTargetPerCoinDay * bnValueIn * nStakeMaxAge / COIN / nOneDay).getuint256(), *pnHashProofOfStake = (uint256 *)&hashProofOfStake; - uint8_t data_block[64]; - uint8_t data_block2[64]; - - // Copy static part of kernel - memcpy(&data_block[0], kernel, 24); - - memcpy(&data_block[28], &block1_suffix[0], 9 * sizeof(uint32_t)); - memcpy(&data_block2[32], &block2_suffix[0], 8 * sizeof(uint32_t)); - // Search forward in time from the given timestamp // Stopping search in case of shutting down for (uint32_t nTimeTx=nIntervalBegin, nMaxTarget32 = nMaxTarget.Get32(7); nTimeTx nMaxTarget32) + if (hashProofOfStake[7] > nMaxTarget32) continue; - // Swap byte order - for(int i = 0; i < 8; i++) - hashProofOfStake[i] = __builtin_bswap32(hashProofOfStake[i]); - CBigNum bnCoinDayWeight = bnValueIn * GetWeight((int64_t)nInputTxTime, (int64_t)nTimeTx) / COIN / nOneDay; CBigNum bnTargetProofOfStake = bnCoinDayWeight * bnTargetPerCoinDay; @@ -505,6 +591,13 @@ public: } } + void Do() + { + if (sha256_use_4way() != 0) + Do_4way(); + Do_generic(); + } + vector >& GetSolutions() { return solutions; diff --git a/src/makefile.bsd b/src/makefile.bsd index 8be6835..ea44263 100644 --- a/src/makefile.bsd +++ b/src/makefile.bsd @@ -156,6 +156,7 @@ endif ifeq (${USE_ASM}, 1) # Assembler implementation OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o +OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S $(CXX) -c $(xCXXFLAGS) -MMD -o $@ $< @@ -168,6 +169,17 @@ crypto/scrypt/asm/obj/scrypt-arm.o: crypto/scrypt/asm/scrypt-arm.S crypto/scrypt/asm/obj/asm-wrapper.o: crypto/scrypt/asm/asm-wrapper.cpp $(CXX) -c $(xCXXFLAGS) -MMD -o $@ $< + +crypto/sha2/asm/obj/sha2-x86.o: crypto/sha2/asm/sha2-x86.S + $(CXX) -c $(xCXXFLAGS) -MMD -o $@ $< + +crypto/sha2/asm/obj/sha2-x86_64.o: crypto/sha2/asm/sha2-x86_64.S + $(CXX) -c $(xCXXFLAGS) -MMD -o $@ $< + +crypto/sha2/asm/obj/sha2-arm.o: crypto/sha2/asm/sha2-arm.S + $(CXX) -c $(xCXXFLAGS) -MMD -o $@ $< + +DEFS += -DUSE_ASM else ifeq (${USE_SSE2}, 1) # Intrinsic implementation diff --git a/src/makefile.linux-mingw b/src/makefile.linux-mingw index 3c65af9..519e6db 100644 --- a/src/makefile.linux-mingw +++ b/src/makefile.linux-mingw @@ -131,6 +131,7 @@ endif ifeq (${USE_ASM}, 1) # Assembler implementation OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o +OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S $(CXX) -c $(CFLAGS) -MMD -o $@ $< @@ -143,6 +144,17 @@ crypto/scrypt/asm/obj/scrypt-arm.o: crypto/scrypt/asm/scrypt-arm.S crypto/scrypt/asm/obj/asm-wrapper.o: crypto/scrypt/asm/asm-wrapper.cpp $(CXX) -c $(CFLAGS) -MMD -o $@ $< + +crypto/sha2/asm/obj/sha2-x86.o: crypto/sha2/asm/sha2-x86.S + $(CXX) -c $(CFLAGS) -MMD -o $@ $< + +crypto/sha2/asm/obj/sha2-x86_64.o: crypto/sha2/asm/sha2-x86_64.S + $(CXX) -c $(CFLAGS) -MMD -o $@ $< + +crypto/sha2/asm/obj/sha2-arm.o: crypto/sha2/asm/sha2-arm.S + $(CXX) -c $(CFLAGS) -MMD -o $@ $< + +DEFS += -DUSE_ASM else ifeq (${USE_SSE2}, 1) # Intrinsic implementation diff --git a/src/makefile.mingw b/src/makefile.mingw index c09ed6b..1f0d38f 100644 --- a/src/makefile.mingw +++ b/src/makefile.mingw @@ -119,32 +119,35 @@ endif ifdef USE_ASM # Assembler implementation -OBJS += scrypt-asm/obj/scrypt-arm.o scrypt-asm/obj/scrypt-x86.o scrypt-asm/obj/scrypt-x86_64.o scrypt-asm/obj/asm-wrapper.o +OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o +OBJS += crypto/sha2/asm/obj/scrypt-arm.o crypto/sha2/asm/obj/scrypt-x86.o crypto/sha2/asm/obj/scrypt-x86_64.o -scrypt-asm/obj/scrypt-x86.o: scrypt-asm/scrypt-x86.S +crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S $(CXX) -c $(CFLAGS) -MMD -o $@ $< -scrypt-asm/obj/scrypt-x86_64.o: scrypt-asm/scrypt-x86_64.S +scrypt-asm/obj/scrypt-x86_64.o: crypto/scrypt/asm/scrypt-x86_64.S $(CXX) -c $(CFLAGS) -MMD -o $@ $< -scrypt-asm/obj/scrypt-arm.o: scrypt-asm/scrypt-arm.S +crypto/scrypt/asm/obj/scrypt-arm.o: crypto/scrypt/asm/scrypt-arm.S $(CXX) -c $(CFLAGS) -MMD -o $@ $< -scrypt-asm/obj/asm-wrapper.o: scrypt-asm/asm-wrapper.cpp +crypto/scrypt/asm/obj/asm-wrapper.o: crypto/scrypt/asm/asm-wrapper.cpp $(CXX) -c $(CFLAGS) -MMD -o $@ $< + +DEFS += -DUSE_ASM else ifdef USE_SSE2 # Intrinsic implementation DEFS += -DUSE_SSE2 OBJS += scrypt-intrin/obj/scrypt-sse2.o -scrypt-intrin/obj/scrypt-sse2.o: scrypt-intrin/scrypt-sse2.cpp +crypto/scrypt/intrin/obj/scrypt-sse2.o: crypto/scrypt/intrin/scrypt-sse2.cpp $(CXX) -c $(CFLAGS) -MMD -o $@ $< else # Generic implementation OBJS += obj/scrypt-generic.o -obj/scrypt-generic.o: scrypt-generic.cpp +crypto/scrypt/obj/scrypt-generic.o: crypto/scrypt/generic/scrypt-generic.cpp $(CXX) -c $(CFLAGS) -MMD -o $@ $< endif endif diff --git a/src/makefile.osx b/src/makefile.osx index cd13d12..121c497 100644 --- a/src/makefile.osx +++ b/src/makefile.osx @@ -132,6 +132,7 @@ endif ifeq (${USE_ASM}, 1) # Assembler implementation OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o +OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S $(CXX) -c $(CFLAGS) -MMD -o $@ $< @@ -144,6 +145,17 @@ crypto/scrypt/asm/obj/scrypt-arm.o: crypto/scrypt/asm/scrypt-arm.S crypto/scrypt/asm/obj/asm-wrapper.o: crypto/scrypt/asm/asm-wrapper.cpp $(CXX) -c $(CFLAGS) -MMD -o $@ $< + +crypto/sha/asm/obj/sha-arm.o: crypto/sha2/asm/sha2-x86.S + $(CXX) -c $(CFLAGS) -MMD -o $@ $< + +crypto/sha/asm/obj/sha-arm.o: crypto/sha2/asm/sha2-x86_64.S + $(CXX) -c $(CFLAGS) -MMD -o $@ $< + +crypto/sha/asm/obj/sha-arm.o: crypto/sha2/asm/sha2-arm.S + $(CXX) -c $(CFLAGS) -MMD -o $@ $< + +DEFS += -DUSE_ASM else ifeq (${USE_SSE2}, 1) # Intrinsic implementation diff --git a/src/makefile.unix b/src/makefile.unix index bc06fc4..65eaca1 100644 --- a/src/makefile.unix +++ b/src/makefile.unix @@ -162,6 +162,7 @@ endif ifeq (${USE_ASM}, 1) # Assembler implementation OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o +OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S $(CXX) -c $(xCXXFLAGS) -MMD -o $@ $< @@ -174,6 +175,17 @@ crypto/scrypt/asm/obj/scrypt-arm.o: crypto/scrypt/asm/scrypt-arm.S crypto/scrypt/asm/obj/asm-wrapper.o: crypto/scrypt/asm/asm-wrapper.cpp $(CXX) -c $(xCXXFLAGS) -MMD -o $@ $< + +crypto/sha2/asm/obj/sha2-x86.o: crypto/sha2/asm/sha2-x86.S + $(CXX) -c $(xCXXFLAGS) -MMD -o $@ $< + +crypto/sha2/asm/obj/sha2-x86_64.o: crypto/sha2/asm/sha2-x86_64.S + $(CXX) -c $(xCXXFLAGS) -MMD -o $@ $< + +crypto/sha2/asm/obj/sha2-arm.o: crypto/sha2/asm/sha2-x86.S + $(CXX) -c $(xCXXFLAGS) -MMD -o $@ $< + +DEFS += -DUSE_ASM else ifeq (${USE_SSE2}, 1) # Intrinsic implementation -- 1.7.1