From 9984fe54b7ee8185928b687edfb2e15cd465f509 Mon Sep 17 00:00:00 2001 From: user Date: Tue, 21 Dec 2021 07:00:24 +0300 Subject: [PATCH] Only allow 64 bit builds by default; Remove assembler scrypt implementations; Import sse2neon translation layer to support neon intrinsics; Enable intrinsics for 64 bit CPUs by default. --- .gitmodules | 3 + CMakeLists.txt | 34 +- src/CMakeLists.txt | 32 +- src/additional/sse2neon | 1 + src/crypto/scrypt/asm/asm-wrapper.cpp | 22 - src/crypto/scrypt/asm/obj/.gitignore | 2 - src/crypto/scrypt/asm/scrypt-arm.S | 393 ------------- src/crypto/scrypt/asm/scrypt-x86.S | 819 ---------------------------- src/crypto/scrypt/asm/scrypt-x86_64.S | 758 ------------------------- src/crypto/scrypt/intrin/scrypt-intrin.cpp | 151 +++++ src/crypto/scrypt/intrin/scrypt-sse2.cpp | 146 ----- 11 files changed, 186 insertions(+), 2175 deletions(-) create mode 160000 src/additional/sse2neon delete mode 100644 src/crypto/scrypt/asm/asm-wrapper.cpp delete mode 100644 src/crypto/scrypt/asm/obj/.gitignore delete mode 100644 src/crypto/scrypt/asm/scrypt-arm.S delete mode 100644 src/crypto/scrypt/asm/scrypt-x86.S delete mode 100644 src/crypto/scrypt/asm/scrypt-x86_64.S create mode 100644 src/crypto/scrypt/intrin/scrypt-intrin.cpp delete mode 100644 src/crypto/scrypt/intrin/scrypt-sse2.cpp diff --git a/.gitmodules b/.gitmodules index 6452f7c..f402066 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ [submodule "src/additional/libqrencode"] path = src/additional/libqrencode url = https://github.com/fukuchi/libqrencode +[submodule "src/additional/sse2neon"] + path = src/additional/sse2neon + url = https://github.com/DLTcollab/sse2neon diff --git a/CMakeLists.txt b/CMakeLists.txt index c012ea2..e0bb860 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,12 +4,20 @@ cmake_minimum_required(VERSION 3.4.1) ## mkdir build && cd build ## ## cmake -DBerkeleyDB_ROOT:STRING=/opt/homebrew/Cellar/berkeley-db@4/4.8.30 .. -## cmake -DUSE_ASM=1 .. -## cmake -DUSE_SSE2 .. -## cmake -DBerkeleyDB_INC:STRING=/usr/include -DBerkeleyDB_LIBS:STRING=/usr/lib/aarch64-linux-gnu -DUSE_SSE2 -DUSE_LEVELDB .. +## cmake -DBerkeleyDB_INC:STRING=/usr/include -DBerkeleyDB_LIBS:STRING=/usr/lib/aarch64-linux-gnu -DUSE_LEVELDB=1 .. ## -project(novacoin-qt VERSION 0.5.9 LANGUAGES C CXX ASM) +project(novacoin-qt VERSION 0.5.9 LANGUAGES C CXX) + +# Enforce sizeof(size_t) == 8 by default +if (NOT ALLOW_32BIT AND NOT CMAKE_SIZEOF_VOID_P MATCHES "8") + message(FATAL_ERROR "Only 64-bit processors (x86_64, AArch64) are supported") +endif () + +# Force generic scrypt on 32 bit platforms +if (NOT CMAKE_SIZEOF_VOID_P MATCHES "8") + set(USE_GENERIC_SCRYPT True) +endif() if (APPLE) enable_language(OBJCXX) @@ -213,19 +221,9 @@ else() list(APPEND ALL_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/txdb-bdb.cpp) endif() -if (USE_ASM) - # Assembler implementation - set(asm_sources - ${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/scrypt/asm/scrypt-arm.S - ${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/scrypt/asm/scrypt-x86.S - ${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/scrypt/asm/scrypt-x86_64.S - ) - - list(APPEND ALL_SOURCES ${generic_sources} ${asm_sources}) - list(APPEND ALL_DEFINITIONS USE_ASM) -elseif (USE_SSE2) - list( APPEND ALL_SOURCES ${generic_sources} ${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/scrypt/intrin/scrypt-sse2.cpp ) - list(APPEND ALL_DEFINITIONS USE_SSE2) +if (NOT USE_GENERIC_SCRYPT) + list( APPEND ALL_SOURCES ${generic_sources} ${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/scrypt/intrin/scrypt-intrin.cpp ) + list(APPEND ALL_DEFINITIONS USE_INTRIN) else() list( APPEND ALL_SOURCES ${generic_sources} ${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/scrypt/generic/scrypt-generic.cpp ) endif() @@ -238,7 +236,7 @@ execute_process ( list(APPEND ALL_DEFINITIONS HAVE_BUILD_INFO) add_executable(novacoin-qt ${ALL_SOURCES}) -target_include_directories(novacoin-qt PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/src/qt ${CMAKE_CURRENT_SOURCE_DIR}/src/json ${BerkeleyDB_INC} ${CMAKE_CURRENT_SOURCE_DIR}/src/additional/leveldb/helpers ${CMAKE_CURRENT_SOURCE_DIR}/src/additional/libqrencode ${Boost_INCLUDE_DIRS}) +target_include_directories(novacoin-qt PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/src/qt ${CMAKE_CURRENT_SOURCE_DIR}/src/json ${BerkeleyDB_INC} ${CMAKE_CURRENT_SOURCE_DIR}/src/additional/leveldb/helpers ${CMAKE_CURRENT_SOURCE_DIR}/src/additional/sse2neon ${CMAKE_CURRENT_SOURCE_DIR}/src/additional/libqrencode ${Boost_INCLUDE_DIRS}) target_link_libraries(novacoin-qt ${ALL_LIBRARIES}) target_compile_features(novacoin-qt PUBLIC cxx_std_17) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0e78bf5..4306290 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -4,12 +4,20 @@ cmake_minimum_required(VERSION 3.4.1) ## mkdir build && cd build ## ## cmake -DBerkeleyDB_ROOT:STRING=/opt/homebrew/Cellar/berkeley-db@4/4.8.30 .. -## cmake -DUSE_ASM=1 .. -## cmake -DUSE_SSE2 .. -## cmake -DBerkeleyDB_INC:STRING=/usr/include -DBerkeleyDB_LIBS:STRING=/usr/lib/aarch64-linux-gnu -DUSE_SSE2 -DUSE_LEVELDB .. +## cmake -DBerkeleyDB_INC:STRING=/usr/include -DBerkeleyDB_LIBS:STRING=/usr/lib/aarch64-linux-gnu -DUSE_LEVELDB=1 .. ## -project(novacoind VERSION 0.5.9 LANGUAGES C CXX ASM) +project(novacoind VERSION 0.5.9 LANGUAGES C CXX) + +# Enforce sizeof(size_t) == 8 by default +if (NOT ALLOW_32BIT AND NOT CMAKE_SIZEOF_VOID_P MATCHES "8") + message(FATAL_ERROR "Only 64-bit processors (x86_64, AArch64) are supported") +endif () + +# Force generic scrypt on 32 bit platforms +if (NOT CMAKE_SIZEOF_VOID_P MATCHES "8") + set(USE_GENERIC_SCRYPT True) +endif() find_program(CCACHE_FOUND ccache) if(CCACHE_FOUND) @@ -126,19 +134,9 @@ else() list(APPEND ALL_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/txdb-bdb.cpp) endif() -if (USE_ASM) - # Assembler implementation - set(asm_sources - ${CMAKE_CURRENT_SOURCE_DIR}/crypto/scrypt/asm/scrypt-arm.S - ${CMAKE_CURRENT_SOURCE_DIR}/crypto/scrypt/asm/scrypt-x86.S - ${CMAKE_CURRENT_SOURCE_DIR}/crypto/scrypt/asm/scrypt-x86_64.S - ) - - list(APPEND ALL_SOURCES ${generic_sources} ${asm_sources}) - list(APPEND ALL_DEFINITIONS USE_ASM) -elseif (USE_SSE2) - list( APPEND ALL_SOURCES ${generic_sources} ${CMAKE_CURRENT_SOURCE_DIR}/crypto/scrypt/intrin/scrypt-sse2.cpp ) - list(APPEND ALL_DEFINITIONS USE_SSE2) +if (NOT USE_GENERIC_SCRYPT) + list( APPEND ALL_SOURCES ${generic_sources} ${CMAKE_CURRENT_SOURCE_DIR}/crypto/scrypt/intrin/scrypt-intrin.cpp ) + list(APPEND ALL_DEFINITIONS USE_INTRIN) else() list( APPEND ALL_SOURCES ${generic_sources} ${CMAKE_CURRENT_SOURCE_DIR}/crypto/scrypt/generic/scrypt-generic.cpp ) endif() diff --git a/src/additional/sse2neon b/src/additional/sse2neon new file mode 160000 index 0000000..95997e2 --- /dev/null +++ b/src/additional/sse2neon @@ -0,0 +1 @@ +Subproject commit 95997e26a34bfddcab6a5d1d3395fb701fedd354 diff --git a/src/crypto/scrypt/asm/asm-wrapper.cpp b/src/crypto/scrypt/asm/asm-wrapper.cpp deleted file mode 100644 index 05914ed..0000000 --- a/src/crypto/scrypt/asm/asm-wrapper.cpp +++ /dev/null @@ -1,22 +0,0 @@ -#include "scrypt.h" - -extern "C" void scrypt_core(uint32_t *X, uint32_t *V); - -/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output - scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes - r = 1, p = 1, N = 1024 - */ -uint256 scrypt_blockhash(const uint8_t* input) -{ - uint8_t scratchpad[SCRYPT_BUFFER_SIZE]; - uint32_t X[32]; - uint256 result = 0; - - uint32_t *V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - PKCS5_PBKDF2_HMAC((const char*)input, 80, input, 80, 1, EVP_sha256(), 128, (unsigned char *)X); - scrypt_core(X, V); - PKCS5_PBKDF2_HMAC((const char*)input, 80, (const unsigned char*)X, 128, 1, EVP_sha256(), 32, (unsigned char*)&result); - - return result; -} diff --git a/src/crypto/scrypt/asm/obj/.gitignore b/src/crypto/scrypt/asm/obj/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/src/crypto/scrypt/asm/obj/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/src/crypto/scrypt/asm/scrypt-arm.S b/src/crypto/scrypt/asm/scrypt-arm.S deleted file mode 100644 index 65b9c7f..0000000 --- a/src/crypto/scrypt/asm/scrypt-arm.S +++ /dev/null @@ -1,393 +0,0 @@ -/* - * Copyright 2012 pooler@litecoinpool.org - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. See COPYING for more details. - */ - -#if defined(__arm__) && defined(__APCS_32__) - -.macro salsa8_core_doubleround_body - ldr r8, [sp, #8*4] - add r11, r11, r10 - ldr lr, [sp, #13*4] - add r12, r12, r3 - eor r2, r2, r11, ror #23 - add r11, r4, r0 - eor r7, r7, r12, ror #23 - add r12, r9, r5 - str r9, [sp, #9*4] - eor r8, r8, r11, ror #23 - str r10, [sp, #14*4] - eor lr, lr, r12, ror #23 - - ldr r11, [sp, #11*4] - add r9, lr, r9 - ldr r12, [sp, #12*4] - add r10, r2, r10 - eor r1, r1, r9, ror #19 - add r9, r7, r3 - eor r6, r6, r10, ror #19 - add r10, r8, r4 - str r8, [sp, #8*4] - eor r11, r11, r9, ror #19 - str lr, [sp, #13*4] - eor r12, r12, r10, ror #19 - - ldr r9, [sp, #10*4] - add r8, r12, r8 - ldr r10, [sp, #15*4] - add lr, r1, lr - eor r0, r0, r8, ror #14 - add r8, r6, r2 - eor r5, r5, lr, ror #14 - add lr, r11, r7 - eor r9, r9, r8, ror #14 - ldr r8, [sp, #9*4] - eor r10, r10, lr, ror #14 - ldr lr, [sp, #14*4] - - - add r8, r9, r8 - str r9, [sp, #10*4] - add lr, r10, lr - str r10, [sp, #15*4] - eor r11, r11, r8, ror #25 - add r8, r0, r3 - eor r12, r12, lr, ror #25 - add lr, r5, r4 - eor r1, r1, r8, ror #25 - ldr r8, [sp, #8*4] - eor r6, r6, lr, ror #25 - - add r9, r11, r9 - ldr lr, [sp, #13*4] - add r10, r12, r10 - eor r8, r8, r9, ror #23 - add r9, r1, r0 - eor lr, lr, r10, ror #23 - add r10, r6, r5 - str r11, [sp, #11*4] - eor r2, r2, r9, ror #23 - str r12, [sp, #12*4] - eor r7, r7, r10, ror #23 - - ldr r9, [sp, #9*4] - add r11, r8, r11 - ldr r10, [sp, #14*4] - add r12, lr, r12 - eor r9, r9, r11, ror #19 - add r11, r2, r1 - eor r10, r10, r12, ror #19 - add r12, r7, r6 - str r8, [sp, #8*4] - eor r3, r3, r11, ror #19 - str lr, [sp, #13*4] - eor r4, r4, r12, ror #19 -.endm - -.macro salsa8_core - ldmia sp, {r0-r7} - - ldr r12, [sp, #15*4] - ldr r8, [sp, #11*4] - ldr lr, [sp, #12*4] - - ldr r9, [sp, #9*4] - add r8, r8, r12 - ldr r11, [sp, #10*4] - add lr, lr, r0 - eor r3, r3, r8, ror #25 - add r8, r5, r1 - ldr r10, [sp, #14*4] - eor r4, r4, lr, ror #25 - add lr, r11, r6 - eor r9, r9, r8, ror #25 - eor r10, r10, lr, ror #25 - - salsa8_core_doubleround_body - - ldr r11, [sp, #10*4] - add r8, r9, r8 - ldr r12, [sp, #15*4] - add lr, r10, lr - eor r11, r11, r8, ror #14 - str r9, [sp, #9*4] - eor r12, r12, lr, ror #14 - add r8, r3, r2 - add lr, r4, r7 - str r10, [sp, #14*4] - eor r0, r0, r8, ror #14 - ldr r8, [sp, #11*4] - eor r5, r5, lr, ror #14 - ldr lr, [sp, #12*4] - - add r8, r8, r12 - str r11, [sp, #10*4] - add lr, lr, r0 - str r12, [sp, #15*4] - eor r3, r3, r8, ror #25 - add r8, r5, r1 - eor r4, r4, lr, ror #25 - add lr, r11, r6 - eor r9, r9, r8, ror #25 - eor r10, r10, lr, ror #25 - - salsa8_core_doubleround_body - - ldr r11, [sp, #10*4] - add r8, r9, r8 - ldr r12, [sp, #15*4] - add lr, r10, lr - eor r11, r11, r8, ror #14 - str r9, [sp, #9*4] - eor r12, r12, lr, ror #14 - add r8, r3, r2 - add lr, r4, r7 - str r10, [sp, #14*4] - eor r0, r0, r8, ror #14 - ldr r8, [sp, #11*4] - eor r5, r5, lr, ror #14 - ldr lr, [sp, #12*4] - - add r8, r8, r12 - str r11, [sp, #10*4] - add lr, lr, r0 - str r12, [sp, #15*4] - eor r3, r3, r8, ror #25 - add r8, r5, r1 - eor r4, r4, lr, ror #25 - add lr, r11, r6 - eor r9, r9, r8, ror #25 - eor r10, r10, lr, ror #25 - - salsa8_core_doubleround_body - - ldr r11, [sp, #10*4] - add r8, r9, r8 - ldr r12, [sp, #15*4] - add lr, r10, lr - eor r11, r11, r8, ror #14 - str r9, [sp, #9*4] - eor r12, r12, lr, ror #14 - add r8, r3, r2 - add lr, r4, r7 - str r10, [sp, #14*4] - eor r0, r0, r8, ror #14 - ldr r8, [sp, #11*4] - eor r5, r5, lr, ror #14 - ldr lr, [sp, #12*4] - - add r8, r8, r12 - str r11, [sp, #10*4] - add lr, lr, r0 - str r12, [sp, #15*4] - eor r3, r3, r8, ror #25 - add r8, r5, r1 - eor r4, r4, lr, ror #25 - add lr, r11, r6 - eor r9, r9, r8, ror #25 - eor r10, r10, lr, ror #25 - - salsa8_core_doubleround_body - - ldr r11, [sp, #10*4] - add r8, r9, r8 - ldr r12, [sp, #15*4] - add lr, r10, lr - str r9, [sp, #9*4] - eor r11, r11, r8, ror #14 - eor r12, r12, lr, ror #14 - add r8, r3, r2 - str r10, [sp, #14*4] - add lr, r4, r7 - str r11, [sp, #10*4] - eor r0, r0, r8, ror #14 - str r12, [sp, #15*4] - eor r5, r5, lr, ror #14 - - stmia sp, {r0-r7} -.endm - - -.macro scrypt_core_macro1a_x4 - ldmia r0, {r4-r7} - ldmia lr!, {r8-r11} - stmia r1!, {r4-r7} - stmia r3!, {r8-r11} - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r10 - eor r7, r7, r11 - stmia r0!, {r4-r7} - stmia r12!, {r4-r7} -.endm - -.macro scrypt_core_macro1b_x4 - ldmia r3!, {r8-r11} - ldmia r2, {r4-r7} - eor r8, r8, r4 - eor r9, r9, r5 - eor r10, r10, r6 - eor r11, r11, r7 - ldmia r0, {r4-r7} - stmia r2!, {r8-r11} - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r10 - eor r7, r7, r11 - ldmia r1!, {r8-r11} - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r10 - eor r7, r7, r11 - stmia r0!, {r4-r7} - stmia r12!, {r4-r7} -.endm - -.macro scrypt_core_macro2_x4 - ldmia r12, {r4-r7} - ldmia r0, {r8-r11} - add r4, r4, r8 - add r5, r5, r9 - add r6, r6, r10 - add r7, r7, r11 - stmia r0!, {r4-r7} - ldmia r2, {r8-r11} - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r10 - eor r7, r7, r11 - stmia r2!, {r4-r7} - stmia r12!, {r4-r7} -.endm - -.macro scrypt_core_macro3_x4 - ldmia r1!, {r4-r7} - ldmia r0, {r8-r11} - add r4, r4, r8 - add r5, r5, r9 - add r6, r6, r10 - add r7, r7, r11 - stmia r0!, {r4-r7} -.endm - -.macro scrypt_core_macro3_x6 - ldmia r1!, {r2-r7} - ldmia r0, {r8-r12, lr} - add r2, r2, r8 - add r3, r3, r9 - add r4, r4, r10 - add r5, r5, r11 - add r6, r6, r12 - add r7, r7, lr - stmia r0!, {r2-r7} -.endm - - - .text - .code 32 - .align 2 - .globl scrypt_core - .globl _scrypt_core -#ifdef __ELF__ - .type scrypt_core, %function -#endif -scrypt_core: -_scrypt_core: - stmfd sp!, {r4-r11, lr} - sub sp, sp, #20*4 - - str r0, [sp, #16*4] - add r12, r1, #1024*32*4 - str r12, [sp, #18*4] -scrypt_core_loop1: - add lr, r0, #16*4 - add r3, r1, #16*4 - mov r12, sp - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 - str r1, [sp, #17*4] - - salsa8_core - - ldr r0, [sp, #16*4] - mov r12, sp - add r2, r0, #16*4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - - salsa8_core - - ldr r0, [sp, #16*4] - mov r1, sp - add r0, r0, #16*4 - scrypt_core_macro3_x6 - scrypt_core_macro3_x6 - ldr r3, [sp, #17*4] - ldr r12, [sp, #18*4] - scrypt_core_macro3_x4 - - add r1, r3, #16*4 - sub r0, r0, #32*4 - cmp r1, r12 - bne scrypt_core_loop1 - - sub r1, r1, #1024*32*4 - str r1, [sp, #17*4] - mov r12, #1024 -scrypt_core_loop2: - str r12, [sp, #18*4] - - ldr r4, [r0, #16*4] - mov r4, r4, lsl #32-10 - add r1, r1, r4, lsr #32-10-7 - - add r2, r0, #16*4 - add r3, r1, #16*4 - mov r12, sp - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 - - salsa8_core - - ldr r0, [sp, #16*4] - mov r12, sp - add r2, r0, #16*4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - - salsa8_core - - ldr r0, [sp, #16*4] - mov r1, sp - add r0, r0, #16*4 - scrypt_core_macro3_x6 - scrypt_core_macro3_x6 - scrypt_core_macro3_x4 - - ldr r12, [sp, #18*4] - sub r0, r0, #32*4 - ldr r1, [sp, #17*4] - subs r12, r12, #1 - bne scrypt_core_loop2 - - add sp, sp, #20*4 -#ifdef __thumb__ - ldmfd sp!, {r4-r11, lr} - bx lr -#else - ldmfd sp!, {r4-r11, pc} -#endif - -#endif diff --git a/src/crypto/scrypt/asm/scrypt-x86.S b/src/crypto/scrypt/asm/scrypt-x86.S deleted file mode 100644 index bfca2ed..0000000 --- a/src/crypto/scrypt/asm/scrypt-x86.S +++ /dev/null @@ -1,819 +0,0 @@ -/* - * Copyright 2011-2012 pooler@litecoinpool.org - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#if defined(__linux__) && defined(__ELF__) - .section .note.GNU-stack,"",%progbits -#endif - -#if defined(__i386__) - -.macro scrypt_shuffle src, so, dest, do - movl \so+60(\src), %eax - movl \so+44(\src), %ebx - movl \so+28(\src), %ecx - movl \so+12(\src), %edx - movl %eax, \do+12(\dest) - movl %ebx, \do+28(\dest) - movl %ecx, \do+44(\dest) - movl %edx, \do+60(\dest) - movl \so+40(\src), %eax - movl \so+8(\src), %ebx - movl \so+48(\src), %ecx - movl \so+16(\src), %edx - movl %eax, \do+8(\dest) - movl %ebx, \do+40(\dest) - movl %ecx, \do+16(\dest) - movl %edx, \do+48(\dest) - movl \so+20(\src), %eax - movl \so+4(\src), %ebx - movl \so+52(\src), %ecx - movl \so+36(\src), %edx - movl %eax, \do+4(\dest) - movl %ebx, \do+20(\dest) - movl %ecx, \do+36(\dest) - movl %edx, \do+52(\dest) - movl \so+0(\src), %eax - movl \so+24(\src), %ebx - movl \so+32(\src), %ecx - movl \so+56(\src), %edx - movl %eax, \do+0(\dest) - movl %ebx, \do+24(\dest) - movl %ecx, \do+32(\dest) - movl %edx, \do+56(\dest) -.endm - -.macro salsa8_core_gen_quadround - movl 52(%esp), %ecx - movl 4(%esp), %edx - movl 20(%esp), %ebx - movl 8(%esp), %esi - leal (%ecx, %edx), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 4(%esp) - movl 36(%esp), %edi - leal (%edx, %ebx), %ebp - roll $9, %ebp - xorl %ebp, %edi - movl 24(%esp), %ebp - movl %edi, 8(%esp) - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 40(%esp), %ebx - movl %ecx, 20(%esp) - addl %edi, %ecx - roll $18, %ecx - leal (%esi, %ebp), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 24(%esp) - movl 56(%esp), %edi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %edi - movl %edi, 36(%esp) - movl 28(%esp), %ecx - movl %edx, 28(%esp) - movl 44(%esp), %edx - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %esi - movl 60(%esp), %ebx - movl %esi, 40(%esp) - addl %edi, %esi - roll $18, %esi - leal (%ecx, %edx), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 44(%esp) - movl 12(%esp), %edi - xorl %esi, %ebp - leal (%edx, %ebx), %esi - roll $9, %esi - xorl %esi, %edi - movl %edi, 12(%esp) - movl 48(%esp), %esi - movl %ebp, 48(%esp) - movl 64(%esp), %ebp - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 16(%esp), %ebx - movl %ecx, 16(%esp) - addl %edi, %ecx - roll $18, %ecx - leal (%esi, %ebp), %edi - roll $7, %edi - xorl %edi, %ebx - movl 32(%esp), %edi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %edi - movl %edi, 32(%esp) - movl %ebx, %ecx - movl %edx, 52(%esp) - movl 28(%esp), %edx - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %esi - movl 40(%esp), %ebx - movl %esi, 28(%esp) - addl %edi, %esi - roll $18, %esi - leal (%ecx, %edx), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 40(%esp) - movl 12(%esp), %edi - xorl %esi, %ebp - leal (%edx, %ebx), %esi - roll $9, %esi - xorl %esi, %edi - movl %edi, 12(%esp) - movl 4(%esp), %esi - movl %ebp, 4(%esp) - movl 48(%esp), %ebp - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 16(%esp), %ebx - movl %ecx, 16(%esp) - addl %edi, %ecx - roll $18, %ecx - leal (%esi, %ebp), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 48(%esp) - movl 32(%esp), %edi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %edi - movl %edi, 32(%esp) - movl 24(%esp), %ecx - movl %edx, 24(%esp) - movl 52(%esp), %edx - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %esi - movl 28(%esp), %ebx - movl %esi, 28(%esp) - addl %edi, %esi - roll $18, %esi - leal (%ecx, %edx), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 52(%esp) - movl 8(%esp), %edi - xorl %esi, %ebp - leal (%edx, %ebx), %esi - roll $9, %esi - xorl %esi, %edi - movl %edi, 8(%esp) - movl 44(%esp), %esi - movl %ebp, 44(%esp) - movl 4(%esp), %ebp - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 20(%esp), %ebx - movl %ecx, 4(%esp) - addl %edi, %ecx - roll $18, %ecx - leal (%esi, %ebp), %edi - roll $7, %edi - xorl %edi, %ebx - movl 36(%esp), %edi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %edi - movl %edi, 20(%esp) - movl %ebx, %ecx - movl %edx, 36(%esp) - movl 24(%esp), %edx - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %esi - movl 28(%esp), %ebx - movl %esi, 24(%esp) - addl %edi, %esi - roll $18, %esi - leal (%ecx, %edx), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 28(%esp) - xorl %esi, %ebp - movl 8(%esp), %esi - leal (%edx, %ebx), %edi - roll $9, %edi - xorl %edi, %esi - movl 40(%esp), %edi - movl %ebp, 8(%esp) - movl 44(%esp), %ebp - movl %esi, 40(%esp) - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 4(%esp), %ebx - movl %ecx, 44(%esp) - addl %esi, %ecx - roll $18, %ecx - leal (%edi, %ebp), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 4(%esp) - movl 20(%esp), %esi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %esi - movl %esi, 56(%esp) - movl 48(%esp), %ecx - movl %edx, 20(%esp) - movl 36(%esp), %edx - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %edi - movl 24(%esp), %ebx - movl %edi, 24(%esp) - addl %esi, %edi - roll $18, %edi - leal (%ecx, %edx), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 60(%esp) - movl 12(%esp), %esi - xorl %edi, %ebp - leal (%edx, %ebx), %edi - roll $9, %edi - xorl %edi, %esi - movl %esi, 12(%esp) - movl 52(%esp), %edi - movl %ebp, 36(%esp) - movl 8(%esp), %ebp - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 16(%esp), %ebx - movl %ecx, 16(%esp) - addl %esi, %ecx - roll $18, %ecx - leal (%edi, %ebp), %esi - roll $7, %esi - xorl %esi, %ebx - movl 32(%esp), %esi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %esi - movl %esi, 32(%esp) - movl %ebx, %ecx - movl %edx, 48(%esp) - movl 20(%esp), %edx - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %edi - movl 24(%esp), %ebx - movl %edi, 20(%esp) - addl %esi, %edi - roll $18, %edi - leal (%ecx, %edx), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 8(%esp) - movl 12(%esp), %esi - xorl %edi, %ebp - leal (%edx, %ebx), %edi - roll $9, %edi - xorl %edi, %esi - movl %esi, 12(%esp) - movl 28(%esp), %edi - movl %ebp, 52(%esp) - movl 36(%esp), %ebp - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 16(%esp), %ebx - movl %ecx, 16(%esp) - addl %esi, %ecx - roll $18, %ecx - leal (%edi, %ebp), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 28(%esp) - movl 32(%esp), %esi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %esi - movl %esi, 32(%esp) - movl 4(%esp), %ecx - movl %edx, 4(%esp) - movl 48(%esp), %edx - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %edi - movl 20(%esp), %ebx - movl %edi, 20(%esp) - addl %esi, %edi - roll $18, %edi - leal (%ecx, %edx), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 48(%esp) - movl 40(%esp), %esi - xorl %edi, %ebp - leal (%edx, %ebx), %edi - roll $9, %edi - xorl %edi, %esi - movl %esi, 36(%esp) - movl 60(%esp), %edi - movl %ebp, 24(%esp) - movl 52(%esp), %ebp - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 44(%esp), %ebx - movl %ecx, 40(%esp) - addl %esi, %ecx - roll $18, %ecx - leal (%edi, %ebp), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 52(%esp) - movl 56(%esp), %esi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %esi - movl %esi, 56(%esp) - addl %esi, %ebx - movl %edx, 44(%esp) - roll $13, %ebx - xorl %ebx, %edi - movl %edi, 60(%esp) - addl %esi, %edi - roll $18, %edi - xorl %edi, %ebp - movl %ebp, 64(%esp) -.endm - - .text - .p2align 5 -salsa8_core_gen: - salsa8_core_gen_quadround - salsa8_core_gen_quadround - ret - - - .text - .p2align 5 - .globl scrypt_core - .globl _scrypt_core -scrypt_core: -_scrypt_core: - pushl %ebx - pushl %ebp - pushl %edi - pushl %esi - - /* Check for SSE2 availability */ - movl $1, %eax - cpuid - andl $0x04000000, %edx - jnz scrypt_core_sse2 - -scrypt_core_gen: - movl 20(%esp), %edi - movl 24(%esp), %esi - subl $72, %esp - -.macro scrypt_core_macro1a p, q - movl \p(%edi), %eax - movl \q(%edi), %edx - movl %eax, \p(%esi) - movl %edx, \q(%esi) - xorl %edx, %eax - movl %eax, \p(%edi) - movl %eax, \p(%esp) -.endm - -.macro scrypt_core_macro1b p, q - movl \p(%edi), %eax - xorl \p(%esi, %edx), %eax - movl \q(%edi), %ebx - xorl \q(%esi, %edx), %ebx - movl %ebx, \q(%edi) - xorl %ebx, %eax - movl %eax, \p(%edi) - movl %eax, \p(%esp) -.endm - -.macro scrypt_core_macro2 p, q - movl \p(%esp), %eax - addl \p(%edi), %eax - movl %eax, \p(%edi) - xorl \q(%edi), %eax - movl %eax, \q(%edi) - movl %eax, \p(%esp) -.endm - -.macro scrypt_core_macro3 p, q - movl \p(%esp), %eax - addl \q(%edi), %eax - movl %eax, \q(%edi) -.endm - - leal 131072(%esi), %ecx -scrypt_core_gen_loop1: - movl %esi, 64(%esp) - movl %ecx, 68(%esp) - - scrypt_core_macro1a 0, 64 - scrypt_core_macro1a 4, 68 - scrypt_core_macro1a 8, 72 - scrypt_core_macro1a 12, 76 - scrypt_core_macro1a 16, 80 - scrypt_core_macro1a 20, 84 - scrypt_core_macro1a 24, 88 - scrypt_core_macro1a 28, 92 - scrypt_core_macro1a 32, 96 - scrypt_core_macro1a 36, 100 - scrypt_core_macro1a 40, 104 - scrypt_core_macro1a 44, 108 - scrypt_core_macro1a 48, 112 - scrypt_core_macro1a 52, 116 - scrypt_core_macro1a 56, 120 - scrypt_core_macro1a 60, 124 - - call salsa8_core_gen - - movl 92(%esp), %edi - scrypt_core_macro2 0, 64 - scrypt_core_macro2 4, 68 - scrypt_core_macro2 8, 72 - scrypt_core_macro2 12, 76 - scrypt_core_macro2 16, 80 - scrypt_core_macro2 20, 84 - scrypt_core_macro2 24, 88 - scrypt_core_macro2 28, 92 - scrypt_core_macro2 32, 96 - scrypt_core_macro2 36, 100 - scrypt_core_macro2 40, 104 - scrypt_core_macro2 44, 108 - scrypt_core_macro2 48, 112 - scrypt_core_macro2 52, 116 - scrypt_core_macro2 56, 120 - scrypt_core_macro2 60, 124 - - call salsa8_core_gen - - movl 92(%esp), %edi - scrypt_core_macro3 0, 64 - scrypt_core_macro3 4, 68 - scrypt_core_macro3 8, 72 - scrypt_core_macro3 12, 76 - scrypt_core_macro3 16, 80 - scrypt_core_macro3 20, 84 - scrypt_core_macro3 24, 88 - scrypt_core_macro3 28, 92 - scrypt_core_macro3 32, 96 - scrypt_core_macro3 36, 100 - scrypt_core_macro3 40, 104 - scrypt_core_macro3 44, 108 - scrypt_core_macro3 48, 112 - scrypt_core_macro3 52, 116 - scrypt_core_macro3 56, 120 - scrypt_core_macro3 60, 124 - - movl 64(%esp), %esi - movl 68(%esp), %ecx - addl $128, %esi - cmpl %ecx, %esi - jne scrypt_core_gen_loop1 - - movl 96(%esp), %esi - movl $1024, %ecx -scrypt_core_gen_loop2: - movl %ecx, 68(%esp) - - movl 64(%edi), %edx - andl $1023, %edx - shll $7, %edx - - scrypt_core_macro1b 0, 64 - scrypt_core_macro1b 4, 68 - scrypt_core_macro1b 8, 72 - scrypt_core_macro1b 12, 76 - scrypt_core_macro1b 16, 80 - scrypt_core_macro1b 20, 84 - scrypt_core_macro1b 24, 88 - scrypt_core_macro1b 28, 92 - scrypt_core_macro1b 32, 96 - scrypt_core_macro1b 36, 100 - scrypt_core_macro1b 40, 104 - scrypt_core_macro1b 44, 108 - scrypt_core_macro1b 48, 112 - scrypt_core_macro1b 52, 116 - scrypt_core_macro1b 56, 120 - scrypt_core_macro1b 60, 124 - - call salsa8_core_gen - - movl 92(%esp), %edi - scrypt_core_macro2 0, 64 - scrypt_core_macro2 4, 68 - scrypt_core_macro2 8, 72 - scrypt_core_macro2 12, 76 - scrypt_core_macro2 16, 80 - scrypt_core_macro2 20, 84 - scrypt_core_macro2 24, 88 - scrypt_core_macro2 28, 92 - scrypt_core_macro2 32, 96 - scrypt_core_macro2 36, 100 - scrypt_core_macro2 40, 104 - scrypt_core_macro2 44, 108 - scrypt_core_macro2 48, 112 - scrypt_core_macro2 52, 116 - scrypt_core_macro2 56, 120 - scrypt_core_macro2 60, 124 - - call salsa8_core_gen - - movl 92(%esp), %edi - movl 96(%esp), %esi - scrypt_core_macro3 0, 64 - scrypt_core_macro3 4, 68 - scrypt_core_macro3 8, 72 - scrypt_core_macro3 12, 76 - scrypt_core_macro3 16, 80 - scrypt_core_macro3 20, 84 - scrypt_core_macro3 24, 88 - scrypt_core_macro3 28, 92 - scrypt_core_macro3 32, 96 - scrypt_core_macro3 36, 100 - scrypt_core_macro3 40, 104 - scrypt_core_macro3 44, 108 - scrypt_core_macro3 48, 112 - scrypt_core_macro3 52, 116 - scrypt_core_macro3 56, 120 - scrypt_core_macro3 60, 124 - - movl 68(%esp), %ecx - subl $1, %ecx - ja scrypt_core_gen_loop2 - - addl $72, %esp - popl %esi - popl %edi - popl %ebp - popl %ebx - ret - - -.macro salsa8_core_sse2_doubleround - movdqa %xmm1, %xmm4 - paddd %xmm0, %xmm4 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm3 - movdqa %xmm0, %xmm4 - pxor %xmm5, %xmm3 - - paddd %xmm3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm3, %xmm4 - pxor %xmm5, %xmm2 - pshufd $0x93, %xmm3, %xmm3 - - paddd %xmm2, %xmm4 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm1 - movdqa %xmm2, %xmm4 - pxor %xmm5, %xmm1 - pshufd $0x4e, %xmm2, %xmm2 - - paddd %xmm1, %xmm4 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - movdqa %xmm3, %xmm4 - pxor %xmm5, %xmm0 - pshufd $0x39, %xmm1, %xmm1 - - paddd %xmm0, %xmm4 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm1 - movdqa %xmm0, %xmm4 - pxor %xmm5, %xmm1 - - paddd %xmm1, %xmm4 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm1, %xmm4 - pxor %xmm5, %xmm2 - pshufd $0x93, %xmm1, %xmm1 - - paddd %xmm2, %xmm4 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm3 - movdqa %xmm2, %xmm4 - pxor %xmm5, %xmm3 - pshufd $0x4e, %xmm2, %xmm2 - - paddd %xmm3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - pshufd $0x39, %xmm3, %xmm3 - pxor %xmm5, %xmm0 -.endm - -.macro salsa8_core_sse2 - salsa8_core_sse2_doubleround - salsa8_core_sse2_doubleround - salsa8_core_sse2_doubleround - salsa8_core_sse2_doubleround -.endm - - .p2align 5 -scrypt_core_sse2: - movl 20(%esp), %edi - movl 24(%esp), %esi - movl %esp, %ebp - subl $128, %esp - andl $-16, %esp - - scrypt_shuffle %edi, 0, %esp, 0 - scrypt_shuffle %edi, 64, %esp, 64 - - movdqa 96(%esp), %xmm6 - movdqa 112(%esp), %xmm7 - - movl %esi, %edx - leal 131072(%esi), %ecx -scrypt_core_sse2_loop1: - movdqa 0(%esp), %xmm0 - movdqa 16(%esp), %xmm1 - movdqa 32(%esp), %xmm2 - movdqa 48(%esp), %xmm3 - movdqa 64(%esp), %xmm4 - movdqa 80(%esp), %xmm5 - pxor %xmm4, %xmm0 - pxor %xmm5, %xmm1 - movdqa %xmm0, 0(%edx) - movdqa %xmm1, 16(%edx) - pxor %xmm6, %xmm2 - pxor %xmm7, %xmm3 - movdqa %xmm2, 32(%edx) - movdqa %xmm3, 48(%edx) - movdqa %xmm4, 64(%edx) - movdqa %xmm5, 80(%edx) - movdqa %xmm6, 96(%edx) - movdqa %xmm7, 112(%edx) - - salsa8_core_sse2 - paddd 0(%edx), %xmm0 - paddd 16(%edx), %xmm1 - paddd 32(%edx), %xmm2 - paddd 48(%edx), %xmm3 - movdqa %xmm0, 0(%esp) - movdqa %xmm1, 16(%esp) - movdqa %xmm2, 32(%esp) - movdqa %xmm3, 48(%esp) - - pxor 64(%esp), %xmm0 - pxor 80(%esp), %xmm1 - pxor %xmm6, %xmm2 - pxor %xmm7, %xmm3 - movdqa %xmm0, 64(%esp) - movdqa %xmm1, 80(%esp) - movdqa %xmm2, %xmm6 - movdqa %xmm3, %xmm7 - salsa8_core_sse2 - paddd 64(%esp), %xmm0 - paddd 80(%esp), %xmm1 - paddd %xmm2, %xmm6 - paddd %xmm3, %xmm7 - movdqa %xmm0, 64(%esp) - movdqa %xmm1, 80(%esp) - - addl $128, %edx - cmpl %ecx, %edx - jne scrypt_core_sse2_loop1 - - movdqa 64(%esp), %xmm4 - movdqa 80(%esp), %xmm5 - - movl $1024, %ecx -scrypt_core_sse2_loop2: - movd %xmm4, %edx - movdqa 0(%esp), %xmm0 - movdqa 16(%esp), %xmm1 - movdqa 32(%esp), %xmm2 - movdqa 48(%esp), %xmm3 - andl $1023, %edx - shll $7, %edx - pxor 0(%esi, %edx), %xmm0 - pxor 16(%esi, %edx), %xmm1 - pxor 32(%esi, %edx), %xmm2 - pxor 48(%esi, %edx), %xmm3 - - pxor %xmm4, %xmm0 - pxor %xmm5, %xmm1 - movdqa %xmm0, 0(%esp) - movdqa %xmm1, 16(%esp) - pxor %xmm6, %xmm2 - pxor %xmm7, %xmm3 - movdqa %xmm2, 32(%esp) - movdqa %xmm3, 48(%esp) - salsa8_core_sse2 - paddd 0(%esp), %xmm0 - paddd 16(%esp), %xmm1 - paddd 32(%esp), %xmm2 - paddd 48(%esp), %xmm3 - movdqa %xmm0, 0(%esp) - movdqa %xmm1, 16(%esp) - movdqa %xmm2, 32(%esp) - movdqa %xmm3, 48(%esp) - - pxor 64(%esi, %edx), %xmm0 - pxor 80(%esi, %edx), %xmm1 - pxor 96(%esi, %edx), %xmm2 - pxor 112(%esi, %edx), %xmm3 - pxor 64(%esp), %xmm0 - pxor 80(%esp), %xmm1 - pxor %xmm6, %xmm2 - pxor %xmm7, %xmm3 - movdqa %xmm0, 64(%esp) - movdqa %xmm1, 80(%esp) - movdqa %xmm2, %xmm6 - movdqa %xmm3, %xmm7 - salsa8_core_sse2 - paddd 64(%esp), %xmm0 - paddd 80(%esp), %xmm1 - paddd %xmm2, %xmm6 - paddd %xmm3, %xmm7 - movdqa %xmm0, %xmm4 - movdqa %xmm1, %xmm5 - movdqa %xmm0, 64(%esp) - movdqa %xmm1, 80(%esp) - - subl $1, %ecx - ja scrypt_core_sse2_loop2 - - movdqa %xmm6, 96(%esp) - movdqa %xmm7, 112(%esp) - - scrypt_shuffle %esp, 0, %edi, 0 - scrypt_shuffle %esp, 64, %edi, 64 - - movl %ebp, %esp - popl %esi - popl %edi - popl %ebp - popl %ebx - ret - -#endif diff --git a/src/crypto/scrypt/asm/scrypt-x86_64.S b/src/crypto/scrypt/asm/scrypt-x86_64.S deleted file mode 100644 index 36054f1..0000000 --- a/src/crypto/scrypt/asm/scrypt-x86_64.S +++ /dev/null @@ -1,758 +0,0 @@ -/* - * Copyright 2011-2012 pooler@litecoinpool.org - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#if defined(__linux__) && defined(__ELF__) - .section .note.GNU-stack,"",%progbits -#endif - -#if defined(__x86_64__) - -.macro scrypt_shuffle src, so, dest, do - movl \so+60(\src), %r8d - movl \so+44(\src), %r9d - movl \so+28(\src), %r10d - movl \so+12(\src), %r11d - movl %r8d, \do+12(\dest) - movl %r9d, \do+28(\dest) - movl %r10d, \do+44(\dest) - movl %r11d, \do+60(\dest) - movl \so+40(\src), %r8d - movl \so+8(\src), %r9d - movl \so+48(\src), %r10d - movl \so+16(\src), %r11d - movl %r8d, \do+8(\dest) - movl %r9d, \do+40(\dest) - movl %r10d, \do+16(\dest) - movl %r11d, \do+48(\dest) - movl \so+20(\src), %r8d - movl \so+4(\src), %r9d - movl \so+52(\src), %r10d - movl \so+36(\src), %r11d - movl %r8d, \do+4(\dest) - movl %r9d, \do+20(\dest) - movl %r10d, \do+36(\dest) - movl %r11d, \do+52(\dest) - movl \so+0(\src), %r8d - movl \so+24(\src), %r9d - movl \so+32(\src), %r10d - movl \so+56(\src), %r11d - movl %r8d, \do+0(\dest) - movl %r9d, \do+24(\dest) - movl %r10d, \do+32(\dest) - movl %r11d, \do+56(\dest) -.endm - - -.macro salsa8_core_gen_doubleround - movq 72(%rsp), %r15 - - leaq (%r14, %rdx), %rbp - roll $7, %ebp - xorl %ebp, %r9d - leaq (%rdi, %r15), %rbp - roll $7, %ebp - xorl %ebp, %r10d - leaq (%rdx, %r9), %rbp - roll $9, %ebp - xorl %ebp, %r11d - leaq (%r15, %r10), %rbp - roll $9, %ebp - xorl %ebp, %r13d - - leaq (%r9, %r11), %rbp - roll $13, %ebp - xorl %ebp, %r14d - leaq (%r10, %r13), %rbp - roll $13, %ebp - xorl %ebp, %edi - leaq (%r11, %r14), %rbp - roll $18, %ebp - xorl %ebp, %edx - leaq (%r13, %rdi), %rbp - roll $18, %ebp - xorl %ebp, %r15d - - movq 48(%rsp), %rbp - movq %r15, 72(%rsp) - - leaq (%rax, %rbp), %r15 - roll $7, %r15d - xorl %r15d, %ebx - leaq (%rbp, %rbx), %r15 - roll $9, %r15d - xorl %r15d, %ecx - leaq (%rbx, %rcx), %r15 - roll $13, %r15d - xorl %r15d, %eax - leaq (%rcx, %rax), %r15 - roll $18, %r15d - xorl %r15d, %ebp - - movq 88(%rsp), %r15 - movq %rbp, 48(%rsp) - - leaq (%r12, %r15), %rbp - roll $7, %ebp - xorl %ebp, %esi - leaq (%r15, %rsi), %rbp - roll $9, %ebp - xorl %ebp, %r8d - leaq (%rsi, %r8), %rbp - roll $13, %ebp - xorl %ebp, %r12d - leaq (%r8, %r12), %rbp - roll $18, %ebp - xorl %ebp, %r15d - - movq %r15, 88(%rsp) - movq 72(%rsp), %r15 - - leaq (%rsi, %rdx), %rbp - roll $7, %ebp - xorl %ebp, %edi - leaq (%r9, %r15), %rbp - roll $7, %ebp - xorl %ebp, %eax - leaq (%rdx, %rdi), %rbp - roll $9, %ebp - xorl %ebp, %ecx - leaq (%r15, %rax), %rbp - roll $9, %ebp - xorl %ebp, %r8d - - leaq (%rdi, %rcx), %rbp - roll $13, %ebp - xorl %ebp, %esi - leaq (%rax, %r8), %rbp - roll $13, %ebp - xorl %ebp, %r9d - leaq (%rcx, %rsi), %rbp - roll $18, %ebp - xorl %ebp, %edx - leaq (%r8, %r9), %rbp - roll $18, %ebp - xorl %ebp, %r15d - - movq 48(%rsp), %rbp - movq %r15, 72(%rsp) - - leaq (%r10, %rbp), %r15 - roll $7, %r15d - xorl %r15d, %r12d - leaq (%rbp, %r12), %r15 - roll $9, %r15d - xorl %r15d, %r11d - leaq (%r12, %r11), %r15 - roll $13, %r15d - xorl %r15d, %r10d - leaq (%r11, %r10), %r15 - roll $18, %r15d - xorl %r15d, %ebp - - movq 88(%rsp), %r15 - movq %rbp, 48(%rsp) - - leaq (%rbx, %r15), %rbp - roll $7, %ebp - xorl %ebp, %r14d - leaq (%r15, %r14), %rbp - roll $9, %ebp - xorl %ebp, %r13d - leaq (%r14, %r13), %rbp - roll $13, %ebp - xorl %ebp, %ebx - leaq (%r13, %rbx), %rbp - roll $18, %ebp - xorl %ebp, %r15d - - movq %r15, 88(%rsp) -.endm - - .text - .p2align 6 -salsa8_core_gen: - /* 0: %rdx, %rdi, %rcx, %rsi */ - movq 8(%rsp), %rdi - movq %rdi, %rdx - shrq $32, %rdi - movq 16(%rsp), %rsi - movq %rsi, %rcx - shrq $32, %rsi - /* 1: %r9, 72(%rsp), %rax, %r8 */ - movq 24(%rsp), %r8 - movq %r8, %r9 - shrq $32, %r8 - movq %r8, 72(%rsp) - movq 32(%rsp), %r8 - movq %r8, %rax - shrq $32, %r8 - /* 2: %r11, %r10, 48(%rsp), %r12 */ - movq 40(%rsp), %r10 - movq %r10, %r11 - shrq $32, %r10 - movq 48(%rsp), %r12 - /* movq %r12, %r13 */ - /* movq %r13, 48(%rsp) */ - shrq $32, %r12 - /* 3: %r14, %r13, %rbx, 88(%rsp) */ - movq 56(%rsp), %r13 - movq %r13, %r14 - shrq $32, %r13 - movq 64(%rsp), %r15 - movq %r15, %rbx - shrq $32, %r15 - movq %r15, 88(%rsp) - - salsa8_core_gen_doubleround - salsa8_core_gen_doubleround - salsa8_core_gen_doubleround - salsa8_core_gen_doubleround - - shlq $32, %rdi - xorq %rdi, %rdx - movq %rdx, 24(%rsp) - - shlq $32, %rsi - xorq %rsi, %rcx - movq %rcx, 32(%rsp) - - movl 72(%rsp), %edi - shlq $32, %rdi - xorq %rdi, %r9 - movq %r9, 40(%rsp) - - movl 48(%rsp), %ebp - shlq $32, %r8 - xorq %r8, %rax - movq %rax, 48(%rsp) - - shlq $32, %r10 - xorq %r10, %r11 - movq %r11, 56(%rsp) - - shlq $32, %r12 - xorq %r12, %rbp - movq %rbp, 64(%rsp) - - shlq $32, %r13 - xorq %r13, %r14 - movq %r14, 72(%rsp) - - movdqa 24(%rsp), %xmm0 - - shlq $32, %r15 - xorq %r15, %rbx - movq %rbx, 80(%rsp) - - movdqa 40(%rsp), %xmm1 - movdqa 56(%rsp), %xmm2 - movdqa 72(%rsp), %xmm3 - - ret - - - .text - .p2align 6 - .globl scrypt_core - .globl _scrypt_core -scrypt_core: -_scrypt_core: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 -#if defined(WIN64) - subq $176, %rsp - movdqa %xmm6, 8(%rsp) - movdqa %xmm7, 24(%rsp) - movdqa %xmm8, 40(%rsp) - movdqa %xmm9, 56(%rsp) - movdqa %xmm10, 72(%rsp) - movdqa %xmm11, 88(%rsp) - movdqa %xmm12, 104(%rsp) - movdqa %xmm13, 120(%rsp) - movdqa %xmm14, 136(%rsp) - movdqa %xmm15, 152(%rsp) - pushq %rdi - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi -#endif - -.macro scrypt_core_cleanup -#if defined(WIN64) - popq %rsi - popq %rdi - movdqa 8(%rsp), %xmm6 - movdqa 24(%rsp), %xmm7 - movdqa 40(%rsp), %xmm8 - movdqa 56(%rsp), %xmm9 - movdqa 72(%rsp), %xmm10 - movdqa 88(%rsp), %xmm11 - movdqa 104(%rsp), %xmm12 - movdqa 120(%rsp), %xmm13 - movdqa 136(%rsp), %xmm14 - movdqa 152(%rsp), %xmm15 - addq $176, %rsp -#endif - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx -.endm - - /* GenuineIntel processors have fast SIMD */ - xorl %eax, %eax - cpuid - cmpl $0x6c65746e, %ecx - jne scrypt_core_gen - cmpl $0x49656e69, %edx - jne scrypt_core_gen - cmpl $0x756e6547, %ebx - je scrypt_core_xmm - - .p2align 6 -scrypt_core_gen: - subq $136, %rsp - movdqa 0(%rdi), %xmm8 - movdqa 16(%rdi), %xmm9 - movdqa 32(%rdi), %xmm10 - movdqa 48(%rdi), %xmm11 - movdqa 64(%rdi), %xmm12 - movdqa 80(%rdi), %xmm13 - movdqa 96(%rdi), %xmm14 - movdqa 112(%rdi), %xmm15 - - leaq 131072(%rsi), %rcx - movq %rdi, 104(%rsp) - movq %rsi, 112(%rsp) - movq %rcx, 120(%rsp) -scrypt_core_gen_loop1: - movdqa %xmm8, 0(%rsi) - movdqa %xmm9, 16(%rsi) - movdqa %xmm10, 32(%rsi) - movdqa %xmm11, 48(%rsi) - movdqa %xmm12, 64(%rsi) - movdqa %xmm13, 80(%rsi) - movdqa %xmm14, 96(%rsi) - movdqa %xmm15, 112(%rsi) - - pxor %xmm12, %xmm8 - pxor %xmm13, %xmm9 - pxor %xmm14, %xmm10 - pxor %xmm15, %xmm11 - movdqa %xmm8, 0(%rsp) - movdqa %xmm9, 16(%rsp) - movdqa %xmm10, 32(%rsp) - movdqa %xmm11, 48(%rsp) - movq %rsi, 128(%rsp) - call salsa8_core_gen - paddd %xmm0, %xmm8 - paddd %xmm1, %xmm9 - paddd %xmm2, %xmm10 - paddd %xmm3, %xmm11 - - pxor %xmm8, %xmm12 - pxor %xmm9, %xmm13 - pxor %xmm10, %xmm14 - pxor %xmm11, %xmm15 - movdqa %xmm12, 0(%rsp) - movdqa %xmm13, 16(%rsp) - movdqa %xmm14, 32(%rsp) - movdqa %xmm15, 48(%rsp) - call salsa8_core_gen - movq 128(%rsp), %rsi - paddd %xmm0, %xmm12 - paddd %xmm1, %xmm13 - paddd %xmm2, %xmm14 - paddd %xmm3, %xmm15 - - addq $128, %rsi - movq 120(%rsp), %rcx - cmpq %rcx, %rsi - jne scrypt_core_gen_loop1 - - movq $1024, %rcx - movd %xmm12, %edx -scrypt_core_gen_loop2: - movq 112(%rsp), %rsi - andl $1023, %edx - shll $7, %edx - addq %rsi, %rdx - movdqa 0(%rdx), %xmm0 - movdqa 16(%rdx), %xmm1 - movdqa 32(%rdx), %xmm2 - movdqa 48(%rdx), %xmm3 - movdqa 64(%rdx), %xmm4 - movdqa 80(%rdx), %xmm5 - movdqa 96(%rdx), %xmm6 - movdqa 112(%rdx), %xmm7 - pxor %xmm0, %xmm8 - pxor %xmm1, %xmm9 - pxor %xmm2, %xmm10 - pxor %xmm3, %xmm11 - pxor %xmm4, %xmm12 - pxor %xmm5, %xmm13 - pxor %xmm6, %xmm14 - pxor %xmm7, %xmm15 - - pxor %xmm12, %xmm8 - pxor %xmm13, %xmm9 - pxor %xmm14, %xmm10 - pxor %xmm15, %xmm11 - movdqa %xmm8, 0(%rsp) - movdqa %xmm9, 16(%rsp) - movdqa %xmm10, 32(%rsp) - movdqa %xmm11, 48(%rsp) - movq %rcx, 128(%rsp) - call salsa8_core_gen - paddd %xmm0, %xmm8 - paddd %xmm1, %xmm9 - paddd %xmm2, %xmm10 - paddd %xmm3, %xmm11 - - pxor %xmm8, %xmm12 - pxor %xmm9, %xmm13 - pxor %xmm10, %xmm14 - pxor %xmm11, %xmm15 - movdqa %xmm12, 0(%rsp) - movdqa %xmm13, 16(%rsp) - movdqa %xmm14, 32(%rsp) - movdqa %xmm15, 48(%rsp) - call salsa8_core_gen - movq 128(%rsp), %rcx - addl 0(%rsp), %edx - paddd %xmm0, %xmm12 - paddd %xmm1, %xmm13 - paddd %xmm2, %xmm14 - paddd %xmm3, %xmm15 - - subq $1, %rcx - ja scrypt_core_gen_loop2 - - movq 104(%rsp), %rdi - movdqa %xmm8, 0(%rdi) - movdqa %xmm9, 16(%rdi) - movdqa %xmm10, 32(%rdi) - movdqa %xmm11, 48(%rdi) - movdqa %xmm12, 64(%rdi) - movdqa %xmm13, 80(%rdi) - movdqa %xmm14, 96(%rdi) - movdqa %xmm15, 112(%rdi) - - addq $136, %rsp - scrypt_core_cleanup - ret - - -.macro salsa8_core_xmm_doubleround - movdqa %xmm1, %xmm4 - paddd %xmm0, %xmm4 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm3 - movdqa %xmm0, %xmm4 - pxor %xmm5, %xmm3 - - paddd %xmm3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm3, %xmm4 - pxor %xmm5, %xmm2 - pshufd $0x93, %xmm3, %xmm3 - - paddd %xmm2, %xmm4 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm1 - movdqa %xmm2, %xmm4 - pxor %xmm5, %xmm1 - pshufd $0x4e, %xmm2, %xmm2 - - paddd %xmm1, %xmm4 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - movdqa %xmm3, %xmm4 - pxor %xmm5, %xmm0 - pshufd $0x39, %xmm1, %xmm1 - - paddd %xmm0, %xmm4 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm1 - movdqa %xmm0, %xmm4 - pxor %xmm5, %xmm1 - - paddd %xmm1, %xmm4 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm1, %xmm4 - pxor %xmm5, %xmm2 - pshufd $0x93, %xmm1, %xmm1 - - paddd %xmm2, %xmm4 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm3 - movdqa %xmm2, %xmm4 - pxor %xmm5, %xmm3 - pshufd $0x4e, %xmm2, %xmm2 - - paddd %xmm3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - pshufd $0x39, %xmm3, %xmm3 - pxor %xmm5, %xmm0 -.endm - -.macro salsa8_core_xmm - salsa8_core_xmm_doubleround - salsa8_core_xmm_doubleround - salsa8_core_xmm_doubleround - salsa8_core_xmm_doubleround -.endm - - .p2align 6 -scrypt_core_xmm: - pcmpeqw %xmm1, %xmm1 - psrlq $32, %xmm1 - - movdqa 0(%rdi), %xmm8 - movdqa 16(%rdi), %xmm11 - movdqa 32(%rdi), %xmm10 - movdqa 48(%rdi), %xmm9 - movdqa %xmm8, %xmm0 - pxor %xmm11, %xmm8 - pand %xmm1, %xmm8 - pxor %xmm11, %xmm8 - pxor %xmm10, %xmm11 - pand %xmm1, %xmm11 - pxor %xmm10, %xmm11 - pxor %xmm9, %xmm10 - pand %xmm1, %xmm10 - pxor %xmm9, %xmm10 - pxor %xmm0, %xmm9 - pand %xmm1, %xmm9 - pxor %xmm0, %xmm9 - movdqa %xmm8, %xmm0 - pshufd $0x4e, %xmm10, %xmm10 - punpcklqdq %xmm10, %xmm8 - punpckhqdq %xmm0, %xmm10 - movdqa %xmm11, %xmm0 - pshufd $0x4e, %xmm9, %xmm9 - punpcklqdq %xmm9, %xmm11 - punpckhqdq %xmm0, %xmm9 - - movdqa 64(%rdi), %xmm12 - movdqa 80(%rdi), %xmm15 - movdqa 96(%rdi), %xmm14 - movdqa 112(%rdi), %xmm13 - movdqa %xmm12, %xmm0 - pxor %xmm15, %xmm12 - pand %xmm1, %xmm12 - pxor %xmm15, %xmm12 - pxor %xmm14, %xmm15 - pand %xmm1, %xmm15 - pxor %xmm14, %xmm15 - pxor %xmm13, %xmm14 - pand %xmm1, %xmm14 - pxor %xmm13, %xmm14 - pxor %xmm0, %xmm13 - pand %xmm1, %xmm13 - pxor %xmm0, %xmm13 - movdqa %xmm12, %xmm0 - pshufd $0x4e, %xmm14, %xmm14 - punpcklqdq %xmm14, %xmm12 - punpckhqdq %xmm0, %xmm14 - movdqa %xmm15, %xmm0 - pshufd $0x4e, %xmm13, %xmm13 - punpcklqdq %xmm13, %xmm15 - punpckhqdq %xmm0, %xmm13 - - movq %rsi, %rdx - leaq 131072(%rsi), %rcx -scrypt_core_xmm_loop1: - pxor %xmm12, %xmm8 - pxor %xmm13, %xmm9 - pxor %xmm14, %xmm10 - pxor %xmm15, %xmm11 - movdqa %xmm8, 0(%rdx) - movdqa %xmm9, 16(%rdx) - movdqa %xmm10, 32(%rdx) - movdqa %xmm11, 48(%rdx) - movdqa %xmm12, 64(%rdx) - movdqa %xmm13, 80(%rdx) - movdqa %xmm14, 96(%rdx) - movdqa %xmm15, 112(%rdx) - - movdqa %xmm8, %xmm0 - movdqa %xmm9, %xmm1 - movdqa %xmm10, %xmm2 - movdqa %xmm11, %xmm3 - salsa8_core_xmm - paddd %xmm0, %xmm8 - paddd %xmm1, %xmm9 - paddd %xmm2, %xmm10 - paddd %xmm3, %xmm11 - - pxor %xmm8, %xmm12 - pxor %xmm9, %xmm13 - pxor %xmm10, %xmm14 - pxor %xmm11, %xmm15 - movdqa %xmm12, %xmm0 - movdqa %xmm13, %xmm1 - movdqa %xmm14, %xmm2 - movdqa %xmm15, %xmm3 - salsa8_core_xmm - paddd %xmm0, %xmm12 - paddd %xmm1, %xmm13 - paddd %xmm2, %xmm14 - paddd %xmm3, %xmm15 - - addq $128, %rdx - cmpq %rcx, %rdx - jne scrypt_core_xmm_loop1 - - movq $1024, %rcx -scrypt_core_xmm_loop2: - movd %xmm12, %edx - andl $1023, %edx - shll $7, %edx - pxor 0(%rsi, %rdx), %xmm8 - pxor 16(%rsi, %rdx), %xmm9 - pxor 32(%rsi, %rdx), %xmm10 - pxor 48(%rsi, %rdx), %xmm11 - - pxor %xmm12, %xmm8 - pxor %xmm13, %xmm9 - pxor %xmm14, %xmm10 - pxor %xmm15, %xmm11 - movdqa %xmm8, %xmm0 - movdqa %xmm9, %xmm1 - movdqa %xmm10, %xmm2 - movdqa %xmm11, %xmm3 - salsa8_core_xmm - paddd %xmm0, %xmm8 - paddd %xmm1, %xmm9 - paddd %xmm2, %xmm10 - paddd %xmm3, %xmm11 - - pxor 64(%rsi, %rdx), %xmm12 - pxor 80(%rsi, %rdx), %xmm13 - pxor 96(%rsi, %rdx), %xmm14 - pxor 112(%rsi, %rdx), %xmm15 - pxor %xmm8, %xmm12 - pxor %xmm9, %xmm13 - pxor %xmm10, %xmm14 - pxor %xmm11, %xmm15 - movdqa %xmm12, %xmm0 - movdqa %xmm13, %xmm1 - movdqa %xmm14, %xmm2 - movdqa %xmm15, %xmm3 - salsa8_core_xmm - paddd %xmm0, %xmm12 - paddd %xmm1, %xmm13 - paddd %xmm2, %xmm14 - paddd %xmm3, %xmm15 - - subq $1, %rcx - ja scrypt_core_xmm_loop2 - - pcmpeqw %xmm1, %xmm1 - psrlq $32, %xmm1 - - movdqa %xmm8, %xmm0 - pxor %xmm9, %xmm8 - pand %xmm1, %xmm8 - pxor %xmm9, %xmm8 - pxor %xmm10, %xmm9 - pand %xmm1, %xmm9 - pxor %xmm10, %xmm9 - pxor %xmm11, %xmm10 - pand %xmm1, %xmm10 - pxor %xmm11, %xmm10 - pxor %xmm0, %xmm11 - pand %xmm1, %xmm11 - pxor %xmm0, %xmm11 - movdqa %xmm8, %xmm0 - pshufd $0x4e, %xmm10, %xmm10 - punpcklqdq %xmm10, %xmm8 - punpckhqdq %xmm0, %xmm10 - movdqa %xmm9, %xmm0 - pshufd $0x4e, %xmm11, %xmm11 - punpcklqdq %xmm11, %xmm9 - punpckhqdq %xmm0, %xmm11 - movdqa %xmm8, 0(%rdi) - movdqa %xmm11, 16(%rdi) - movdqa %xmm10, 32(%rdi) - movdqa %xmm9, 48(%rdi) - - movdqa %xmm12, %xmm0 - pxor %xmm13, %xmm12 - pand %xmm1, %xmm12 - pxor %xmm13, %xmm12 - pxor %xmm14, %xmm13 - pand %xmm1, %xmm13 - pxor %xmm14, %xmm13 - pxor %xmm15, %xmm14 - pand %xmm1, %xmm14 - pxor %xmm15, %xmm14 - pxor %xmm0, %xmm15 - pand %xmm1, %xmm15 - pxor %xmm0, %xmm15 - movdqa %xmm12, %xmm0 - pshufd $0x4e, %xmm14, %xmm14 - punpcklqdq %xmm14, %xmm12 - punpckhqdq %xmm0, %xmm14 - movdqa %xmm13, %xmm0 - pshufd $0x4e, %xmm15, %xmm15 - punpcklqdq %xmm15, %xmm13 - punpckhqdq %xmm0, %xmm15 - movdqa %xmm12, 64(%rdi) - movdqa %xmm15, 80(%rdi) - movdqa %xmm14, 96(%rdi) - movdqa %xmm13, 112(%rdi) - - scrypt_core_cleanup - ret - -#endif diff --git a/src/crypto/scrypt/intrin/scrypt-intrin.cpp b/src/crypto/scrypt/intrin/scrypt-intrin.cpp new file mode 100644 index 0000000..04613b9 --- /dev/null +++ b/src/crypto/scrypt/intrin/scrypt-intrin.cpp @@ -0,0 +1,151 @@ +/* + * Copyright 2009 Colin Percival, 2011 ArtForz, 2012-2013 pooler + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +#ifdef __ARM_NEON +#include +#else +#include +#endif + +#include "scrypt.h" + +static inline uint32_t le32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + + ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); +} + +static inline void le32enc(void *pp, uint32_t x) +{ + uint8_t *p = (uint8_t *)pp; + p[0] = x & 0xff; + p[1] = (x >> 8) & 0xff; + p[2] = (x >> 16) & 0xff; + p[3] = (x >> 24) & 0xff; +} + +static inline void xor_salsa8_sse2(__m128i B[4], const __m128i Bx[4]) +{ + __m128i X0 = B[0] = _mm_xor_si128(B[0], Bx[0]); + __m128i X1 = B[1] = _mm_xor_si128(B[1], Bx[1]); + __m128i X2 = B[2] = _mm_xor_si128(B[2], Bx[2]); + __m128i X3 = B[3] = _mm_xor_si128(B[3], Bx[3]); + + for (uint32_t i = 0; i < 8; i += 2) { + /* Operate on "columns". */ + __m128i T = _mm_add_epi32(X0, X3); + X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 7)); + X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X1, X0); + X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9)); + X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X1); + X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 13)); + X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X3, X2); + X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18)); + X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14)); + + /* Rearrange data. */ + X1 = _mm_shuffle_epi32(X1, 0x93); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x39); + + /* Operate on "rows". */ + T = _mm_add_epi32(X0, X1); + X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 7)); + X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X3, X0); + X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9)); + X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X3); + X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 13)); + X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X1, X2); + X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18)); + X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14)); + + /* Rearrange data. */ + X1 = _mm_shuffle_epi32(X1, 0x39); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x93); + } + + B[0] = _mm_add_epi32(B[0], X0); + B[1] = _mm_add_epi32(B[1], X1); + B[2] = _mm_add_epi32(B[2], X2); + B[3] = _mm_add_epi32(B[3], X3); +} + +uint256 scrypt_blockhash(const uint8_t* input) +{ + uint8_t scratchpad[SCRYPT_BUFFER_SIZE]; + __m128i *V = (__m128i *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + + uint8_t B[128]; + void *const tmp = const_cast(input); + PKCS5_PBKDF2_HMAC(static_cast(tmp), 80, input, 80, 1, EVP_sha256(), 128, B); + + union { + __m128i i128[8]; + uint32_t u32[32]; + } X; + uint32_t i, k; + for (k = 0; k < 2; k++) { + for (i = 0; i < 16; i++) { + X.u32[k * 16 + i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]); + } + } + + for (i = 0; i < 1024; i++) { + for (k = 0; k < 8; k++) + V[i * 8 + k] = X.i128[k]; + xor_salsa8_sse2(&X.i128[0], &X.i128[4]); + xor_salsa8_sse2(&X.i128[4], &X.i128[0]); + } + for (i = 0; i < 1024; i++) { + uint32_t j = 8 * (X.u32[16] & 1023); + for (k = 0; k < 8; k++) + X.i128[k] = _mm_xor_si128(X.i128[k], V[j + k]); + xor_salsa8_sse2(&X.i128[0], &X.i128[4]); + xor_salsa8_sse2(&X.i128[4], &X.i128[0]); + } + + for (k = 0; k < 2; k++) { + for (i = 0; i < 16; i++) { + le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X.u32[k * 16 + i]); + } + } + + uint256 result = 0; + PKCS5_PBKDF2_HMAC(static_cast(tmp), 80, B, 128, 1, EVP_sha256(), 32, (unsigned char*)&result); + + return result; +} diff --git a/src/crypto/scrypt/intrin/scrypt-sse2.cpp b/src/crypto/scrypt/intrin/scrypt-sse2.cpp deleted file mode 100644 index bc828e3..0000000 --- a/src/crypto/scrypt/intrin/scrypt-sse2.cpp +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright 2009 Colin Percival, 2011 ArtForz, 2012-2013 pooler - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - */ - -#include -#include "scrypt.h" - -static inline uint32_t le32dec(const void *pp) -{ - const uint8_t *p = (uint8_t const *)pp; - return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + - ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); -} - -static inline void le32enc(void *pp, uint32_t x) -{ - uint8_t *p = (uint8_t *)pp; - p[0] = x & 0xff; - p[1] = (x >> 8) & 0xff; - p[2] = (x >> 16) & 0xff; - p[3] = (x >> 24) & 0xff; -} - -static inline void xor_salsa8_sse2(__m128i B[4], const __m128i Bx[4]) -{ - __m128i X0 = B[0] = _mm_xor_si128(B[0], Bx[0]); - __m128i X1 = B[1] = _mm_xor_si128(B[1], Bx[1]); - __m128i X2 = B[2] = _mm_xor_si128(B[2], Bx[2]); - __m128i X3 = B[3] = _mm_xor_si128(B[3], Bx[3]); - - for (uint32_t i = 0; i < 8; i += 2) { - /* Operate on "columns". */ - __m128i T = _mm_add_epi32(X0, X3); - X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 7)); - X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 25)); - T = _mm_add_epi32(X1, X0); - X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9)); - X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23)); - T = _mm_add_epi32(X2, X1); - X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 13)); - X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 19)); - T = _mm_add_epi32(X3, X2); - X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18)); - X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14)); - - /* Rearrange data. */ - X1 = _mm_shuffle_epi32(X1, 0x93); - X2 = _mm_shuffle_epi32(X2, 0x4E); - X3 = _mm_shuffle_epi32(X3, 0x39); - - /* Operate on "rows". */ - T = _mm_add_epi32(X0, X1); - X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 7)); - X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 25)); - T = _mm_add_epi32(X3, X0); - X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9)); - X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23)); - T = _mm_add_epi32(X2, X3); - X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 13)); - X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 19)); - T = _mm_add_epi32(X1, X2); - X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18)); - X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14)); - - /* Rearrange data. */ - X1 = _mm_shuffle_epi32(X1, 0x39); - X2 = _mm_shuffle_epi32(X2, 0x4E); - X3 = _mm_shuffle_epi32(X3, 0x93); - } - - B[0] = _mm_add_epi32(B[0], X0); - B[1] = _mm_add_epi32(B[1], X1); - B[2] = _mm_add_epi32(B[2], X2); - B[3] = _mm_add_epi32(B[3], X3); -} - -uint256 scrypt_blockhash(const uint8_t* input) -{ - uint8_t scratchpad[SCRYPT_BUFFER_SIZE]; - __m128i *V = (__m128i *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - uint8_t B[128]; - void *const tmp = const_cast(input); - PKCS5_PBKDF2_HMAC(static_cast(tmp), 80, input, 80, 1, EVP_sha256(), 128, B); - - union { - __m128i i128[8]; - uint32_t u32[32]; - } X; - uint32_t i, k; - for (k = 0; k < 2; k++) { - for (i = 0; i < 16; i++) { - X.u32[k * 16 + i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]); - } - } - - for (i = 0; i < 1024; i++) { - for (k = 0; k < 8; k++) - V[i * 8 + k] = X.i128[k]; - xor_salsa8_sse2(&X.i128[0], &X.i128[4]); - xor_salsa8_sse2(&X.i128[4], &X.i128[0]); - } - for (i = 0; i < 1024; i++) { - uint32_t j = 8 * (X.u32[16] & 1023); - for (k = 0; k < 8; k++) - X.i128[k] = _mm_xor_si128(X.i128[k], V[j + k]); - xor_salsa8_sse2(&X.i128[0], &X.i128[4]); - xor_salsa8_sse2(&X.i128[4], &X.i128[0]); - } - - for (k = 0; k < 2; k++) { - for (i = 0; i < 16; i++) { - le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X.u32[k * 16 + i]); - } - } - - uint256 result = 0; - PKCS5_PBKDF2_HMAC(static_cast(tmp), 80, B, 128, 1, EVP_sha256(), 32, (unsigned char*)&result); - - return result; -} -- 1.7.1