From 589ad785953e590eeea2fc2914eeb330fe1c6071 Mon Sep 17 00:00:00 2001 From: alex Date: Fri, 3 Jan 2014 03:59:01 +0400 Subject: [PATCH] Compatibility improvements * Add a generic SALSA implementation and disable optimized implementations by default; * Remove macroses from ARM SALSA implementation; * Some misc defigition fixs to correct mingw-w64 compatibility issues. --- src/compat.h | 4 +- src/makefile.linux-mingw | 4 +- src/scrypt-arm.S | 878 ++++++++++++++++++++++------------------------ src/scrypt-x86_64.S | 5 +- src/scrypt.cpp | 105 +++++- src/util.h | 3 +- 6 files changed, 529 insertions(+), 470 deletions(-) diff --git a/src/compat.h b/src/compat.h index 79ebb93..d09ec4f 100644 --- a/src/compat.h +++ b/src/compat.h @@ -23,9 +23,11 @@ #include #include #include -#endif typedef u_int SOCKET; +#endif + + #ifdef WIN32 #define MSG_NOSIGNAL 0 #define MSG_DONTWAIT 0 diff --git a/src/makefile.linux-mingw b/src/makefile.linux-mingw index 014c87f..5be8548 100644 --- a/src/makefile.linux-mingw +++ b/src/makefile.linux-mingw @@ -18,13 +18,13 @@ USE_LEVELDB:=1 INCLUDEPATHS= \ -I"$(CURDIR)" \ -I"$(CURDIR)"/obj \ - -I"$(DEPSDIR)/boost_1_50_0" \ + -I"$(DEPSDIR)/boost_1_55_0" \ -I"$(DEPSDIR)/db-4.8.30.NC/build_unix" \ -I"$(DEPSDIR)/openssl-1.0.1c/include" \ -I"$(DEPSDIR)" LIBPATHS= \ - -L"$(DEPSDIR)/boost_1_50_0/stage/lib" \ + -L"$(DEPSDIR)/boost_1_55_0/stage/lib" \ -L"$(DEPSDIR)/db-4.8.30.NC/build_unix" \ -L"$(DEPSDIR)/openssl-1.0.1c" diff --git a/src/scrypt-arm.S b/src/scrypt-arm.S index 14f2a7e..12d94b0 100644 --- a/src/scrypt-arm.S +++ b/src/scrypt-arm.S @@ -7,12 +7,13 @@ * any later version. See COPYING for more details. */ + +#if defined(__arm__) && defined(__APCS_32__) + #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif -#if defined(__arm__) && defined(__APCS_32__) - #if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ defined(__ARM_ARCH_5TEJ__) || defined(__ARM_ARCH_6__) || \ defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || \ @@ -29,437 +30,406 @@ #ifdef __ARM_ARCH_5E_OR_6__ -.macro scrypt_shuffle - add lr, r0, #9*4 - ldmia r0, {r2-r7} - ldmia lr, {r2, r8-r12, lr} - str r3, [r0, #5*4] - str r5, [r0, #15*4] - str r6, [r0, #12*4] - str r7, [r0, #1*4] - ldr r5, [r0, #7*4] - str r2, [r0, #13*4] - str r8, [r0, #2*4] - strd r4, [r0, #10*4] - str r9, [r0, #7*4] - str r10, [r0, #4*4] - str r11, [r0, #9*4] - str lr, [r0, #3*4] - - add r2, r0, #64+0*4 - add lr, r0, #64+9*4 - ldmia r2, {r2-r7} - ldmia lr, {r2, r8-r12, lr} - str r3, [r0, #64+5*4] - str r5, [r0, #64+15*4] - str r6, [r0, #64+12*4] - str r7, [r0, #64+1*4] - ldr r5, [r0, #64+7*4] - str r2, [r0, #64+13*4] - str r8, [r0, #64+2*4] - strd r4, [r0, #64+10*4] - str r9, [r0, #64+7*4] - str r10, [r0, #64+4*4] - str r11, [r0, #64+9*4] - str lr, [r0, #64+3*4] -.endm - -.macro salsa8_core_doubleround_body - add r6, r2, r6 - add r7, r3, r7 - eor r10, r10, r6, ror #25 - add r6, r0, r4 - eor r11, r11, r7, ror #25 - add r7, r1, r5 - strd r10, [sp, #14*4] - eor r12, r12, r6, ror #25 - eor lr, lr, r7, ror #25 - - ldrd r6, [sp, #10*4] - add r2, r10, r2 - add r3, r11, r3 - eor r6, r6, r2, ror #23 - add r2, r12, r0 - eor r7, r7, r3, ror #23 - add r3, lr, r1 - strd r6, [sp, #10*4] - eor r8, r8, r2, ror #23 - eor r9, r9, r3, ror #23 - - ldrd r2, [sp, #6*4] - add r10, r6, r10 - add r11, r7, r11 - eor r2, r2, r10, ror #19 - add r10, r8, r12 - eor r3, r3, r11, ror #19 - add r11, r9, lr - eor r4, r4, r10, ror #19 - eor r5, r5, r11, ror #19 - - ldrd r10, [sp, #2*4] - add r6, r2, r6 - add r7, r3, r7 - eor r10, r10, r6, ror #14 - add r6, r4, r8 - eor r11, r11, r7, ror #14 - add r7, r5, r9 - eor r0, r0, r6, ror #14 - eor r1, r1, r7, ror #14 - - - ldrd r6, [sp, #14*4] - strd r2, [sp, #6*4] - strd r10, [sp, #2*4] - add r6, r11, r6 - add r7, r0, r7 - eor r4, r4, r6, ror #25 - add r6, r1, r12 - eor r5, r5, r7, ror #25 - add r7, r10, lr - eor r2, r2, r6, ror #25 - eor r3, r3, r7, ror #25 - strd r2, [sp, #6*4] - - add r10, r3, r10 - ldrd r6, [sp, #10*4] - add r11, r4, r11 - eor r8, r8, r10, ror #23 - add r10, r5, r0 - eor r9, r9, r11, ror #23 - add r11, r2, r1 - eor r6, r6, r10, ror #23 - eor r7, r7, r11, ror #23 - strd r6, [sp, #10*4] - - add r2, r7, r2 - ldrd r10, [sp, #14*4] - add r3, r8, r3 - eor r12, r12, r2, ror #19 - add r2, r9, r4 - eor lr, lr, r3, ror #19 - add r3, r6, r5 - eor r10, r10, r2, ror #19 - eor r11, r11, r3, ror #19 - - ldrd r2, [sp, #2*4] - add r6, r11, r6 - add r7, r12, r7 - eor r0, r0, r6, ror #14 - add r6, lr, r8 - eor r1, r1, r7, ror #14 - add r7, r10, r9 - eor r2, r2, r6, ror #14 - eor r3, r3, r7, ror #14 -.endm - -.macro salsa8_core - ldmia sp, {r0-r12, lr} - - ldrd r10, [sp, #14*4] - salsa8_core_doubleround_body - ldrd r6, [sp, #6*4] - strd r2, [sp, #2*4] - strd r10, [sp, #14*4] - salsa8_core_doubleround_body - ldrd r6, [sp, #6*4] - strd r2, [sp, #2*4] - strd r10, [sp, #14*4] - salsa8_core_doubleround_body - ldrd r6, [sp, #6*4] - strd r2, [sp, #2*4] - strd r10, [sp, #14*4] - salsa8_core_doubleround_body - - stmia sp, {r0-r5} - strd r8, [sp, #8*4] - str r12, [sp, #12*4] - str lr, [sp, #13*4] - strd r10, [sp, #14*4] -.endm +#define scrypt_shuffle() \ + add lr, r0, #9*4; \ + ldmia r0, {r2-r7}; \ + ldmia lr, {r2, r8-r12, lr}; \ + str r3, [r0, #5*4]; \ + str r5, [r0, #15*4]; \ + str r6, [r0, #12*4]; \ + str r7, [r0, #1*4]; \ + ldr r5, [r0, #7*4]; \ + str r2, [r0, #13*4]; \ + str r8, [r0, #2*4]; \ + strd r4, [r0, #10*4]; \ + str r9, [r0, #7*4]; \ + str r10, [r0, #4*4]; \ + str r11, [r0, #9*4]; \ + str lr, [r0, #3*4]; \ + add r2, r0, #64+0*4; \ + add lr, r0, #64+9*4; \ + ldmia r2, {r2-r7}; \ + ldmia lr, {r2, r8-r12, lr}; \ + str r3, [r0, #64+5*4]; \ + str r5, [r0, #64+15*4]; \ + str r6, [r0, #64+12*4]; \ + str r7, [r0, #64+1*4]; \ + ldr r5, [r0, #64+7*4]; \ + str r2, [r0, #64+13*4]; \ + str r8, [r0, #64+2*4]; \ + strd r4, [r0, #64+10*4]; \ + str r9, [r0, #64+7*4]; \ + str r10, [r0, #64+4*4]; \ + str r11, [r0, #64+9*4]; \ + str lr, [r0, #64+3*4]; \ + + +#define salsa8_core_doubleround_body() \ + add r6, r2, r6; \ + add r7, r3, r7; \ + eor r10, r10, r6, ror #25; \ + add r6, r0, r4; \ + eor r11, r11, r7, ror #25; \ + add r7, r1, r5; \ + strd r10, [sp, #14*4]; \ + eor r12, r12, r6, ror #25; \ + eor lr, lr, r7, ror #25; \ + ldrd r6, [sp, #10*4]; \ + add r2, r10, r2; \ + add r3, r11, r3; \ + eor r6, r6, r2, ror #23; \ + add r2, r12, r0; \ + eor r7, r7, r3, ror #23; \ + add r3, lr, r1; \ + strd r6, [sp, #10*4]; \ + eor r8, r8, r2, ror #23; \ + eor r9, r9, r3, ror #23; \ + ldrd r2, [sp, #6*4]; \ + add r10, r6, r10; \ + add r11, r7, r11; \ + eor r2, r2, r10, ror #19; \ + add r10, r8, r12; \ + eor r3, r3, r11, ror #19; \ + add r11, r9, lr; \ + eor r4, r4, r10, ror #19; \ + eor r5, r5, r11, ror #19; \ + ldrd r10, [sp, #2*4]; \ + add r6, r2, r6; \ + add r7, r3, r7; \ + eor r10, r10, r6, ror #14; \ + add r6, r4, r8; \ + eor r11, r11, r7, ror #14; \ + add r7, r5, r9; \ + eor r0, r0, r6, ror #14; \ + eor r1, r1, r7, ror #14; \ + ldrd r6, [sp, #14*4]; \ + strd r2, [sp, #6*4]; \ + strd r10, [sp, #2*4]; \ + add r6, r11, r6; \ + add r7, r0, r7; \ + eor r4, r4, r6, ror #25; \ + add r6, r1, r12; \ + eor r5, r5, r7, ror #25; \ + add r7, r10, lr; \ + eor r2, r2, r6, ror #25; \ + eor r3, r3, r7, ror #25; \ + strd r2, [sp, #6*4]; \ + add r10, r3, r10; \ + ldrd r6, [sp, #10*4]; \ + add r11, r4, r11; \ + eor r8, r8, r10, ror #23; \ + add r10, r5, r0; \ + eor r9, r9, r11, ror #23; \ + add r11, r2, r1; \ + eor r6, r6, r10, ror #23; \ + eor r7, r7, r11, ror #23; \ + strd r6, [sp, #10*4]; \ + add r2, r7, r2; \ + ldrd r10, [sp, #14*4]; \ + add r3, r8, r3; \ + eor r12, r12, r2, ror #19; \ + add r2, r9, r4; \ + eor lr, lr, r3, ror #19; \ + add r3, r6, r5; \ + eor r10, r10, r2, ror #19; \ + eor r11, r11, r3, ror #19; \ + ldrd r2, [sp, #2*4]; \ + add r6, r11, r6; \ + add r7, r12, r7; \ + eor r0, r0, r6, ror #14; \ + add r6, lr, r8; \ + eor r1, r1, r7, ror #14; \ + add r7, r10, r9; \ + eor r2, r2, r6, ror #14; \ + eor r3, r3, r7, ror #14; \ + + +#define salsa8_core() \ + ldmia sp, {r0-r12, lr}; \ + ldrd r10, [sp, #14*4]; \ + salsa8_core_doubleround_body(); \ + ldrd r6, [sp, #6*4]; \ + strd r2, [sp, #2*4]; \ + strd r10, [sp, #14*4]; \ + salsa8_core_doubleround_body(); \ + ldrd r6, [sp, #6*4]; \ + strd r2, [sp, #2*4]; \ + strd r10, [sp, #14*4]; \ + salsa8_core_doubleround_body(); \ + ldrd r6, [sp, #6*4]; \ + strd r2, [sp, #2*4]; \ + strd r10, [sp, #14*4]; \ + salsa8_core_doubleround_body(); \ + stmia sp, {r0-r5}; \ + strd r8, [sp, #8*4]; \ + str r12, [sp, #12*4]; \ + str lr, [sp, #13*4]; \ + strd r10, [sp, #14*4]; \ + #else -.macro scrypt_shuffle -.endm - -.macro salsa8_core_doubleround_body - ldr r8, [sp, #8*4] - add r11, r11, r10 - ldr lr, [sp, #13*4] - add r12, r12, r3 - eor r2, r2, r11, ror #23 - add r11, r4, r0 - eor r7, r7, r12, ror #23 - add r12, r9, r5 - str r9, [sp, #9*4] - eor r8, r8, r11, ror #23 - str r10, [sp, #14*4] - eor lr, lr, r12, ror #23 - - ldr r11, [sp, #11*4] - add r9, lr, r9 - ldr r12, [sp, #12*4] - add r10, r2, r10 - eor r1, r1, r9, ror #19 - add r9, r7, r3 - eor r6, r6, r10, ror #19 - add r10, r8, r4 - str r8, [sp, #8*4] - eor r11, r11, r9, ror #19 - str lr, [sp, #13*4] - eor r12, r12, r10, ror #19 - - ldr r9, [sp, #10*4] - add r8, r12, r8 - ldr r10, [sp, #15*4] - add lr, r1, lr - eor r0, r0, r8, ror #14 - add r8, r6, r2 - eor r5, r5, lr, ror #14 - add lr, r11, r7 - eor r9, r9, r8, ror #14 - ldr r8, [sp, #9*4] - eor r10, r10, lr, ror #14 - ldr lr, [sp, #14*4] - - - add r8, r9, r8 - str r9, [sp, #10*4] - add lr, r10, lr - str r10, [sp, #15*4] - eor r11, r11, r8, ror #25 - add r8, r0, r3 - eor r12, r12, lr, ror #25 - add lr, r5, r4 - eor r1, r1, r8, ror #25 - ldr r8, [sp, #8*4] - eor r6, r6, lr, ror #25 - - add r9, r11, r9 - ldr lr, [sp, #13*4] - add r10, r12, r10 - eor r8, r8, r9, ror #23 - add r9, r1, r0 - eor lr, lr, r10, ror #23 - add r10, r6, r5 - str r11, [sp, #11*4] - eor r2, r2, r9, ror #23 - str r12, [sp, #12*4] - eor r7, r7, r10, ror #23 - - ldr r9, [sp, #9*4] - add r11, r8, r11 - ldr r10, [sp, #14*4] - add r12, lr, r12 - eor r9, r9, r11, ror #19 - add r11, r2, r1 - eor r10, r10, r12, ror #19 - add r12, r7, r6 - str r8, [sp, #8*4] - eor r3, r3, r11, ror #19 - str lr, [sp, #13*4] - eor r4, r4, r12, ror #19 -.endm - -.macro salsa8_core - ldmia sp, {r0-r7} - - ldr r12, [sp, #15*4] - ldr r8, [sp, #11*4] - ldr lr, [sp, #12*4] - - ldr r9, [sp, #9*4] - add r8, r8, r12 - ldr r11, [sp, #10*4] - add lr, lr, r0 - eor r3, r3, r8, ror #25 - add r8, r5, r1 - ldr r10, [sp, #14*4] - eor r4, r4, lr, ror #25 - add lr, r11, r6 - eor r9, r9, r8, ror #25 - eor r10, r10, lr, ror #25 - - salsa8_core_doubleround_body - - ldr r11, [sp, #10*4] - add r8, r9, r8 - ldr r12, [sp, #15*4] - add lr, r10, lr - eor r11, r11, r8, ror #14 - add r8, r3, r2 - eor r12, r12, lr, ror #14 - add lr, r4, r7 - eor r0, r0, r8, ror #14 - ldr r8, [sp, #11*4] - eor r5, r5, lr, ror #14 - ldr lr, [sp, #12*4] - - add r8, r8, r12 - str r11, [sp, #10*4] - add lr, lr, r0 - str r12, [sp, #15*4] - eor r3, r3, r8, ror #25 - add r8, r5, r1 - eor r4, r4, lr, ror #25 - add lr, r11, r6 - str r9, [sp, #9*4] - eor r9, r9, r8, ror #25 - str r10, [sp, #14*4] - eor r10, r10, lr, ror #25 - - salsa8_core_doubleround_body - - ldr r11, [sp, #10*4] - add r8, r9, r8 - ldr r12, [sp, #15*4] - add lr, r10, lr - eor r11, r11, r8, ror #14 - add r8, r3, r2 - eor r12, r12, lr, ror #14 - add lr, r4, r7 - eor r0, r0, r8, ror #14 - ldr r8, [sp, #11*4] - eor r5, r5, lr, ror #14 - ldr lr, [sp, #12*4] - - add r8, r8, r12 - str r11, [sp, #10*4] - add lr, lr, r0 - str r12, [sp, #15*4] - eor r3, r3, r8, ror #25 - add r8, r5, r1 - eor r4, r4, lr, ror #25 - add lr, r11, r6 - str r9, [sp, #9*4] - eor r9, r9, r8, ror #25 - str r10, [sp, #14*4] - eor r10, r10, lr, ror #25 - - salsa8_core_doubleround_body - - ldr r11, [sp, #10*4] - add r8, r9, r8 - ldr r12, [sp, #15*4] - add lr, r10, lr - eor r11, r11, r8, ror #14 - add r8, r3, r2 - eor r12, r12, lr, ror #14 - add lr, r4, r7 - eor r0, r0, r8, ror #14 - ldr r8, [sp, #11*4] - eor r5, r5, lr, ror #14 - ldr lr, [sp, #12*4] - - add r8, r8, r12 - str r11, [sp, #10*4] - add lr, lr, r0 - str r12, [sp, #15*4] - eor r3, r3, r8, ror #25 - add r8, r5, r1 - eor r4, r4, lr, ror #25 - add lr, r11, r6 - str r9, [sp, #9*4] - eor r9, r9, r8, ror #25 - str r10, [sp, #14*4] - eor r10, r10, lr, ror #25 - - salsa8_core_doubleround_body - - ldr r11, [sp, #10*4] - add r8, r9, r8 - ldr r12, [sp, #15*4] - add lr, r10, lr - str r9, [sp, #9*4] - eor r11, r11, r8, ror #14 - eor r12, r12, lr, ror #14 - add r8, r3, r2 - str r10, [sp, #14*4] - add lr, r4, r7 - str r11, [sp, #10*4] - eor r0, r0, r8, ror #14 - str r12, [sp, #15*4] - eor r5, r5, lr, ror #14 - - stmia sp, {r0-r7} -.endm +#define scrypt_shuffle() \ + + +#define salsa8_core_doubleround_body() \ + ldr r8, [sp, #8*4]; \ + add r11, r11, r10; \ + ldr lr, [sp, #13*4]; \ + add r12, r12, r3; \ + eor r2, r2, r11, ror #23; \ + add r11, r4, r0; \ + eor r7, r7, r12, ror #23; \ + add r12, r9, r5; \ + str r9, [sp, #9*4]; \ + eor r8, r8, r11, ror #23; \ + str r10, [sp, #14*4]; \ + eor lr, lr, r12, ror #23; \ + ldr r11, [sp, #11*4]; \ + add r9, lr, r9; \ + ldr r12, [sp, #12*4]; \ + add r10, r2, r10; \ + eor r1, r1, r9, ror #19; \ + add r9, r7, r3; \ + eor r6, r6, r10, ror #19; \ + add r10, r8, r4; \ + str r8, [sp, #8*4]; \ + eor r11, r11, r9, ror #19; \ + str lr, [sp, #13*4]; \ + eor r12, r12, r10, ror #19; \ + ldr r9, [sp, #10*4]; \ + add r8, r12, r8; \ + ldr r10, [sp, #15*4]; \ + add lr, r1, lr; \ + eor r0, r0, r8, ror #14; \ + add r8, r6, r2; \ + eor r5, r5, lr, ror #14; \ + add lr, r11, r7; \ + eor r9, r9, r8, ror #14; \ + ldr r8, [sp, #9*4]; \ + eor r10, r10, lr, ror #14; \ + ldr lr, [sp, #14*4]; \ + add r8, r9, r8; \ + str r9, [sp, #10*4]; \ + add lr, r10, lr; \ + str r10, [sp, #15*4]; \ + eor r11, r11, r8, ror #25; \ + add r8, r0, r3; \ + eor r12, r12, lr, ror #25; \ + add lr, r5, r4; \ + eor r1, r1, r8, ror #25; \ + ldr r8, [sp, #8*4]; \ + eor r6, r6, lr, ror #25; \ + add r9, r11, r9; \ + ldr lr, [sp, #13*4]; \ + add r10, r12, r10; \ + eor r8, r8, r9, ror #23; \ + add r9, r1, r0; \ + eor lr, lr, r10, ror #23; \ + add r10, r6, r5; \ + str r11, [sp, #11*4]; \ + eor r2, r2, r9, ror #23; \ + str r12, [sp, #12*4]; \ + eor r7, r7, r10, ror #23; \ + ldr r9, [sp, #9*4]; \ + add r11, r8, r11; \ + ldr r10, [sp, #14*4]; \ + add r12, lr, r12; \ + eor r9, r9, r11, ror #19; \ + add r11, r2, r1; \ + eor r10, r10, r12, ror #19; \ + add r12, r7, r6; \ + str r8, [sp, #8*4]; \ + eor r3, r3, r11, ror #19; \ + str lr, [sp, #13*4]; \ + eor r4, r4, r12, ror #19; \ + + +#define salsa8_core() \ + ldmia sp, {r0-r7}; \ + ldr r12, [sp, #15*4]; \ + ldr r8, [sp, #11*4]; \ + ldr lr, [sp, #12*4]; \ + ldr r9, [sp, #9*4]; \ + add r8, r8, r12; \ + ldr r11, [sp, #10*4]; \ + add lr, lr, r0; \ + eor r3, r3, r8, ror #25; \ + add r8, r5, r1; \ + ldr r10, [sp, #14*4]; \ + eor r4, r4, lr, ror #25; \ + add lr, r11, r6; \ + eor r9, r9, r8, ror #25; \ + eor r10, r10, lr, ror #25; \ + salsa8_core_doubleround_body(); \ + ldr r11, [sp, #10*4]; \ + add r8, r9, r8; \ + ldr r12, [sp, #15*4]; \ + add lr, r10, lr; \ + eor r11, r11, r8, ror #14; \ + add r8, r3, r2; \ + eor r12, r12, lr, ror #14; \ + add lr, r4, r7; \ + eor r0, r0, r8, ror #14; \ + ldr r8, [sp, #11*4]; \ + eor r5, r5, lr, ror #14; \ + ldr lr, [sp, #12*4]; \ + add r8, r8, r12; \ + str r11, [sp, #10*4]; \ + add lr, lr, r0; \ + str r12, [sp, #15*4]; \ + eor r3, r3, r8, ror #25; \ + add r8, r5, r1; \ + eor r4, r4, lr, ror #25; \ + add lr, r11, r6; \ + str r9, [sp, #9*4]; \ + eor r9, r9, r8, ror #25; \ + str r10, [sp, #14*4]; \ + eor r10, r10, lr, ror #25; \ + salsa8_core_doubleround_body(); \ + ldr r11, [sp, #10*4]; \ + add r8, r9, r8; \ + ldr r12, [sp, #15*4]; \ + add lr, r10, lr; \ + eor r11, r11, r8, ror #14; \ + add r8, r3, r2; \ + eor r12, r12, lr, ror #14; \ + add lr, r4, r7; \ + eor r0, r0, r8, ror #14; \ + ldr r8, [sp, #11*4]; \ + eor r5, r5, lr, ror #14; \ + ldr lr, [sp, #12*4]; \ + add r8, r8, r12; \ + str r11, [sp, #10*4]; \ + add lr, lr, r0; \ + str r12, [sp, #15*4]; \ + eor r3, r3, r8, ror #25; \ + add r8, r5, r1; \ + eor r4, r4, lr, ror #25; \ + add lr, r11, r6; \ + str r9, [sp, #9*4]; \ + eor r9, r9, r8, ror #25; \ + str r10, [sp, #14*4]; \ + eor r10, r10, lr, ror #25; \ + salsa8_core_doubleround_body(); \ + ldr r11, [sp, #10*4]; \ + add r8, r9, r8; \ + ldr r12, [sp, #15*4]; \ + add lr, r10, lr; \ + eor r11, r11, r8, ror #14; \ + add r8, r3, r2; \ + eor r12, r12, lr, ror #14; \ + add lr, r4, r7; \ + eor r0, r0, r8, ror #14; \ + ldr r8, [sp, #11*4]; \ + eor r5, r5, lr, ror #14; \ + ldr lr, [sp, #12*4]; \ + add r8, r8, r12; \ + str r11, [sp, #10*4]; \ + add lr, lr, r0; \ + str r12, [sp, #15*4]; \ + eor r3, r3, r8, ror #25; \ + add r8, r5, r1; \ + eor r4, r4, lr, ror #25; \ + add lr, r11, r6; \ + str r9, [sp, #9*4]; \ + eor r9, r9, r8, ror #25; \ + str r10, [sp, #14*4]; \ + eor r10, r10, lr, ror #25; \ + salsa8_core_doubleround_body(); \ + ldr r11, [sp, #10*4]; \ + add r8, r9, r8; \ + ldr r12, [sp, #15*4]; \ + add lr, r10, lr; \ + str r9, [sp, #9*4]; \ + eor r11, r11, r8, ror #14; \ + eor r12, r12, lr, ror #14; \ + add r8, r3, r2; \ + str r10, [sp, #14*4]; \ + add lr, r4, r7; \ + str r11, [sp, #10*4]; \ + eor r0, r0, r8, ror #14; \ + str r12, [sp, #15*4]; \ + eor r5, r5, lr, ror #14; \ + stmia sp, {r0-r7}; \ + #endif -.macro scrypt_core_macro1a_x4 - ldmia r0, {r4-r7} - ldmia lr!, {r8-r11} - stmia r1!, {r4-r7} - stmia r3!, {r8-r11} - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r10 - eor r7, r7, r11 - stmia r0!, {r4-r7} - stmia r12!, {r4-r7} -.endm - -.macro scrypt_core_macro1b_x4 - ldmia r3!, {r8-r11} - ldmia r2, {r4-r7} - eor r8, r8, r4 - eor r9, r9, r5 - eor r10, r10, r6 - eor r11, r11, r7 - ldmia r0, {r4-r7} - stmia r2!, {r8-r11} - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r10 - eor r7, r7, r11 - ldmia r1!, {r8-r11} - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r10 - eor r7, r7, r11 - stmia r0!, {r4-r7} - stmia r12!, {r4-r7} -.endm - -.macro scrypt_core_macro2_x4 - ldmia r12, {r4-r7} - ldmia r0, {r8-r11} - add r4, r4, r8 - add r5, r5, r9 - add r6, r6, r10 - add r7, r7, r11 - stmia r0!, {r4-r7} - ldmia r2, {r8-r11} - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r10 - eor r7, r7, r11 - stmia r2!, {r4-r7} - stmia r12!, {r4-r7} -.endm - -.macro scrypt_core_macro3_x4 - ldmia r1!, {r4-r7} - ldmia r0, {r8-r11} - add r4, r4, r8 - add r5, r5, r9 - add r6, r6, r10 - add r7, r7, r11 - stmia r0!, {r4-r7} -.endm - -.macro scrypt_core_macro3_x6 - ldmia r1!, {r2-r7} - ldmia r0, {r8-r12, lr} - add r2, r2, r8 - add r3, r3, r9 - add r4, r4, r10 - add r5, r5, r11 - add r6, r6, r12 - add r7, r7, lr - stmia r0!, {r2-r7} -.endm +#define scrypt_core_macro1a_x4() \ + ldmia r0, {r4-r7}; \ + ldmia lr!, {r8-r11}; \ + stmia r1!, {r4-r7}; \ + stmia r3!, {r8-r11}; \ + eor r4, r4, r8; \ + eor r5, r5, r9; \ + eor r6, r6, r10; \ + eor r7, r7, r11; \ + stmia r0!, {r4-r7}; \ + stmia r12!, {r4-r7}; \ + + +#define scrypt_core_macro1b_x4() \ + ldmia r3!, {r8-r11}; \ + ldmia r2, {r4-r7}; \ + eor r8, r8, r4; \ + eor r9, r9, r5; \ + eor r10, r10, r6; \ + eor r11, r11, r7; \ + ldmia r0, {r4-r7}; \ + stmia r2!, {r8-r11}; \ + eor r4, r4, r8; \ + eor r5, r5, r9; \ + eor r6, r6, r10; \ + eor r7, r7, r11; \ + ldmia r1!, {r8-r11}; \ + eor r4, r4, r8; \ + eor r5, r5, r9; \ + eor r6, r6, r10; \ + eor r7, r7, r11; \ + stmia r0!, {r4-r7}; \ + stmia r12!, {r4-r7}; \ + + +#define scrypt_core_macro2_x4() \ + ldmia r12, {r4-r7}; \ + ldmia r0, {r8-r11}; \ + add r4, r4, r8; \ + add r5, r5, r9; \ + add r6, r6, r10; \ + add r7, r7, r11; \ + stmia r0!, {r4-r7}; \ + ldmia r2, {r8-r11}; \ + eor r4, r4, r8; \ + eor r5, r5, r9; \ + eor r6, r6, r10; \ + eor r7, r7, r11; \ + stmia r2!, {r4-r7}; \ + stmia r12!, {r4-r7}; \ + + +#define scrypt_core_macro3_x4() \ + ldmia r1!, {r4-r7}; \ + ldmia r0, {r8-r11}; \ + add r4, r4, r8; \ + add r5, r5, r9; \ + add r6, r6, r10; \ + add r7, r7, r11; \ + stmia r0!, {r4-r7}; \ + + +#define scrypt_core_macro3_x6() \ + ldmia r1!, {r2-r7}; \ + ldmia r0, {r8-r12, lr}; \ + add r2, r2, r8; \ + add r3, r3, r9; \ + add r4, r4, r10; \ + add r5, r5, r11; \ + add r6, r6, r12; \ + add r7, r7, lr; \ + stmia r0!, {r2-r7}; \ + .text @@ -478,7 +448,7 @@ _scrypt_core: bic sp, sp, #63 str r12, [sp, #20*4] - scrypt_shuffle + scrypt_shuffle() str r0, [sp, #16*4] add r12, r1, #1024*32*4 @@ -487,32 +457,32 @@ scrypt_core_loop1: add lr, r0, #16*4 add r3, r1, #16*4 mov r12, sp - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 + scrypt_core_macro1a_x4() + scrypt_core_macro1a_x4() + scrypt_core_macro1a_x4() + scrypt_core_macro1a_x4() str r1, [sp, #17*4] - salsa8_core + salsa8_core() ldr r0, [sp, #16*4] mov r12, sp add r2, r0, #16*4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 + scrypt_core_macro2_x4() + scrypt_core_macro2_x4() + scrypt_core_macro2_x4() + scrypt_core_macro2_x4() - salsa8_core + salsa8_core() ldr r0, [sp, #16*4] mov r1, sp add r0, r0, #16*4 - scrypt_core_macro3_x6 - scrypt_core_macro3_x6 + scrypt_core_macro3_x6() + scrypt_core_macro3_x6() ldr r3, [sp, #17*4] ldr r12, [sp, #18*4] - scrypt_core_macro3_x4 + scrypt_core_macro3_x4() add r1, r3, #16*4 sub r0, r0, #32*4 @@ -534,28 +504,28 @@ scrypt_core_loop2: pld [r1, #24*4] pld [r1, #8*4] #endif - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 + scrypt_core_macro1b_x4() + scrypt_core_macro1b_x4() + scrypt_core_macro1b_x4() + scrypt_core_macro1b_x4() - salsa8_core + salsa8_core() ldr r0, [sp, #16*4] mov r12, sp add r2, r0, #16*4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 + scrypt_core_macro2_x4() + scrypt_core_macro2_x4() + scrypt_core_macro2_x4() + scrypt_core_macro2_x4() - salsa8_core + salsa8_core() ldr r0, [sp, #16*4] mov r1, sp ldr r3, [sp, #17*4] add r0, r0, #16*4 - scrypt_core_macro3_x4 + scrypt_core_macro3_x4() mov r4, r4, lsl #32-10 add r3, r3, r4, lsr #32-10-7 str r3, [sp, #19*4] @@ -563,8 +533,8 @@ scrypt_core_loop2: pld [r3, #16*4] pld [r3] #endif - scrypt_core_macro3_x6 - scrypt_core_macro3_x6 + scrypt_core_macro3_x6() + scrypt_core_macro3_x6() ldr r12, [sp, #18*4] sub r0, r0, #32*4 @@ -572,7 +542,7 @@ scrypt_core_loop2: subs r12, r12, #1 bne scrypt_core_loop2 - scrypt_shuffle + scrypt_shuffle() ldr sp, [sp, #20*4] #ifdef __thumb__ diff --git a/src/scrypt-x86_64.S b/src/scrypt-x86_64.S index 6ae2f00..9d894f2 100644 --- a/src/scrypt-x86_64.S +++ b/src/scrypt-x86_64.S @@ -22,12 +22,13 @@ # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. + +#if defined(__x86_64__) + #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif -#if defined(__x86_64__) - #define scrypt_shuffle(src, so, dest, do) \ movl so+60(src), %r8d; \ movl so+44(src), %r9d; \ diff --git a/src/scrypt.cpp b/src/scrypt.cpp index fe56d3c..06ef53c 100644 --- a/src/scrypt.cpp +++ b/src/scrypt.cpp @@ -38,10 +38,97 @@ #define SCRYPT_BUFFER_SIZE (131072 + 63) -#if defined (__x86_64__) || defined (__i386__) || defined(__arm__) -extern "C" void scrypt_core(uint32_t *X, uint32_t *V); +#if defined (OPTIMIZED_SALSA) && ( defined (__x86_64__) || defined (__i386__) || defined(__arm__) ) +extern "C" void scrypt_core(unsigned int *X, unsigned int *V); #else -// TODO: Add cross-platform scrypt_core implementation +// Generic scrypt_core implementation + +static inline void xor_salsa8(unsigned int B[16], const unsigned int Bx[16]) +{ + unsigned int x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15; + int i; + + x00 = (B[0] ^= Bx[0]); + x01 = (B[1] ^= Bx[1]); + x02 = (B[2] ^= Bx[2]); + x03 = (B[3] ^= Bx[3]); + x04 = (B[4] ^= Bx[4]); + x05 = (B[5] ^= Bx[5]); + x06 = (B[6] ^= Bx[6]); + x07 = (B[7] ^= Bx[7]); + x08 = (B[8] ^= Bx[8]); + x09 = (B[9] ^= Bx[9]); + x10 = (B[10] ^= Bx[10]); + x11 = (B[11] ^= Bx[11]); + x12 = (B[12] ^= Bx[12]); + x13 = (B[13] ^= Bx[13]); + x14 = (B[14] ^= Bx[14]); + x15 = (B[15] ^= Bx[15]); + for (i = 0; i < 8; i += 2) { +#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) + /* Operate on columns. */ + x04 ^= R(x00+x12, 7); x09 ^= R(x05+x01, 7); + x14 ^= R(x10+x06, 7); x03 ^= R(x15+x11, 7); + + x08 ^= R(x04+x00, 9); x13 ^= R(x09+x05, 9); + x02 ^= R(x14+x10, 9); x07 ^= R(x03+x15, 9); + + x12 ^= R(x08+x04,13); x01 ^= R(x13+x09,13); + x06 ^= R(x02+x14,13); x11 ^= R(x07+x03,13); + + x00 ^= R(x12+x08,18); x05 ^= R(x01+x13,18); + x10 ^= R(x06+x02,18); x15 ^= R(x11+x07,18); + + /* Operate on rows. */ + x01 ^= R(x00+x03, 7); x06 ^= R(x05+x04, 7); + x11 ^= R(x10+x09, 7); x12 ^= R(x15+x14, 7); + + x02 ^= R(x01+x00, 9); x07 ^= R(x06+x05, 9); + x08 ^= R(x11+x10, 9); x13 ^= R(x12+x15, 9); + + x03 ^= R(x02+x01,13); x04 ^= R(x07+x06,13); + x09 ^= R(x08+x11,13); x14 ^= R(x13+x12,13); + + x00 ^= R(x03+x02,18); x05 ^= R(x04+x07,18); + x10 ^= R(x09+x08,18); x15 ^= R(x14+x13,18); +#undef R + } + B[0] += x00; + B[1] += x01; + B[2] += x02; + B[3] += x03; + B[4] += x04; + B[5] += x05; + B[6] += x06; + B[7] += x07; + B[8] += x08; + B[9] += x09; + B[10] += x10; + B[11] += x11; + B[12] += x12; + B[13] += x13; + B[14] += x14; + B[15] += x15; +} + +static inline void scrypt_core(unsigned int *X, unsigned int *V) +{ + unsigned int i, j, k; + + for (i = 0; i < 1024; i++) { + memcpy(&V[i * 32], X, 128); + xor_salsa8(&X[0], &X[16]); + xor_salsa8(&X[16], &X[0]); + } + for (i = 0; i < 1024; i++) { + j = 32 * (X[16] & 1023); + for (k = 0; k < 32; k++) + X[k] ^= V[j + k]; + xor_salsa8(&X[0], &X[16]); + xor_salsa8(&X[16], &X[0]); + } +} + #endif /* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output @@ -51,10 +138,10 @@ extern "C" void scrypt_core(uint32_t *X, uint32_t *V); uint256 scrypt_nosalt(const void* input, size_t inputlen, void *scratchpad) { - uint32_t *V; - uint32_t X[32]; + unsigned int *V; + unsigned int X[32]; uint256 result = 0; - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + V = (unsigned int *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); PBKDF2_SHA256((const uint8_t*)input, inputlen, (const uint8_t*)input, inputlen, 1, (uint8_t *)X, 128); scrypt_core(X, V); @@ -65,10 +152,10 @@ uint256 scrypt_nosalt(const void* input, size_t inputlen, void *scratchpad) uint256 scrypt(const void* data, size_t datalen, const void* salt, size_t saltlen, void *scratchpad) { - uint32_t *V; - uint32_t X[32]; + unsigned int *V; + unsigned int X[32]; uint256 result = 0; - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + V = (unsigned int *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); PBKDF2_SHA256((const uint8_t*)data, datalen, (const uint8_t*)salt, saltlen, 1, (uint8_t *)X, 128); scrypt_core(X, V); diff --git a/src/util.h b/src/util.h index d4d0ea2..1d182be 100644 --- a/src/util.h +++ b/src/util.h @@ -11,9 +11,8 @@ #include #include #include -#else -typedef int pid_t; /* define for Windows compatibility */ #endif + #include #include #include -- 1.7.1