From: alex Date: Fri, 13 Sep 2013 14:09:16 +0000 (+0400) Subject: Add ARM scrypt implementation X-Git-Tag: v0.4.4.5~11 X-Git-Url: https://git.novaco.in/?p=novacoin.git;a=commitdiff_plain;h=5f6bdb8d78fea44ac4ddd165a601342a4437eb84 Add ARM scrypt implementation --- diff --git a/novacoin-qt.pro b/novacoin-qt.pro index 8c322ed..9513cd3 100644 --- a/novacoin-qt.pro +++ b/novacoin-qt.pro @@ -301,6 +301,7 @@ SOURCES += src/qt/bitcoin.cpp src/qt/bitcoingui.cpp \ src/qt/rpcconsole.cpp \ src/noui.cpp \ src/kernel.cpp \ + src/scrypt-arm.S \ src/scrypt-x86.S \ src/scrypt-x86_64.S \ src/scrypt.cpp \ diff --git a/src/makefile.bsd b/src/makefile.bsd index c003999..b5f14ce 100644 --- a/src/makefile.bsd +++ b/src/makefile.bsd @@ -134,6 +134,7 @@ OBJS= \ obj/kernel.o \ obj/pbkdf2.o \ obj/scrypt.o \ + obj/scrypt-arm.o \ obj/scrypt-x86.o \ obj/scrypt-x86_64.o \ obj/zerocoin/Accumulator.o \ diff --git a/src/makefile.unix b/src/makefile.unix index af350f9..ddf5d6a 100644 --- a/src/makefile.unix +++ b/src/makefile.unix @@ -140,6 +140,7 @@ OBJS= \ obj/kernel.o \ obj/pbkdf2.o \ obj/scrypt.o \ + obj/scrypt-arm.o \ obj/scrypt-x86.o \ obj/scrypt-x86_64.o \ obj/zerocoin/Accumulator.o \ diff --git a/src/scrypt-arm.S b/src/scrypt-arm.S new file mode 100644 index 0000000..06097a6 --- /dev/null +++ b/src/scrypt-arm.S @@ -0,0 +1,581 @@ +/* + * Copyright 2012 pooler@litecoinpool.org + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#if defined(__arm__) && defined(__APCS_32__) + +#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ + defined(__ARM_ARCH_5TEJ__) || defined(__ARM_ARCH_6__) || \ + defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || \ + defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6T2__) || \ + defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) +#define __ARM_ARCH_5E_OR_6__ +#endif + +#if defined(__ARM_ARCH_5E_OR_6__) || defined(__ARM_ARCH_7__) || \ + defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || \ + defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__) +#define __ARM_ARCH_5E_OR_6_OR_7__ +#endif + +#ifdef __ARM_ARCH_5E_OR_6__ + +.macro scrypt_shuffle + add lr, r0, #9*4 + ldmia r0, {r2-r7} + ldmia lr, {r2, r8-r12, lr} + str r3, [r0, #5*4] + str r5, [r0, #15*4] + str r6, [r0, #12*4] + str r7, [r0, #1*4] + ldr r5, [r0, #7*4] + str r2, [r0, #13*4] + str r8, [r0, #2*4] + strd r4, [r0, #10*4] + str r9, [r0, #7*4] + str r10, [r0, #4*4] + str r11, [r0, #9*4] + str lr, [r0, #3*4] + + add r2, r0, #64+0*4 + add lr, r0, #64+9*4 + ldmia r2, {r2-r7} + ldmia lr, {r2, r8-r12, lr} + str r3, [r0, #64+5*4] + str r5, [r0, #64+15*4] + str r6, [r0, #64+12*4] + str r7, [r0, #64+1*4] + ldr r5, [r0, #64+7*4] + str r2, [r0, #64+13*4] + str r8, [r0, #64+2*4] + strd r4, [r0, #64+10*4] + str r9, [r0, #64+7*4] + str r10, [r0, #64+4*4] + str r11, [r0, #64+9*4] + str lr, [r0, #64+3*4] +.endm + +.macro salsa8_core_doubleround_body + add r6, r2, r6 + add r7, r3, r7 + eor r10, r10, r6, ror #25 + add r6, r0, r4 + eor r11, r11, r7, ror #25 + add r7, r1, r5 + strd r10, [sp, #14*4] + eor r12, r12, r6, ror #25 + eor lr, lr, r7, ror #25 + + ldrd r6, [sp, #10*4] + add r2, r10, r2 + add r3, r11, r3 + eor r6, r6, r2, ror #23 + add r2, r12, r0 + eor r7, r7, r3, ror #23 + add r3, lr, r1 + strd r6, [sp, #10*4] + eor r8, r8, r2, ror #23 + eor r9, r9, r3, ror #23 + + ldrd r2, [sp, #6*4] + add r10, r6, r10 + add r11, r7, r11 + eor r2, r2, r10, ror #19 + add r10, r8, r12 + eor r3, r3, r11, ror #19 + add r11, r9, lr + eor r4, r4, r10, ror #19 + eor r5, r5, r11, ror #19 + + ldrd r10, [sp, #2*4] + add r6, r2, r6 + add r7, r3, r7 + eor r10, r10, r6, ror #14 + add r6, r4, r8 + eor r11, r11, r7, ror #14 + add r7, r5, r9 + eor r0, r0, r6, ror #14 + eor r1, r1, r7, ror #14 + + + ldrd r6, [sp, #14*4] + strd r2, [sp, #6*4] + strd r10, [sp, #2*4] + add r6, r11, r6 + add r7, r0, r7 + eor r4, r4, r6, ror #25 + add r6, r1, r12 + eor r5, r5, r7, ror #25 + add r7, r10, lr + eor r2, r2, r6, ror #25 + eor r3, r3, r7, ror #25 + strd r2, [sp, #6*4] + + add r10, r3, r10 + ldrd r6, [sp, #10*4] + add r11, r4, r11 + eor r8, r8, r10, ror #23 + add r10, r5, r0 + eor r9, r9, r11, ror #23 + add r11, r2, r1 + eor r6, r6, r10, ror #23 + eor r7, r7, r11, ror #23 + strd r6, [sp, #10*4] + + add r2, r7, r2 + ldrd r10, [sp, #14*4] + add r3, r8, r3 + eor r12, r12, r2, ror #19 + add r2, r9, r4 + eor lr, lr, r3, ror #19 + add r3, r6, r5 + eor r10, r10, r2, ror #19 + eor r11, r11, r3, ror #19 + + ldrd r2, [sp, #2*4] + add r6, r11, r6 + add r7, r12, r7 + eor r0, r0, r6, ror #14 + add r6, lr, r8 + eor r1, r1, r7, ror #14 + add r7, r10, r9 + eor r2, r2, r6, ror #14 + eor r3, r3, r7, ror #14 +.endm + +.macro salsa8_core + ldmia sp, {r0-r12, lr} + + ldrd r10, [sp, #14*4] + salsa8_core_doubleround_body + ldrd r6, [sp, #6*4] + strd r2, [sp, #2*4] + strd r10, [sp, #14*4] + salsa8_core_doubleround_body + ldrd r6, [sp, #6*4] + strd r2, [sp, #2*4] + strd r10, [sp, #14*4] + salsa8_core_doubleround_body + ldrd r6, [sp, #6*4] + strd r2, [sp, #2*4] + strd r10, [sp, #14*4] + salsa8_core_doubleround_body + + stmia sp, {r0-r5} + strd r8, [sp, #8*4] + str r12, [sp, #12*4] + str lr, [sp, #13*4] + strd r10, [sp, #14*4] +.endm + +#else + +.macro scrypt_shuffle +.endm + +.macro salsa8_core_doubleround_body + ldr r8, [sp, #8*4] + add r11, r11, r10 + ldr lr, [sp, #13*4] + add r12, r12, r3 + eor r2, r2, r11, ror #23 + add r11, r4, r0 + eor r7, r7, r12, ror #23 + add r12, r9, r5 + str r9, [sp, #9*4] + eor r8, r8, r11, ror #23 + str r10, [sp, #14*4] + eor lr, lr, r12, ror #23 + + ldr r11, [sp, #11*4] + add r9, lr, r9 + ldr r12, [sp, #12*4] + add r10, r2, r10 + eor r1, r1, r9, ror #19 + add r9, r7, r3 + eor r6, r6, r10, ror #19 + add r10, r8, r4 + str r8, [sp, #8*4] + eor r11, r11, r9, ror #19 + str lr, [sp, #13*4] + eor r12, r12, r10, ror #19 + + ldr r9, [sp, #10*4] + add r8, r12, r8 + ldr r10, [sp, #15*4] + add lr, r1, lr + eor r0, r0, r8, ror #14 + add r8, r6, r2 + eor r5, r5, lr, ror #14 + add lr, r11, r7 + eor r9, r9, r8, ror #14 + ldr r8, [sp, #9*4] + eor r10, r10, lr, ror #14 + ldr lr, [sp, #14*4] + + + add r8, r9, r8 + str r9, [sp, #10*4] + add lr, r10, lr + str r10, [sp, #15*4] + eor r11, r11, r8, ror #25 + add r8, r0, r3 + eor r12, r12, lr, ror #25 + add lr, r5, r4 + eor r1, r1, r8, ror #25 + ldr r8, [sp, #8*4] + eor r6, r6, lr, ror #25 + + add r9, r11, r9 + ldr lr, [sp, #13*4] + add r10, r12, r10 + eor r8, r8, r9, ror #23 + add r9, r1, r0 + eor lr, lr, r10, ror #23 + add r10, r6, r5 + str r11, [sp, #11*4] + eor r2, r2, r9, ror #23 + str r12, [sp, #12*4] + eor r7, r7, r10, ror #23 + + ldr r9, [sp, #9*4] + add r11, r8, r11 + ldr r10, [sp, #14*4] + add r12, lr, r12 + eor r9, r9, r11, ror #19 + add r11, r2, r1 + eor r10, r10, r12, ror #19 + add r12, r7, r6 + str r8, [sp, #8*4] + eor r3, r3, r11, ror #19 + str lr, [sp, #13*4] + eor r4, r4, r12, ror #19 +.endm + +.macro salsa8_core + ldmia sp, {r0-r7} + + ldr r12, [sp, #15*4] + ldr r8, [sp, #11*4] + ldr lr, [sp, #12*4] + + ldr r9, [sp, #9*4] + add r8, r8, r12 + ldr r11, [sp, #10*4] + add lr, lr, r0 + eor r3, r3, r8, ror #25 + add r8, r5, r1 + ldr r10, [sp, #14*4] + eor r4, r4, lr, ror #25 + add lr, r11, r6 + eor r9, r9, r8, ror #25 + eor r10, r10, lr, ror #25 + + salsa8_core_doubleround_body + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + eor r11, r11, r8, ror #14 + add r8, r3, r2 + eor r12, r12, lr, ror #14 + add lr, r4, r7 + eor r0, r0, r8, ror #14 + ldr r8, [sp, #11*4] + eor r5, r5, lr, ror #14 + ldr lr, [sp, #12*4] + + add r8, r8, r12 + str r11, [sp, #10*4] + add lr, lr, r0 + str r12, [sp, #15*4] + eor r3, r3, r8, ror #25 + add r8, r5, r1 + eor r4, r4, lr, ror #25 + add lr, r11, r6 + str r9, [sp, #9*4] + eor r9, r9, r8, ror #25 + str r10, [sp, #14*4] + eor r10, r10, lr, ror #25 + + salsa8_core_doubleround_body + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + eor r11, r11, r8, ror #14 + add r8, r3, r2 + eor r12, r12, lr, ror #14 + add lr, r4, r7 + eor r0, r0, r8, ror #14 + ldr r8, [sp, #11*4] + eor r5, r5, lr, ror #14 + ldr lr, [sp, #12*4] + + add r8, r8, r12 + str r11, [sp, #10*4] + add lr, lr, r0 + str r12, [sp, #15*4] + eor r3, r3, r8, ror #25 + add r8, r5, r1 + eor r4, r4, lr, ror #25 + add lr, r11, r6 + str r9, [sp, #9*4] + eor r9, r9, r8, ror #25 + str r10, [sp, #14*4] + eor r10, r10, lr, ror #25 + + salsa8_core_doubleround_body + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + eor r11, r11, r8, ror #14 + add r8, r3, r2 + eor r12, r12, lr, ror #14 + add lr, r4, r7 + eor r0, r0, r8, ror #14 + ldr r8, [sp, #11*4] + eor r5, r5, lr, ror #14 + ldr lr, [sp, #12*4] + + add r8, r8, r12 + str r11, [sp, #10*4] + add lr, lr, r0 + str r12, [sp, #15*4] + eor r3, r3, r8, ror #25 + add r8, r5, r1 + eor r4, r4, lr, ror #25 + add lr, r11, r6 + str r9, [sp, #9*4] + eor r9, r9, r8, ror #25 + str r10, [sp, #14*4] + eor r10, r10, lr, ror #25 + + salsa8_core_doubleround_body + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + str r9, [sp, #9*4] + eor r11, r11, r8, ror #14 + eor r12, r12, lr, ror #14 + add r8, r3, r2 + str r10, [sp, #14*4] + add lr, r4, r7 + str r11, [sp, #10*4] + eor r0, r0, r8, ror #14 + str r12, [sp, #15*4] + eor r5, r5, lr, ror #14 + + stmia sp, {r0-r7} +.endm + +#endif + + +.macro scrypt_core_macro1a_x4 + ldmia r0, {r4-r7} + ldmia lr!, {r8-r11} + stmia r1!, {r4-r7} + stmia r3!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} +.endm + +.macro scrypt_core_macro1b_x4 + ldmia r3!, {r8-r11} + ldmia r2, {r4-r7} + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 + ldmia r0, {r4-r7} + stmia r2!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + ldmia r1!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} +.endm + +.macro scrypt_core_macro2_x4 + ldmia r12, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} + ldmia r2, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r2!, {r4-r7} + stmia r12!, {r4-r7} +.endm + +.macro scrypt_core_macro3_x4 + ldmia r1!, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} +.endm + +.macro scrypt_core_macro3_x6 + ldmia r1!, {r2-r7} + ldmia r0, {r8-r12, lr} + add r2, r2, r8 + add r3, r3, r9 + add r4, r4, r10 + add r5, r5, r11 + add r6, r6, r12 + add r7, r7, lr + stmia r0!, {r2-r7} +.endm + + + .text + .code 32 + .align 2 + .globl scrypt_core + .globl _scrypt_core +#ifdef __ELF__ + .type scrypt_core, %function +#endif +scrypt_core: +_scrypt_core: + stmfd sp!, {r4-r11, lr} + mov r12, sp + sub sp, sp, #21*4 + bic sp, sp, #63 + str r12, [sp, #20*4] + + scrypt_shuffle + + str r0, [sp, #16*4] + add r12, r1, #1024*32*4 + str r12, [sp, #18*4] +scrypt_core_loop1: + add lr, r0, #16*4 + add r3, r1, #16*4 + mov r12, sp + scrypt_core_macro1a_x4 + scrypt_core_macro1a_x4 + scrypt_core_macro1a_x4 + scrypt_core_macro1a_x4 + str r1, [sp, #17*4] + + salsa8_core + + ldr r0, [sp, #16*4] + mov r12, sp + add r2, r0, #16*4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + + salsa8_core + + ldr r0, [sp, #16*4] + mov r1, sp + add r0, r0, #16*4 + scrypt_core_macro3_x6 + scrypt_core_macro3_x6 + ldr r3, [sp, #17*4] + ldr r12, [sp, #18*4] + scrypt_core_macro3_x4 + + add r1, r3, #16*4 + sub r0, r0, #32*4 + cmp r1, r12 + bne scrypt_core_loop1 + + ldr r4, [r0, #16*4] + sub r1, r1, #1024*32*4 + str r1, [sp, #17*4] + mov r4, r4, lsl #32-10 + mov r12, #1024 + add r1, r1, r4, lsr #32-10-7 +scrypt_core_loop2: + add r2, r0, #16*4 + add r3, r1, #16*4 + str r12, [sp, #18*4] + mov r12, sp +#ifdef __ARM_ARCH_5E_OR_6_OR_7__ + pld [r1, #24*4] + pld [r1, #8*4] +#endif + scrypt_core_macro1b_x4 + scrypt_core_macro1b_x4 + scrypt_core_macro1b_x4 + scrypt_core_macro1b_x4 + + salsa8_core + + ldr r0, [sp, #16*4] + mov r12, sp + add r2, r0, #16*4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + + salsa8_core + + ldr r0, [sp, #16*4] + mov r1, sp + ldr r3, [sp, #17*4] + add r0, r0, #16*4 + scrypt_core_macro3_x4 + mov r4, r4, lsl #32-10 + add r3, r3, r4, lsr #32-10-7 + str r3, [sp, #19*4] +#ifdef __ARM_ARCH_5E_OR_6_OR_7__ + pld [r3, #16*4] + pld [r3] +#endif + scrypt_core_macro3_x6 + scrypt_core_macro3_x6 + + ldr r12, [sp, #18*4] + sub r0, r0, #32*4 + ldr r1, [sp, #19*4] + subs r12, r12, #1 + bne scrypt_core_loop2 + + scrypt_shuffle + + ldr sp, [sp, #20*4] +#ifdef __thumb__ + ldmfd sp!, {r4-r11, lr} + bx lr +#else + ldmfd sp!, {r4-r11, pc} +#endif + +#endif