From: CryptoManiac Date: Sun, 11 Oct 2015 16:57:44 +0000 (+0300) Subject: Add SSSE3 imnplementation ofg block copy function, gives us ~30% kernel scanning... X-Git-Tag: nvc-v0.5.5~45 X-Git-Url: https://git.novaco.in/?p=novacoin.git;a=commitdiff_plain;h=fb24309e8e08fb4ce6d97b2e17dccfee06689836 Add SSSE3 imnplementation ofg block copy function, gives us ~30% kernel scanning performance benefit in comparison with generic implementation. Also fix mingw makefiles. --- diff --git a/novacoin-qt.pro b/novacoin-qt.pro index 22f6dbc..5d39b6a 100644 --- a/novacoin-qt.pro +++ b/novacoin-qt.pro @@ -129,15 +129,24 @@ contains(USE_LEVELDB, 1) { contains(USE_ASM, 1) { message(Using assembler scrypt & sha256 implementations) DEFINES += USE_ASM + QMAKE_CFLAGS += -msse2 + QMAKE_CXXFLAGS += -msse2 + + contains(USE_SSSE3, 1) { + DEFINES += USE_SSSE3 + QMAKE_CFLAGS += -mssse3 + QMAKE_CXXFLAGS += -mssse3 + } + SOURCES += src/crypto/scrypt/asm/scrypt-arm.S src/crypto/scrypt/asm/scrypt-x86.S src/crypto/scrypt/asm/scrypt-x86_64.S src/crypto/scrypt/asm/asm-wrapper.cpp - SOURCES += src/crypto/sha2/asm/sha2-arm.S src/crypto/sha2/asm/sha2-x86.S src/crypto/sha2/asm/sha2-x86_64.S + SOURCES += src/crypto/sha2/asm/sha2-arm.S src/crypto/sha2/asm/sha2-x86.S src/crypto/sha2/asm/sha2-x86_64.S src/crypto/sha2/asm/copy_swap.c } else { # use: qmake "USE_SSE2=1" contains(USE_SSE2, 1) { message(Using SSE2 intrinsic scrypt implementation & generic sha256 implementation) SOURCES += src/crypto/scrypt/intrin/scrypt-sse2.cpp DEFINES += USE_SSE2 - QMAKE_CXXFLAGS += -msse2 + QMAKE_CXXFLAGS += -msse2 QMAKE_CFLAGS += -msse2 } else { message(Using generic scrypt & sha256 implementations) diff --git a/src/crypto/sha2/asm/copy_swap.c b/src/crypto/sha2/asm/copy_swap.c new file mode 100644 index 0000000..f430292 --- /dev/null +++ b/src/crypto/sha2/asm/copy_swap.c @@ -0,0 +1,57 @@ +// Copyright (c) 2015 The Novacoin developers +// Distributed under the MIT/X11 software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#include +#include + +void copy_swap_hashes(uint32_t *blocks, uint32_t *state) +{ + blocks[0] = __builtin_bswap32(state[0]); + blocks[1] = __builtin_bswap32(state[1]); + blocks[2] = __builtin_bswap32(state[2]); + blocks[3] = __builtin_bswap32(state[3]); + blocks[4] = __builtin_bswap32(state[4]); + blocks[5] = __builtin_bswap32(state[5]); + blocks[6] = __builtin_bswap32(state[6]); + blocks[7] = __builtin_bswap32(state[7]); + blocks[8] = __builtin_bswap32(state[8]); + blocks[9] = __builtin_bswap32(state[9]); + blocks[10] = __builtin_bswap32(state[10]); + blocks[11] = __builtin_bswap32(state[11]); + blocks[12] = __builtin_bswap32(state[12]); + blocks[13] = __builtin_bswap32(state[13]); + blocks[14] = __builtin_bswap32(state[14]); + blocks[15] = __builtin_bswap32(state[15]); + blocks[16] = __builtin_bswap32(state[16]); + blocks[17] = __builtin_bswap32(state[17]); + blocks[18] = __builtin_bswap32(state[18]); + blocks[19] = __builtin_bswap32(state[19]); + blocks[20] = __builtin_bswap32(state[20]); + blocks[21] = __builtin_bswap32(state[21]); + blocks[22] = __builtin_bswap32(state[22]); + blocks[23] = __builtin_bswap32(state[23]); + blocks[24] = __builtin_bswap32(state[24]); + blocks[25] = __builtin_bswap32(state[25]); + blocks[26] = __builtin_bswap32(state[26]); + blocks[27] = __builtin_bswap32(state[27]); + blocks[28] = __builtin_bswap32(state[28]); + blocks[29] = __builtin_bswap32(state[29]); + blocks[30] = __builtin_bswap32(state[30]); + blocks[31] = __builtin_bswap32(state[31]); +} + +#ifdef USE_SSSE3 +void copy_swap_hashes_ssse3(uint32_t *blocks, uint32_t *state) +{ + __m128i mask = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3); + _mm_storeu_si128((__m128i *)&blocks[0], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[0]), mask)); + _mm_storeu_si128((__m128i *)&blocks[4], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[4]), mask)); + _mm_storeu_si128((__m128i *)&blocks[8], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[8]), mask)); + _mm_storeu_si128((__m128i *)&blocks[12], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[12]), mask)); + _mm_storeu_si128((__m128i *)&blocks[16], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[16]), mask)); + _mm_storeu_si128((__m128i *)&blocks[20], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[20]), mask)); + _mm_storeu_si128((__m128i *)&blocks[24], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[24]), mask)); + _mm_storeu_si128((__m128i *)&blocks[28], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[28]), mask)); +} +#endif diff --git a/src/crypto/sha2/asm/sha2-x86.S b/src/crypto/sha2/asm/sha2-x86.S index 65f2062..044630f 100644 --- a/src/crypto/sha2/asm/sha2-x86.S +++ b/src/crypto/sha2/asm/sha2-x86.S @@ -1188,4 +1188,28 @@ sha256_use_4way_sse2: popl %ebx ret +#if defined(USE_SSSE3) + .text + .p2align 5 + .globl sha256_use_ssse3 + .globl _sha256_use_ssse3 +sha256_use_ssse3: +_sha256_use_ssse3: + pushl %ebx + + movl $1, %eax + cpuid + andl $0x00000200, %ecx + jnz sha256_use_ssse3 + xorl %eax, %eax + popl %ebx + ret + +sha256_use_ssse3: + movl $1, %eax + popl %ebx + ret +#endif + + #endif diff --git a/src/crypto/sha2/asm/sha2-x86_64.S b/src/crypto/sha2/asm/sha2-x86_64.S index 9f3974b..6554ef0 100644 --- a/src/crypto/sha2/asm/sha2-x86_64.S +++ b/src/crypto/sha2/asm/sha2-x86_64.S @@ -3574,6 +3574,33 @@ sha256_use_4way_exit: popq %rbx ret +#if defined(USE_SSSE3) + .text + .p2align 6 + .globl sha256_use_ssse3 + .globl _sha256_use_ssse3 +sha256_use_ssse3: +_sha256_use_ssse3: + pushq %rbx + pushq %rcx + pushq %rdx + cpuid + andl $0x00000200, %ecx + jz sha256_use_ssse3_done + xorl %eax, %eax + popq %rdx + popq %rcx + popq %rbx + ret + +sha256_use_ssse3_done: + movl $1, %eax + popq %rdx + popq %rcx + popq %rbx + ret +#endif + .text .p2align 6 .globl sha256d_ms_8way diff --git a/src/kernel.cpp b/src/kernel.cpp index 2873c41..e3ed6d1 100644 --- a/src/kernel.cpp +++ b/src/kernel.cpp @@ -1,4 +1,7 @@ // Copyright (c) 2012-2013 The PPCoin developers +// Copyright (c) 2013-2015 The Novacoin developers +// Distributed under the MIT/X11 software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. // Distributed under the MIT/X11 software license, see the accompanying // file COPYING or http://www.opensource.org/licenses/mit-license.php. @@ -454,12 +457,20 @@ static const uint32_t block2_suffix_4way[4 * 8] = { }; extern "C" int sha256_use_4way(); - extern "C" void sha256_init(uint32_t *state); extern "C" void sha256_transform(uint32_t *state, const uint32_t *block, int swap); - extern "C" void sha256_init_4way(uint32_t *state); extern "C" void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap); +extern "C" void copy_swap_hashes(uint32_t *blocks, uint32_t *state); // Generic block copy function + +#ifdef USE_SSSE3 +extern "C" int sha256_use_ssse3(); +extern "C" void copy_swap_hashes_ssse3(uint32_t *blocks, uint32_t *state); // SSSE3 optimized block copy function + +void (*copy_swap)(uint32_t *, uint32_t *) = (sha256_use_ssse3() != 0) ? ©_swap_hashes_ssse3 : copy_swap_hashes; +#else +void (*copy_swap)(uint32_t *, uint32_t *) = ©_swap_hashes; +#endif bool fUse4Way = sha256_use_4way() != 0; @@ -476,6 +487,8 @@ public: void Do_4way() { + cout << sha256_use_ssse3() << endl; + SetThreadPriority(THREAD_PRIORITY_LOWEST); // Compute maximum possible target to filter out majority of obviously insufficient hashes @@ -518,10 +531,7 @@ public: blocks1[27] = nTimeTx++; sha256_transform_4way(&state1[0], &blocks1[0], 1); // first hashing - - for(int i=0; i<32; i++) - blocks2[i] = __builtin_bswap32(state1[i]); - + copy_swap(&blocks2[0], &state1[0]); sha256_transform_4way(&state2[0], &blocks2[0], 1); // second hashing for(int nResult = 0; nResult < 4; nResult++) diff --git a/src/makefile.bsd b/src/makefile.bsd index ea44263..98906c4 100644 --- a/src/makefile.bsd +++ b/src/makefile.bsd @@ -93,6 +93,12 @@ ifeq (${USE_O3}, 1) xOPT_LEVEL=-O3 endif +ifeq (${USE_SSSE3}, 1) +# Intrinsic implementation of block copy +DEFS += -DUSE_SSSE3 +xOPT_LEVEL += -mssse3 +endif + # CXXFLAGS can be specified on the make command line, so we use xCXXFLAGS that only # adds some defaults in front. Unfortunately, CXXFLAGS=... $(CXXFLAGS) does not work. xCXXFLAGS=$(xOPT_LEVEL) -msse2 -pthread -Wall -Wextra -Wno-ignored-qualifiers -Wformat -Wformat-security -Wno-unused-parameter \ @@ -154,9 +160,12 @@ OBJS += obj/txdb-bdb.o endif ifeq (${USE_ASM}, 1) + +DEFS += -DUSE_ASM + # Assembler implementation OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o -OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o +OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o crypto/sha2/asm/obj/copy_swap.o crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S $(CXX) -c $(xCXXFLAGS) -MMD -o $@ $< @@ -179,7 +188,9 @@ crypto/sha2/asm/obj/sha2-x86_64.o: crypto/sha2/asm/sha2-x86_64.S crypto/sha2/asm/obj/sha2-arm.o: crypto/sha2/asm/sha2-arm.S $(CXX) -c $(xCXXFLAGS) -MMD -o $@ $< -DEFS += -DUSE_ASM +crypto/sha2/asm/obj/copy_swap.o: crypto/sha2/asm/copy_swap.c + $(CC) -c $(xCXXFLAGS) -MMD -o $@ $< + else ifeq (${USE_SSE2}, 1) # Intrinsic implementation diff --git a/src/makefile.linux-mingw b/src/makefile.linux-mingw index 519e6db..284670a 100644 --- a/src/makefile.linux-mingw +++ b/src/makefile.linux-mingw @@ -56,6 +56,13 @@ ifeq (${USE_O3}, 1) endif DEFS=-D_MT -DWIN32 -D_WINDOWS -DBOOST_THREAD_USE_LIB -DBOOST_SPIRIT_THREADSAFE -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS + +ifeq (${USE_SSSE3}, 1) +# Intrinsic implementation of block copy +DEFS += -DUSE_SSSE3 +xOPT_LEVEL += -mssse3 +endif + DEBUGFLAGS=-g CFLAGS=$(xOPT_LEVEL) -msse2 -w -Wall -Wextra -Wno-ignored-qualifiers -Wformat -Wformat-security -Wno-unused-parameter $(DEBUGFLAGS) $(DEFS) $(INCLUDEPATHS) LDFLAGS=-Wl,--dynamicbase -Wl,--nxcompat -static-libgcc -static-libstdc++ @@ -131,7 +138,7 @@ endif ifeq (${USE_ASM}, 1) # Assembler implementation OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o -OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o +OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o crypto/sha2/asm/obj/copy_swap.o crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S $(CXX) -c $(CFLAGS) -MMD -o $@ $< @@ -154,6 +161,9 @@ crypto/sha2/asm/obj/sha2-x86_64.o: crypto/sha2/asm/sha2-x86_64.S crypto/sha2/asm/obj/sha2-arm.o: crypto/sha2/asm/sha2-arm.S $(CXX) -c $(CFLAGS) -MMD -o $@ $< +crypto/sha2/asm/obj/copy_swap.o: crypto/sha2/asm/copy_swap.c + $(CC) -c $(CFLAGS) -MMD -o $@ $< + DEFS += -DUSE_ASM else ifeq (${USE_SSE2}, 1) diff --git a/src/makefile.mingw b/src/makefile.mingw index 1f0d38f..3fa7244 100644 --- a/src/makefile.mingw +++ b/src/makefile.mingw @@ -43,6 +43,12 @@ ifeq (${USE_O3}, 1) xOPT_LEVEL=-O3 endif +ifdef USE_SSSE3 +# Intrinsic implementation of block copy +DEFS += -DUSE_SSSE3 +xOPT_LEVEL+=-mssse3 +endif + DEFS=-DWIN32 -D_WINDOWS -DBOOST_THREAD_USE_LIB -DBOOST_SPIRIT_THREADSAFE -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS DEBUGFLAGS=-g CFLAGS=-mthreads $(xOPT_LEVEL) -msse2 -w -Wall -Wextra -Wno-ignored-qualifiers -Wformat -Wformat-security -Wno-unused-parameter $(DEBUGFLAGS) $(DEFS) $(INCLUDEPATHS) @@ -120,7 +126,7 @@ endif ifdef USE_ASM # Assembler implementation OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o -OBJS += crypto/sha2/asm/obj/scrypt-arm.o crypto/sha2/asm/obj/scrypt-x86.o crypto/sha2/asm/obj/scrypt-x86_64.o +OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o crypto/sha2/asm/obj/copy_swap.o crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S $(CXX) -c $(CFLAGS) -MMD -o $@ $< @@ -134,7 +140,11 @@ crypto/scrypt/asm/obj/scrypt-arm.o: crypto/scrypt/asm/scrypt-arm.S crypto/scrypt/asm/obj/asm-wrapper.o: crypto/scrypt/asm/asm-wrapper.cpp $(CXX) -c $(CFLAGS) -MMD -o $@ $< +crypto/sha2/asm/obj/copy_swap.o: crypto/sha2/asm/copy_swap.c + $(CC) -c $(CFLAGS) -MMD -o $@ $< + DEFS += -DUSE_ASM + else ifdef USE_SSE2 # Intrinsic implementation diff --git a/src/makefile.osx b/src/makefile.osx index 121c497..2048961 100644 --- a/src/makefile.osx +++ b/src/makefile.osx @@ -59,6 +59,12 @@ else CFLAGS = -g -msse2 endif +ifeq (${USE_SSSE3}, 1) +# Intrinsic implementation of block copy +DEFS += -DUSE_SSSE3 +CFLAGS += -mssse3 +endif + # ppc doesn't work because we don't support big-endian CFLAGS += -Wall -Wextra -Wformat -Wno-ignored-qualifiers -Wformat-security -Wno-unused-parameter \ $(DEBUGFLAGS) $(DEFS) $(INCLUDEPATHS) @@ -132,7 +138,7 @@ endif ifeq (${USE_ASM}, 1) # Assembler implementation OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o -OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o +OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o crypto/sha2/asm/obj/copy_swap.o crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S $(CXX) -c $(CFLAGS) -MMD -o $@ $< @@ -155,7 +161,11 @@ crypto/sha/asm/obj/sha-arm.o: crypto/sha2/asm/sha2-x86_64.S crypto/sha/asm/obj/sha-arm.o: crypto/sha2/asm/sha2-arm.S $(CXX) -c $(CFLAGS) -MMD -o $@ $< +crypto/sha2/asm/obj/copy_swap.o: crypto/sha2/asm/copy_swap.c + $(CC) -c $(CFLAGS) -MMD -o $@ $< + DEFS += -DUSE_ASM + else ifeq (${USE_SSE2}, 1) # Intrinsic implementation diff --git a/src/makefile.unix b/src/makefile.unix index 65eaca1..c8fc18e 100644 --- a/src/makefile.unix +++ b/src/makefile.unix @@ -99,6 +99,12 @@ ifeq (${USE_O3}, 1) xOPT_LEVEL=-O3 endif +ifeq (${USE_SSSE3}, 1) +# Intrinsic implementation of block copy +DEFS += -DUSE_SSSE3 +xOPT_LEVEL += -mssse3 +endif + # CXXFLAGS can be specified on the make command line, so we use xCXXFLAGS that only # adds some defaults in front. Unfortunately, CXXFLAGS=... $(CXXFLAGS) does not work. xCXXFLAGS=$(xOPT_LEVEL) $(EXT_OPTIONS) -pthread -Wall -Wextra -Wno-ignored-qualifiers -Wformat -Wformat-security -Wno-unused-parameter \ @@ -162,7 +168,7 @@ endif ifeq (${USE_ASM}, 1) # Assembler implementation OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o -OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o +OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o crypto/sha2/asm/obj/copy_swap.o crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S $(CXX) -c $(xCXXFLAGS) -MMD -o $@ $< @@ -185,7 +191,11 @@ crypto/sha2/asm/obj/sha2-x86_64.o: crypto/sha2/asm/sha2-x86_64.S crypto/sha2/asm/obj/sha2-arm.o: crypto/sha2/asm/sha2-x86.S $(CXX) -c $(xCXXFLAGS) -MMD -o $@ $< +crypto/sha2/asm/obj/copy_swap.o: crypto/sha2/asm/copy_swap.c + $(CC) -c $(xCXXFLAGS) -MMD -o $@ $< + DEFS += -DUSE_ASM + else ifeq (${USE_SSE2}, 1) # Intrinsic implementation