From: CryptoManiac Date: Wed, 14 Oct 2015 22:07:39 +0000 (-0700) Subject: RPC scaninput: Add 8way implementation, but disable it for now. X-Git-Tag: nvc-v0.5.5~24 X-Git-Url: https://git.novaco.in/?p=novacoin.git;a=commitdiff_plain;h=f1cc3f2b7b56686389a1deeaf0c7868a019d840e RPC scaninput: Add 8way implementation, but disable it for now. --- diff --git a/novacoin-qt.pro b/novacoin-qt.pro index 25f6bc5..369a734 100644 --- a/novacoin-qt.pro +++ b/novacoin-qt.pro @@ -9,6 +9,10 @@ CONFIG += no_include_pwd CONFIG += thread CONFIG += static +linux-g++: QMAKE_TARGET.arch = $$QMAKE_HOST.arch +linux-g++-32: QMAKE_TARGET.arch = x86 +linux-g++-64: QMAKE_TARGET.arch = x86_64 + # for boost 1.37, add -mt to the boost libraries # use: qmake BOOST_LIB_SUFFIX=-mt # for boost thread win32 with _win32 sufix @@ -33,6 +37,7 @@ OBJECTS_DIR = build MOC_DIR = build UI_DIR = build + # use: qmake "RELEASE=1" contains(RELEASE, 1) { !windows:!macx { @@ -106,19 +111,27 @@ contains(USE_LEVELDB, 1) { SOURCES += src/txdb-bdb.cpp } + # use: qmake "USE_ASM=1" contains(USE_ASM, 1) { message(Using assembler scrypt & sha256 implementations) DEFINES += USE_ASM - QMAKE_CFLAGS += -msse2 - QMAKE_CXXFLAGS += -msse2 - contains(USE_SSSE3, 1) { - DEFINES += USE_SSSE3 - QMAKE_CFLAGS += -mssse3 + contains(QMAKE_TARGET.arch, x86) { + message("x86 platform, setting -msse2 & -mssse3 flags") + + QMAKE_CXXFLAGS += -msse2 -mssse3 + QMAKE_CFLAGS += -msse2 -mssse3 + } + + contains(QMAKE_TARGET.arch, x86_64) { + message("x86_64 platform, setting -mssse3 flag") + QMAKE_CXXFLAGS += -mssse3 + QMAKE_CFLAGS += -mssse3 } + SOURCES += src/crypto/scrypt/asm/scrypt-arm.S src/crypto/scrypt/asm/scrypt-x86.S src/crypto/scrypt/asm/scrypt-x86_64.S src/crypto/scrypt/asm/asm-wrapper.cpp SOURCES += src/crypto/sha2/asm/sha2-arm.S src/crypto/sha2/asm/sha2-x86.S src/crypto/sha2/asm/sha2-x86_64.S } else { @@ -153,12 +166,6 @@ contains(USE_O3, 1) { QMAKE_CFLAGS += -O3 } -*-g++-32 { - message("32 platform, adding -msse2 flag") - - QMAKE_CXXFLAGS += -msse2 - QMAKE_CFLAGS += -msse2 -} QMAKE_CXXFLAGS_WARN_ON = -fdiagnostics-show-option -Wall -Wextra -Wno-ignored-qualifiers -Wformat -Wformat-security -Wno-unused-parameter -Wstack-protector diff --git a/src/crypto/sha2/asm/sha2-x86.S b/src/crypto/sha2/asm/sha2-x86.S index ac64569..f0cd5f4 100644 --- a/src/crypto/sha2/asm/sha2-x86.S +++ b/src/crypto/sha2/asm/sha2-x86.S @@ -602,7 +602,6 @@ sha256_use_4way_sse2: popl %ebx ret -#if defined(USE_SSSE3) .text .p2align 5 .globl sha256_use_ssse3 @@ -623,7 +622,5 @@ sha256_use_ssse3_done: movl $1, %eax popl %ebx ret -#endif - #endif diff --git a/src/crypto/sha2/asm/sha2-x86_64.S b/src/crypto/sha2/asm/sha2-x86_64.S index f9cd45f..eabb50f 100644 --- a/src/crypto/sha2/asm/sha2-x86_64.S +++ b/src/crypto/sha2/asm/sha2-x86_64.S @@ -1967,7 +1967,6 @@ sha256_use_4way_exit: popq %rbx ret -#if defined(USE_SSSE3) .text .p2align 6 .globl sha256_use_ssse3 @@ -1992,7 +1991,6 @@ sha256_use_ssse3_done: popq %rcx popq %rbx ret -#endif .macro sha256_avx2_main_round_red i, r0, r1, r2, r3, r4 vpaddd 32*\i(%rax), \r0, %ymm6 diff --git a/src/kernel.cpp b/src/kernel.cpp index 7c2b5c7..978d4bb 100644 --- a/src/kernel.cpp +++ b/src/kernel.cpp @@ -429,8 +429,14 @@ bool CheckStakeKernelHash(uint32_t nBits, const CBlock& blockFrom, uint32_t nTxP #ifdef USE_ASM +#ifndef __i386__ // kernel padding static const uint32_t block1_suffix[9] = { 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0x000000e0 }; +// hash padding +static const uint32_t block2_suffix[8] = { 0x80000000, 0, 0, 0, 0, 0, 0, 0x00000100 }; +#endif + +// 4-way kernel padding static const uint32_t block1_suffix_4way[4 * 9] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0, 0, 0, 0, @@ -443,8 +449,7 @@ static const uint32_t block1_suffix_4way[4 * 9] = { 0x000000e0, 0x000000e0, 0x000000e0, 0x000000e0 }; -// hash padding -static const uint32_t block2_suffix[8] = { 0x80000000, 0, 0, 0, 0, 0, 0, 0x00000100 }; +// 4-way hash padding static const uint32_t block2_suffix_4way[4 * 8] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0, 0, 0, 0, @@ -456,24 +461,53 @@ static const uint32_t block2_suffix_4way[4 * 8] = { 0x00000100, 0x00000100, 0x00000100, 0x00000100 }; +#ifdef __x86_64__ +// 8-way kernel padding +static const uint32_t block1_suffix_8way[8 * 9] = { + 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0x000000e0, 0x000000e0, 0x000000e0, 0x000000e0, 0x000000e0, 0x000000e0, 0x000000e0, 0x000000e0 +}; + +// 8-way hash padding +static const uint32_t block2_suffix_8way[8 * 8] = { + 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0x000000e0, 0x000000e0, 0x000000e0, 0x000000e0, 0x000000e0, 0x000000e0, 0x000000e0, 0x000000e0 +}; +#endif + // Sha256 initial state static const uint32_t sha256_initial[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; -extern "C" int sha256_use_4way(); #ifndef __i386__ extern "C" void sha256_transform(uint32_t *state, const uint32_t *block, int swap); #endif -extern "C" void sha256_init_4way(uint32_t *state); -extern "C" void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap); - -#ifdef USE_SSSE3 +#if defined(__i386__) || defined(__x86_64__) #include - extern "C" int sha256_use_ssse3(); bool fUseSSSE3 = sha256_use_ssse3() != 0; -inline void copyrow_swap32(uint32_t *to, uint32_t *from) +inline void copyrow8_swap32(uint32_t *to, uint32_t *from) +{ + __m128i mask = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3); + _mm_storeu_si128((__m128i *)&to[0], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&from[0]), mask)); + _mm_storeu_si128((__m128i *)&to[4], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&from[4]), mask)); +} + +inline void copyrow4_swap32(uint32_t *to, uint32_t *from) { if (!fUseSSSE3) { @@ -487,15 +521,28 @@ inline void copyrow_swap32(uint32_t *to, uint32_t *from) } } #else -inline void copyrow_swap32(uint32_t *to, uint32_t *from) +inline void copyrow4_swap32(uint32_t *to, uint32_t *from) { for (int i = 0; i < 4; i++) to[i] = __builtin_bswap32(from[i]); } #endif +extern "C" int sha256_use_4way(); +extern "C" void sha256_init_4way(uint32_t *state); +extern "C" void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap); + bool fUse4Way = sha256_use_4way() != 0; +#ifdef __x86_64__ +extern "C" int sha256_use_8way(); +extern "C" void sha256_init_8way(uint32_t *state); +extern "C" void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap); + +bool fUse8Way = sha256_use_8way() != 0; +#endif + + class ScanMidstateWorker { public: @@ -507,6 +554,78 @@ public: solutions = vector >(); } +#ifdef __x86_64__ + void Do_8way() + { + SetThreadPriority(THREAD_PRIORITY_LOWEST); + + // Compute maximum possible target to filter out majority of obviously insufficient hashes + CBigNum bnTargetPerCoinDay; + bnTargetPerCoinDay.SetCompact(nBits); + uint256 nMaxTarget = (bnTargetPerCoinDay * bnValueIn * nStakeMaxAge / COIN / nOneDay).getuint256(); + + uint32_t blocks1[8 * 16] __attribute__((aligned(16))); + uint32_t blocks2[8 * 16] __attribute__((aligned(16))); + uint32_t candidates[8 * 8] __attribute__((aligned(16))); + + vector vRow = vector(8); + uint32_t *pnKernel = (uint32_t *) kernel; + + for(int i = 0; i < 7; i++) + { + fill(vRow.begin(), vRow.end(), pnKernel[i]); + copyrow8_swap32(&blocks1[i*8], &vRow[0]); + } + + memcpy(&blocks1[56], &block1_suffix_8way[0], 36*8); // sha256 padding + memcpy(&blocks2[64], &block2_suffix_8way[0], 32*8); + + uint32_t nHashes[8]; + uint32_t nTimeStamps[8]; + + // Search forward in time from the given timestamp + // Stopping search in case of shutting down + for (uint32_t nTimeTx=nIntervalBegin, nMaxTarget32 = nMaxTarget.Get32(7); nTimeTx= CBigNum(nHashProofOfStake)) + solutions.push_back(std::pair(nHashProofOfStake, nTimeStamps[nResult])); + } + } + } + } +#endif + void Do_4way() { SetThreadPriority(THREAD_PRIORITY_LOWEST); @@ -526,14 +645,14 @@ public: for(int i = 0; i < 7; i++) { fill(vRow.begin(), vRow.end(), pnKernel[i]); - copyrow_swap32(&blocks1[i*4], &vRow[0]); + copyrow4_swap32(&blocks1[i*4], &vRow[0]); } memcpy(&blocks1[28], &block1_suffix_4way[0], 36*4); // sha256 padding memcpy(&blocks2[32], &block2_suffix_4way[0], 32*4); - uint32_t nTimeStamps[4] = {0, 0, 0, 0}; - uint32_t nHashes[4] = {0, 0, 0, 0}; + uint32_t nHashes[4]; + uint32_t nTimeStamps[4]; // Search forward in time from the given timestamp // Stopping search in case of shutting down @@ -547,11 +666,10 @@ public: nTimeStamps[2] = nTimeTx+2; nTimeStamps[3] = nTimeTx+3; - copyrow_swap32(&blocks1[24], &nTimeStamps[0]); // Kernel timestamps - + copyrow4_swap32(&blocks1[24], &nTimeStamps[0]); // Kernel timestamps sha256_transform_4way(&blocks2[0], &blocks1[0], 0); // first hashing sha256_transform_4way(&candidates[0], &blocks2[0], 0); // second hashing - copyrow_swap32(&nHashes[0], &candidates[28]); + copyrow4_swap32(&nHashes[0], &candidates[28]); for(int nResult = 0; nResult < 4; nResult++) { @@ -629,7 +747,7 @@ public: memcpy(&block2[8], &block2_suffix[0], 32); uint32_t *pnKernel = (uint32_t *) kernel; - copyrow_swap32(&block1[0], pnKernel); + copyrow4_swap32(&block1[0], pnKernel); block1[4] = __builtin_bswap32(pnKernel[4]); block1[5] = __builtin_bswap32(pnKernel[5]); @@ -669,10 +787,20 @@ public: void Do() { +#ifdef __x86_64__ + if (false && fUse8Way) // disable for now + { + Do_8way(); + return; + } +#endif if (fUse4Way) + { Do_4way(); - else - Do_oneway(); + return; + } + + Do_oneway(); } vector >& GetSolutions() diff --git a/src/makefile.bsd b/src/makefile.bsd index 873e753..88f8b2b 100644 --- a/src/makefile.bsd +++ b/src/makefile.bsd @@ -4,6 +4,7 @@ USE_LEVELDB:=0 USE_IPV6:=1 +ARCH:=$(uname -m) LINK:=$(CXX) @@ -79,20 +80,22 @@ LIBS+= \ DEBUGFLAGS=-g +ifeq (${ARCH}, i386) + EXT_OPTIONS=-msse2 -mssse3 +endif + +ifeq (${ARCH}, amd64) + EXT_OPTIONS=-mssse3 +endif + xOPT_LEVEL=-O2 ifeq (${USE_O3}, 1) xOPT_LEVEL=-O3 endif -ifeq (${USE_SSSE3}, 1) -# Intrinsic implementation of block copy -DEFS += -DUSE_SSSE3 -xOPT_LEVEL += -mssse3 -endif - # CXXFLAGS can be specified on the make command line, so we use xCXXFLAGS that only # adds some defaults in front. Unfortunately, CXXFLAGS=... $(CXXFLAGS) does not work. -xCXXFLAGS=$(xOPT_LEVEL) -msse2 -pthread -Wall -Wextra -Wno-ignored-qualifiers -Wformat -Wformat-security -Wno-unused-parameter \ +xCXXFLAGS=$(xOPT_LEVEL) $(EXT_OPTIONS) -pthread -Wall -Wextra -Wno-ignored-qualifiers -Wformat -Wformat-security -Wno-unused-parameter \ $(DEBUGFLAGS) $(DEFS) $(HARDENING) $(CXXFLAGS) # LDFLAGS can be specified on the make command line, so we use xLDFLAGS that only diff --git a/src/makefile.linux-mingw b/src/makefile.linux-mingw index 1f89ee9..fc1b93e 100644 --- a/src/makefile.linux-mingw +++ b/src/makefile.linux-mingw @@ -52,14 +52,8 @@ endif DEFS=-D_MT -DWIN32 -D_WINDOWS -DBOOST_THREAD_USE_LIB -DBOOST_SPIRIT_THREADSAFE -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -ifeq (${USE_SSSE3}, 1) -# Intrinsic implementation of block copy -DEFS += -DUSE_SSSE3 -xOPT_LEVEL += -mssse3 -endif - DEBUGFLAGS=-g -CFLAGS=$(xOPT_LEVEL) -msse2 -w -Wall -Wextra -Wno-ignored-qualifiers -Wformat -Wformat-security -Wno-unused-parameter $(DEBUGFLAGS) $(DEFS) $(INCLUDEPATHS) +CFLAGS=$(xOPT_LEVEL) -msse2 -mssse3 -w -Wall -Wextra -Wno-ignored-qualifiers -Wformat -Wformat-security -Wno-unused-parameter $(DEBUGFLAGS) $(DEFS) $(INCLUDEPATHS) LDFLAGS=-Wl,--dynamicbase -Wl,--nxcompat -static-libgcc -static-libstdc++ ifneq (${USE_IPV6}, -) diff --git a/src/makefile.mingw b/src/makefile.mingw index 7152101..d7552c2 100644 --- a/src/makefile.mingw +++ b/src/makefile.mingw @@ -41,15 +41,9 @@ ifeq (${USE_O3}, 1) xOPT_LEVEL=-O3 endif -ifdef USE_SSSE3 -# Intrinsic implementation of block copy -DEFS += -DUSE_SSSE3 -xOPT_LEVEL+=-mssse3 -endif - DEFS=-DWIN32 -D_WINDOWS -DBOOST_THREAD_USE_LIB -DBOOST_SPIRIT_THREADSAFE -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS DEBUGFLAGS=-g -CFLAGS=-mthreads $(xOPT_LEVEL) -msse2 -w -Wall -Wextra -Wno-ignored-qualifiers -Wformat -Wformat-security -Wno-unused-parameter $(DEBUGFLAGS) $(DEFS) $(INCLUDEPATHS) +CFLAGS=-mthreads $(xOPT_LEVEL) -msse2 -mssse3 -w -Wall -Wextra -Wno-ignored-qualifiers -Wformat -Wformat-security -Wno-unused-parameter $(DEBUGFLAGS) $(DEFS) $(INCLUDEPATHS) LDFLAGS=-Wl,--dynamicbase -Wl,--nxcompat -Wl,--large-address-aware -static ifneq (${USE_IPV6}, -) diff --git a/src/makefile.osx b/src/makefile.osx index 70ded54..0903d6e 100644 --- a/src/makefile.osx +++ b/src/makefile.osx @@ -53,15 +53,9 @@ ifdef RELEASE # Compile for maximum compatibility and smallest size. # This requires that dependencies are compiled # the same way. -CFLAGS = -O2 -msse2 +CFLAGS = -O2 -msse2 -mssse3 else -CFLAGS = -g -msse2 -endif - -ifeq (${USE_SSSE3}, 1) -# Intrinsic implementation of block copy -DEFS += -DUSE_SSSE3 -CFLAGS += -mssse3 +CFLAGS = -g -msse2 -mssse3 endif # ppc doesn't work because we don't support big-endian @@ -124,8 +118,8 @@ endif ifeq (${USE_ASM}, 1) # Assembler implementation -OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o -OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o +OBJS += crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o +OBJS += crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S $(CXX) -c $(CFLAGS) -MMD -o $@ $< @@ -133,9 +127,6 @@ crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S crypto/scrypt/asm/obj/scrypt-x86_64.o: crypto/scrypt/asm/scrypt-x86_64.S $(CXX) -c $(CFLAGS) -MMD -o $@ $< -crypto/scrypt/asm/obj/scrypt-arm.o: crypto/scrypt/asm/scrypt-arm.S - $(CXX) -c $(CFLAGS) -MMD -o $@ $< - crypto/scrypt/asm/obj/asm-wrapper.o: crypto/scrypt/asm/asm-wrapper.cpp $(CXX) -c $(CFLAGS) -MMD -o $@ $< @@ -145,9 +136,6 @@ crypto/sha2/asm/obj/sha2-x86.o: crypto/sha2/asm/sha2-x86.S crypto/sha2/asm/obj/sha2-x86_64.o: crypto/sha2/asm/sha2-x86_64.S $(CXX) -c $(CFLAGS) -MMD -o $@ $< -crypto/sha2/asm/obj/sha2-arm.o: crypto/sha2/asm/sha2-arm.S - $(CXX) -c $(CFLAGS) -MMD -o $@ $< - DEFS += -DUSE_ASM else diff --git a/src/makefile.unix b/src/makefile.unix index bea34d8..191c4c9 100644 --- a/src/makefile.unix +++ b/src/makefile.unix @@ -82,7 +82,11 @@ DEBUGFLAGS=-g ifeq (${ARCH}, i686) - EXT_OPTIONS=-msse2 + EXT_OPTIONS=-msse2 -mssse3 +endif + +ifeq (${ARCH}, x86_64) + EXT_OPTIONS=-mssse3 endif xOPT_LEVEL=-O2 @@ -90,12 +94,6 @@ ifeq (${USE_O3}, 1) xOPT_LEVEL=-O3 endif -ifeq (${USE_SSSE3}, 1) -# Intrinsic implementation of block copy -DEFS += -DUSE_SSSE3 -xOPT_LEVEL += -mssse3 -endif - # CXXFLAGS can be specified on the make command line, so we use xCXXFLAGS that only # adds some defaults in front. Unfortunately, CXXFLAGS=... $(CXXFLAGS) does not work. xCXXFLAGS=$(xOPT_LEVEL) $(EXT_OPTIONS) -pthread -Wall -Wextra -Wno-ignored-qualifiers -Wformat -Wformat-security -Wno-unused-parameter \