Add SSSE3 imnplementation ofg block copy function, gives us ~30% kernel scanning...
authorCryptoManiac <balthazar@yandex.ru>
Sun, 11 Oct 2015 16:57:44 +0000 (19:57 +0300)
committerCryptoManiac <balthazar@yandex.ru>
Sun, 11 Oct 2015 16:57:44 +0000 (19:57 +0300)
Also fix mingw makefiles.

novacoin-qt.pro
src/crypto/sha2/asm/copy_swap.c [new file with mode: 0644]
src/crypto/sha2/asm/sha2-x86.S
src/crypto/sha2/asm/sha2-x86_64.S
src/kernel.cpp
src/makefile.bsd
src/makefile.linux-mingw
src/makefile.mingw
src/makefile.osx
src/makefile.unix

index 22f6dbc..5d39b6a 100644 (file)
@@ -129,15 +129,24 @@ contains(USE_LEVELDB, 1) {
 contains(USE_ASM, 1) {
     message(Using assembler scrypt & sha256 implementations)
     DEFINES += USE_ASM
+    QMAKE_CFLAGS += -msse2
+    QMAKE_CXXFLAGS += -msse2
+
+    contains(USE_SSSE3, 1) {
+        DEFINES += USE_SSSE3
+        QMAKE_CFLAGS += -mssse3
+        QMAKE_CXXFLAGS += -mssse3
+    }
+
     SOURCES += src/crypto/scrypt/asm/scrypt-arm.S src/crypto/scrypt/asm/scrypt-x86.S src/crypto/scrypt/asm/scrypt-x86_64.S src/crypto/scrypt/asm/asm-wrapper.cpp
-    SOURCES += src/crypto/sha2/asm/sha2-arm.S src/crypto/sha2/asm/sha2-x86.S src/crypto/sha2/asm/sha2-x86_64.S
+    SOURCES += src/crypto/sha2/asm/sha2-arm.S src/crypto/sha2/asm/sha2-x86.S src/crypto/sha2/asm/sha2-x86_64.S src/crypto/sha2/asm/copy_swap.c
 } else {
     # use: qmake "USE_SSE2=1"
     contains(USE_SSE2, 1) {
         message(Using SSE2 intrinsic scrypt implementation & generic sha256 implementation)
         SOURCES += src/crypto/scrypt/intrin/scrypt-sse2.cpp
         DEFINES += USE_SSE2
-        QMAKE_CXXFLAGS += -msse2
+        QMAKE_CXXFLAGS += -msse2 
         QMAKE_CFLAGS += -msse2
     } else {
         message(Using generic scrypt & sha256 implementations)
diff --git a/src/crypto/sha2/asm/copy_swap.c b/src/crypto/sha2/asm/copy_swap.c
new file mode 100644 (file)
index 0000000..f430292
--- /dev/null
@@ -0,0 +1,57 @@
+// Copyright (c) 2015 The Novacoin developers
+// Distributed under the MIT/X11 software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#include <stdint.h>
+#include <immintrin.h>
+
+void copy_swap_hashes(uint32_t *blocks, uint32_t *state)
+{
+    blocks[0] = __builtin_bswap32(state[0]);
+    blocks[1] = __builtin_bswap32(state[1]);
+    blocks[2] = __builtin_bswap32(state[2]);
+    blocks[3] = __builtin_bswap32(state[3]);
+    blocks[4] = __builtin_bswap32(state[4]);
+    blocks[5] = __builtin_bswap32(state[5]);
+    blocks[6] = __builtin_bswap32(state[6]);
+    blocks[7] = __builtin_bswap32(state[7]);
+    blocks[8] = __builtin_bswap32(state[8]);
+    blocks[9] = __builtin_bswap32(state[9]);
+    blocks[10] = __builtin_bswap32(state[10]);
+    blocks[11] = __builtin_bswap32(state[11]);
+    blocks[12] = __builtin_bswap32(state[12]);
+    blocks[13] = __builtin_bswap32(state[13]);
+    blocks[14] = __builtin_bswap32(state[14]);
+    blocks[15] = __builtin_bswap32(state[15]);
+    blocks[16] = __builtin_bswap32(state[16]);
+    blocks[17] = __builtin_bswap32(state[17]);
+    blocks[18] = __builtin_bswap32(state[18]);
+    blocks[19] = __builtin_bswap32(state[19]);
+    blocks[20] = __builtin_bswap32(state[20]);
+    blocks[21] = __builtin_bswap32(state[21]);
+    blocks[22] = __builtin_bswap32(state[22]);
+    blocks[23] = __builtin_bswap32(state[23]);
+    blocks[24] = __builtin_bswap32(state[24]);
+    blocks[25] = __builtin_bswap32(state[25]);
+    blocks[26] = __builtin_bswap32(state[26]);
+    blocks[27] = __builtin_bswap32(state[27]);
+    blocks[28] = __builtin_bswap32(state[28]);
+    blocks[29] = __builtin_bswap32(state[29]);
+    blocks[30] = __builtin_bswap32(state[30]);
+    blocks[31] = __builtin_bswap32(state[31]);
+}
+
+#ifdef USE_SSSE3
+void copy_swap_hashes_ssse3(uint32_t *blocks, uint32_t *state) 
+{
+    __m128i mask = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
+    _mm_storeu_si128((__m128i *)&blocks[0], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[0]), mask));
+    _mm_storeu_si128((__m128i *)&blocks[4], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[4]), mask));
+    _mm_storeu_si128((__m128i *)&blocks[8], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[8]), mask));
+    _mm_storeu_si128((__m128i *)&blocks[12], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[12]), mask));
+    _mm_storeu_si128((__m128i *)&blocks[16], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[16]), mask));
+    _mm_storeu_si128((__m128i *)&blocks[20], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[20]), mask));
+    _mm_storeu_si128((__m128i *)&blocks[24], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[24]), mask));
+    _mm_storeu_si128((__m128i *)&blocks[28], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[28]), mask));
+}
+#endif
index 65f2062..044630f 100644 (file)
@@ -1188,4 +1188,28 @@ sha256_use_4way_sse2:
     popl    %ebx
     ret
 
+#if defined(USE_SSSE3)
+    .text
+    .p2align 5
+    .globl sha256_use_ssse3
+    .globl _sha256_use_ssse3
+sha256_use_ssse3:
+_sha256_use_ssse3:
+    pushl    %ebx
+    
+    movl    $1, %eax
+    cpuid
+    andl    $0x00000200, %ecx
+    jnz sha256_use_ssse3
+    xorl    %eax, %eax
+    popl    %ebx
+    ret
+    
+sha256_use_ssse3:
+    movl    $1, %eax
+    popl    %ebx
+    ret
+#endif
+
+
 #endif
index 9f3974b..6554ef0 100644 (file)
@@ -3574,6 +3574,33 @@ sha256_use_4way_exit:
     popq    %rbx
     ret
 
+#if defined(USE_SSSE3)
+    .text
+    .p2align 6
+    .globl sha256_use_ssse3
+    .globl _sha256_use_ssse3
+sha256_use_ssse3:
+_sha256_use_ssse3:
+    pushq    %rbx
+    pushq    %rcx
+    pushq    %rdx
+    cpuid
+    andl    $0x00000200, %ecx
+    jz sha256_use_ssse3_done
+    xorl    %eax, %eax
+    popq    %rdx
+    popq    %rcx
+    popq    %rbx
+    ret
+    
+sha256_use_ssse3_done:
+    movl    $1, %eax
+    popq    %rdx
+    popq    %rcx
+    popq    %rbx
+    ret
+#endif
+
     .text
     .p2align 6
     .globl sha256d_ms_8way
index 2873c41..e3ed6d1 100644 (file)
@@ -1,4 +1,7 @@
 // Copyright (c) 2012-2013 The PPCoin developers
+// Copyright (c) 2013-2015 The Novacoin developers
+// Distributed under the MIT/X11 software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
 // Distributed under the MIT/X11 software license, see the accompanying
 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
 
@@ -454,12 +457,20 @@ static const uint32_t block2_suffix_4way[4 * 8] = {
 };
 
 extern "C" int sha256_use_4way();
-
 extern "C" void sha256_init(uint32_t *state);
 extern "C" void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
-
 extern "C" void sha256_init_4way(uint32_t *state);
 extern "C" void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
+extern "C" void copy_swap_hashes(uint32_t *blocks, uint32_t *state); // Generic block copy function
+
+#ifdef USE_SSSE3
+extern "C" int sha256_use_ssse3();
+extern "C" void copy_swap_hashes_ssse3(uint32_t *blocks, uint32_t *state); // SSSE3 optimized block copy function
+
+void (*copy_swap)(uint32_t *, uint32_t *) = (sha256_use_ssse3() != 0) ? &copy_swap_hashes_ssse3 : copy_swap_hashes;
+#else
+void (*copy_swap)(uint32_t *, uint32_t *) = &copy_swap_hashes;
+#endif
 
 bool fUse4Way = sha256_use_4way() != 0;
 
@@ -476,6 +487,8 @@ public:
 
     void Do_4way()
     {
+        cout << sha256_use_ssse3() << endl;
+
         SetThreadPriority(THREAD_PRIORITY_LOWEST);
 
         // Compute maximum possible target to filter out majority of obviously insufficient hashes
@@ -518,10 +531,7 @@ public:
             blocks1[27] = nTimeTx++;
 
             sha256_transform_4way(&state1[0], &blocks1[0], 1); // first hashing
-
-            for(int i=0; i<32; i++)
-                blocks2[i] = __builtin_bswap32(state1[i]);
-
+            copy_swap(&blocks2[0], &state1[0]);
             sha256_transform_4way(&state2[0], &blocks2[0], 1); // second hashing
 
             for(int nResult = 0; nResult < 4; nResult++)
index ea44263..98906c4 100644 (file)
@@ -93,6 +93,12 @@ ifeq (${USE_O3}, 1)
     xOPT_LEVEL=-O3
 endif
 
+ifeq  (${USE_SSSE3}, 1)
+# Intrinsic implementation of block copy
+DEFS += -DUSE_SSSE3
+xOPT_LEVEL += -mssse3
+endif
+
 # CXXFLAGS can be specified on the make command line, so we use xCXXFLAGS that only
 # adds some defaults in front. Unfortunately, CXXFLAGS=... $(CXXFLAGS) does not work.
 xCXXFLAGS=$(xOPT_LEVEL) -msse2 -pthread -Wall -Wextra -Wno-ignored-qualifiers -Wformat -Wformat-security -Wno-unused-parameter \
@@ -154,9 +160,12 @@ OBJS += obj/txdb-bdb.o
 endif
 
 ifeq (${USE_ASM}, 1)
+
+DEFS += -DUSE_ASM
+
 # Assembler implementation
 OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o
-OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o
+OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o crypto/sha2/asm/obj/copy_swap.o
 
 crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S
        $(CXX) -c $(xCXXFLAGS) -MMD -o $@ $<
@@ -179,7 +188,9 @@ crypto/sha2/asm/obj/sha2-x86_64.o: crypto/sha2/asm/sha2-x86_64.S
 crypto/sha2/asm/obj/sha2-arm.o: crypto/sha2/asm/sha2-arm.S
        $(CXX) -c $(xCXXFLAGS) -MMD -o $@ $<
 
-DEFS += -DUSE_ASM
+crypto/sha2/asm/obj/copy_swap.o: crypto/sha2/asm/copy_swap.c
+       $(CC)  -c $(xCXXFLAGS) -MMD -o $@ $<
+
 else
 ifeq  (${USE_SSE2}, 1)
 # Intrinsic implementation
index 519e6db..284670a 100644 (file)
@@ -56,6 +56,13 @@ ifeq (${USE_O3}, 1)
 endif
 
 DEFS=-D_MT -DWIN32 -D_WINDOWS -DBOOST_THREAD_USE_LIB -DBOOST_SPIRIT_THREADSAFE -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS
+
+ifeq  (${USE_SSSE3}, 1)
+# Intrinsic implementation of block copy
+DEFS += -DUSE_SSSE3
+xOPT_LEVEL += -mssse3
+endif
+
 DEBUGFLAGS=-g
 CFLAGS=$(xOPT_LEVEL) -msse2 -w -Wall -Wextra -Wno-ignored-qualifiers -Wformat -Wformat-security -Wno-unused-parameter $(DEBUGFLAGS) $(DEFS) $(INCLUDEPATHS)
 LDFLAGS=-Wl,--dynamicbase -Wl,--nxcompat -static-libgcc -static-libstdc++
@@ -131,7 +138,7 @@ endif
 ifeq (${USE_ASM}, 1)
 # Assembler implementation
 OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o
-OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o
+OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o crypto/sha2/asm/obj/copy_swap.o
 
 crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S
        $(CXX) -c $(CFLAGS) -MMD -o $@ $<
@@ -154,6 +161,9 @@ crypto/sha2/asm/obj/sha2-x86_64.o: crypto/sha2/asm/sha2-x86_64.S
 crypto/sha2/asm/obj/sha2-arm.o: crypto/sha2/asm/sha2-arm.S
        $(CXX) -c $(CFLAGS) -MMD -o $@ $<
 
+crypto/sha2/asm/obj/copy_swap.o: crypto/sha2/asm/copy_swap.c
+       $(CC)  -c $(CFLAGS) -MMD -o $@ $<
+
 DEFS += -DUSE_ASM
 else
 ifeq  (${USE_SSE2}, 1)
index 1f0d38f..3fa7244 100644 (file)
@@ -43,6 +43,12 @@ ifeq (${USE_O3}, 1)
     xOPT_LEVEL=-O3
 endif
 
+ifdef  USE_SSSE3
+# Intrinsic implementation of block copy
+DEFS += -DUSE_SSSE3
+xOPT_LEVEL+=-mssse3
+endif
+
 DEFS=-DWIN32 -D_WINDOWS -DBOOST_THREAD_USE_LIB -DBOOST_SPIRIT_THREADSAFE -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS
 DEBUGFLAGS=-g
 CFLAGS=-mthreads $(xOPT_LEVEL) -msse2 -w -Wall -Wextra -Wno-ignored-qualifiers -Wformat -Wformat-security -Wno-unused-parameter $(DEBUGFLAGS) $(DEFS) $(INCLUDEPATHS)
@@ -120,7 +126,7 @@ endif
 ifdef USE_ASM
 # Assembler implementation
 OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o
-OBJS += crypto/sha2/asm/obj/scrypt-arm.o crypto/sha2/asm/obj/scrypt-x86.o crypto/sha2/asm/obj/scrypt-x86_64.o
+OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o crypto/sha2/asm/obj/copy_swap.o
 
 crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S
        $(CXX) -c $(CFLAGS) -MMD -o $@ $<
@@ -134,7 +140,11 @@ crypto/scrypt/asm/obj/scrypt-arm.o: crypto/scrypt/asm/scrypt-arm.S
 crypto/scrypt/asm/obj/asm-wrapper.o: crypto/scrypt/asm/asm-wrapper.cpp
        $(CXX) -c $(CFLAGS) -MMD -o $@ $<
 
+crypto/sha2/asm/obj/copy_swap.o: crypto/sha2/asm/copy_swap.c
+       $(CC)  -c $(CFLAGS) -MMD -o $@ $<
+
 DEFS += -DUSE_ASM
+
 else
 ifdef USE_SSE2
 # Intrinsic implementation
index 121c497..2048961 100644 (file)
@@ -59,6 +59,12 @@ else
 CFLAGS = -g -msse2
 endif
 
+ifeq  (${USE_SSSE3}, 1)
+# Intrinsic implementation of block copy
+DEFS += -DUSE_SSSE3
+CFLAGS += -mssse3
+endif
+
 # ppc doesn't work because we don't support big-endian
 CFLAGS += -Wall -Wextra -Wformat -Wno-ignored-qualifiers -Wformat-security -Wno-unused-parameter \
     $(DEBUGFLAGS) $(DEFS) $(INCLUDEPATHS)
@@ -132,7 +138,7 @@ endif
 ifeq (${USE_ASM}, 1)
 # Assembler implementation
 OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o
-OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o
+OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o crypto/sha2/asm/obj/copy_swap.o
 
 crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S
        $(CXX) -c $(CFLAGS) -MMD -o $@ $<
@@ -155,7 +161,11 @@ crypto/sha/asm/obj/sha-arm.o: crypto/sha2/asm/sha2-x86_64.S
 crypto/sha/asm/obj/sha-arm.o: crypto/sha2/asm/sha2-arm.S
        $(CXX) -c $(CFLAGS) -MMD -o $@ $<
 
+crypto/sha2/asm/obj/copy_swap.o: crypto/sha2/asm/copy_swap.c
+       $(CC)  -c $(CFLAGS) -MMD -o $@ $<
+
 DEFS += -DUSE_ASM
+
 else
 ifeq  (${USE_SSE2}, 1)
 # Intrinsic implementation
index 65eaca1..c8fc18e 100644 (file)
@@ -99,6 +99,12 @@ ifeq (${USE_O3}, 1)
     xOPT_LEVEL=-O3
 endif
 
+ifeq  (${USE_SSSE3}, 1)
+# Intrinsic implementation of block copy
+DEFS += -DUSE_SSSE3
+xOPT_LEVEL += -mssse3
+endif
+
 # CXXFLAGS can be specified on the make command line, so we use xCXXFLAGS that only
 # adds some defaults in front. Unfortunately, CXXFLAGS=... $(CXXFLAGS) does not work.
 xCXXFLAGS=$(xOPT_LEVEL) $(EXT_OPTIONS) -pthread -Wall -Wextra -Wno-ignored-qualifiers -Wformat -Wformat-security -Wno-unused-parameter \
@@ -162,7 +168,7 @@ endif
 ifeq (${USE_ASM}, 1)
 # Assembler implementation
 OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o
-OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o
+OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o crypto/sha2/asm/obj/copy_swap.o
 
 crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S
        $(CXX) -c $(xCXXFLAGS) -MMD -o $@ $<
@@ -185,7 +191,11 @@ crypto/sha2/asm/obj/sha2-x86_64.o: crypto/sha2/asm/sha2-x86_64.S
 crypto/sha2/asm/obj/sha2-arm.o: crypto/sha2/asm/sha2-x86.S
        $(CXX) -c $(xCXXFLAGS) -MMD -o $@ $<
 
+crypto/sha2/asm/obj/copy_swap.o: crypto/sha2/asm/copy_swap.c
+       $(CC)  -c $(xCXXFLAGS) -MMD -o $@ $<
+
 DEFS += -DUSE_ASM
+
 else
 ifeq  (${USE_SSE2}, 1)
 # Intrinsic implementation