Also fix mingw makefiles.
contains(USE_ASM, 1) {
message(Using assembler scrypt & sha256 implementations)
DEFINES += USE_ASM
+ QMAKE_CFLAGS += -msse2
+ QMAKE_CXXFLAGS += -msse2
+
+ contains(USE_SSSE3, 1) {
+ DEFINES += USE_SSSE3
+ QMAKE_CFLAGS += -mssse3
+ QMAKE_CXXFLAGS += -mssse3
+ }
+
SOURCES += src/crypto/scrypt/asm/scrypt-arm.S src/crypto/scrypt/asm/scrypt-x86.S src/crypto/scrypt/asm/scrypt-x86_64.S src/crypto/scrypt/asm/asm-wrapper.cpp
- SOURCES += src/crypto/sha2/asm/sha2-arm.S src/crypto/sha2/asm/sha2-x86.S src/crypto/sha2/asm/sha2-x86_64.S
+ SOURCES += src/crypto/sha2/asm/sha2-arm.S src/crypto/sha2/asm/sha2-x86.S src/crypto/sha2/asm/sha2-x86_64.S src/crypto/sha2/asm/copy_swap.c
} else {
# use: qmake "USE_SSE2=1"
contains(USE_SSE2, 1) {
message(Using SSE2 intrinsic scrypt implementation & generic sha256 implementation)
SOURCES += src/crypto/scrypt/intrin/scrypt-sse2.cpp
DEFINES += USE_SSE2
- QMAKE_CXXFLAGS += -msse2
+ QMAKE_CXXFLAGS += -msse2
QMAKE_CFLAGS += -msse2
} else {
message(Using generic scrypt & sha256 implementations)
--- /dev/null
+// Copyright (c) 2015 The Novacoin developers
+// Distributed under the MIT/X11 software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#include <stdint.h>
+#include <immintrin.h>
+
+void copy_swap_hashes(uint32_t *blocks, uint32_t *state)
+{
+ blocks[0] = __builtin_bswap32(state[0]);
+ blocks[1] = __builtin_bswap32(state[1]);
+ blocks[2] = __builtin_bswap32(state[2]);
+ blocks[3] = __builtin_bswap32(state[3]);
+ blocks[4] = __builtin_bswap32(state[4]);
+ blocks[5] = __builtin_bswap32(state[5]);
+ blocks[6] = __builtin_bswap32(state[6]);
+ blocks[7] = __builtin_bswap32(state[7]);
+ blocks[8] = __builtin_bswap32(state[8]);
+ blocks[9] = __builtin_bswap32(state[9]);
+ blocks[10] = __builtin_bswap32(state[10]);
+ blocks[11] = __builtin_bswap32(state[11]);
+ blocks[12] = __builtin_bswap32(state[12]);
+ blocks[13] = __builtin_bswap32(state[13]);
+ blocks[14] = __builtin_bswap32(state[14]);
+ blocks[15] = __builtin_bswap32(state[15]);
+ blocks[16] = __builtin_bswap32(state[16]);
+ blocks[17] = __builtin_bswap32(state[17]);
+ blocks[18] = __builtin_bswap32(state[18]);
+ blocks[19] = __builtin_bswap32(state[19]);
+ blocks[20] = __builtin_bswap32(state[20]);
+ blocks[21] = __builtin_bswap32(state[21]);
+ blocks[22] = __builtin_bswap32(state[22]);
+ blocks[23] = __builtin_bswap32(state[23]);
+ blocks[24] = __builtin_bswap32(state[24]);
+ blocks[25] = __builtin_bswap32(state[25]);
+ blocks[26] = __builtin_bswap32(state[26]);
+ blocks[27] = __builtin_bswap32(state[27]);
+ blocks[28] = __builtin_bswap32(state[28]);
+ blocks[29] = __builtin_bswap32(state[29]);
+ blocks[30] = __builtin_bswap32(state[30]);
+ blocks[31] = __builtin_bswap32(state[31]);
+}
+
+#ifdef USE_SSSE3
+void copy_swap_hashes_ssse3(uint32_t *blocks, uint32_t *state)
+{
+ __m128i mask = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
+ _mm_storeu_si128((__m128i *)&blocks[0], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[0]), mask));
+ _mm_storeu_si128((__m128i *)&blocks[4], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[4]), mask));
+ _mm_storeu_si128((__m128i *)&blocks[8], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[8]), mask));
+ _mm_storeu_si128((__m128i *)&blocks[12], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[12]), mask));
+ _mm_storeu_si128((__m128i *)&blocks[16], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[16]), mask));
+ _mm_storeu_si128((__m128i *)&blocks[20], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[20]), mask));
+ _mm_storeu_si128((__m128i *)&blocks[24], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[24]), mask));
+ _mm_storeu_si128((__m128i *)&blocks[28], _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&state[28]), mask));
+}
+#endif
popl %ebx
ret
+#if defined(USE_SSSE3)
+ .text
+ .p2align 5
+ .globl sha256_use_ssse3
+ .globl _sha256_use_ssse3
+sha256_use_ssse3:
+_sha256_use_ssse3:
+ pushl %ebx
+
+ movl $1, %eax
+ cpuid
+ andl $0x00000200, %ecx
+ jnz sha256_use_ssse3
+ xorl %eax, %eax
+ popl %ebx
+ ret
+
+sha256_use_ssse3:
+ movl $1, %eax
+ popl %ebx
+ ret
+#endif
+
+
#endif
popq %rbx
ret
+#if defined(USE_SSSE3)
+ .text
+ .p2align 6
+ .globl sha256_use_ssse3
+ .globl _sha256_use_ssse3
+sha256_use_ssse3:
+_sha256_use_ssse3:
+ pushq %rbx
+ pushq %rcx
+ pushq %rdx
+ cpuid
+ andl $0x00000200, %ecx
+ jz sha256_use_ssse3_done
+ xorl %eax, %eax
+ popq %rdx
+ popq %rcx
+ popq %rbx
+ ret
+
+sha256_use_ssse3_done:
+ movl $1, %eax
+ popq %rdx
+ popq %rcx
+ popq %rbx
+ ret
+#endif
+
.text
.p2align 6
.globl sha256d_ms_8way
// Copyright (c) 2012-2013 The PPCoin developers
+// Copyright (c) 2013-2015 The Novacoin developers
+// Distributed under the MIT/X11 software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
};
extern "C" int sha256_use_4way();
-
extern "C" void sha256_init(uint32_t *state);
extern "C" void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
-
extern "C" void sha256_init_4way(uint32_t *state);
extern "C" void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
+extern "C" void copy_swap_hashes(uint32_t *blocks, uint32_t *state); // Generic block copy function
+
+#ifdef USE_SSSE3
+extern "C" int sha256_use_ssse3();
+extern "C" void copy_swap_hashes_ssse3(uint32_t *blocks, uint32_t *state); // SSSE3 optimized block copy function
+
+void (*copy_swap)(uint32_t *, uint32_t *) = (sha256_use_ssse3() != 0) ? ©_swap_hashes_ssse3 : copy_swap_hashes;
+#else
+void (*copy_swap)(uint32_t *, uint32_t *) = ©_swap_hashes;
+#endif
bool fUse4Way = sha256_use_4way() != 0;
void Do_4way()
{
+ cout << sha256_use_ssse3() << endl;
+
SetThreadPriority(THREAD_PRIORITY_LOWEST);
// Compute maximum possible target to filter out majority of obviously insufficient hashes
blocks1[27] = nTimeTx++;
sha256_transform_4way(&state1[0], &blocks1[0], 1); // first hashing
-
- for(int i=0; i<32; i++)
- blocks2[i] = __builtin_bswap32(state1[i]);
-
+ copy_swap(&blocks2[0], &state1[0]);
sha256_transform_4way(&state2[0], &blocks2[0], 1); // second hashing
for(int nResult = 0; nResult < 4; nResult++)
xOPT_LEVEL=-O3
endif
+ifeq (${USE_SSSE3}, 1)
+# Intrinsic implementation of block copy
+DEFS += -DUSE_SSSE3
+xOPT_LEVEL += -mssse3
+endif
+
# CXXFLAGS can be specified on the make command line, so we use xCXXFLAGS that only
# adds some defaults in front. Unfortunately, CXXFLAGS=... $(CXXFLAGS) does not work.
xCXXFLAGS=$(xOPT_LEVEL) -msse2 -pthread -Wall -Wextra -Wno-ignored-qualifiers -Wformat -Wformat-security -Wno-unused-parameter \
endif
ifeq (${USE_ASM}, 1)
+
+DEFS += -DUSE_ASM
+
# Assembler implementation
OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o
-OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o
+OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o crypto/sha2/asm/obj/copy_swap.o
crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S
$(CXX) -c $(xCXXFLAGS) -MMD -o $@ $<
crypto/sha2/asm/obj/sha2-arm.o: crypto/sha2/asm/sha2-arm.S
$(CXX) -c $(xCXXFLAGS) -MMD -o $@ $<
-DEFS += -DUSE_ASM
+crypto/sha2/asm/obj/copy_swap.o: crypto/sha2/asm/copy_swap.c
+ $(CC) -c $(xCXXFLAGS) -MMD -o $@ $<
+
else
ifeq (${USE_SSE2}, 1)
# Intrinsic implementation
endif
DEFS=-D_MT -DWIN32 -D_WINDOWS -DBOOST_THREAD_USE_LIB -DBOOST_SPIRIT_THREADSAFE -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS
+
+ifeq (${USE_SSSE3}, 1)
+# Intrinsic implementation of block copy
+DEFS += -DUSE_SSSE3
+xOPT_LEVEL += -mssse3
+endif
+
DEBUGFLAGS=-g
CFLAGS=$(xOPT_LEVEL) -msse2 -w -Wall -Wextra -Wno-ignored-qualifiers -Wformat -Wformat-security -Wno-unused-parameter $(DEBUGFLAGS) $(DEFS) $(INCLUDEPATHS)
LDFLAGS=-Wl,--dynamicbase -Wl,--nxcompat -static-libgcc -static-libstdc++
ifeq (${USE_ASM}, 1)
# Assembler implementation
OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o
-OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o
+OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o crypto/sha2/asm/obj/copy_swap.o
crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S
$(CXX) -c $(CFLAGS) -MMD -o $@ $<
crypto/sha2/asm/obj/sha2-arm.o: crypto/sha2/asm/sha2-arm.S
$(CXX) -c $(CFLAGS) -MMD -o $@ $<
+crypto/sha2/asm/obj/copy_swap.o: crypto/sha2/asm/copy_swap.c
+ $(CC) -c $(CFLAGS) -MMD -o $@ $<
+
DEFS += -DUSE_ASM
else
ifeq (${USE_SSE2}, 1)
xOPT_LEVEL=-O3
endif
+ifdef USE_SSSE3
+# Intrinsic implementation of block copy
+DEFS += -DUSE_SSSE3
+xOPT_LEVEL+=-mssse3
+endif
+
DEFS=-DWIN32 -D_WINDOWS -DBOOST_THREAD_USE_LIB -DBOOST_SPIRIT_THREADSAFE -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS
DEBUGFLAGS=-g
CFLAGS=-mthreads $(xOPT_LEVEL) -msse2 -w -Wall -Wextra -Wno-ignored-qualifiers -Wformat -Wformat-security -Wno-unused-parameter $(DEBUGFLAGS) $(DEFS) $(INCLUDEPATHS)
ifdef USE_ASM
# Assembler implementation
OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o
-OBJS += crypto/sha2/asm/obj/scrypt-arm.o crypto/sha2/asm/obj/scrypt-x86.o crypto/sha2/asm/obj/scrypt-x86_64.o
+OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o crypto/sha2/asm/obj/copy_swap.o
crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S
$(CXX) -c $(CFLAGS) -MMD -o $@ $<
crypto/scrypt/asm/obj/asm-wrapper.o: crypto/scrypt/asm/asm-wrapper.cpp
$(CXX) -c $(CFLAGS) -MMD -o $@ $<
+crypto/sha2/asm/obj/copy_swap.o: crypto/sha2/asm/copy_swap.c
+ $(CC) -c $(CFLAGS) -MMD -o $@ $<
+
DEFS += -DUSE_ASM
+
else
ifdef USE_SSE2
# Intrinsic implementation
CFLAGS = -g -msse2
endif
+ifeq (${USE_SSSE3}, 1)
+# Intrinsic implementation of block copy
+DEFS += -DUSE_SSSE3
+CFLAGS += -mssse3
+endif
+
# ppc doesn't work because we don't support big-endian
CFLAGS += -Wall -Wextra -Wformat -Wno-ignored-qualifiers -Wformat-security -Wno-unused-parameter \
$(DEBUGFLAGS) $(DEFS) $(INCLUDEPATHS)
ifeq (${USE_ASM}, 1)
# Assembler implementation
OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o
-OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o
+OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o crypto/sha2/asm/obj/copy_swap.o
crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S
$(CXX) -c $(CFLAGS) -MMD -o $@ $<
crypto/sha/asm/obj/sha-arm.o: crypto/sha2/asm/sha2-arm.S
$(CXX) -c $(CFLAGS) -MMD -o $@ $<
+crypto/sha2/asm/obj/copy_swap.o: crypto/sha2/asm/copy_swap.c
+ $(CC) -c $(CFLAGS) -MMD -o $@ $<
+
DEFS += -DUSE_ASM
+
else
ifeq (${USE_SSE2}, 1)
# Intrinsic implementation
xOPT_LEVEL=-O3
endif
+ifeq (${USE_SSSE3}, 1)
+# Intrinsic implementation of block copy
+DEFS += -DUSE_SSSE3
+xOPT_LEVEL += -mssse3
+endif
+
# CXXFLAGS can be specified on the make command line, so we use xCXXFLAGS that only
# adds some defaults in front. Unfortunately, CXXFLAGS=... $(CXXFLAGS) does not work.
xCXXFLAGS=$(xOPT_LEVEL) $(EXT_OPTIONS) -pthread -Wall -Wextra -Wno-ignored-qualifiers -Wformat -Wformat-security -Wno-unused-parameter \
ifeq (${USE_ASM}, 1)
# Assembler implementation
OBJS += crypto/scrypt/asm/obj/scrypt-arm.o crypto/scrypt/asm/obj/scrypt-x86.o crypto/scrypt/asm/obj/scrypt-x86_64.o crypto/scrypt/asm/obj/asm-wrapper.o
-OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o
+OBJS += crypto/sha2/asm/obj/sha2-arm.o crypto/sha2/asm/obj/sha2-x86.o crypto/sha2/asm/obj/sha2-x86_64.o crypto/sha2/asm/obj/copy_swap.o
crypto/scrypt/asm/obj/scrypt-x86.o: crypto/scrypt/asm/scrypt-x86.S
$(CXX) -c $(xCXXFLAGS) -MMD -o $@ $<
crypto/sha2/asm/obj/sha2-arm.o: crypto/sha2/asm/sha2-x86.S
$(CXX) -c $(xCXXFLAGS) -MMD -o $@ $<
+crypto/sha2/asm/obj/copy_swap.o: crypto/sha2/asm/copy_swap.c
+ $(CC) -c $(xCXXFLAGS) -MMD -o $@ $<
+
DEFS += -DUSE_ASM
+
else
ifeq (${USE_SSE2}, 1)
# Intrinsic implementation