From 7c1fbd9a25ef1857048a3aea21af3db9b8e38fd8 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 11 May 2013 00:03:23 +0400 Subject: [PATCH] fix --- README.md | 6 +- contrib/macdeploy/macdeployqtplus | 17 +- novacoin-qt.pro | 2 +- src/makefile.osx | 2 +- src/scrypt-x86.S | 1059 ++++++++++++++++---------------- src/scrypt-x86_64.S | 1220 ++++++++++++++++++------------------- 6 files changed, 1131 insertions(+), 1175 deletions(-) diff --git a/README.md b/README.md index 73ea3b6..05c91be 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,9 @@ NovaCoin official development tree NovaCoin - a hybrid scrypt PoW + PoS based cryptocurrency. -* 10 minute PoW block targets -* 10 minute PoS block targets -* The PoW subsidy halves every x64 multiply of difficulty +* 10 minute block spacing +* The PoW subsidy halves every x64 multiply of PoW difficulty +* The PoS interest halves every x64 multiply of PoS difficulty * Maximum PoW reward is 100 coins * ~ 2 billion total coins diff --git a/contrib/macdeploy/macdeployqtplus b/contrib/macdeploy/macdeployqtplus index e159f9b..7981eca 100755 --- a/contrib/macdeploy/macdeployqtplus +++ b/contrib/macdeploy/macdeployqtplus @@ -18,6 +18,7 @@ # import subprocess, sys, re, os, shutil, stat, os.path +from string import Template from time import sleep from argparse import ArgumentParser @@ -169,7 +170,12 @@ class DeploymentInfo(object): elif os.path.exists(os.path.join(parentDir, "share", "qt4", "translations")): # MacPorts layout, e.g. "/opt/local/share/qt4" self.qtPath = os.path.join(parentDir, "share", "qt4") - + elif os.path.exists(os.path.join(os.path.dirname(parentDir), "share", "qt4", "translations")): + # Newer Macports layout + self.qtPath = os.path.join(os.path.dirname(parentDir), "share", "qt4") + else: + self.qtPath = os.getenv("QTDIR", None) + if self.qtPath is not None: pluginPath = os.path.join(self.qtPath, "plugins") if os.path.exists(pluginPath): @@ -239,7 +245,12 @@ def runStrip(binaryPath, verbose): subprocess.check_call(["strip", "-x", binaryPath]) def copyFramework(framework, path, verbose): - fromPath = framework.sourceFilePath + if framework.sourceFilePath.startswith("Qt"): + #standard place for Nokia Qt installer's frameworks + fromPath = "/Library/Frameworks/" + framework.sourceFilePath + else: + fromPath = framework.sourceFilePath + toDir = os.path.join(path, framework.destinationDirectory) toPath = os.path.join(toDir, framework.binaryName) @@ -342,7 +353,7 @@ def deployPlugins(appBundleInfo, deploymentInfo, strip, verbose): if pluginDirectory == "designer": # Skip designer plugins continue - elif pluginDirectory == "phonon": + elif pluginDirectory == "phonon" or pluginDirectory == "phonon_backend": # Deploy the phonon plugins only if phonon is in use if not deploymentInfo.usesFramework("phonon"): continue diff --git a/novacoin-qt.pro b/novacoin-qt.pro index 7146671..53b06fa 100644 --- a/novacoin-qt.pro +++ b/novacoin-qt.pro @@ -23,7 +23,7 @@ UI_DIR = build # use: qmake "RELEASE=1" contains(RELEASE, 1) { # Mac: compile for maximum compatibility (10.5, 32-bit) - macx:QMAKE_CXXFLAGS += -mmacosx-version-min=10.5 -arch i386 -isysroot /Developer/SDKs/MacOSX10.5.sdk + macx:QMAKE_CXXFLAGS += -mmacosx-version-min=10.5 -arch x86_64 -isysroot /Developer/SDKs/MacOSX10.5.sdk !windows:!macx { # Linux: static link diff --git a/src/makefile.osx b/src/makefile.osx index c699be5..a76f277 100644 --- a/src/makefile.osx +++ b/src/makefile.osx @@ -60,7 +60,7 @@ ifdef RELEASE # Compile for maximum compatibility and smallest size. # This requires that dependencies are compiled # the same way. -CFLAGS = -mmacosx-version-min=10.5 -arch i386 -O3 -msse2 +CFLAGS = -mmacosx-version-min=10.5 -arch x86_64 -O3 -msse2 else CFLAGS = -g -msse2 endif diff --git a/src/scrypt-x86.S b/src/scrypt-x86.S index 4c3c152..33b6cd1 100644 --- a/src/scrypt-x86.S +++ b/src/scrypt-x86.S @@ -24,331 +24,331 @@ #if defined(__i386__) -.macro gen_salsa8_core_quadround - movl 52(%esp), %ecx - movl 4(%esp), %edx - movl 20(%esp), %ebx - movl 8(%esp), %esi - leal (%ecx, %edx), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 4(%esp) - movl 36(%esp), %edi - leal (%edx, %ebx), %ebp - roll $9, %ebp - xorl %ebp, %edi - movl 24(%esp), %ebp - movl %edi, 8(%esp) - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 40(%esp), %ebx - movl %ecx, 20(%esp) - addl %edi, %ecx - roll $18, %ecx - leal (%esi, %ebp), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 24(%esp) - movl 56(%esp), %edi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %edi - movl %edi, 36(%esp) - movl 28(%esp), %ecx - movl %edx, 28(%esp) - movl 44(%esp), %edx - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %esi - movl 60(%esp), %ebx - movl %esi, 40(%esp) - addl %edi, %esi - roll $18, %esi - leal (%ecx, %edx), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 44(%esp) - movl 12(%esp), %edi - xorl %esi, %ebp - leal (%edx, %ebx), %esi - roll $9, %esi - xorl %esi, %edi - movl %edi, 12(%esp) - movl 48(%esp), %esi - movl %ebp, 48(%esp) - movl 64(%esp), %ebp - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 16(%esp), %ebx - movl %ecx, 16(%esp) - addl %edi, %ecx - roll $18, %ecx - leal (%esi, %ebp), %edi - roll $7, %edi - xorl %edi, %ebx - movl 32(%esp), %edi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %edi - movl %edi, 32(%esp) - movl %ebx, %ecx - movl %edx, 52(%esp) - movl 28(%esp), %edx - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %esi - movl 40(%esp), %ebx - movl %esi, 28(%esp) - addl %edi, %esi - roll $18, %esi - leal (%ecx, %edx), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 40(%esp) - movl 12(%esp), %edi - xorl %esi, %ebp - leal (%edx, %ebx), %esi - roll $9, %esi - xorl %esi, %edi - movl %edi, 12(%esp) - movl 4(%esp), %esi - movl %ebp, 4(%esp) - movl 48(%esp), %ebp - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 16(%esp), %ebx - movl %ecx, 16(%esp) - addl %edi, %ecx - roll $18, %ecx - leal (%esi, %ebp), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 48(%esp) - movl 32(%esp), %edi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %edi - movl %edi, 32(%esp) - movl 24(%esp), %ecx - movl %edx, 24(%esp) - movl 52(%esp), %edx - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %esi - movl 28(%esp), %ebx - movl %esi, 28(%esp) - addl %edi, %esi - roll $18, %esi - leal (%ecx, %edx), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 52(%esp) - movl 8(%esp), %edi - xorl %esi, %ebp - leal (%edx, %ebx), %esi - roll $9, %esi - xorl %esi, %edi - movl %edi, 8(%esp) - movl 44(%esp), %esi - movl %ebp, 44(%esp) - movl 4(%esp), %ebp - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 20(%esp), %ebx - movl %ecx, 4(%esp) - addl %edi, %ecx - roll $18, %ecx - leal (%esi, %ebp), %edi - roll $7, %edi - xorl %edi, %ebx - movl 36(%esp), %edi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %edi - movl %edi, 20(%esp) - movl %ebx, %ecx - movl %edx, 36(%esp) - movl 24(%esp), %edx - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %esi - movl 28(%esp), %ebx - movl %esi, 24(%esp) - addl %edi, %esi - roll $18, %esi - leal (%ecx, %edx), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 28(%esp) - xorl %esi, %ebp - movl 8(%esp), %esi - leal (%edx, %ebx), %edi - roll $9, %edi - xorl %edi, %esi - movl 40(%esp), %edi - movl %ebp, 8(%esp) - movl 44(%esp), %ebp - movl %esi, 40(%esp) - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 4(%esp), %ebx - movl %ecx, 44(%esp) - addl %esi, %ecx - roll $18, %ecx - leal (%edi, %ebp), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 4(%esp) - movl 20(%esp), %esi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %esi - movl %esi, 56(%esp) - movl 48(%esp), %ecx - movl %edx, 20(%esp) - movl 36(%esp), %edx - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %edi - movl 24(%esp), %ebx - movl %edi, 24(%esp) - addl %esi, %edi - roll $18, %edi - leal (%ecx, %edx), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 60(%esp) - movl 12(%esp), %esi - xorl %edi, %ebp - leal (%edx, %ebx), %edi - roll $9, %edi - xorl %edi, %esi - movl %esi, 12(%esp) - movl 52(%esp), %edi - movl %ebp, 36(%esp) - movl 8(%esp), %ebp - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 16(%esp), %ebx - movl %ecx, 16(%esp) - addl %esi, %ecx - roll $18, %ecx - leal (%edi, %ebp), %esi - roll $7, %esi - xorl %esi, %ebx - movl 32(%esp), %esi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %esi - movl %esi, 32(%esp) - movl %ebx, %ecx - movl %edx, 48(%esp) - movl 20(%esp), %edx - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %edi - movl 24(%esp), %ebx - movl %edi, 20(%esp) - addl %esi, %edi - roll $18, %edi - leal (%ecx, %edx), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 8(%esp) - movl 12(%esp), %esi - xorl %edi, %ebp - leal (%edx, %ebx), %edi - roll $9, %edi - xorl %edi, %esi - movl %esi, 12(%esp) - movl 28(%esp), %edi - movl %ebp, 52(%esp) - movl 36(%esp), %ebp - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 16(%esp), %ebx - movl %ecx, 16(%esp) - addl %esi, %ecx - roll $18, %ecx - leal (%edi, %ebp), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 28(%esp) - movl 32(%esp), %esi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %esi - movl %esi, 32(%esp) - movl 4(%esp), %ecx - movl %edx, 4(%esp) - movl 48(%esp), %edx - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %edi - movl 20(%esp), %ebx - movl %edi, 20(%esp) - addl %esi, %edi - roll $18, %edi - leal (%ecx, %edx), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 48(%esp) - movl 40(%esp), %esi - xorl %edi, %ebp - leal (%edx, %ebx), %edi - roll $9, %edi - xorl %edi, %esi - movl %esi, 36(%esp) - movl 60(%esp), %edi - movl %ebp, 24(%esp) - movl 52(%esp), %ebp - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 44(%esp), %ebx - movl %ecx, 40(%esp) - addl %esi, %ecx - roll $18, %ecx - leal (%edi, %ebp), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 52(%esp) - movl 56(%esp), %esi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %esi - movl %esi, 56(%esp) - addl %esi, %ebx - movl %edx, 44(%esp) - roll $13, %ebx - xorl %ebx, %edi - movl %edi, 60(%esp) - addl %esi, %edi - roll $18, %edi - xorl %edi, %ebp - movl %ebp, 64(%esp) -.endm +#define gen_salsa8_core_quadround() \ + movl 52(%esp), %ecx; \ + movl 4(%esp), %edx; \ + movl 20(%esp), %ebx; \ + movl 8(%esp), %esi; \ + leal (%ecx, %edx), %edi; \ + roll $7, %edi; \ + xorl %edi, %ebx; \ + movl %ebx, 4(%esp); \ + movl 36(%esp), %edi; \ + leal (%edx, %ebx), %ebp; \ + roll $9, %ebp; \ + xorl %ebp, %edi; \ + movl 24(%esp), %ebp; \ + movl %edi, 8(%esp); \ + addl %edi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %ecx; \ + movl 40(%esp), %ebx; \ + movl %ecx, 20(%esp); \ + addl %edi, %ecx; \ + roll $18, %ecx; \ + leal (%esi, %ebp), %edi; \ + roll $7, %edi; \ + xorl %edi, %ebx; \ + movl %ebx, 24(%esp); \ + movl 56(%esp), %edi; \ + xorl %ecx, %edx; \ + leal (%ebp, %ebx), %ecx; \ + roll $9, %ecx; \ + xorl %ecx, %edi; \ + movl %edi, 36(%esp); \ + movl 28(%esp), %ecx; \ + movl %edx, 28(%esp); \ + movl 44(%esp), %edx; \ + addl %edi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %esi; \ + movl 60(%esp), %ebx; \ + movl %esi, 40(%esp); \ + addl %edi, %esi; \ + roll $18, %esi; \ + leal (%ecx, %edx), %edi; \ + roll $7, %edi; \ + xorl %edi, %ebx; \ + movl %ebx, 44(%esp); \ + movl 12(%esp), %edi; \ + xorl %esi, %ebp; \ + leal (%edx, %ebx), %esi; \ + roll $9, %esi; \ + xorl %esi, %edi; \ + movl %edi, 12(%esp); \ + movl 48(%esp), %esi; \ + movl %ebp, 48(%esp); \ + movl 64(%esp), %ebp; \ + addl %edi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %ecx; \ + movl 16(%esp), %ebx; \ + movl %ecx, 16(%esp); \ + addl %edi, %ecx; \ + roll $18, %ecx; \ + leal (%esi, %ebp), %edi; \ + roll $7, %edi; \ + xorl %edi, %ebx; \ + movl 32(%esp), %edi; \ + xorl %ecx, %edx; \ + leal (%ebp, %ebx), %ecx; \ + roll $9, %ecx; \ + xorl %ecx, %edi; \ + movl %edi, 32(%esp); \ + movl %ebx, %ecx; \ + movl %edx, 52(%esp); \ + movl 28(%esp), %edx; \ + addl %edi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %esi; \ + movl 40(%esp), %ebx; \ + movl %esi, 28(%esp); \ + addl %edi, %esi; \ + roll $18, %esi; \ + leal (%ecx, %edx), %edi; \ + roll $7, %edi; \ + xorl %edi, %ebx; \ + movl %ebx, 40(%esp); \ + movl 12(%esp), %edi; \ + xorl %esi, %ebp; \ + leal (%edx, %ebx), %esi; \ + roll $9, %esi; \ + xorl %esi, %edi; \ + movl %edi, 12(%esp); \ + movl 4(%esp), %esi; \ + movl %ebp, 4(%esp); \ + movl 48(%esp), %ebp; \ + addl %edi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %ecx; \ + movl 16(%esp), %ebx; \ + movl %ecx, 16(%esp); \ + addl %edi, %ecx; \ + roll $18, %ecx; \ + leal (%esi, %ebp), %edi; \ + roll $7, %edi; \ + xorl %edi, %ebx; \ + movl %ebx, 48(%esp); \ + movl 32(%esp), %edi; \ + xorl %ecx, %edx; \ + leal (%ebp, %ebx), %ecx; \ + roll $9, %ecx; \ + xorl %ecx, %edi; \ + movl %edi, 32(%esp); \ + movl 24(%esp), %ecx; \ + movl %edx, 24(%esp); \ + movl 52(%esp), %edx; \ + addl %edi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %esi; \ + movl 28(%esp), %ebx; \ + movl %esi, 28(%esp); \ + addl %edi, %esi; \ + roll $18, %esi; \ + leal (%ecx, %edx), %edi; \ + roll $7, %edi; \ + xorl %edi, %ebx; \ + movl %ebx, 52(%esp); \ + movl 8(%esp), %edi; \ + xorl %esi, %ebp; \ + leal (%edx, %ebx), %esi; \ + roll $9, %esi; \ + xorl %esi, %edi; \ + movl %edi, 8(%esp); \ + movl 44(%esp), %esi; \ + movl %ebp, 44(%esp); \ + movl 4(%esp), %ebp; \ + addl %edi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %ecx; \ + movl 20(%esp), %ebx; \ + movl %ecx, 4(%esp); \ + addl %edi, %ecx; \ + roll $18, %ecx; \ + leal (%esi, %ebp), %edi; \ + roll $7, %edi; \ + xorl %edi, %ebx; \ + movl 36(%esp), %edi; \ + xorl %ecx, %edx; \ + leal (%ebp, %ebx), %ecx; \ + roll $9, %ecx; \ + xorl %ecx, %edi; \ + movl %edi, 20(%esp); \ + movl %ebx, %ecx; \ + movl %edx, 36(%esp); \ + movl 24(%esp), %edx; \ + addl %edi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %esi; \ + movl 28(%esp), %ebx; \ + movl %esi, 24(%esp); \ + addl %edi, %esi; \ + roll $18, %esi; \ + leal (%ecx, %edx), %edi; \ + roll $7, %edi; \ + xorl %edi, %ebx; \ + movl %ebx, 28(%esp); \ + xorl %esi, %ebp; \ + movl 8(%esp), %esi; \ + leal (%edx, %ebx), %edi; \ + roll $9, %edi; \ + xorl %edi, %esi; \ + movl 40(%esp), %edi; \ + movl %ebp, 8(%esp); \ + movl 44(%esp), %ebp; \ + movl %esi, 40(%esp); \ + addl %esi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %ecx; \ + movl 4(%esp), %ebx; \ + movl %ecx, 44(%esp); \ + addl %esi, %ecx; \ + roll $18, %ecx; \ + leal (%edi, %ebp), %esi; \ + roll $7, %esi; \ + xorl %esi, %ebx; \ + movl %ebx, 4(%esp); \ + movl 20(%esp), %esi; \ + xorl %ecx, %edx; \ + leal (%ebp, %ebx), %ecx; \ + roll $9, %ecx; \ + xorl %ecx, %esi; \ + movl %esi, 56(%esp); \ + movl 48(%esp), %ecx; \ + movl %edx, 20(%esp); \ + movl 36(%esp), %edx; \ + addl %esi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %edi; \ + movl 24(%esp), %ebx; \ + movl %edi, 24(%esp); \ + addl %esi, %edi; \ + roll $18, %edi; \ + leal (%ecx, %edx), %esi; \ + roll $7, %esi; \ + xorl %esi, %ebx; \ + movl %ebx, 60(%esp); \ + movl 12(%esp), %esi; \ + xorl %edi, %ebp; \ + leal (%edx, %ebx), %edi; \ + roll $9, %edi; \ + xorl %edi, %esi; \ + movl %esi, 12(%esp); \ + movl 52(%esp), %edi; \ + movl %ebp, 36(%esp); \ + movl 8(%esp), %ebp; \ + addl %esi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %ecx; \ + movl 16(%esp), %ebx; \ + movl %ecx, 16(%esp); \ + addl %esi, %ecx; \ + roll $18, %ecx; \ + leal (%edi, %ebp), %esi; \ + roll $7, %esi; \ + xorl %esi, %ebx; \ + movl 32(%esp), %esi; \ + xorl %ecx, %edx; \ + leal (%ebp, %ebx), %ecx; \ + roll $9, %ecx; \ + xorl %ecx, %esi; \ + movl %esi, 32(%esp); \ + movl %ebx, %ecx; \ + movl %edx, 48(%esp); \ + movl 20(%esp), %edx; \ + addl %esi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %edi; \ + movl 24(%esp), %ebx; \ + movl %edi, 20(%esp); \ + addl %esi, %edi; \ + roll $18, %edi; \ + leal (%ecx, %edx), %esi; \ + roll $7, %esi; \ + xorl %esi, %ebx; \ + movl %ebx, 8(%esp); \ + movl 12(%esp), %esi; \ + xorl %edi, %ebp; \ + leal (%edx, %ebx), %edi; \ + roll $9, %edi; \ + xorl %edi, %esi; \ + movl %esi, 12(%esp); \ + movl 28(%esp), %edi; \ + movl %ebp, 52(%esp); \ + movl 36(%esp), %ebp; \ + addl %esi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %ecx; \ + movl 16(%esp), %ebx; \ + movl %ecx, 16(%esp); \ + addl %esi, %ecx; \ + roll $18, %ecx; \ + leal (%edi, %ebp), %esi; \ + roll $7, %esi; \ + xorl %esi, %ebx; \ + movl %ebx, 28(%esp); \ + movl 32(%esp), %esi; \ + xorl %ecx, %edx; \ + leal (%ebp, %ebx), %ecx; \ + roll $9, %ecx; \ + xorl %ecx, %esi; \ + movl %esi, 32(%esp); \ + movl 4(%esp), %ecx; \ + movl %edx, 4(%esp); \ + movl 48(%esp), %edx; \ + addl %esi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %edi; \ + movl 20(%esp), %ebx; \ + movl %edi, 20(%esp); \ + addl %esi, %edi; \ + roll $18, %edi; \ + leal (%ecx, %edx), %esi; \ + roll $7, %esi; \ + xorl %esi, %ebx; \ + movl %ebx, 48(%esp); \ + movl 40(%esp), %esi; \ + xorl %edi, %ebp; \ + leal (%edx, %ebx), %edi; \ + roll $9, %edi; \ + xorl %edi, %esi; \ + movl %esi, 36(%esp); \ + movl 60(%esp), %edi; \ + movl %ebp, 24(%esp); \ + movl 52(%esp), %ebp; \ + addl %esi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %ecx; \ + movl 44(%esp), %ebx; \ + movl %ecx, 40(%esp); \ + addl %esi, %ecx; \ + roll $18, %ecx; \ + leal (%edi, %ebp), %esi; \ + roll $7, %esi; \ + xorl %esi, %ebx; \ + movl %ebx, 52(%esp); \ + movl 56(%esp), %esi; \ + xorl %ecx, %edx; \ + leal (%ebp, %ebx), %ecx; \ + roll $9, %ecx; \ + xorl %ecx, %esi; \ + movl %esi, 56(%esp); \ + addl %esi, %ebx; \ + movl %edx, 44(%esp); \ + roll $13, %ebx; \ + xorl %ebx, %edi; \ + movl %edi, 60(%esp); \ + addl %esi, %edi; \ + roll $18, %edi; \ + xorl %edi, %ebp; \ + movl %ebp, 64(%esp); \ + .text .align 32 gen_salsa8_core: - gen_salsa8_core_quadround - gen_salsa8_core_quadround + gen_salsa8_core_quadround() + gen_salsa8_core_quadround() ret @@ -374,103 +374,103 @@ gen_scrypt_core: movl 24(%esp), %esi subl $72, %esp -.macro scrypt_core_macro1a p, q - movl \p(%edi), %eax - movl \q(%edi), %edx - movl %eax, \p(%esi) - movl %edx, \q(%esi) - xorl %edx, %eax - movl %eax, \p(%edi) - movl %eax, \p(%esp) -.endm - -.macro scrypt_core_macro1b p, q - movl \p(%edi), %eax - xorl \p(%esi, %edx), %eax - movl \q(%edi), %ebx - xorl \q(%esi, %edx), %ebx - movl %ebx, \q(%edi) - xorl %ebx, %eax - movl %eax, \p(%edi) - movl %eax, \p(%esp) -.endm - -.macro scrypt_core_macro2 p, q - movl \p(%esp), %eax - addl \p(%edi), %eax - movl %eax, \p(%edi) - xorl \q(%edi), %eax - movl %eax, \q(%edi) - movl %eax, \p(%esp) -.endm - -.macro scrypt_core_macro3 p, q - movl \p(%esp), %eax - addl \q(%edi), %eax - movl %eax, \q(%edi) -.endm +#define scrypt_core_macro1a(p, q) \ + movl p(%edi), %eax; \ + movl q(%edi), %edx; \ + movl %eax, p(%esi); \ + movl %edx, q(%esi); \ + xorl %edx, %eax; \ + movl %eax, p(%edi); \ + movl %eax, p(%esp); \ + + +#define scrypt_core_macro1b(p, q) \ + movl p(%edi), %eax; \ + xorl p(%esi, %edx), %eax; \ + movl q(%edi), %ebx; \ + xorl q(%esi, %edx), %ebx; \ + movl %ebx, q(%edi); \ + xorl %ebx, %eax; \ + movl %eax, p(%edi); \ + movl %eax, p(%esp); \ + + +#define scrypt_core_macro2(p, q) \ + movl p(%esp), %eax; \ + addl p(%edi), %eax; \ + movl %eax, p(%edi); \ + xorl q(%edi), %eax; \ + movl %eax, q(%edi); \ + movl %eax, p(%esp); \ + + +#define scrypt_core_macro3(p, q) \ + movl p(%esp), %eax; \ + addl q(%edi), %eax; \ + movl %eax, q(%edi); \ + leal 131072(%esi), %ecx gen_scrypt_core_loop1: movl %esi, 64(%esp) movl %ecx, 68(%esp) - scrypt_core_macro1a 0, 64 - scrypt_core_macro1a 4, 68 - scrypt_core_macro1a 8, 72 - scrypt_core_macro1a 12, 76 - scrypt_core_macro1a 16, 80 - scrypt_core_macro1a 20, 84 - scrypt_core_macro1a 24, 88 - scrypt_core_macro1a 28, 92 - scrypt_core_macro1a 32, 96 - scrypt_core_macro1a 36, 100 - scrypt_core_macro1a 40, 104 - scrypt_core_macro1a 44, 108 - scrypt_core_macro1a 48, 112 - scrypt_core_macro1a 52, 116 - scrypt_core_macro1a 56, 120 - scrypt_core_macro1a 60, 124 + scrypt_core_macro1a(0, 64) + scrypt_core_macro1a(4, 68) + scrypt_core_macro1a(8, 72) + scrypt_core_macro1a(12, 76) + scrypt_core_macro1a(16, 80) + scrypt_core_macro1a(20, 84) + scrypt_core_macro1a(24, 88) + scrypt_core_macro1a(28, 92) + scrypt_core_macro1a(32, 96) + scrypt_core_macro1a(36, 100) + scrypt_core_macro1a(40, 104) + scrypt_core_macro1a(44, 108) + scrypt_core_macro1a(48, 112) + scrypt_core_macro1a(52, 116) + scrypt_core_macro1a(56, 120) + scrypt_core_macro1a(60, 124) call gen_salsa8_core movl 92(%esp), %edi - scrypt_core_macro2 0, 64 - scrypt_core_macro2 4, 68 - scrypt_core_macro2 8, 72 - scrypt_core_macro2 12, 76 - scrypt_core_macro2 16, 80 - scrypt_core_macro2 20, 84 - scrypt_core_macro2 24, 88 - scrypt_core_macro2 28, 92 - scrypt_core_macro2 32, 96 - scrypt_core_macro2 36, 100 - scrypt_core_macro2 40, 104 - scrypt_core_macro2 44, 108 - scrypt_core_macro2 48, 112 - scrypt_core_macro2 52, 116 - scrypt_core_macro2 56, 120 - scrypt_core_macro2 60, 124 + scrypt_core_macro2(0, 64) + scrypt_core_macro2(4, 68) + scrypt_core_macro2(8, 72) + scrypt_core_macro2(12, 76) + scrypt_core_macro2(16, 80) + scrypt_core_macro2(20, 84) + scrypt_core_macro2(24, 88) + scrypt_core_macro2(28, 92) + scrypt_core_macro2(32, 96) + scrypt_core_macro2(36, 100) + scrypt_core_macro2(40, 104) + scrypt_core_macro2(44, 108) + scrypt_core_macro2(48, 112) + scrypt_core_macro2(52, 116) + scrypt_core_macro2(56, 120) + scrypt_core_macro2(60, 124) call gen_salsa8_core movl 92(%esp), %edi - scrypt_core_macro3 0, 64 - scrypt_core_macro3 4, 68 - scrypt_core_macro3 8, 72 - scrypt_core_macro3 12, 76 - scrypt_core_macro3 16, 80 - scrypt_core_macro3 20, 84 - scrypt_core_macro3 24, 88 - scrypt_core_macro3 28, 92 - scrypt_core_macro3 32, 96 - scrypt_core_macro3 36, 100 - scrypt_core_macro3 40, 104 - scrypt_core_macro3 44, 108 - scrypt_core_macro3 48, 112 - scrypt_core_macro3 52, 116 - scrypt_core_macro3 56, 120 - scrypt_core_macro3 60, 124 + scrypt_core_macro3(0, 64) + scrypt_core_macro3(4, 68) + scrypt_core_macro3(8, 72) + scrypt_core_macro3(12, 76) + scrypt_core_macro3(16, 80) + scrypt_core_macro3(20, 84) + scrypt_core_macro3(24, 88) + scrypt_core_macro3(28, 92) + scrypt_core_macro3(32, 96) + scrypt_core_macro3(36, 100) + scrypt_core_macro3(40, 104) + scrypt_core_macro3(44, 108) + scrypt_core_macro3(48, 112) + scrypt_core_macro3(52, 116) + scrypt_core_macro3(56, 120) + scrypt_core_macro3(60, 124) movl 64(%esp), %esi movl 68(%esp), %ecx @@ -487,63 +487,63 @@ gen_scrypt_core_loop2: andl $1023, %edx shll $7, %edx - scrypt_core_macro1b 0, 64 - scrypt_core_macro1b 4, 68 - scrypt_core_macro1b 8, 72 - scrypt_core_macro1b 12, 76 - scrypt_core_macro1b 16, 80 - scrypt_core_macro1b 20, 84 - scrypt_core_macro1b 24, 88 - scrypt_core_macro1b 28, 92 - scrypt_core_macro1b 32, 96 - scrypt_core_macro1b 36, 100 - scrypt_core_macro1b 40, 104 - scrypt_core_macro1b 44, 108 - scrypt_core_macro1b 48, 112 - scrypt_core_macro1b 52, 116 - scrypt_core_macro1b 56, 120 - scrypt_core_macro1b 60, 124 + scrypt_core_macro1b(0, 64) + scrypt_core_macro1b(4, 68) + scrypt_core_macro1b(8, 72) + scrypt_core_macro1b(12, 76) + scrypt_core_macro1b(16, 80) + scrypt_core_macro1b(20, 84) + scrypt_core_macro1b(24, 88) + scrypt_core_macro1b(28, 92) + scrypt_core_macro1b(32, 96) + scrypt_core_macro1b(36, 100) + scrypt_core_macro1b(40, 104) + scrypt_core_macro1b(44, 108) + scrypt_core_macro1b(48, 112) + scrypt_core_macro1b(52, 116) + scrypt_core_macro1b(56, 120) + scrypt_core_macro1b(60, 124) call gen_salsa8_core movl 92(%esp), %edi - scrypt_core_macro2 0, 64 - scrypt_core_macro2 4, 68 - scrypt_core_macro2 8, 72 - scrypt_core_macro2 12, 76 - scrypt_core_macro2 16, 80 - scrypt_core_macro2 20, 84 - scrypt_core_macro2 24, 88 - scrypt_core_macro2 28, 92 - scrypt_core_macro2 32, 96 - scrypt_core_macro2 36, 100 - scrypt_core_macro2 40, 104 - scrypt_core_macro2 44, 108 - scrypt_core_macro2 48, 112 - scrypt_core_macro2 52, 116 - scrypt_core_macro2 56, 120 - scrypt_core_macro2 60, 124 + scrypt_core_macro2(0, 64) + scrypt_core_macro2(4, 68) + scrypt_core_macro2(8, 72) + scrypt_core_macro2(12, 76) + scrypt_core_macro2(16, 80) + scrypt_core_macro2(20, 84) + scrypt_core_macro2(24, 88) + scrypt_core_macro2(28, 92) + scrypt_core_macro2(32, 96) + scrypt_core_macro2(36, 100) + scrypt_core_macro2(40, 104) + scrypt_core_macro2(44, 108) + scrypt_core_macro2(48, 112) + scrypt_core_macro2(52, 116) + scrypt_core_macro2(56, 120) + scrypt_core_macro2(60, 124) call gen_salsa8_core movl 92(%esp), %edi movl 96(%esp), %esi - scrypt_core_macro3 0, 64 - scrypt_core_macro3 4, 68 - scrypt_core_macro3 8, 72 - scrypt_core_macro3 12, 76 - scrypt_core_macro3 16, 80 - scrypt_core_macro3 20, 84 - scrypt_core_macro3 24, 88 - scrypt_core_macro3 28, 92 - scrypt_core_macro3 32, 96 - scrypt_core_macro3 36, 100 - scrypt_core_macro3 40, 104 - scrypt_core_macro3 44, 108 - scrypt_core_macro3 48, 112 - scrypt_core_macro3 52, 116 - scrypt_core_macro3 56, 120 - scrypt_core_macro3 60, 124 + scrypt_core_macro3(0, 64) + scrypt_core_macro3(4, 68) + scrypt_core_macro3(8, 72) + scrypt_core_macro3(12, 76) + scrypt_core_macro3(16, 80) + scrypt_core_macro3(20, 84) + scrypt_core_macro3(24, 88) + scrypt_core_macro3(28, 92) + scrypt_core_macro3(32, 96) + scrypt_core_macro3(36, 100) + scrypt_core_macro3(40, 104) + scrypt_core_macro3(44, 108) + scrypt_core_macro3(48, 112) + scrypt_core_macro3(52, 116) + scrypt_core_macro3(56, 120) + scrypt_core_macro3(60, 124) movl 68(%esp), %ecx subl $1, %ecx @@ -557,84 +557,77 @@ gen_scrypt_core_loop2: ret -.macro xmm_salsa8_core_doubleround - movdqa %xmm1, %xmm4 - paddd %xmm0, %xmm4 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm3 - pxor %xmm5, %xmm3 - movdqa %xmm0, %xmm4 - - paddd %xmm3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm3, %xmm4 - pshufd $0x93, %xmm3, %xmm3 - pxor %xmm5, %xmm2 - - paddd %xmm2, %xmm4 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm1 - movdqa %xmm2, %xmm4 - pshufd $0x4e, %xmm2, %xmm2 - pxor %xmm5, %xmm1 - - paddd %xmm1, %xmm4 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - pshufd $0x39, %xmm1, %xmm1 - pxor %xmm5, %xmm0 - movdqa %xmm3, %xmm4 - - paddd %xmm0, %xmm4 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm1 - pxor %xmm5, %xmm1 - movdqa %xmm0, %xmm4 - - paddd %xmm1, %xmm4 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm1, %xmm4 - pshufd $0x93, %xmm1, %xmm1 - pxor %xmm5, %xmm2 - - paddd %xmm2, %xmm4 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm3 - movdqa %xmm2, %xmm4 - pshufd $0x4e, %xmm2, %xmm2 - pxor %xmm5, %xmm3 - - paddd %xmm3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - pshufd $0x39, %xmm3, %xmm3 - pxor %xmm5, %xmm0 -.endm +#define xmm_salsa8_core_doubleround() \ + movdqa %xmm1, %xmm4; \ + paddd %xmm0, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $7, %xmm4; \ + psrld $25, %xmm5; \ + pxor %xmm4, %xmm3; \ + pxor %xmm5, %xmm3; \ + movdqa %xmm0, %xmm4; \ + paddd %xmm3, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $9, %xmm4; \ + psrld $23, %xmm5; \ + pxor %xmm4, %xmm2; \ + movdqa %xmm3, %xmm4; \ + pshufd $0x93, %xmm3, %xmm3; \ + pxor %xmm5, %xmm2; \ + paddd %xmm2, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $13, %xmm4; \ + psrld $19, %xmm5; \ + pxor %xmm4, %xmm1; \ + movdqa %xmm2, %xmm4; \ + pshufd $0x4e, %xmm2, %xmm2; \ + pxor %xmm5, %xmm1; \ + paddd %xmm1, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $18, %xmm4; \ + psrld $14, %xmm5; \ + pxor %xmm4, %xmm0; \ + pshufd $0x39, %xmm1, %xmm1; \ + pxor %xmm5, %xmm0; \ + movdqa %xmm3, %xmm4; \ + paddd %xmm0, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $7, %xmm4; \ + psrld $25, %xmm5; \ + pxor %xmm4, %xmm1; \ + pxor %xmm5, %xmm1; \ + movdqa %xmm0, %xmm4; \ + paddd %xmm1, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $9, %xmm4; \ + psrld $23, %xmm5; \ + pxor %xmm4, %xmm2; \ + movdqa %xmm1, %xmm4; \ + pshufd $0x93, %xmm1, %xmm1; \ + pxor %xmm5, %xmm2; \ + paddd %xmm2, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $13, %xmm4; \ + psrld $19, %xmm5; \ + pxor %xmm4, %xmm3; \ + movdqa %xmm2, %xmm4; \ + pshufd $0x4e, %xmm2, %xmm2; \ + pxor %xmm5, %xmm3; \ + paddd %xmm3, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $18, %xmm4; \ + psrld $14, %xmm5; \ + pxor %xmm4, %xmm0; \ + pshufd $0x39, %xmm3, %xmm3; \ + pxor %xmm5, %xmm0; \ + + +#define xmm_salsa8_core() \ + xmm_salsa8_core_doubleround(); \ + xmm_salsa8_core_doubleround(); \ + xmm_salsa8_core_doubleround(); \ + xmm_salsa8_core_doubleround(); \ -.macro xmm_salsa8_core - xmm_salsa8_core_doubleround - xmm_salsa8_core_doubleround - xmm_salsa8_core_doubleround - xmm_salsa8_core_doubleround -.endm .align 32 xmm_scrypt_core: @@ -740,7 +733,7 @@ xmm_scrypt_core_loop1: movdqa %xmm1, 16(%esp) movdqa %xmm2, 32(%esp) movdqa %xmm3, 48(%esp) - xmm_salsa8_core + xmm_salsa8_core() paddd 0(%esp), %xmm0 paddd 16(%esp), %xmm1 paddd 32(%esp), %xmm2 @@ -758,7 +751,7 @@ xmm_scrypt_core_loop1: movdqa %xmm1, 80(%esp) movdqa %xmm2, 96(%esp) movdqa %xmm3, 112(%esp) - xmm_salsa8_core + xmm_salsa8_core() paddd 64(%esp), %xmm0 paddd 80(%esp), %xmm1 paddd 96(%esp), %xmm2 @@ -806,7 +799,7 @@ xmm_scrypt_core_loop2: movdqa %xmm1, 16(%esp) movdqa %xmm2, 32(%esp) movdqa %xmm3, 48(%esp) - xmm_salsa8_core + xmm_salsa8_core() paddd 0(%esp), %xmm0 paddd 16(%esp), %xmm1 paddd 32(%esp), %xmm2 @@ -824,7 +817,7 @@ xmm_scrypt_core_loop2: movdqa %xmm1, 80(%esp) movdqa %xmm2, 96(%esp) movdqa %xmm3, 112(%esp) - xmm_salsa8_core + xmm_salsa8_core() paddd 64(%esp), %xmm0 paddd 80(%esp), %xmm1 paddd 96(%esp), %xmm2 diff --git a/src/scrypt-x86_64.S b/src/scrypt-x86_64.S index 8d408fd..f0a3fdd 100644 --- a/src/scrypt-x86_64.S +++ b/src/scrypt-x86_64.S @@ -28,163 +28,151 @@ #if defined(__x86_64__) -.macro scrypt_shuffle src, so, dest, do - movl \so+60(\src), %r8d - movl \so+44(\src), %r9d - movl \so+28(\src), %r10d - movl \so+12(\src), %r11d - movl %r8d, \do+12(\dest) - movl %r9d, \do+28(\dest) - movl %r10d, \do+44(\dest) - movl %r11d, \do+60(\dest) - movl \so+40(\src), %r8d - movl \so+8(\src), %r9d - movl \so+48(\src), %r10d - movl \so+16(\src), %r11d - movl %r8d, \do+8(\dest) - movl %r9d, \do+40(\dest) - movl %r10d, \do+16(\dest) - movl %r11d, \do+48(\dest) - movl \so+20(\src), %r8d - movl \so+4(\src), %r9d - movl \so+52(\src), %r10d - movl \so+36(\src), %r11d - movl %r8d, \do+4(\dest) - movl %r9d, \do+20(\dest) - movl %r10d, \do+36(\dest) - movl %r11d, \do+52(\dest) - movl \so+0(\src), %r8d - movl \so+24(\src), %r9d - movl \so+32(\src), %r10d - movl \so+56(\src), %r11d - movl %r8d, \do+0(\dest) - movl %r9d, \do+24(\dest) - movl %r10d, \do+32(\dest) - movl %r11d, \do+56(\dest) -.endm - -.macro gen_salsa8_core_doubleround - movq 72(%rsp), %r15 - - leaq (%r14, %rdx), %rbp - roll $7, %ebp - xorq %rbp, %r9 - leaq (%rdi, %r15), %rbp - roll $7, %ebp - xorq %rbp, %r10 - leaq (%rdx, %r9), %rbp - roll $9, %ebp - xorq %rbp, %r11 - leaq (%r15, %r10), %rbp - roll $9, %ebp - xorq %rbp, %r13 - leaq (%r9, %r11), %rbp - roll $13, %ebp - xorq %rbp, %r14 - leaq (%r10, %r13), %rbp - roll $13, %ebp - xorq %rbp, %rdi - leaq (%r11, %r14), %rbp - roll $18, %ebp - xorq %rbp, %rdx - leaq (%r13, %rdi), %rbp - roll $18, %ebp - xorq %rbp, %r15 - - movq 48(%rsp), %rbp - movq %r15, 72(%rsp) - - leaq (%rax, %rbp), %r15 - roll $7, %r15d - xorq %r15, %rbx - leaq (%rbp, %rbx), %r15 - roll $9, %r15d - xorq %r15, %rcx - leaq (%rbx, %rcx), %r15 - roll $13, %r15d - xorq %r15, %rax - leaq (%rcx, %rax), %r15 - roll $18, %r15d - xorq %r15, %rbp - - movq 88(%rsp), %r15 - movq %rbp, 48(%rsp) - - leaq (%r12, %r15), %rbp - roll $7, %ebp - xorq %rbp, %rsi - leaq (%r15, %rsi), %rbp - roll $9, %ebp - xorq %rbp, %r8 - leaq (%rsi, %r8), %rbp - roll $13, %ebp - xorq %rbp, %r12 - leaq (%r8, %r12), %rbp - roll $18, %ebp - xorq %rbp, %r15 +#define scrypt_shuffle(src, so, dest, do) \ + movl so+60(src), %r8d; \ + movl so+44(src), %r9d; \ + movl so+28(src), %r10d; \ + movl so+12(src), %r11d; \ + movl %r8d, do+12(dest); \ + movl %r9d, do+28(dest); \ + movl %r10d, do+44(dest); \ + movl %r11d, do+60(dest); \ + movl so+40(src), %r8d; \ + movl so+8(src), %r9d; \ + movl so+48(src), %r10d; \ + movl so+16(src), %r11d; \ + movl %r8d, do+8(dest); \ + movl %r9d, do+40(dest); \ + movl %r10d, do+16(dest); \ + movl %r11d, do+48(dest); \ + movl so+20(src), %r8d; \ + movl so+4(src), %r9d; \ + movl so+52(src), %r10d; \ + movl so+36(src), %r11d; \ + movl %r8d, do+4(dest); \ + movl %r9d, do+20(dest); \ + movl %r10d, do+36(dest); \ + movl %r11d, do+52(dest); \ + movl so+0(src), %r8d; \ + movl so+24(src), %r9d; \ + movl so+32(src), %r10d; \ + movl so+56(src), %r11d; \ + movl %r8d, do+0(dest); \ + movl %r9d, do+24(dest); \ + movl %r10d, do+32(dest); \ + movl %r11d, do+56(dest); \ + + +#define gen_salsa8_core_doubleround() \ + movq 72(%rsp), %r15; \ + leaq (%r14, %rdx), %rbp; \ + roll $7, %ebp; \ + xorq %rbp, %r9; \ + leaq (%rdi, %r15), %rbp; \ + roll $7, %ebp; \ + xorq %rbp, %r10; \ + leaq (%rdx, %r9), %rbp; \ + roll $9, %ebp; \ + xorq %rbp, %r11; \ + leaq (%r15, %r10), %rbp; \ + roll $9, %ebp; \ + xorq %rbp, %r13; \ + leaq (%r9, %r11), %rbp; \ + roll $13, %ebp; \ + xorq %rbp, %r14; \ + leaq (%r10, %r13), %rbp; \ + roll $13, %ebp; \ + xorq %rbp, %rdi; \ + leaq (%r11, %r14), %rbp; \ + roll $18, %ebp; \ + xorq %rbp, %rdx; \ + leaq (%r13, %rdi), %rbp; \ + roll $18, %ebp; \ + xorq %rbp, %r15; \ + movq 48(%rsp), %rbp; \ + movq %r15, 72(%rsp); \ + leaq (%rax, %rbp), %r15; \ + roll $7, %r15d; \ + xorq %r15, %rbx; \ + leaq (%rbp, %rbx), %r15; \ + roll $9, %r15d; \ + xorq %r15, %rcx; \ + leaq (%rbx, %rcx), %r15; \ + roll $13, %r15d; \ + xorq %r15, %rax; \ + leaq (%rcx, %rax), %r15; \ + roll $18, %r15d; \ + xorq %r15, %rbp; \ + movq 88(%rsp), %r15; \ + movq %rbp, 48(%rsp); \ + leaq (%r12, %r15), %rbp; \ + roll $7, %ebp; \ + xorq %rbp, %rsi; \ + leaq (%r15, %rsi), %rbp; \ + roll $9, %ebp; \ + xorq %rbp, %r8; \ + leaq (%rsi, %r8), %rbp; \ + roll $13, %ebp; \ + xorq %rbp, %r12; \ + leaq (%r8, %r12), %rbp; \ + roll $18, %ebp; \ + xorq %rbp, %r15; \ + movq %r15, 88(%rsp); \ + movq 72(%rsp), %r15; \ + leaq (%rsi, %rdx), %rbp; \ + roll $7, %ebp; \ + xorq %rbp, %rdi; \ + leaq (%r9, %r15), %rbp; \ + roll $7, %ebp; \ + xorq %rbp, %rax; \ + leaq (%rdx, %rdi), %rbp; \ + roll $9, %ebp; \ + xorq %rbp, %rcx; \ + leaq (%r15, %rax), %rbp; \ + roll $9, %ebp; \ + xorq %rbp, %r8; \ + leaq (%rdi, %rcx), %rbp; \ + roll $13, %ebp; \ + xorq %rbp, %rsi; \ + leaq (%rax, %r8), %rbp; \ + roll $13, %ebp; \ + xorq %rbp, %r9; \ + leaq (%rcx, %rsi), %rbp; \ + roll $18, %ebp; \ + xorq %rbp, %rdx; \ + leaq (%r8, %r9), %rbp; \ + roll $18, %ebp; \ + xorq %rbp, %r15; \ + movq 48(%rsp), %rbp; \ + movq %r15, 72(%rsp); \ + leaq (%r10, %rbp), %r15; \ + roll $7, %r15d; \ + xorq %r15, %r12; \ + leaq (%rbp, %r12), %r15; \ + roll $9, %r15d; \ + xorq %r15, %r11; \ + leaq (%r12, %r11), %r15; \ + roll $13, %r15d; \ + xorq %r15, %r10; \ + leaq (%r11, %r10), %r15; \ + roll $18, %r15d; \ + xorq %r15, %rbp; \ + movq 88(%rsp), %r15; \ + movq %rbp, 48(%rsp); \ + leaq (%rbx, %r15), %rbp; \ + roll $7, %ebp; \ + xorq %rbp, %r14; \ + leaq (%r15, %r14), %rbp; \ + roll $9, %ebp; \ + xorq %rbp, %r13; \ + leaq (%r14, %r13), %rbp; \ + roll $13, %ebp; \ + xorq %rbp, %rbx; \ + leaq (%r13, %rbx), %rbp; \ + roll $18, %ebp; \ + xorq %rbp, %r15; \ + movq %r15, 88(%rsp); \ - movq %r15, 88(%rsp) - movq 72(%rsp), %r15 - - leaq (%rsi, %rdx), %rbp - roll $7, %ebp - xorq %rbp, %rdi - leaq (%r9, %r15), %rbp - roll $7, %ebp - xorq %rbp, %rax - leaq (%rdx, %rdi), %rbp - roll $9, %ebp - xorq %rbp, %rcx - leaq (%r15, %rax), %rbp - roll $9, %ebp - xorq %rbp, %r8 - leaq (%rdi, %rcx), %rbp - roll $13, %ebp - xorq %rbp, %rsi - leaq (%rax, %r8), %rbp - roll $13, %ebp - xorq %rbp, %r9 - leaq (%rcx, %rsi), %rbp - roll $18, %ebp - xorq %rbp, %rdx - leaq (%r8, %r9), %rbp - roll $18, %ebp - xorq %rbp, %r15 - - movq 48(%rsp), %rbp - movq %r15, 72(%rsp) - - leaq (%r10, %rbp), %r15 - roll $7, %r15d - xorq %r15, %r12 - leaq (%rbp, %r12), %r15 - roll $9, %r15d - xorq %r15, %r11 - leaq (%r12, %r11), %r15 - roll $13, %r15d - xorq %r15, %r10 - leaq (%r11, %r10), %r15 - roll $18, %r15d - xorq %r15, %rbp - - movq 88(%rsp), %r15 - movq %rbp, 48(%rsp) - - leaq (%rbx, %r15), %rbp - roll $7, %ebp - xorq %rbp, %r14 - leaq (%r15, %r14), %rbp - roll $9, %ebp - xorq %rbp, %r13 - leaq (%r14, %r13), %rbp - roll $13, %ebp - xorq %rbp, %rbx - leaq (%r13, %rbx), %rbp - roll $18, %ebp - xorq %rbp, %r15 - - movq %r15, 88(%rsp) -.endm .text .align 32 @@ -221,10 +209,10 @@ gen_salsa8_core: shrq $32, %r15 movq %r15, 88(%rsp) - gen_salsa8_core_doubleround - gen_salsa8_core_doubleround - gen_salsa8_core_doubleround - gen_salsa8_core_doubleround + gen_salsa8_core_doubleround() + gen_salsa8_core_doubleround() + gen_salsa8_core_doubleround() + gen_salsa8_core_doubleround() movl %edx, %edx shlq $32, %rdi @@ -315,29 +303,14 @@ _scrypt_core: movq %rdx, %rsi #endif -.macro scrypt_core_cleanup -#if defined(WIN64) - popq %rsi - popq %rdi - movdqa 8(%rsp), %xmm6 - movdqa 24(%rsp), %xmm7 - movdqa 40(%rsp), %xmm8 - movdqa 56(%rsp), %xmm9 - movdqa 72(%rsp), %xmm10 - movdqa 88(%rsp), %xmm11 - movdqa 104(%rsp), %xmm12 - movdqa 120(%rsp), %xmm13 - movdqa 136(%rsp), %xmm14 - movdqa 152(%rsp), %xmm15 - addq $176, %rsp -#endif - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx -.endm +#define scrypt_core_cleanup() \ + popq %r15; \ + popq %r14; \ + popq %r13; \ + popq %r12; \ + popq %rbp; \ + popq %rbx; \ + # GenuineIntel processors have fast SIMD xorl %eax, %eax @@ -476,88 +449,81 @@ gen_scrypt_core_loop2: movdqa %xmm15, 112(%rdi) addq $136, %rsp - scrypt_core_cleanup + scrypt_core_cleanup() ret -.macro xmm_salsa8_core_doubleround - movdqa %xmm1, %xmm4 - paddd %xmm0, %xmm4 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm3 - pxor %xmm5, %xmm3 - movdqa %xmm0, %xmm4 - - paddd %xmm3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm3, %xmm4 - pshufd $0x93, %xmm3, %xmm3 - pxor %xmm5, %xmm2 - - paddd %xmm2, %xmm4 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm1 - movdqa %xmm2, %xmm4 - pshufd $0x4e, %xmm2, %xmm2 - pxor %xmm5, %xmm1 +#define xmm_salsa8_core_doubleround() \ + movdqa %xmm1, %xmm4; \ + paddd %xmm0, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $7, %xmm4; \ + psrld $25, %xmm5; \ + pxor %xmm4, %xmm3; \ + pxor %xmm5, %xmm3; \ + movdqa %xmm0, %xmm4; \ + paddd %xmm3, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $9, %xmm4; \ + psrld $23, %xmm5; \ + pxor %xmm4, %xmm2; \ + movdqa %xmm3, %xmm4; \ + pshufd $0x93, %xmm3, %xmm3; \ + pxor %xmm5, %xmm2; \ + paddd %xmm2, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $13, %xmm4; \ + psrld $19, %xmm5; \ + pxor %xmm4, %xmm1; \ + movdqa %xmm2, %xmm4; \ + pshufd $0x4e, %xmm2, %xmm2; \ + pxor %xmm5, %xmm1; \ + paddd %xmm1, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $18, %xmm4; \ + psrld $14, %xmm5; \ + pxor %xmm4, %xmm0; \ + pshufd $0x39, %xmm1, %xmm1; \ + pxor %xmm5, %xmm0; \ + movdqa %xmm3, %xmm4; \ + paddd %xmm0, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $7, %xmm4; \ + psrld $25, %xmm5; \ + pxor %xmm4, %xmm1; \ + pxor %xmm5, %xmm1; \ + movdqa %xmm0, %xmm4; \ + paddd %xmm1, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $9, %xmm4; \ + psrld $23, %xmm5; \ + pxor %xmm4, %xmm2; \ + movdqa %xmm1, %xmm4; \ + pshufd $0x93, %xmm1, %xmm1; \ + pxor %xmm5, %xmm2; \ + paddd %xmm2, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $13, %xmm4; \ + psrld $19, %xmm5; \ + pxor %xmm4, %xmm3; \ + movdqa %xmm2, %xmm4; \ + pshufd $0x4e, %xmm2, %xmm2; \ + pxor %xmm5, %xmm3; \ + paddd %xmm3, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $18, %xmm4; \ + psrld $14, %xmm5; \ + pxor %xmm4, %xmm0; \ + pshufd $0x39, %xmm3, %xmm3; \ + pxor %xmm5, %xmm0; \ + + +#define xmm_salsa8_core() \ + xmm_salsa8_core_doubleround(); \ + xmm_salsa8_core_doubleround(); \ + xmm_salsa8_core_doubleround(); \ + xmm_salsa8_core_doubleround(); \ - paddd %xmm1, %xmm4 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - pshufd $0x39, %xmm1, %xmm1 - pxor %xmm5, %xmm0 - movdqa %xmm3, %xmm4 - - paddd %xmm0, %xmm4 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm1 - pxor %xmm5, %xmm1 - movdqa %xmm0, %xmm4 - - paddd %xmm1, %xmm4 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm1, %xmm4 - pshufd $0x93, %xmm1, %xmm1 - pxor %xmm5, %xmm2 - - paddd %xmm2, %xmm4 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm3 - movdqa %xmm2, %xmm4 - pshufd $0x4e, %xmm2, %xmm2 - pxor %xmm5, %xmm3 - - paddd %xmm3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - pshufd $0x39, %xmm3, %xmm3 - pxor %xmm5, %xmm0 -.endm - -.macro xmm_salsa8_core - xmm_salsa8_core_doubleround - xmm_salsa8_core_doubleround - xmm_salsa8_core_doubleround - xmm_salsa8_core_doubleround -.endm .align 32 xmm_scrypt_core: @@ -697,7 +663,7 @@ xmm_scrypt_core_loop1: movdqa %xmm9, %xmm1 movdqa %xmm10, %xmm2 movdqa %xmm11, %xmm3 - xmm_salsa8_core + xmm_salsa8_core() paddd %xmm0, %xmm8 paddd %xmm1, %xmm9 paddd %xmm2, %xmm10 @@ -711,7 +677,7 @@ xmm_scrypt_core_loop1: movdqa %xmm13, %xmm1 movdqa %xmm14, %xmm2 movdqa %xmm15, %xmm3 - xmm_salsa8_core + xmm_salsa8_core() paddd %xmm0, %xmm12 paddd %xmm1, %xmm13 paddd %xmm2, %xmm14 @@ -751,7 +717,7 @@ xmm_scrypt_core_loop2: movdqa %xmm9, %xmm1 movdqa %xmm10, %xmm2 movdqa %xmm11, %xmm3 - xmm_salsa8_core + xmm_salsa8_core() paddd %xmm0, %xmm8 paddd %xmm1, %xmm9 paddd %xmm2, %xmm10 @@ -765,7 +731,7 @@ xmm_scrypt_core_loop2: movdqa %xmm13, %xmm1 movdqa %xmm14, %xmm2 movdqa %xmm15, %xmm3 - xmm_salsa8_core + xmm_salsa8_core() paddd %xmm0, %xmm12 paddd %xmm1, %xmm13 paddd %xmm2, %xmm14 @@ -866,7 +832,7 @@ xmm_scrypt_core_loop2: movl %ebx, 92(%rdi) movl %eax, 76(%rdi) - scrypt_core_cleanup + scrypt_core_cleanup() ret @@ -897,146 +863,139 @@ scrypt_best_throughput_exit: ret -.macro xmm_salsa8_core_2way_doubleround - movdqa %xmm1, %xmm4 - movdqa %xmm9, %xmm6 - paddd %xmm0, %xmm4 - paddd %xmm8, %xmm6 - movdqa %xmm4, %xmm5 - movdqa %xmm6, %xmm7 - pslld $7, %xmm4 - pslld $7, %xmm6 - psrld $25, %xmm5 - psrld $25, %xmm7 - pxor %xmm4, %xmm3 - pxor %xmm6, %xmm11 - pxor %xmm5, %xmm3 - pxor %xmm7, %xmm11 - movdqa %xmm0, %xmm4 - movdqa %xmm8, %xmm6 - - paddd %xmm3, %xmm4 - paddd %xmm11, %xmm6 - movdqa %xmm4, %xmm5 - movdqa %xmm6, %xmm7 - pslld $9, %xmm4 - pslld $9, %xmm6 - psrld $23, %xmm5 - psrld $23, %xmm7 - pxor %xmm4, %xmm2 - pxor %xmm6, %xmm10 - movdqa %xmm3, %xmm4 - movdqa %xmm11, %xmm6 - pshufd $0x93, %xmm3, %xmm3 - pshufd $0x93, %xmm11, %xmm11 - pxor %xmm5, %xmm2 - pxor %xmm7, %xmm10 - - paddd %xmm2, %xmm4 - paddd %xmm10, %xmm6 - movdqa %xmm4, %xmm5 - movdqa %xmm6, %xmm7 - pslld $13, %xmm4 - pslld $13, %xmm6 - psrld $19, %xmm5 - psrld $19, %xmm7 - pxor %xmm4, %xmm1 - pxor %xmm6, %xmm9 - movdqa %xmm2, %xmm4 - movdqa %xmm10, %xmm6 - pshufd $0x4e, %xmm2, %xmm2 - pshufd $0x4e, %xmm10, %xmm10 - pxor %xmm5, %xmm1 - pxor %xmm7, %xmm9 - - paddd %xmm1, %xmm4 - paddd %xmm9, %xmm6 - movdqa %xmm4, %xmm5 - movdqa %xmm6, %xmm7 - pslld $18, %xmm4 - pslld $18, %xmm6 - psrld $14, %xmm5 - psrld $14, %xmm7 - pxor %xmm4, %xmm0 - pxor %xmm6, %xmm8 - pshufd $0x39, %xmm1, %xmm1 - pshufd $0x39, %xmm9, %xmm9 - pxor %xmm5, %xmm0 - pxor %xmm7, %xmm8 - movdqa %xmm3, %xmm4 - movdqa %xmm11, %xmm6 - - paddd %xmm0, %xmm4 - paddd %xmm8, %xmm6 - movdqa %xmm4, %xmm5 - movdqa %xmm6, %xmm7 - pslld $7, %xmm4 - pslld $7, %xmm6 - psrld $25, %xmm5 - psrld $25, %xmm7 - pxor %xmm4, %xmm1 - pxor %xmm6, %xmm9 - pxor %xmm5, %xmm1 - pxor %xmm7, %xmm9 - movdqa %xmm0, %xmm4 - movdqa %xmm8, %xmm6 - - paddd %xmm1, %xmm4 - paddd %xmm9, %xmm6 - movdqa %xmm4, %xmm5 - movdqa %xmm6, %xmm7 - pslld $9, %xmm4 - pslld $9, %xmm6 - psrld $23, %xmm5 - psrld $23, %xmm7 - pxor %xmm4, %xmm2 - pxor %xmm6, %xmm10 - movdqa %xmm1, %xmm4 - movdqa %xmm9, %xmm6 - pshufd $0x93, %xmm1, %xmm1 - pshufd $0x93, %xmm9, %xmm9 - pxor %xmm5, %xmm2 - pxor %xmm7, %xmm10 - - paddd %xmm2, %xmm4 - paddd %xmm10, %xmm6 - movdqa %xmm4, %xmm5 - movdqa %xmm6, %xmm7 - pslld $13, %xmm4 - pslld $13, %xmm6 - psrld $19, %xmm5 - psrld $19, %xmm7 - pxor %xmm4, %xmm3 - pxor %xmm6, %xmm11 - movdqa %xmm2, %xmm4 - movdqa %xmm10, %xmm6 - pshufd $0x4e, %xmm2, %xmm2 - pshufd $0x4e, %xmm10, %xmm10 - pxor %xmm5, %xmm3 - pxor %xmm7, %xmm11 - - paddd %xmm3, %xmm4 - paddd %xmm11, %xmm6 - movdqa %xmm4, %xmm5 - movdqa %xmm6, %xmm7 - pslld $18, %xmm4 - pslld $18, %xmm6 - psrld $14, %xmm5 - psrld $14, %xmm7 - pxor %xmm4, %xmm0 - pxor %xmm6, %xmm8 - pshufd $0x39, %xmm3, %xmm3 - pshufd $0x39, %xmm11, %xmm11 - pxor %xmm5, %xmm0 - pxor %xmm7, %xmm8 -.endm +#define xmm_salsa8_core_2way_doubleround() \ + movdqa %xmm1, %xmm4; \ + movdqa %xmm9, %xmm6; \ + paddd %xmm0, %xmm4; \ + paddd %xmm8, %xmm6; \ + movdqa %xmm4, %xmm5; \ + movdqa %xmm6, %xmm7; \ + pslld $7, %xmm4; \ + pslld $7, %xmm6; \ + psrld $25, %xmm5; \ + psrld $25, %xmm7; \ + pxor %xmm4, %xmm3; \ + pxor %xmm6, %xmm11; \ + pxor %xmm5, %xmm3; \ + pxor %xmm7, %xmm11; \ + movdqa %xmm0, %xmm4; \ + movdqa %xmm8, %xmm6; \ + paddd %xmm3, %xmm4; \ + paddd %xmm11, %xmm6; \ + movdqa %xmm4, %xmm5; \ + movdqa %xmm6, %xmm7; \ + pslld $9, %xmm4; \ + pslld $9, %xmm6; \ + psrld $23, %xmm5; \ + psrld $23, %xmm7; \ + pxor %xmm4, %xmm2; \ + pxor %xmm6, %xmm10; \ + movdqa %xmm3, %xmm4; \ + movdqa %xmm11, %xmm6; \ + pshufd $0x93, %xmm3, %xmm3; \ + pshufd $0x93, %xmm11, %xmm11; \ + pxor %xmm5, %xmm2; \ + pxor %xmm7, %xmm10; \ + paddd %xmm2, %xmm4; \ + paddd %xmm10, %xmm6; \ + movdqa %xmm4, %xmm5; \ + movdqa %xmm6, %xmm7; \ + pslld $13, %xmm4; \ + pslld $13, %xmm6; \ + psrld $19, %xmm5; \ + psrld $19, %xmm7; \ + pxor %xmm4, %xmm1; \ + pxor %xmm6, %xmm9; \ + movdqa %xmm2, %xmm4; \ + movdqa %xmm10, %xmm6; \ + pshufd $0x4e, %xmm2, %xmm2; \ + pshufd $0x4e, %xmm10, %xmm10; \ + pxor %xmm5, %xmm1; \ + pxor %xmm7, %xmm9; \ + paddd %xmm1, %xmm4; \ + paddd %xmm9, %xmm6; \ + movdqa %xmm4, %xmm5; \ + movdqa %xmm6, %xmm7; \ + pslld $18, %xmm4; \ + pslld $18, %xmm6; \ + psrld $14, %xmm5; \ + psrld $14, %xmm7; \ + pxor %xmm4, %xmm0; \ + pxor %xmm6, %xmm8; \ + pshufd $0x39, %xmm1, %xmm1; \ + pshufd $0x39, %xmm9, %xmm9; \ + pxor %xmm5, %xmm0; \ + pxor %xmm7, %xmm8; \ + movdqa %xmm3, %xmm4; \ + movdqa %xmm11, %xmm6; \ + paddd %xmm0, %xmm4; \ + paddd %xmm8, %xmm6; \ + movdqa %xmm4, %xmm5; \ + movdqa %xmm6, %xmm7; \ + pslld $7, %xmm4; \ + pslld $7, %xmm6; \ + psrld $25, %xmm5; \ + psrld $25, %xmm7; \ + pxor %xmm4, %xmm1; \ + pxor %xmm6, %xmm9; \ + pxor %xmm5, %xmm1; \ + pxor %xmm7, %xmm9; \ + movdqa %xmm0, %xmm4; \ + movdqa %xmm8, %xmm6; \ + paddd %xmm1, %xmm4; \ + paddd %xmm9, %xmm6; \ + movdqa %xmm4, %xmm5; \ + movdqa %xmm6, %xmm7; \ + pslld $9, %xmm4; \ + pslld $9, %xmm6; \ + psrld $23, %xmm5; \ + psrld $23, %xmm7; \ + pxor %xmm4, %xmm2; \ + pxor %xmm6, %xmm10; \ + movdqa %xmm1, %xmm4; \ + movdqa %xmm9, %xmm6; \ + pshufd $0x93, %xmm1, %xmm1; \ + pshufd $0x93, %xmm9, %xmm9; \ + pxor %xmm5, %xmm2; \ + pxor %xmm7, %xmm10; \ + paddd %xmm2, %xmm4; \ + paddd %xmm10, %xmm6; \ + movdqa %xmm4, %xmm5; \ + movdqa %xmm6, %xmm7; \ + pslld $13, %xmm4; \ + pslld $13, %xmm6; \ + psrld $19, %xmm5; \ + psrld $19, %xmm7; \ + pxor %xmm4, %xmm3; \ + pxor %xmm6, %xmm11; \ + movdqa %xmm2, %xmm4; \ + movdqa %xmm10, %xmm6; \ + pshufd $0x4e, %xmm2, %xmm2; \ + pshufd $0x4e, %xmm10, %xmm10; \ + pxor %xmm5, %xmm3; \ + pxor %xmm7, %xmm11; \ + paddd %xmm3, %xmm4; \ + paddd %xmm11, %xmm6; \ + movdqa %xmm4, %xmm5; \ + movdqa %xmm6, %xmm7; \ + pslld $18, %xmm4; \ + pslld $18, %xmm6; \ + psrld $14, %xmm5; \ + psrld $14, %xmm7; \ + pxor %xmm4, %xmm0; \ + pxor %xmm6, %xmm8; \ + pshufd $0x39, %xmm3, %xmm3; \ + pshufd $0x39, %xmm11, %xmm11; \ + pxor %xmm5, %xmm0; \ + pxor %xmm7, %xmm8; \ + + +#define xmm_salsa8_core_2way() \ + xmm_salsa8_core_2way_doubleround(); \ + xmm_salsa8_core_2way_doubleround(); \ + xmm_salsa8_core_2way_doubleround(); \ + xmm_salsa8_core_2way_doubleround(); \ -.macro xmm_salsa8_core_2way - xmm_salsa8_core_2way_doubleround - xmm_salsa8_core_2way_doubleround - xmm_salsa8_core_2way_doubleround - xmm_salsa8_core_2way_doubleround -.endm .text @@ -1067,10 +1026,10 @@ _scrypt_core_2way: #endif subq $264, %rsp - scrypt_shuffle %rdi, 0, %rsp, 0 - scrypt_shuffle %rdi, 64, %rsp, 64 - scrypt_shuffle %rsi, 0, %rsp, 128 - scrypt_shuffle %rsi, 64, %rsp, 192 + scrypt_shuffle(%rdi, 0, %rsp, 0) + scrypt_shuffle(%rdi, 64, %rsp, 64) + scrypt_shuffle(%rsi, 0, %rsp, 128) + scrypt_shuffle(%rsi, 64, %rsp, 192) movdqa 192(%rsp), %xmm12 movdqa 208(%rsp), %xmm13 @@ -1117,7 +1076,7 @@ scrypt_core_2way_loop1: movdqa %xmm14, 224(%rbp) movdqa %xmm15, 240(%rbp) - xmm_salsa8_core_2way + xmm_salsa8_core_2way() paddd 0(%rbp), %xmm0 paddd 16(%rbp), %xmm1 paddd 32(%rbp), %xmm2 @@ -1151,7 +1110,7 @@ scrypt_core_2way_loop1: movdqa %xmm9, %xmm13 movdqa %xmm10, %xmm14 movdqa %xmm11, %xmm15 - xmm_salsa8_core_2way + xmm_salsa8_core_2way() paddd 64(%rsp), %xmm0 paddd 80(%rsp), %xmm1 paddd 96(%rsp), %xmm2 @@ -1215,7 +1174,7 @@ scrypt_core_2way_loop2: movdqa %xmm9, 144(%rsp) movdqa %xmm10, 160(%rsp) movdqa %xmm11, 176(%rsp) - xmm_salsa8_core_2way + xmm_salsa8_core_2way() paddd 0(%rsp), %xmm0 paddd 16(%rsp), %xmm1 paddd 32(%rsp), %xmm2 @@ -1257,7 +1216,7 @@ scrypt_core_2way_loop2: movdqa %xmm9, %xmm13 movdqa %xmm10, %xmm14 movdqa %xmm11, %xmm15 - xmm_salsa8_core_2way + xmm_salsa8_core_2way() paddd 64(%rsp), %xmm0 paddd 80(%rsp), %xmm1 paddd 96(%rsp), %xmm2 @@ -1279,10 +1238,10 @@ scrypt_core_2way_loop2: movdqa %xmm14, 224(%rsp) movdqa %xmm15, 240(%rsp) - scrypt_shuffle %rsp, 0, %rdi, 0 - scrypt_shuffle %rsp, 64, %rdi, 64 - scrypt_shuffle %rsp, 128, %rsi, 0 - scrypt_shuffle %rsp, 192, %rsi, 64 + scrypt_shuffle(%rsp, 0, %rdi, 0) + scrypt_shuffle(%rsp, 64, %rdi, 64) + scrypt_shuffle(%rsp, 128, %rsi, 0) + scrypt_shuffle(%rsp, 192, %rsi, 64) addq $264, %rsp #if defined(WIN64) @@ -1305,208 +1264,201 @@ scrypt_core_2way_loop2: ret -.macro xmm_salsa8_core_3way_doubleround - movdqa %xmm1, %xmm4 - movdqa %xmm9, %xmm6 - movdqa %xmm13, %xmm7 - paddd %xmm0, %xmm4 - paddd %xmm8, %xmm6 - paddd %xmm12, %xmm7 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm3 - pxor %xmm5, %xmm3 - movdqa %xmm0, %xmm4 - movdqa %xmm6, %xmm5 - pslld $7, %xmm6 - psrld $25, %xmm5 - pxor %xmm6, %xmm11 - pxor %xmm5, %xmm11 - movdqa %xmm8, %xmm6 - movdqa %xmm7, %xmm5 - pslld $7, %xmm7 - psrld $25, %xmm5 - pxor %xmm7, %xmm15 - pxor %xmm5, %xmm15 - movdqa %xmm12, %xmm7 - - paddd %xmm3, %xmm4 - paddd %xmm11, %xmm6 - paddd %xmm15, %xmm7 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm3, %xmm4 - pshufd $0x93, %xmm3, %xmm3 - pxor %xmm5, %xmm2 - movdqa %xmm6, %xmm5 - pslld $9, %xmm6 - psrld $23, %xmm5 - pxor %xmm6, %xmm10 - movdqa %xmm11, %xmm6 - pshufd $0x93, %xmm11, %xmm11 - pxor %xmm5, %xmm10 - movdqa %xmm7, %xmm5 - pslld $9, %xmm7 - psrld $23, %xmm5 - pxor %xmm7, %xmm14 - movdqa %xmm15, %xmm7 - pshufd $0x93, %xmm15, %xmm15 - pxor %xmm5, %xmm14 - - paddd %xmm2, %xmm4 - paddd %xmm10, %xmm6 - paddd %xmm14, %xmm7 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm1 - movdqa %xmm2, %xmm4 - pshufd $0x4e, %xmm2, %xmm2 - pxor %xmm5, %xmm1 - movdqa %xmm6, %xmm5 - pslld $13, %xmm6 - psrld $19, %xmm5 - pxor %xmm6, %xmm9 - movdqa %xmm10, %xmm6 - pshufd $0x4e, %xmm10, %xmm10 - pxor %xmm5, %xmm9 - movdqa %xmm7, %xmm5 - pslld $13, %xmm7 - psrld $19, %xmm5 - pxor %xmm7, %xmm13 - movdqa %xmm14, %xmm7 - pshufd $0x4e, %xmm14, %xmm14 - pxor %xmm5, %xmm13 - - paddd %xmm1, %xmm4 - paddd %xmm9, %xmm6 - paddd %xmm13, %xmm7 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - pshufd $0x39, %xmm1, %xmm1 - pxor %xmm5, %xmm0 - movdqa %xmm3, %xmm4 - movdqa %xmm6, %xmm5 - pslld $18, %xmm6 - psrld $14, %xmm5 - pxor %xmm6, %xmm8 - pshufd $0x39, %xmm9, %xmm9 - pxor %xmm5, %xmm8 - movdqa %xmm11, %xmm6 - movdqa %xmm7, %xmm5 - pslld $18, %xmm7 - psrld $14, %xmm5 - pxor %xmm7, %xmm12 - pshufd $0x39, %xmm13, %xmm13 - pxor %xmm5, %xmm12 - movdqa %xmm15, %xmm7 - - paddd %xmm0, %xmm4 - paddd %xmm8, %xmm6 - paddd %xmm12, %xmm7 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm1 - pxor %xmm5, %xmm1 - movdqa %xmm0, %xmm4 - movdqa %xmm6, %xmm5 - pslld $7, %xmm6 - psrld $25, %xmm5 - pxor %xmm6, %xmm9 - pxor %xmm5, %xmm9 - movdqa %xmm8, %xmm6 - movdqa %xmm7, %xmm5 - pslld $7, %xmm7 - psrld $25, %xmm5 - pxor %xmm7, %xmm13 - pxor %xmm5, %xmm13 - movdqa %xmm12, %xmm7 - - paddd %xmm1, %xmm4 - paddd %xmm9, %xmm6 - paddd %xmm13, %xmm7 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm1, %xmm4 - pshufd $0x93, %xmm1, %xmm1 - pxor %xmm5, %xmm2 - movdqa %xmm6, %xmm5 - pslld $9, %xmm6 - psrld $23, %xmm5 - pxor %xmm6, %xmm10 - movdqa %xmm9, %xmm6 - pshufd $0x93, %xmm9, %xmm9 - pxor %xmm5, %xmm10 - movdqa %xmm7, %xmm5 - pslld $9, %xmm7 - psrld $23, %xmm5 - pxor %xmm7, %xmm14 - movdqa %xmm13, %xmm7 - pshufd $0x93, %xmm13, %xmm13 - pxor %xmm5, %xmm14 - - paddd %xmm2, %xmm4 - paddd %xmm10, %xmm6 - paddd %xmm14, %xmm7 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm3 - movdqa %xmm2, %xmm4 - pshufd $0x4e, %xmm2, %xmm2 - pxor %xmm5, %xmm3 - movdqa %xmm6, %xmm5 - pslld $13, %xmm6 - psrld $19, %xmm5 - pxor %xmm6, %xmm11 - movdqa %xmm10, %xmm6 - pshufd $0x4e, %xmm10, %xmm10 - pxor %xmm5, %xmm11 - movdqa %xmm7, %xmm5 - pslld $13, %xmm7 - psrld $19, %xmm5 - pxor %xmm7, %xmm15 - movdqa %xmm14, %xmm7 - pshufd $0x4e, %xmm14, %xmm14 - pxor %xmm5, %xmm15 - - paddd %xmm3, %xmm4 - paddd %xmm11, %xmm6 - paddd %xmm15, %xmm7 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - pshufd $0x39, %xmm3, %xmm3 - pxor %xmm5, %xmm0 - movdqa %xmm6, %xmm5 - pslld $18, %xmm6 - psrld $14, %xmm5 - pxor %xmm6, %xmm8 - pshufd $0x39, %xmm11, %xmm11 - pxor %xmm5, %xmm8 - movdqa %xmm7, %xmm5 - pslld $18, %xmm7 - psrld $14, %xmm5 - pxor %xmm7, %xmm12 - pshufd $0x39, %xmm15, %xmm15 - pxor %xmm5, %xmm12 -.endm +#define xmm_salsa8_core_3way_doubleround() \ + movdqa %xmm1, %xmm4; \ + movdqa %xmm9, %xmm6; \ + movdqa %xmm13, %xmm7; \ + paddd %xmm0, %xmm4; \ + paddd %xmm8, %xmm6; \ + paddd %xmm12, %xmm7; \ + movdqa %xmm4, %xmm5; \ + pslld $7, %xmm4; \ + psrld $25, %xmm5; \ + pxor %xmm4, %xmm3; \ + pxor %xmm5, %xmm3; \ + movdqa %xmm0, %xmm4; \ + movdqa %xmm6, %xmm5; \ + pslld $7, %xmm6; \ + psrld $25, %xmm5; \ + pxor %xmm6, %xmm11; \ + pxor %xmm5, %xmm11; \ + movdqa %xmm8, %xmm6; \ + movdqa %xmm7, %xmm5; \ + pslld $7, %xmm7; \ + psrld $25, %xmm5; \ + pxor %xmm7, %xmm15; \ + pxor %xmm5, %xmm15; \ + movdqa %xmm12, %xmm7; \ + paddd %xmm3, %xmm4; \ + paddd %xmm11, %xmm6; \ + paddd %xmm15, %xmm7; \ + movdqa %xmm4, %xmm5; \ + pslld $9, %xmm4; \ + psrld $23, %xmm5; \ + pxor %xmm4, %xmm2; \ + movdqa %xmm3, %xmm4; \ + pshufd $0x93, %xmm3, %xmm3; \ + pxor %xmm5, %xmm2; \ + movdqa %xmm6, %xmm5; \ + pslld $9, %xmm6; \ + psrld $23, %xmm5; \ + pxor %xmm6, %xmm10; \ + movdqa %xmm11, %xmm6; \ + pshufd $0x93, %xmm11, %xmm11; \ + pxor %xmm5, %xmm10; \ + movdqa %xmm7, %xmm5; \ + pslld $9, %xmm7; \ + psrld $23, %xmm5; \ + pxor %xmm7, %xmm14; \ + movdqa %xmm15, %xmm7; \ + pshufd $0x93, %xmm15, %xmm15; \ + pxor %xmm5, %xmm14; \ + paddd %xmm2, %xmm4; \ + paddd %xmm10, %xmm6; \ + paddd %xmm14, %xmm7; \ + movdqa %xmm4, %xmm5; \ + pslld $13, %xmm4; \ + psrld $19, %xmm5; \ + pxor %xmm4, %xmm1; \ + movdqa %xmm2, %xmm4; \ + pshufd $0x4e, %xmm2, %xmm2; \ + pxor %xmm5, %xmm1; \ + movdqa %xmm6, %xmm5; \ + pslld $13, %xmm6; \ + psrld $19, %xmm5; \ + pxor %xmm6, %xmm9; \ + movdqa %xmm10, %xmm6; \ + pshufd $0x4e, %xmm10, %xmm10; \ + pxor %xmm5, %xmm9; \ + movdqa %xmm7, %xmm5; \ + pslld $13, %xmm7; \ + psrld $19, %xmm5; \ + pxor %xmm7, %xmm13; \ + movdqa %xmm14, %xmm7; \ + pshufd $0x4e, %xmm14, %xmm14; \ + pxor %xmm5, %xmm13; \ + paddd %xmm1, %xmm4; \ + paddd %xmm9, %xmm6; \ + paddd %xmm13, %xmm7; \ + movdqa %xmm4, %xmm5; \ + pslld $18, %xmm4; \ + psrld $14, %xmm5; \ + pxor %xmm4, %xmm0; \ + pshufd $0x39, %xmm1, %xmm1; \ + pxor %xmm5, %xmm0; \ + movdqa %xmm3, %xmm4; \ + movdqa %xmm6, %xmm5; \ + pslld $18, %xmm6; \ + psrld $14, %xmm5; \ + pxor %xmm6, %xmm8; \ + pshufd $0x39, %xmm9, %xmm9; \ + pxor %xmm5, %xmm8; \ + movdqa %xmm11, %xmm6; \ + movdqa %xmm7, %xmm5; \ + pslld $18, %xmm7; \ + psrld $14, %xmm5; \ + pxor %xmm7, %xmm12; \ + pshufd $0x39, %xmm13, %xmm13; \ + pxor %xmm5, %xmm12; \ + movdqa %xmm15, %xmm7; \ + paddd %xmm0, %xmm4; \ + paddd %xmm8, %xmm6; \ + paddd %xmm12, %xmm7; \ + movdqa %xmm4, %xmm5; \ + pslld $7, %xmm4; \ + psrld $25, %xmm5; \ + pxor %xmm4, %xmm1; \ + pxor %xmm5, %xmm1; \ + movdqa %xmm0, %xmm4; \ + movdqa %xmm6, %xmm5; \ + pslld $7, %xmm6; \ + psrld $25, %xmm5; \ + pxor %xmm6, %xmm9; \ + pxor %xmm5, %xmm9; \ + movdqa %xmm8, %xmm6; \ + movdqa %xmm7, %xmm5; \ + pslld $7, %xmm7; \ + psrld $25, %xmm5; \ + pxor %xmm7, %xmm13; \ + pxor %xmm5, %xmm13; \ + movdqa %xmm12, %xmm7; \ + paddd %xmm1, %xmm4; \ + paddd %xmm9, %xmm6; \ + paddd %xmm13, %xmm7; \ + movdqa %xmm4, %xmm5; \ + pslld $9, %xmm4; \ + psrld $23, %xmm5; \ + pxor %xmm4, %xmm2; \ + movdqa %xmm1, %xmm4; \ + pshufd $0x93, %xmm1, %xmm1; \ + pxor %xmm5, %xmm2; \ + movdqa %xmm6, %xmm5; \ + pslld $9, %xmm6; \ + psrld $23, %xmm5; \ + pxor %xmm6, %xmm10; \ + movdqa %xmm9, %xmm6; \ + pshufd $0x93, %xmm9, %xmm9; \ + pxor %xmm5, %xmm10; \ + movdqa %xmm7, %xmm5; \ + pslld $9, %xmm7; \ + psrld $23, %xmm5; \ + pxor %xmm7, %xmm14; \ + movdqa %xmm13, %xmm7; \ + pshufd $0x93, %xmm13, %xmm13; \ + pxor %xmm5, %xmm14; \ + paddd %xmm2, %xmm4; \ + paddd %xmm10, %xmm6; \ + paddd %xmm14, %xmm7; \ + movdqa %xmm4, %xmm5; \ + pslld $13, %xmm4; \ + psrld $19, %xmm5; \ + pxor %xmm4, %xmm3; \ + movdqa %xmm2, %xmm4; \ + pshufd $0x4e, %xmm2, %xmm2; \ + pxor %xmm5, %xmm3; \ + movdqa %xmm6, %xmm5; \ + pslld $13, %xmm6; \ + psrld $19, %xmm5; \ + pxor %xmm6, %xmm11; \ + movdqa %xmm10, %xmm6; \ + pshufd $0x4e, %xmm10, %xmm10; \ + pxor %xmm5, %xmm11; \ + movdqa %xmm7, %xmm5; \ + pslld $13, %xmm7; \ + psrld $19, %xmm5; \ + pxor %xmm7, %xmm15; \ + movdqa %xmm14, %xmm7; \ + pshufd $0x4e, %xmm14, %xmm14; \ + pxor %xmm5, %xmm15; \ + paddd %xmm3, %xmm4; \ + paddd %xmm11, %xmm6; \ + paddd %xmm15, %xmm7; \ + movdqa %xmm4, %xmm5; \ + pslld $18, %xmm4; \ + psrld $14, %xmm5; \ + pxor %xmm4, %xmm0; \ + pshufd $0x39, %xmm3, %xmm3; \ + pxor %xmm5, %xmm0; \ + movdqa %xmm6, %xmm5; \ + pslld $18, %xmm6; \ + psrld $14, %xmm5; \ + pxor %xmm6, %xmm8; \ + pshufd $0x39, %xmm11, %xmm11; \ + pxor %xmm5, %xmm8; \ + movdqa %xmm7, %xmm5; \ + pslld $18, %xmm7; \ + psrld $14, %xmm5; \ + pxor %xmm7, %xmm12; \ + pshufd $0x39, %xmm15, %xmm15; \ + pxor %xmm5, %xmm12; \ + + +#define xmm_salsa8_core_3way() \ + xmm_salsa8_core_3way_doubleround(); \ + xmm_salsa8_core_3way_doubleround(); \ + xmm_salsa8_core_3way_doubleround(); \ + xmm_salsa8_core_3way_doubleround(); \ -.macro xmm_salsa8_core_3way - xmm_salsa8_core_3way_doubleround - xmm_salsa8_core_3way_doubleround - xmm_salsa8_core_3way_doubleround - xmm_salsa8_core_3way_doubleround -.endm .text .align 32 @@ -1537,12 +1489,12 @@ _scrypt_core_3way: #endif subq $392, %rsp - scrypt_shuffle %rdi, 0, %rsp, 0 - scrypt_shuffle %rdi, 64, %rsp, 64 - scrypt_shuffle %rsi, 0, %rsp, 128 - scrypt_shuffle %rsi, 64, %rsp, 192 - scrypt_shuffle %rdx, 0, %rsp, 256 - scrypt_shuffle %rdx, 64, %rsp, 320 + scrypt_shuffle(%rdi, 0, %rsp, 0) + scrypt_shuffle(%rdi, 64, %rsp, 64) + scrypt_shuffle(%rsi, 0, %rsp, 128) + scrypt_shuffle(%rsi, 64, %rsp, 192) + scrypt_shuffle(%rdx, 0, %rsp, 256) + scrypt_shuffle(%rdx, 64, %rsp, 320) movdqa 128+64(%rsp), %xmm8 movdqa 128+80(%rsp), %xmm9 @@ -1613,7 +1565,7 @@ scrypt_core_3way_loop1: movdqa %xmm6, 256+96(%rbp) movdqa %xmm7, 256+112(%rbp) - xmm_salsa8_core_3way + xmm_salsa8_core_3way() paddd 0(%rbp), %xmm0 paddd 16(%rbp), %xmm1 paddd 32(%rbp), %xmm2 @@ -1663,7 +1615,7 @@ scrypt_core_3way_loop1: movdqa %xmm13, 256+80(%rsp) movdqa %xmm14, 256+96(%rsp) movdqa %xmm15, 256+112(%rsp) - xmm_salsa8_core_3way + xmm_salsa8_core_3way() paddd 64(%rsp), %xmm0 paddd 80(%rsp), %xmm1 paddd 96(%rsp), %xmm2 @@ -1758,7 +1710,7 @@ scrypt_core_3way_loop2: movdqa %xmm13, 256+16(%rsp) movdqa %xmm14, 256+32(%rsp) movdqa %xmm15, 256+48(%rsp) - xmm_salsa8_core_3way + xmm_salsa8_core_3way() paddd 0(%rsp), %xmm0 paddd 16(%rsp), %xmm1 paddd 32(%rsp), %xmm2 @@ -1820,7 +1772,7 @@ scrypt_core_3way_loop2: movdqa %xmm13, 256+80(%rsp) movdqa %xmm14, 256+96(%rsp) movdqa %xmm15, 256+112(%rsp) - xmm_salsa8_core_3way + xmm_salsa8_core_3way() paddd 64(%rsp), %xmm0 paddd 80(%rsp), %xmm1 paddd 96(%rsp), %xmm2 @@ -1849,12 +1801,12 @@ scrypt_core_3way_loop2: subq $1, %rax ja scrypt_core_3way_loop2 - scrypt_shuffle %rsp, 0, %rdi, 0 - scrypt_shuffle %rsp, 64, %rdi, 64 - scrypt_shuffle %rsp, 128, %rsi, 0 - scrypt_shuffle %rsp, 192, %rsi, 64 - scrypt_shuffle %rsp, 256, %rdx, 0 - scrypt_shuffle %rsp, 320, %rdx, 64 + scrypt_shuffle(%rsp, 0, %rdi, 0) + scrypt_shuffle(%rsp, 64, %rdi, 64) + scrypt_shuffle(%rsp, 128, %rsi, 0) + scrypt_shuffle(%rsp, 192, %rsi, 64) + scrypt_shuffle(%rsp, 256, %rdx, 0) + scrypt_shuffle(%rsp, 320, %rdx, 64) addq $392, %rsp #if defined(WIN64) -- 1.7.1