1 # Copyright 2011-2012 pooler@litecoinpool.org
4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions
7 # 1. Redistributions of source code must retain the above copyright
8 # notice, this list of conditions and the following disclaimer.
9 # 2. Redistributions in binary form must reproduce the above copyright
10 # notice, this list of conditions and the following disclaimer in the
11 # documentation and/or other materials provided with the distribution.
13 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 #if defined(__linux__) && defined(__ELF__)
26 .section .note.GNU-stack,"",%progbits
29 #if defined(__x86_64__)
31 .macro scrypt_shuffle src, so, dest, do
32 movl \so+60(\src), %r8d
33 movl \so+44(\src), %r9d
34 movl \so+28(\src), %r10d
35 movl \so+12(\src), %r11d
36 movl %r8d, \do+12(\dest)
37 movl %r9d, \do+28(\dest)
38 movl %r10d, \do+44(\dest)
39 movl %r11d, \do+60(\dest)
40 movl \so+40(\src), %r8d
41 movl \so+8(\src), %r9d
42 movl \so+48(\src), %r10d
43 movl \so+16(\src), %r11d
44 movl %r8d, \do+8(\dest)
45 movl %r9d, \do+40(\dest)
46 movl %r10d, \do+16(\dest)
47 movl %r11d, \do+48(\dest)
48 movl \so+20(\src), %r8d
49 movl \so+4(\src), %r9d
50 movl \so+52(\src), %r10d
51 movl \so+36(\src), %r11d
52 movl %r8d, \do+4(\dest)
53 movl %r9d, \do+20(\dest)
54 movl %r10d, \do+36(\dest)
55 movl %r11d, \do+52(\dest)
56 movl \so+0(\src), %r8d
57 movl \so+24(\src), %r9d
58 movl \so+32(\src), %r10d
59 movl \so+56(\src), %r11d
60 movl %r8d, \do+0(\dest)
61 movl %r9d, \do+24(\dest)
62 movl %r10d, \do+32(\dest)
63 movl %r11d, \do+56(\dest)
66 .macro gen_salsa8_core_doubleround
69 leaq (%r14, %rdx), %rbp
72 leaq (%rdi, %r15), %rbp
75 leaq (%rdx, %r9), %rbp
78 leaq (%r15, %r10), %rbp
81 leaq (%r9, %r11), %rbp
84 leaq (%r10, %r13), %rbp
87 leaq (%r11, %r14), %rbp
90 leaq (%r13, %rdi), %rbp
97 leaq (%rax, %rbp), %r15
100 leaq (%rbp, %rbx), %r15
103 leaq (%rbx, %rcx), %r15
106 leaq (%rcx, %rax), %r15
113 leaq (%r12, %r15), %rbp
116 leaq (%r15, %rsi), %rbp
119 leaq (%rsi, %r8), %rbp
122 leaq (%r8, %r12), %rbp
129 leaq (%rsi, %rdx), %rbp
132 leaq (%r9, %r15), %rbp
135 leaq (%rdx, %rdi), %rbp
138 leaq (%r15, %rax), %rbp
141 leaq (%rdi, %rcx), %rbp
144 leaq (%rax, %r8), %rbp
147 leaq (%rcx, %rsi), %rbp
150 leaq (%r8, %r9), %rbp
157 leaq (%r10, %rbp), %r15
160 leaq (%rbp, %r12), %r15
163 leaq (%r12, %r11), %r15
166 leaq (%r11, %r10), %r15
173 leaq (%rbx, %r15), %rbp
176 leaq (%r15, %r14), %rbp
179 leaq (%r14, %r13), %rbp
182 leaq (%r13, %rbx), %rbp
192 # 0: %rdx, %rdi, %rcx, %rsi
199 # 1: %r9, 72(%rsp), %rax, %r8
207 # 2: %r11, %r10, 48(%rsp), %r12
215 # 3: %r14, %r13, %rbx, 88(%rsp)
224 gen_salsa8_core_doubleround
225 gen_salsa8_core_doubleround
226 gen_salsa8_core_doubleround
227 gen_salsa8_core_doubleround
271 punpcklqdq %xmm4, %xmm0
272 punpcklqdq %xmm5, %xmm1
273 punpcklqdq %xmm6, %xmm2
274 punpcklqdq %xmm7, %xmm3
302 movdqa %xmm6, 8(%rsp)
303 movdqa %xmm7, 24(%rsp)
304 movdqa %xmm8, 40(%rsp)
305 movdqa %xmm9, 56(%rsp)
306 movdqa %xmm10, 72(%rsp)
307 movdqa %xmm11, 88(%rsp)
308 movdqa %xmm12, 104(%rsp)
309 movdqa %xmm13, 120(%rsp)
310 movdqa %xmm14, 136(%rsp)
311 movdqa %xmm15, 152(%rsp)
318 .macro scrypt_core_cleanup
322 movdqa 8(%rsp), %xmm6
323 movdqa 24(%rsp), %xmm7
324 movdqa 40(%rsp), %xmm8
325 movdqa 56(%rsp), %xmm9
326 movdqa 72(%rsp), %xmm10
327 movdqa 88(%rsp), %xmm11
328 movdqa 104(%rsp), %xmm12
329 movdqa 120(%rsp), %xmm13
330 movdqa 136(%rsp), %xmm14
331 movdqa 152(%rsp), %xmm15
342 # GenuineIntel processors have fast SIMD
345 cmpl $0x6c65746e, %ecx
347 cmpl $0x49656e69, %edx
349 cmpl $0x756e6547, %ebx
354 movdqa 0(%rdi), %xmm8
355 movdqa 16(%rdi), %xmm9
356 movdqa 32(%rdi), %xmm10
357 movdqa 48(%rdi), %xmm11
358 movdqa 64(%rdi), %xmm12
359 movdqa 80(%rdi), %xmm13
360 movdqa 96(%rdi), %xmm14
361 movdqa 112(%rdi), %xmm15
363 leaq 131072(%rsi), %rcx
367 gen_scrypt_core_loop1:
368 movdqa %xmm8, 0(%rsi)
369 movdqa %xmm9, 16(%rsi)
370 movdqa %xmm10, 32(%rsi)
371 movdqa %xmm11, 48(%rsi)
372 movdqa %xmm12, 64(%rsi)
373 movdqa %xmm13, 80(%rsi)
374 movdqa %xmm14, 96(%rsi)
375 movdqa %xmm15, 112(%rsi)
381 movdqa %xmm8, 0(%rsp)
382 movdqa %xmm9, 16(%rsp)
383 movdqa %xmm10, 32(%rsp)
384 movdqa %xmm11, 48(%rsp)
396 movdqa %xmm12, 0(%rsp)
397 movdqa %xmm13, 16(%rsp)
398 movdqa %xmm14, 32(%rsp)
399 movdqa %xmm15, 48(%rsp)
410 jne gen_scrypt_core_loop1
413 gen_scrypt_core_loop2:
418 movdqa 0(%rsi, %rdx), %xmm0
419 movdqa 16(%rsi, %rdx), %xmm1
420 movdqa 32(%rsi, %rdx), %xmm2
421 movdqa 48(%rsi, %rdx), %xmm3
422 movdqa 64(%rsi, %rdx), %xmm4
423 movdqa 80(%rsi, %rdx), %xmm5
424 movdqa 96(%rsi, %rdx), %xmm6
425 movdqa 112(%rsi, %rdx), %xmm7
439 movdqa %xmm8, 0(%rsp)
440 movdqa %xmm9, 16(%rsp)
441 movdqa %xmm10, 32(%rsp)
442 movdqa %xmm11, 48(%rsp)
454 movdqa %xmm12, 0(%rsp)
455 movdqa %xmm13, 16(%rsp)
456 movdqa %xmm14, 32(%rsp)
457 movdqa %xmm15, 48(%rsp)
466 ja gen_scrypt_core_loop2
469 movdqa %xmm8, 0(%rdi)
470 movdqa %xmm9, 16(%rdi)
471 movdqa %xmm10, 32(%rdi)
472 movdqa %xmm11, 48(%rdi)
473 movdqa %xmm12, 64(%rdi)
474 movdqa %xmm13, 80(%rdi)
475 movdqa %xmm14, 96(%rdi)
476 movdqa %xmm15, 112(%rdi)
483 .macro xmm_salsa8_core_doubleround
499 pshufd $0x93, %xmm3, %xmm3
508 pshufd $0x4e, %xmm2, %xmm2
516 pshufd $0x39, %xmm1, %xmm1
534 pshufd $0x93, %xmm1, %xmm1
543 pshufd $0x4e, %xmm2, %xmm2
551 pshufd $0x39, %xmm3, %xmm3
555 .macro xmm_salsa8_core
556 xmm_salsa8_core_doubleround
557 xmm_salsa8_core_doubleround
558 xmm_salsa8_core_doubleround
559 xmm_salsa8_core_doubleround
564 # shuffle 1st block into %xmm8-%xmm11
577 pshufd $0x93, %xmm0, %xmm0
578 pshufd $0x93, %xmm1, %xmm1
579 pshufd $0x93, %xmm2, %xmm2
580 pshufd $0x93, %xmm3, %xmm3
593 pshufd $0x93, %xmm0, %xmm0
594 pshufd $0x93, %xmm1, %xmm1
595 pshufd $0x93, %xmm2, %xmm2
596 pshufd $0x93, %xmm3, %xmm3
609 pshufd $0x93, %xmm0, %xmm0
610 pshufd $0x93, %xmm1, %xmm1
611 pshufd $0x93, %xmm2, %xmm2
612 pshufd $0x93, %xmm3, %xmm3
622 # shuffle 2nd block into %xmm12-%xmm15
635 pshufd $0x93, %xmm0, %xmm0
636 pshufd $0x93, %xmm1, %xmm1
637 pshufd $0x93, %xmm2, %xmm2
638 pshufd $0x93, %xmm3, %xmm3
651 pshufd $0x93, %xmm0, %xmm0
652 pshufd $0x93, %xmm1, %xmm1
653 pshufd $0x93, %xmm2, %xmm2
654 pshufd $0x93, %xmm3, %xmm3
667 pshufd $0x93, %xmm0, %xmm0
668 pshufd $0x93, %xmm1, %xmm1
669 pshufd $0x93, %xmm2, %xmm2
670 pshufd $0x93, %xmm3, %xmm3
681 leaq 131072(%rsi), %rcx
682 xmm_scrypt_core_loop1:
683 movdqa %xmm8, 0(%rdx)
684 movdqa %xmm9, 16(%rdx)
685 movdqa %xmm10, 32(%rdx)
686 movdqa %xmm11, 48(%rdx)
687 movdqa %xmm12, 64(%rdx)
688 movdqa %xmm13, 80(%rdx)
689 movdqa %xmm14, 96(%rdx)
690 movdqa %xmm15, 112(%rdx)
722 jne xmm_scrypt_core_loop1
725 xmm_scrypt_core_loop2:
729 movdqa 0(%rsi, %rdx), %xmm0
730 movdqa 16(%rsi, %rdx), %xmm1
731 movdqa 32(%rsi, %rdx), %xmm2
732 movdqa 48(%rsi, %rdx), %xmm3
733 movdqa 64(%rsi, %rdx), %xmm4
734 movdqa 80(%rsi, %rdx), %xmm5
735 movdqa 96(%rsi, %rdx), %xmm6
736 movdqa 112(%rsi, %rdx), %xmm7
775 ja xmm_scrypt_core_loop2
777 # re-shuffle 1st block back
782 pshufd $0x39, %xmm8, %xmm8
783 pshufd $0x39, %xmm9, %xmm9
784 pshufd $0x39, %xmm10, %xmm10
785 pshufd $0x39, %xmm11, %xmm11
794 pshufd $0x39, %xmm8, %xmm8
795 pshufd $0x39, %xmm9, %xmm9
796 pshufd $0x39, %xmm10, %xmm10
797 pshufd $0x39, %xmm11, %xmm11
806 pshufd $0x39, %xmm8, %xmm8
807 pshufd $0x39, %xmm9, %xmm9
808 pshufd $0x39, %xmm10, %xmm10
809 pshufd $0x39, %xmm11, %xmm11
823 # re-shuffle 2nd block back
828 pshufd $0x39, %xmm12, %xmm12
829 pshufd $0x39, %xmm13, %xmm13
830 pshufd $0x39, %xmm14, %xmm14
831 pshufd $0x39, %xmm15, %xmm15
840 pshufd $0x39, %xmm12, %xmm12
841 pshufd $0x39, %xmm13, %xmm13
842 pshufd $0x39, %xmm14, %xmm14
843 pshufd $0x39, %xmm15, %xmm15
852 pshufd $0x39, %xmm12, %xmm12
853 pshufd $0x39, %xmm13, %xmm13
854 pshufd $0x39, %xmm14, %xmm14
855 pshufd $0x39, %xmm15, %xmm15
875 .globl scrypt_best_throughput
876 .globl _scrypt_best_throughput
877 scrypt_best_throughput:
878 _scrypt_best_throughput:
883 cmpl $0x444d4163, %ecx
884 jne scrypt_best_throughput_exit
885 cmpl $0x69746e65, %edx
886 jne scrypt_best_throughput_exit
887 cmpl $0x68747541, %ebx
888 jne scrypt_best_throughput_exit
891 andl $0x0ff00000, %eax
893 jnz scrypt_best_throughput_exit
895 scrypt_best_throughput_exit:
900 .macro xmm_salsa8_core_2way_doubleround
930 pshufd $0x93, %xmm3, %xmm3
931 pshufd $0x93, %xmm11, %xmm11
947 pshufd $0x4e, %xmm2, %xmm2
948 pshufd $0x4e, %xmm10, %xmm10
962 pshufd $0x39, %xmm1, %xmm1
963 pshufd $0x39, %xmm9, %xmm9
996 pshufd $0x93, %xmm1, %xmm1
997 pshufd $0x93, %xmm9, %xmm9
1012 movdqa %xmm10, %xmm6
1013 pshufd $0x4e, %xmm2, %xmm2
1014 pshufd $0x4e, %xmm10, %xmm10
1028 pshufd $0x39, %xmm3, %xmm3
1029 pshufd $0x39, %xmm11, %xmm11
1034 .macro xmm_salsa8_core_2way
1035 xmm_salsa8_core_2way_doubleround
1036 xmm_salsa8_core_2way_doubleround
1037 xmm_salsa8_core_2way_doubleround
1038 xmm_salsa8_core_2way_doubleround
1044 .globl scrypt_core_2way
1045 .globl _scrypt_core_2way
1052 movdqa %xmm6, 8(%rsp)
1053 movdqa %xmm7, 24(%rsp)
1054 movdqa %xmm8, 40(%rsp)
1055 movdqa %xmm9, 56(%rsp)
1056 movdqa %xmm10, 72(%rsp)
1057 movdqa %xmm11, 88(%rsp)
1058 movdqa %xmm12, 104(%rsp)
1059 movdqa %xmm13, 120(%rsp)
1060 movdqa %xmm14, 136(%rsp)
1061 movdqa %xmm15, 152(%rsp)
1070 scrypt_shuffle %rdi, 0, %rsp, 0
1071 scrypt_shuffle %rdi, 64, %rsp, 64
1072 scrypt_shuffle %rsi, 0, %rsp, 128
1073 scrypt_shuffle %rsi, 64, %rsp, 192
1075 movdqa 192(%rsp), %xmm12
1076 movdqa 208(%rsp), %xmm13
1077 movdqa 224(%rsp), %xmm14
1078 movdqa 240(%rsp), %xmm15
1081 leaq 262144(%rdx), %rcx
1082 scrypt_core_2way_loop1:
1083 movdqa 0(%rsp), %xmm0
1084 movdqa 16(%rsp), %xmm1
1085 movdqa 32(%rsp), %xmm2
1086 movdqa 48(%rsp), %xmm3
1087 movdqa 64(%rsp), %xmm4
1088 movdqa 80(%rsp), %xmm5
1089 movdqa 96(%rsp), %xmm6
1090 movdqa 112(%rsp), %xmm7
1091 movdqa 128(%rsp), %xmm8
1092 movdqa 144(%rsp), %xmm9
1093 movdqa 160(%rsp), %xmm10
1094 movdqa 176(%rsp), %xmm11
1099 movdqa %xmm0, 0(%rbp)
1100 movdqa %xmm1, 16(%rbp)
1101 movdqa %xmm2, 32(%rbp)
1102 movdqa %xmm3, 48(%rbp)
1103 movdqa %xmm4, 64(%rbp)
1104 movdqa %xmm5, 80(%rbp)
1105 movdqa %xmm6, 96(%rbp)
1106 movdqa %xmm7, 112(%rbp)
1111 movdqa %xmm8, 128(%rbp)
1112 movdqa %xmm9, 144(%rbp)
1113 movdqa %xmm10, 160(%rbp)
1114 movdqa %xmm11, 176(%rbp)
1115 movdqa %xmm12, 192(%rbp)
1116 movdqa %xmm13, 208(%rbp)
1117 movdqa %xmm14, 224(%rbp)
1118 movdqa %xmm15, 240(%rbp)
1120 xmm_salsa8_core_2way
1121 paddd 0(%rbp), %xmm0
1122 paddd 16(%rbp), %xmm1
1123 paddd 32(%rbp), %xmm2
1124 paddd 48(%rbp), %xmm3
1125 paddd 128(%rbp), %xmm8
1126 paddd 144(%rbp), %xmm9
1127 paddd 160(%rbp), %xmm10
1128 paddd 176(%rbp), %xmm11
1129 movdqa %xmm0, 0(%rsp)
1130 movdqa %xmm1, 16(%rsp)
1131 movdqa %xmm2, 32(%rsp)
1132 movdqa %xmm3, 48(%rsp)
1133 movdqa %xmm8, 128(%rsp)
1134 movdqa %xmm9, 144(%rsp)
1135 movdqa %xmm10, 160(%rsp)
1136 movdqa %xmm11, 176(%rsp)
1138 pxor 64(%rsp), %xmm0
1139 pxor 80(%rsp), %xmm1
1140 pxor 96(%rsp), %xmm2
1141 pxor 112(%rsp), %xmm3
1146 movdqa %xmm0, 64(%rsp)
1147 movdqa %xmm1, 80(%rsp)
1148 movdqa %xmm2, 96(%rsp)
1149 movdqa %xmm3, 112(%rsp)
1150 movdqa %xmm8, %xmm12
1151 movdqa %xmm9, %xmm13
1152 movdqa %xmm10, %xmm14
1153 movdqa %xmm11, %xmm15
1154 xmm_salsa8_core_2way
1155 paddd 64(%rsp), %xmm0
1156 paddd 80(%rsp), %xmm1
1157 paddd 96(%rsp), %xmm2
1158 paddd 112(%rsp), %xmm3
1161 paddd %xmm10, %xmm14
1162 paddd %xmm11, %xmm15
1163 movdqa %xmm0, 64(%rsp)
1164 movdqa %xmm1, 80(%rsp)
1165 movdqa %xmm2, 96(%rsp)
1166 movdqa %xmm3, 112(%rsp)
1170 jne scrypt_core_2way_loop1
1173 scrypt_core_2way_loop2:
1174 movdqa 0(%rsp), %xmm0
1175 movdqa 16(%rsp), %xmm1
1176 movdqa 32(%rsp), %xmm2
1177 movdqa 48(%rsp), %xmm3
1178 movdqa 64(%rsp), %xmm4
1179 movdqa 80(%rsp), %xmm5
1180 movdqa 96(%rsp), %xmm6
1181 movdqa 112(%rsp), %xmm7
1182 movdqa 128(%rsp), %xmm8
1183 movdqa 144(%rsp), %xmm9
1184 movdqa 160(%rsp), %xmm10
1185 movdqa 176(%rsp), %xmm11
1189 pxor 0(%rdx, %rbp), %xmm0
1190 pxor 16(%rdx, %rbp), %xmm1
1191 pxor 32(%rdx, %rbp), %xmm2
1192 pxor 48(%rdx, %rbp), %xmm3
1197 pxor 0(%rdx, %rbx), %xmm8
1198 pxor 16(%rdx, %rbx), %xmm9
1199 pxor 32(%rdx, %rbx), %xmm10
1200 pxor 48(%rdx, %rbx), %xmm11
1210 movdqa %xmm0, 0(%rsp)
1211 movdqa %xmm1, 16(%rsp)
1212 movdqa %xmm2, 32(%rsp)
1213 movdqa %xmm3, 48(%rsp)
1214 movdqa %xmm8, 128(%rsp)
1215 movdqa %xmm9, 144(%rsp)
1216 movdqa %xmm10, 160(%rsp)
1217 movdqa %xmm11, 176(%rsp)
1218 xmm_salsa8_core_2way
1219 paddd 0(%rsp), %xmm0
1220 paddd 16(%rsp), %xmm1
1221 paddd 32(%rsp), %xmm2
1222 paddd 48(%rsp), %xmm3
1223 paddd 128(%rsp), %xmm8
1224 paddd 144(%rsp), %xmm9
1225 paddd 160(%rsp), %xmm10
1226 paddd 176(%rsp), %xmm11
1227 movdqa %xmm0, 0(%rsp)
1228 movdqa %xmm1, 16(%rsp)
1229 movdqa %xmm2, 32(%rsp)
1230 movdqa %xmm3, 48(%rsp)
1231 movdqa %xmm8, 128(%rsp)
1232 movdqa %xmm9, 144(%rsp)
1233 movdqa %xmm10, 160(%rsp)
1234 movdqa %xmm11, 176(%rsp)
1236 pxor 64(%rdx, %rbp), %xmm0
1237 pxor 80(%rdx, %rbp), %xmm1
1238 pxor 96(%rdx, %rbp), %xmm2
1239 pxor 112(%rdx, %rbp), %xmm3
1240 pxor 64(%rdx, %rbx), %xmm8
1241 pxor 80(%rdx, %rbx), %xmm9
1242 pxor 96(%rdx, %rbx), %xmm10
1243 pxor 112(%rdx, %rbx), %xmm11
1244 pxor 64(%rsp), %xmm0
1245 pxor 80(%rsp), %xmm1
1246 pxor 96(%rsp), %xmm2
1247 pxor 112(%rsp), %xmm3
1252 movdqa %xmm0, 64(%rsp)
1253 movdqa %xmm1, 80(%rsp)
1254 movdqa %xmm2, 96(%rsp)
1255 movdqa %xmm3, 112(%rsp)
1256 movdqa %xmm8, %xmm12
1257 movdqa %xmm9, %xmm13
1258 movdqa %xmm10, %xmm14
1259 movdqa %xmm11, %xmm15
1260 xmm_salsa8_core_2way
1261 paddd 64(%rsp), %xmm0
1262 paddd 80(%rsp), %xmm1
1263 paddd 96(%rsp), %xmm2
1264 paddd 112(%rsp), %xmm3
1267 paddd %xmm10, %xmm14
1268 paddd %xmm11, %xmm15
1269 movdqa %xmm0, 64(%rsp)
1270 movdqa %xmm1, 80(%rsp)
1271 movdqa %xmm2, 96(%rsp)
1272 movdqa %xmm3, 112(%rsp)
1275 ja scrypt_core_2way_loop2
1277 movdqa %xmm12, 192(%rsp)
1278 movdqa %xmm13, 208(%rsp)
1279 movdqa %xmm14, 224(%rsp)
1280 movdqa %xmm15, 240(%rsp)
1282 scrypt_shuffle %rsp, 0, %rdi, 0
1283 scrypt_shuffle %rsp, 64, %rdi, 64
1284 scrypt_shuffle %rsp, 128, %rsi, 0
1285 scrypt_shuffle %rsp, 192, %rsi, 64
1291 movdqa 8(%rsp), %xmm6
1292 movdqa 24(%rsp), %xmm7
1293 movdqa 40(%rsp), %xmm8
1294 movdqa 56(%rsp), %xmm9
1295 movdqa 72(%rsp), %xmm10
1296 movdqa 88(%rsp), %xmm11
1297 movdqa 104(%rsp), %xmm12
1298 movdqa 120(%rsp), %xmm13
1299 movdqa 136(%rsp), %xmm14
1300 movdqa 152(%rsp), %xmm15
1308 .macro xmm_salsa8_core_3way_doubleround
1311 movdqa %xmm13, %xmm7
1332 movdqa %xmm12, %xmm7
1342 pshufd $0x93, %xmm3, %xmm3
1348 movdqa %xmm11, %xmm6
1349 pshufd $0x93, %xmm11, %xmm11
1355 movdqa %xmm15, %xmm7
1356 pshufd $0x93, %xmm15, %xmm15
1367 pshufd $0x4e, %xmm2, %xmm2
1373 movdqa %xmm10, %xmm6
1374 pshufd $0x4e, %xmm10, %xmm10
1380 movdqa %xmm14, %xmm7
1381 pshufd $0x4e, %xmm14, %xmm14
1391 pshufd $0x39, %xmm1, %xmm1
1398 pshufd $0x39, %xmm9, %xmm9
1400 movdqa %xmm11, %xmm6
1405 pshufd $0x39, %xmm13, %xmm13
1407 movdqa %xmm15, %xmm7
1429 movdqa %xmm12, %xmm7
1439 pshufd $0x93, %xmm1, %xmm1
1446 pshufd $0x93, %xmm9, %xmm9
1452 movdqa %xmm13, %xmm7
1453 pshufd $0x93, %xmm13, %xmm13
1464 pshufd $0x4e, %xmm2, %xmm2
1470 movdqa %xmm10, %xmm6
1471 pshufd $0x4e, %xmm10, %xmm10
1477 movdqa %xmm14, %xmm7
1478 pshufd $0x4e, %xmm14, %xmm14
1488 pshufd $0x39, %xmm3, %xmm3
1494 pshufd $0x39, %xmm11, %xmm11
1500 pshufd $0x39, %xmm15, %xmm15
1504 .macro xmm_salsa8_core_3way
1505 xmm_salsa8_core_3way_doubleround
1506 xmm_salsa8_core_3way_doubleround
1507 xmm_salsa8_core_3way_doubleround
1508 xmm_salsa8_core_3way_doubleround
1513 .globl scrypt_core_3way
1514 .globl _scrypt_core_3way
1521 movdqa %xmm6, 8(%rsp)
1522 movdqa %xmm7, 24(%rsp)
1523 movdqa %xmm8, 40(%rsp)
1524 movdqa %xmm9, 56(%rsp)
1525 movdqa %xmm10, 72(%rsp)
1526 movdqa %xmm11, 88(%rsp)
1527 movdqa %xmm12, 104(%rsp)
1528 movdqa %xmm13, 120(%rsp)
1529 movdqa %xmm14, 136(%rsp)
1530 movdqa %xmm15, 152(%rsp)
1540 scrypt_shuffle %rdi, 0, %rsp, 0
1541 scrypt_shuffle %rdi, 64, %rsp, 64
1542 scrypt_shuffle %rsi, 0, %rsp, 128
1543 scrypt_shuffle %rsi, 64, %rsp, 192
1544 scrypt_shuffle %rdx, 0, %rsp, 256
1545 scrypt_shuffle %rdx, 64, %rsp, 320
1547 movdqa 128+64(%rsp), %xmm8
1548 movdqa 128+80(%rsp), %xmm9
1549 movdqa 128+96(%rsp), %xmm10
1550 movdqa 128+112(%rsp), %xmm11
1553 leaq 3*131072(%rcx), %rax
1554 scrypt_core_3way_loop1:
1555 movdqa %xmm8, %xmm12
1556 movdqa %xmm9, %xmm13
1557 movdqa %xmm10, %xmm14
1558 movdqa %xmm11, %xmm15
1559 movdqa 0(%rsp), %xmm0
1560 movdqa 16(%rsp), %xmm1
1561 movdqa 32(%rsp), %xmm2
1562 movdqa 48(%rsp), %xmm3
1563 movdqa 64(%rsp), %xmm4
1564 movdqa 80(%rsp), %xmm5
1565 movdqa 96(%rsp), %xmm6
1566 movdqa 112(%rsp), %xmm7
1567 movdqa 128+0(%rsp), %xmm8
1568 movdqa 128+16(%rsp), %xmm9
1569 movdqa 128+32(%rsp), %xmm10
1570 movdqa 128+48(%rsp), %xmm11
1575 movdqa %xmm0, 0(%rbp)
1576 movdqa %xmm1, 16(%rbp)
1577 movdqa %xmm2, 32(%rbp)
1578 movdqa %xmm3, 48(%rbp)
1579 movdqa %xmm4, 64(%rbp)
1580 movdqa %xmm5, 80(%rbp)
1581 movdqa %xmm6, 96(%rbp)
1582 movdqa %xmm7, 112(%rbp)
1587 movdqa %xmm8, 128+0(%rbp)
1588 movdqa %xmm9, 128+16(%rbp)
1589 movdqa %xmm10, 128+32(%rbp)
1590 movdqa %xmm11, 128+48(%rbp)
1591 movdqa %xmm12, 128+64(%rbp)
1592 movdqa %xmm13, 128+80(%rbp)
1593 movdqa %xmm14, 128+96(%rbp)
1594 movdqa %xmm15, 128+112(%rbp)
1595 movdqa 256+0(%rsp), %xmm12
1596 movdqa 256+16(%rsp), %xmm13
1597 movdqa 256+32(%rsp), %xmm14
1598 movdqa 256+48(%rsp), %xmm15
1599 movdqa 256+64(%rsp), %xmm4
1600 movdqa 256+80(%rsp), %xmm5
1601 movdqa 256+96(%rsp), %xmm6
1602 movdqa 256+112(%rsp), %xmm7
1607 movdqa %xmm12, 256+0(%rbp)
1608 movdqa %xmm13, 256+16(%rbp)
1609 movdqa %xmm14, 256+32(%rbp)
1610 movdqa %xmm15, 256+48(%rbp)
1611 movdqa %xmm4, 256+64(%rbp)
1612 movdqa %xmm5, 256+80(%rbp)
1613 movdqa %xmm6, 256+96(%rbp)
1614 movdqa %xmm7, 256+112(%rbp)
1616 xmm_salsa8_core_3way
1617 paddd 0(%rbp), %xmm0
1618 paddd 16(%rbp), %xmm1
1619 paddd 32(%rbp), %xmm2
1620 paddd 48(%rbp), %xmm3
1621 paddd 128+0(%rbp), %xmm8
1622 paddd 128+16(%rbp), %xmm9
1623 paddd 128+32(%rbp), %xmm10
1624 paddd 128+48(%rbp), %xmm11
1625 paddd 256+0(%rbp), %xmm12
1626 paddd 256+16(%rbp), %xmm13
1627 paddd 256+32(%rbp), %xmm14
1628 paddd 256+48(%rbp), %xmm15
1629 movdqa %xmm0, 0(%rsp)
1630 movdqa %xmm1, 16(%rsp)
1631 movdqa %xmm2, 32(%rsp)
1632 movdqa %xmm3, 48(%rsp)
1633 movdqa %xmm8, 128+0(%rsp)
1634 movdqa %xmm9, 128+16(%rsp)
1635 movdqa %xmm10, 128+32(%rsp)
1636 movdqa %xmm11, 128+48(%rsp)
1637 movdqa %xmm12, 256+0(%rsp)
1638 movdqa %xmm13, 256+16(%rsp)
1639 movdqa %xmm14, 256+32(%rsp)
1640 movdqa %xmm15, 256+48(%rsp)
1642 pxor 64(%rsp), %xmm0
1643 pxor 80(%rsp), %xmm1
1644 pxor 96(%rsp), %xmm2
1645 pxor 112(%rsp), %xmm3
1646 pxor 128+64(%rsp), %xmm8
1647 pxor 128+80(%rsp), %xmm9
1648 pxor 128+96(%rsp), %xmm10
1649 pxor 128+112(%rsp), %xmm11
1650 pxor 256+64(%rsp), %xmm12
1651 pxor 256+80(%rsp), %xmm13
1652 pxor 256+96(%rsp), %xmm14
1653 pxor 256+112(%rsp), %xmm15
1654 movdqa %xmm0, 64(%rsp)
1655 movdqa %xmm1, 80(%rsp)
1656 movdqa %xmm2, 96(%rsp)
1657 movdqa %xmm3, 112(%rsp)
1658 movdqa %xmm8, 128+64(%rsp)
1659 movdqa %xmm9, 128+80(%rsp)
1660 movdqa %xmm10, 128+96(%rsp)
1661 movdqa %xmm11, 128+112(%rsp)
1662 movdqa %xmm12, 256+64(%rsp)
1663 movdqa %xmm13, 256+80(%rsp)
1664 movdqa %xmm14, 256+96(%rsp)
1665 movdqa %xmm15, 256+112(%rsp)
1666 xmm_salsa8_core_3way
1667 paddd 64(%rsp), %xmm0
1668 paddd 80(%rsp), %xmm1
1669 paddd 96(%rsp), %xmm2
1670 paddd 112(%rsp), %xmm3
1671 paddd 128+64(%rsp), %xmm8
1672 paddd 128+80(%rsp), %xmm9
1673 paddd 128+96(%rsp), %xmm10
1674 paddd 128+112(%rsp), %xmm11
1675 paddd 256+64(%rsp), %xmm12
1676 paddd 256+80(%rsp), %xmm13
1677 paddd 256+96(%rsp), %xmm14
1678 paddd 256+112(%rsp), %xmm15
1679 movdqa %xmm0, 64(%rsp)
1680 movdqa %xmm1, 80(%rsp)
1681 movdqa %xmm2, 96(%rsp)
1682 movdqa %xmm3, 112(%rsp)
1683 movdqa %xmm8, 128+64(%rsp)
1684 movdqa %xmm9, 128+80(%rsp)
1685 movdqa %xmm10, 128+96(%rsp)
1686 movdqa %xmm11, 128+112(%rsp)
1687 movdqa %xmm12, 256+64(%rsp)
1688 movdqa %xmm13, 256+80(%rsp)
1689 movdqa %xmm14, 256+96(%rsp)
1690 movdqa %xmm15, 256+112(%rsp)
1694 jne scrypt_core_3way_loop1
1697 scrypt_core_3way_loop2:
1700 leal (%ebp, %ebp, 2), %ebp
1702 movl 128+64(%rsp), %ebx
1704 leal (%ebx, %ebx, 2), %ebx
1707 movl 256+64(%rsp), %r8d
1709 leal (%r8d, %r8d, 2), %r8d
1712 movdqa 0(%rsp), %xmm0
1713 movdqa 16(%rsp), %xmm1
1714 movdqa 32(%rsp), %xmm2
1715 movdqa 48(%rsp), %xmm3
1716 movdqa 128+0(%rsp), %xmm8
1717 movdqa 128+16(%rsp), %xmm9
1718 movdqa 128+32(%rsp), %xmm10
1719 movdqa 128+48(%rsp), %xmm11
1720 movdqa 256+0(%rsp), %xmm12
1721 movdqa 256+16(%rsp), %xmm13
1722 movdqa 256+32(%rsp), %xmm14
1723 movdqa 256+48(%rsp), %xmm15
1724 pxor 0(%rcx, %rbp), %xmm0
1725 pxor 16(%rcx, %rbp), %xmm1
1726 pxor 32(%rcx, %rbp), %xmm2
1727 pxor 48(%rcx, %rbp), %xmm3
1728 pxor 0(%rcx, %rbx), %xmm8
1729 pxor 16(%rcx, %rbx), %xmm9
1730 pxor 32(%rcx, %rbx), %xmm10
1731 pxor 48(%rcx, %rbx), %xmm11
1732 pxor 0(%rcx, %r8), %xmm12
1733 pxor 16(%rcx, %r8), %xmm13
1734 pxor 32(%rcx, %r8), %xmm14
1735 pxor 48(%rcx, %r8), %xmm15
1737 pxor 64(%rsp), %xmm0
1738 pxor 80(%rsp), %xmm1
1739 pxor 96(%rsp), %xmm2
1740 pxor 112(%rsp), %xmm3
1741 pxor 128+64(%rsp), %xmm8
1742 pxor 128+80(%rsp), %xmm9
1743 pxor 128+96(%rsp), %xmm10
1744 pxor 128+112(%rsp), %xmm11
1745 pxor 256+64(%rsp), %xmm12
1746 pxor 256+80(%rsp), %xmm13
1747 pxor 256+96(%rsp), %xmm14
1748 pxor 256+112(%rsp), %xmm15
1749 movdqa %xmm0, 0(%rsp)
1750 movdqa %xmm1, 16(%rsp)
1751 movdqa %xmm2, 32(%rsp)
1752 movdqa %xmm3, 48(%rsp)
1753 movdqa %xmm8, 128+0(%rsp)
1754 movdqa %xmm9, 128+16(%rsp)
1755 movdqa %xmm10, 128+32(%rsp)
1756 movdqa %xmm11, 128+48(%rsp)
1757 movdqa %xmm12, 256+0(%rsp)
1758 movdqa %xmm13, 256+16(%rsp)
1759 movdqa %xmm14, 256+32(%rsp)
1760 movdqa %xmm15, 256+48(%rsp)
1761 xmm_salsa8_core_3way
1762 paddd 0(%rsp), %xmm0
1763 paddd 16(%rsp), %xmm1
1764 paddd 32(%rsp), %xmm2
1765 paddd 48(%rsp), %xmm3
1766 paddd 128+0(%rsp), %xmm8
1767 paddd 128+16(%rsp), %xmm9
1768 paddd 128+32(%rsp), %xmm10
1769 paddd 128+48(%rsp), %xmm11
1770 paddd 256+0(%rsp), %xmm12
1771 paddd 256+16(%rsp), %xmm13
1772 paddd 256+32(%rsp), %xmm14
1773 paddd 256+48(%rsp), %xmm15
1774 movdqa %xmm0, 0(%rsp)
1775 movdqa %xmm1, 16(%rsp)
1776 movdqa %xmm2, 32(%rsp)
1777 movdqa %xmm3, 48(%rsp)
1778 movdqa %xmm8, 128+0(%rsp)
1779 movdqa %xmm9, 128+16(%rsp)
1780 movdqa %xmm10, 128+32(%rsp)
1781 movdqa %xmm11, 128+48(%rsp)
1782 movdqa %xmm12, 256+0(%rsp)
1783 movdqa %xmm13, 256+16(%rsp)
1784 movdqa %xmm14, 256+32(%rsp)
1785 movdqa %xmm15, 256+48(%rsp)
1787 pxor 64(%rcx, %rbp), %xmm0
1788 pxor 80(%rcx, %rbp), %xmm1
1789 pxor 96(%rcx, %rbp), %xmm2
1790 pxor 112(%rcx, %rbp), %xmm3
1791 pxor 64(%rcx, %rbx), %xmm8
1792 pxor 80(%rcx, %rbx), %xmm9
1793 pxor 96(%rcx, %rbx), %xmm10
1794 pxor 112(%rcx, %rbx), %xmm11
1795 pxor 64(%rcx, %r8), %xmm12
1796 pxor 80(%rcx, %r8), %xmm13
1797 pxor 96(%rcx, %r8), %xmm14
1798 pxor 112(%rcx, %r8), %xmm15
1799 pxor 64(%rsp), %xmm0
1800 pxor 80(%rsp), %xmm1
1801 pxor 96(%rsp), %xmm2
1802 pxor 112(%rsp), %xmm3
1803 pxor 128+64(%rsp), %xmm8
1804 pxor 128+80(%rsp), %xmm9
1805 pxor 128+96(%rsp), %xmm10
1806 pxor 128+112(%rsp), %xmm11
1807 pxor 256+64(%rsp), %xmm12
1808 pxor 256+80(%rsp), %xmm13
1809 pxor 256+96(%rsp), %xmm14
1810 pxor 256+112(%rsp), %xmm15
1811 movdqa %xmm0, 64(%rsp)
1812 movdqa %xmm1, 80(%rsp)
1813 movdqa %xmm2, 96(%rsp)
1814 movdqa %xmm3, 112(%rsp)
1815 movdqa %xmm8, 128+64(%rsp)
1816 movdqa %xmm9, 128+80(%rsp)
1817 movdqa %xmm10, 128+96(%rsp)
1818 movdqa %xmm11, 128+112(%rsp)
1819 movdqa %xmm12, 256+64(%rsp)
1820 movdqa %xmm13, 256+80(%rsp)
1821 movdqa %xmm14, 256+96(%rsp)
1822 movdqa %xmm15, 256+112(%rsp)
1823 xmm_salsa8_core_3way
1824 paddd 64(%rsp), %xmm0
1825 paddd 80(%rsp), %xmm1
1826 paddd 96(%rsp), %xmm2
1827 paddd 112(%rsp), %xmm3
1828 paddd 128+64(%rsp), %xmm8
1829 paddd 128+80(%rsp), %xmm9
1830 paddd 128+96(%rsp), %xmm10
1831 paddd 128+112(%rsp), %xmm11
1832 paddd 256+64(%rsp), %xmm12
1833 paddd 256+80(%rsp), %xmm13
1834 paddd 256+96(%rsp), %xmm14
1835 paddd 256+112(%rsp), %xmm15
1836 movdqa %xmm0, 64(%rsp)
1837 movdqa %xmm1, 80(%rsp)
1838 movdqa %xmm2, 96(%rsp)
1839 movdqa %xmm3, 112(%rsp)
1840 movdqa %xmm8, 128+64(%rsp)
1841 movdqa %xmm9, 128+80(%rsp)
1842 movdqa %xmm10, 128+96(%rsp)
1843 movdqa %xmm11, 128+112(%rsp)
1844 movdqa %xmm12, 256+64(%rsp)
1845 movdqa %xmm13, 256+80(%rsp)
1846 movdqa %xmm14, 256+96(%rsp)
1847 movdqa %xmm15, 256+112(%rsp)
1850 ja scrypt_core_3way_loop2
1852 scrypt_shuffle %rsp, 0, %rdi, 0
1853 scrypt_shuffle %rsp, 64, %rdi, 64
1854 scrypt_shuffle %rsp, 128, %rsi, 0
1855 scrypt_shuffle %rsp, 192, %rsi, 64
1856 scrypt_shuffle %rsp, 256, %rdx, 0
1857 scrypt_shuffle %rsp, 320, %rdx, 64
1863 movdqa 8(%rsp), %xmm6
1864 movdqa 24(%rsp), %xmm7
1865 movdqa 40(%rsp), %xmm8
1866 movdqa 56(%rsp), %xmm9
1867 movdqa 72(%rsp), %xmm10
1868 movdqa 88(%rsp), %xmm11
1869 movdqa 104(%rsp), %xmm12
1870 movdqa 120(%rsp), %xmm13
1871 movdqa 136(%rsp), %xmm14
1872 movdqa 152(%rsp), %xmm15