1 # Copyright 2011-2012 pooler@litecoinpool.org
4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions
7 # 1. Redistributions of source code must retain the above copyright
8 # notice, this list of conditions and the following disclaimer.
9 # 2. Redistributions in binary form must reproduce the above copyright
10 # notice, this list of conditions and the following disclaimer in the
11 # documentation and/or other materials provided with the distribution.
13 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 #if defined(__linux__) && defined(__ELF__)
26 .section .note.GNU-stack,"",%progbits
29 #if defined(__x86_64__)
31 #define scrypt_shuffle(src, so, dest, do) \
32 movl so+60(src), %r8d; \
33 movl so+44(src), %r9d; \
34 movl so+28(src), %r10d; \
35 movl so+12(src), %r11d; \
36 movl %r8d, do+12(dest); \
37 movl %r9d, do+28(dest); \
38 movl %r10d, do+44(dest); \
39 movl %r11d, do+60(dest); \
40 movl so+40(src), %r8d; \
41 movl so+8(src), %r9d; \
42 movl so+48(src), %r10d; \
43 movl so+16(src), %r11d; \
44 movl %r8d, do+8(dest); \
45 movl %r9d, do+40(dest); \
46 movl %r10d, do+16(dest); \
47 movl %r11d, do+48(dest); \
48 movl so+20(src), %r8d; \
49 movl so+4(src), %r9d; \
50 movl so+52(src), %r10d; \
51 movl so+36(src), %r11d; \
52 movl %r8d, do+4(dest); \
53 movl %r9d, do+20(dest); \
54 movl %r10d, do+36(dest); \
55 movl %r11d, do+52(dest); \
56 movl so+0(src), %r8d; \
57 movl so+24(src), %r9d; \
58 movl so+32(src), %r10d; \
59 movl so+56(src), %r11d; \
60 movl %r8d, do+0(dest); \
61 movl %r9d, do+24(dest); \
62 movl %r10d, do+32(dest); \
63 movl %r11d, do+56(dest); \
66 #define gen_salsa8_core_doubleround() \
67 movq 72(%rsp), %r15; \
68 leaq (%r14, %rdx), %rbp; \
71 leaq (%rdi, %r15), %rbp; \
74 leaq (%rdx, %r9), %rbp; \
77 leaq (%r15, %r10), %rbp; \
80 leaq (%r9, %r11), %rbp; \
83 leaq (%r10, %r13), %rbp; \
86 leaq (%r11, %r14), %rbp; \
89 leaq (%r13, %rdi), %rbp; \
92 movq 48(%rsp), %rbp; \
93 movq %r15, 72(%rsp); \
94 leaq (%rax, %rbp), %r15; \
97 leaq (%rbp, %rbx), %r15; \
100 leaq (%rbx, %rcx), %r15; \
103 leaq (%rcx, %rax), %r15; \
106 movq 88(%rsp), %r15; \
107 movq %rbp, 48(%rsp); \
108 leaq (%r12, %r15), %rbp; \
111 leaq (%r15, %rsi), %rbp; \
114 leaq (%rsi, %r8), %rbp; \
117 leaq (%r8, %r12), %rbp; \
120 movq %r15, 88(%rsp); \
121 movq 72(%rsp), %r15; \
122 leaq (%rsi, %rdx), %rbp; \
125 leaq (%r9, %r15), %rbp; \
128 leaq (%rdx, %rdi), %rbp; \
131 leaq (%r15, %rax), %rbp; \
134 leaq (%rdi, %rcx), %rbp; \
137 leaq (%rax, %r8), %rbp; \
140 leaq (%rcx, %rsi), %rbp; \
143 leaq (%r8, %r9), %rbp; \
146 movq 48(%rsp), %rbp; \
147 movq %r15, 72(%rsp); \
148 leaq (%r10, %rbp), %r15; \
151 leaq (%rbp, %r12), %r15; \
154 leaq (%r12, %r11), %r15; \
157 leaq (%r11, %r10), %r15; \
160 movq 88(%rsp), %r15; \
161 movq %rbp, 48(%rsp); \
162 leaq (%rbx, %r15), %rbp; \
165 leaq (%r15, %r14), %rbp; \
168 leaq (%r14, %r13), %rbp; \
171 leaq (%r13, %rbx), %rbp; \
174 movq %r15, 88(%rsp); \
180 # 0: %rdx, %rdi, %rcx, %rsi
187 # 1: %r9, 72(%rsp), %rax, %r8
195 # 2: %r11, %r10, 48(%rsp), %r12
203 # 3: %r14, %r13, %rbx, 88(%rsp)
212 gen_salsa8_core_doubleround()
213 gen_salsa8_core_doubleround()
214 gen_salsa8_core_doubleround()
215 gen_salsa8_core_doubleround()
259 punpcklqdq %xmm4, %xmm0
260 punpcklqdq %xmm5, %xmm1
261 punpcklqdq %xmm6, %xmm2
262 punpcklqdq %xmm7, %xmm3
290 movdqa %xmm6, 8(%rsp)
291 movdqa %xmm7, 24(%rsp)
292 movdqa %xmm8, 40(%rsp)
293 movdqa %xmm9, 56(%rsp)
294 movdqa %xmm10, 72(%rsp)
295 movdqa %xmm11, 88(%rsp)
296 movdqa %xmm12, 104(%rsp)
297 movdqa %xmm13, 120(%rsp)
298 movdqa %xmm14, 136(%rsp)
299 movdqa %xmm15, 152(%rsp)
306 #define scrypt_core_cleanup() \
315 # GenuineIntel processors have fast SIMD
318 cmpl $0x6c65746e, %ecx
320 cmpl $0x49656e69, %edx
322 cmpl $0x756e6547, %ebx
327 movdqa 0(%rdi), %xmm8
328 movdqa 16(%rdi), %xmm9
329 movdqa 32(%rdi), %xmm10
330 movdqa 48(%rdi), %xmm11
331 movdqa 64(%rdi), %xmm12
332 movdqa 80(%rdi), %xmm13
333 movdqa 96(%rdi), %xmm14
334 movdqa 112(%rdi), %xmm15
336 leaq 131072(%rsi), %rcx
340 gen_scrypt_core_loop1:
341 movdqa %xmm8, 0(%rsi)
342 movdqa %xmm9, 16(%rsi)
343 movdqa %xmm10, 32(%rsi)
344 movdqa %xmm11, 48(%rsi)
345 movdqa %xmm12, 64(%rsi)
346 movdqa %xmm13, 80(%rsi)
347 movdqa %xmm14, 96(%rsi)
348 movdqa %xmm15, 112(%rsi)
354 movdqa %xmm8, 0(%rsp)
355 movdqa %xmm9, 16(%rsp)
356 movdqa %xmm10, 32(%rsp)
357 movdqa %xmm11, 48(%rsp)
369 movdqa %xmm12, 0(%rsp)
370 movdqa %xmm13, 16(%rsp)
371 movdqa %xmm14, 32(%rsp)
372 movdqa %xmm15, 48(%rsp)
383 jne gen_scrypt_core_loop1
386 gen_scrypt_core_loop2:
391 movdqa 0(%rsi, %rdx), %xmm0
392 movdqa 16(%rsi, %rdx), %xmm1
393 movdqa 32(%rsi, %rdx), %xmm2
394 movdqa 48(%rsi, %rdx), %xmm3
395 movdqa 64(%rsi, %rdx), %xmm4
396 movdqa 80(%rsi, %rdx), %xmm5
397 movdqa 96(%rsi, %rdx), %xmm6
398 movdqa 112(%rsi, %rdx), %xmm7
412 movdqa %xmm8, 0(%rsp)
413 movdqa %xmm9, 16(%rsp)
414 movdqa %xmm10, 32(%rsp)
415 movdqa %xmm11, 48(%rsp)
427 movdqa %xmm12, 0(%rsp)
428 movdqa %xmm13, 16(%rsp)
429 movdqa %xmm14, 32(%rsp)
430 movdqa %xmm15, 48(%rsp)
439 ja gen_scrypt_core_loop2
442 movdqa %xmm8, 0(%rdi)
443 movdqa %xmm9, 16(%rdi)
444 movdqa %xmm10, 32(%rdi)
445 movdqa %xmm11, 48(%rdi)
446 movdqa %xmm12, 64(%rdi)
447 movdqa %xmm13, 80(%rdi)
448 movdqa %xmm14, 96(%rdi)
449 movdqa %xmm15, 112(%rdi)
452 scrypt_core_cleanup()
456 #define xmm_salsa8_core_doubleround() \
457 movdqa %xmm1, %xmm4; \
458 paddd %xmm0, %xmm4; \
459 movdqa %xmm4, %xmm5; \
464 movdqa %xmm0, %xmm4; \
465 paddd %xmm3, %xmm4; \
466 movdqa %xmm4, %xmm5; \
470 movdqa %xmm3, %xmm4; \
471 pshufd $0x93, %xmm3, %xmm3; \
473 paddd %xmm2, %xmm4; \
474 movdqa %xmm4, %xmm5; \
478 movdqa %xmm2, %xmm4; \
479 pshufd $0x4e, %xmm2, %xmm2; \
481 paddd %xmm1, %xmm4; \
482 movdqa %xmm4, %xmm5; \
486 pshufd $0x39, %xmm1, %xmm1; \
488 movdqa %xmm3, %xmm4; \
489 paddd %xmm0, %xmm4; \
490 movdqa %xmm4, %xmm5; \
495 movdqa %xmm0, %xmm4; \
496 paddd %xmm1, %xmm4; \
497 movdqa %xmm4, %xmm5; \
501 movdqa %xmm1, %xmm4; \
502 pshufd $0x93, %xmm1, %xmm1; \
504 paddd %xmm2, %xmm4; \
505 movdqa %xmm4, %xmm5; \
509 movdqa %xmm2, %xmm4; \
510 pshufd $0x4e, %xmm2, %xmm2; \
512 paddd %xmm3, %xmm4; \
513 movdqa %xmm4, %xmm5; \
517 pshufd $0x39, %xmm3, %xmm3; \
521 #define xmm_salsa8_core() \
522 xmm_salsa8_core_doubleround(); \
523 xmm_salsa8_core_doubleround(); \
524 xmm_salsa8_core_doubleround(); \
525 xmm_salsa8_core_doubleround(); \
530 # shuffle 1st block into %xmm8-%xmm11
543 pshufd $0x93, %xmm0, %xmm0
544 pshufd $0x93, %xmm1, %xmm1
545 pshufd $0x93, %xmm2, %xmm2
546 pshufd $0x93, %xmm3, %xmm3
559 pshufd $0x93, %xmm0, %xmm0
560 pshufd $0x93, %xmm1, %xmm1
561 pshufd $0x93, %xmm2, %xmm2
562 pshufd $0x93, %xmm3, %xmm3
575 pshufd $0x93, %xmm0, %xmm0
576 pshufd $0x93, %xmm1, %xmm1
577 pshufd $0x93, %xmm2, %xmm2
578 pshufd $0x93, %xmm3, %xmm3
588 # shuffle 2nd block into %xmm12-%xmm15
601 pshufd $0x93, %xmm0, %xmm0
602 pshufd $0x93, %xmm1, %xmm1
603 pshufd $0x93, %xmm2, %xmm2
604 pshufd $0x93, %xmm3, %xmm3
617 pshufd $0x93, %xmm0, %xmm0
618 pshufd $0x93, %xmm1, %xmm1
619 pshufd $0x93, %xmm2, %xmm2
620 pshufd $0x93, %xmm3, %xmm3
633 pshufd $0x93, %xmm0, %xmm0
634 pshufd $0x93, %xmm1, %xmm1
635 pshufd $0x93, %xmm2, %xmm2
636 pshufd $0x93, %xmm3, %xmm3
647 leaq 131072(%rsi), %rcx
648 xmm_scrypt_core_loop1:
649 movdqa %xmm8, 0(%rdx)
650 movdqa %xmm9, 16(%rdx)
651 movdqa %xmm10, 32(%rdx)
652 movdqa %xmm11, 48(%rdx)
653 movdqa %xmm12, 64(%rdx)
654 movdqa %xmm13, 80(%rdx)
655 movdqa %xmm14, 96(%rdx)
656 movdqa %xmm15, 112(%rdx)
688 jne xmm_scrypt_core_loop1
691 xmm_scrypt_core_loop2:
695 movdqa 0(%rsi, %rdx), %xmm0
696 movdqa 16(%rsi, %rdx), %xmm1
697 movdqa 32(%rsi, %rdx), %xmm2
698 movdqa 48(%rsi, %rdx), %xmm3
699 movdqa 64(%rsi, %rdx), %xmm4
700 movdqa 80(%rsi, %rdx), %xmm5
701 movdqa 96(%rsi, %rdx), %xmm6
702 movdqa 112(%rsi, %rdx), %xmm7
741 ja xmm_scrypt_core_loop2
743 # re-shuffle 1st block back
748 pshufd $0x39, %xmm8, %xmm8
749 pshufd $0x39, %xmm9, %xmm9
750 pshufd $0x39, %xmm10, %xmm10
751 pshufd $0x39, %xmm11, %xmm11
760 pshufd $0x39, %xmm8, %xmm8
761 pshufd $0x39, %xmm9, %xmm9
762 pshufd $0x39, %xmm10, %xmm10
763 pshufd $0x39, %xmm11, %xmm11
772 pshufd $0x39, %xmm8, %xmm8
773 pshufd $0x39, %xmm9, %xmm9
774 pshufd $0x39, %xmm10, %xmm10
775 pshufd $0x39, %xmm11, %xmm11
789 # re-shuffle 2nd block back
794 pshufd $0x39, %xmm12, %xmm12
795 pshufd $0x39, %xmm13, %xmm13
796 pshufd $0x39, %xmm14, %xmm14
797 pshufd $0x39, %xmm15, %xmm15
806 pshufd $0x39, %xmm12, %xmm12
807 pshufd $0x39, %xmm13, %xmm13
808 pshufd $0x39, %xmm14, %xmm14
809 pshufd $0x39, %xmm15, %xmm15
818 pshufd $0x39, %xmm12, %xmm12
819 pshufd $0x39, %xmm13, %xmm13
820 pshufd $0x39, %xmm14, %xmm14
821 pshufd $0x39, %xmm15, %xmm15
835 scrypt_core_cleanup()
841 .globl scrypt_best_throughput
842 .globl _scrypt_best_throughput
843 scrypt_best_throughput:
844 _scrypt_best_throughput:
849 cmpl $0x444d4163, %ecx
850 jne scrypt_best_throughput_exit
851 cmpl $0x69746e65, %edx
852 jne scrypt_best_throughput_exit
853 cmpl $0x68747541, %ebx
854 jne scrypt_best_throughput_exit
857 andl $0x0ff00000, %eax
859 jnz scrypt_best_throughput_exit
861 scrypt_best_throughput_exit:
866 #define xmm_salsa8_core_2way_doubleround() \
867 movdqa %xmm1, %xmm4; \
868 movdqa %xmm9, %xmm6; \
869 paddd %xmm0, %xmm4; \
870 paddd %xmm8, %xmm6; \
871 movdqa %xmm4, %xmm5; \
872 movdqa %xmm6, %xmm7; \
878 pxor %xmm6, %xmm11; \
880 pxor %xmm7, %xmm11; \
881 movdqa %xmm0, %xmm4; \
882 movdqa %xmm8, %xmm6; \
883 paddd %xmm3, %xmm4; \
884 paddd %xmm11, %xmm6; \
885 movdqa %xmm4, %xmm5; \
886 movdqa %xmm6, %xmm7; \
892 pxor %xmm6, %xmm10; \
893 movdqa %xmm3, %xmm4; \
894 movdqa %xmm11, %xmm6; \
895 pshufd $0x93, %xmm3, %xmm3; \
896 pshufd $0x93, %xmm11, %xmm11; \
898 pxor %xmm7, %xmm10; \
899 paddd %xmm2, %xmm4; \
900 paddd %xmm10, %xmm6; \
901 movdqa %xmm4, %xmm5; \
902 movdqa %xmm6, %xmm7; \
909 movdqa %xmm2, %xmm4; \
910 movdqa %xmm10, %xmm6; \
911 pshufd $0x4e, %xmm2, %xmm2; \
912 pshufd $0x4e, %xmm10, %xmm10; \
915 paddd %xmm1, %xmm4; \
916 paddd %xmm9, %xmm6; \
917 movdqa %xmm4, %xmm5; \
918 movdqa %xmm6, %xmm7; \
925 pshufd $0x39, %xmm1, %xmm1; \
926 pshufd $0x39, %xmm9, %xmm9; \
929 movdqa %xmm3, %xmm4; \
930 movdqa %xmm11, %xmm6; \
931 paddd %xmm0, %xmm4; \
932 paddd %xmm8, %xmm6; \
933 movdqa %xmm4, %xmm5; \
934 movdqa %xmm6, %xmm7; \
943 movdqa %xmm0, %xmm4; \
944 movdqa %xmm8, %xmm6; \
945 paddd %xmm1, %xmm4; \
946 paddd %xmm9, %xmm6; \
947 movdqa %xmm4, %xmm5; \
948 movdqa %xmm6, %xmm7; \
954 pxor %xmm6, %xmm10; \
955 movdqa %xmm1, %xmm4; \
956 movdqa %xmm9, %xmm6; \
957 pshufd $0x93, %xmm1, %xmm1; \
958 pshufd $0x93, %xmm9, %xmm9; \
960 pxor %xmm7, %xmm10; \
961 paddd %xmm2, %xmm4; \
962 paddd %xmm10, %xmm6; \
963 movdqa %xmm4, %xmm5; \
964 movdqa %xmm6, %xmm7; \
970 pxor %xmm6, %xmm11; \
971 movdqa %xmm2, %xmm4; \
972 movdqa %xmm10, %xmm6; \
973 pshufd $0x4e, %xmm2, %xmm2; \
974 pshufd $0x4e, %xmm10, %xmm10; \
976 pxor %xmm7, %xmm11; \
977 paddd %xmm3, %xmm4; \
978 paddd %xmm11, %xmm6; \
979 movdqa %xmm4, %xmm5; \
980 movdqa %xmm6, %xmm7; \
987 pshufd $0x39, %xmm3, %xmm3; \
988 pshufd $0x39, %xmm11, %xmm11; \
993 #define xmm_salsa8_core_2way() \
994 xmm_salsa8_core_2way_doubleround(); \
995 xmm_salsa8_core_2way_doubleround(); \
996 xmm_salsa8_core_2way_doubleround(); \
997 xmm_salsa8_core_2way_doubleround(); \
1003 .globl scrypt_core_2way
1004 .globl _scrypt_core_2way
1011 movdqa %xmm6, 8(%rsp)
1012 movdqa %xmm7, 24(%rsp)
1013 movdqa %xmm8, 40(%rsp)
1014 movdqa %xmm9, 56(%rsp)
1015 movdqa %xmm10, 72(%rsp)
1016 movdqa %xmm11, 88(%rsp)
1017 movdqa %xmm12, 104(%rsp)
1018 movdqa %xmm13, 120(%rsp)
1019 movdqa %xmm14, 136(%rsp)
1020 movdqa %xmm15, 152(%rsp)
1029 scrypt_shuffle(%rdi, 0, %rsp, 0)
1030 scrypt_shuffle(%rdi, 64, %rsp, 64)
1031 scrypt_shuffle(%rsi, 0, %rsp, 128)
1032 scrypt_shuffle(%rsi, 64, %rsp, 192)
1034 movdqa 192(%rsp), %xmm12
1035 movdqa 208(%rsp), %xmm13
1036 movdqa 224(%rsp), %xmm14
1037 movdqa 240(%rsp), %xmm15
1040 leaq 262144(%rdx), %rcx
1041 scrypt_core_2way_loop1:
1042 movdqa 0(%rsp), %xmm0
1043 movdqa 16(%rsp), %xmm1
1044 movdqa 32(%rsp), %xmm2
1045 movdqa 48(%rsp), %xmm3
1046 movdqa 64(%rsp), %xmm4
1047 movdqa 80(%rsp), %xmm5
1048 movdqa 96(%rsp), %xmm6
1049 movdqa 112(%rsp), %xmm7
1050 movdqa 128(%rsp), %xmm8
1051 movdqa 144(%rsp), %xmm9
1052 movdqa 160(%rsp), %xmm10
1053 movdqa 176(%rsp), %xmm11
1058 movdqa %xmm0, 0(%rbp)
1059 movdqa %xmm1, 16(%rbp)
1060 movdqa %xmm2, 32(%rbp)
1061 movdqa %xmm3, 48(%rbp)
1062 movdqa %xmm4, 64(%rbp)
1063 movdqa %xmm5, 80(%rbp)
1064 movdqa %xmm6, 96(%rbp)
1065 movdqa %xmm7, 112(%rbp)
1070 movdqa %xmm8, 128(%rbp)
1071 movdqa %xmm9, 144(%rbp)
1072 movdqa %xmm10, 160(%rbp)
1073 movdqa %xmm11, 176(%rbp)
1074 movdqa %xmm12, 192(%rbp)
1075 movdqa %xmm13, 208(%rbp)
1076 movdqa %xmm14, 224(%rbp)
1077 movdqa %xmm15, 240(%rbp)
1079 xmm_salsa8_core_2way()
1080 paddd 0(%rbp), %xmm0
1081 paddd 16(%rbp), %xmm1
1082 paddd 32(%rbp), %xmm2
1083 paddd 48(%rbp), %xmm3
1084 paddd 128(%rbp), %xmm8
1085 paddd 144(%rbp), %xmm9
1086 paddd 160(%rbp), %xmm10
1087 paddd 176(%rbp), %xmm11
1088 movdqa %xmm0, 0(%rsp)
1089 movdqa %xmm1, 16(%rsp)
1090 movdqa %xmm2, 32(%rsp)
1091 movdqa %xmm3, 48(%rsp)
1092 movdqa %xmm8, 128(%rsp)
1093 movdqa %xmm9, 144(%rsp)
1094 movdqa %xmm10, 160(%rsp)
1095 movdqa %xmm11, 176(%rsp)
1097 pxor 64(%rsp), %xmm0
1098 pxor 80(%rsp), %xmm1
1099 pxor 96(%rsp), %xmm2
1100 pxor 112(%rsp), %xmm3
1105 movdqa %xmm0, 64(%rsp)
1106 movdqa %xmm1, 80(%rsp)
1107 movdqa %xmm2, 96(%rsp)
1108 movdqa %xmm3, 112(%rsp)
1109 movdqa %xmm8, %xmm12
1110 movdqa %xmm9, %xmm13
1111 movdqa %xmm10, %xmm14
1112 movdqa %xmm11, %xmm15
1113 xmm_salsa8_core_2way()
1114 paddd 64(%rsp), %xmm0
1115 paddd 80(%rsp), %xmm1
1116 paddd 96(%rsp), %xmm2
1117 paddd 112(%rsp), %xmm3
1120 paddd %xmm10, %xmm14
1121 paddd %xmm11, %xmm15
1122 movdqa %xmm0, 64(%rsp)
1123 movdqa %xmm1, 80(%rsp)
1124 movdqa %xmm2, 96(%rsp)
1125 movdqa %xmm3, 112(%rsp)
1129 jne scrypt_core_2way_loop1
1132 scrypt_core_2way_loop2:
1133 movdqa 0(%rsp), %xmm0
1134 movdqa 16(%rsp), %xmm1
1135 movdqa 32(%rsp), %xmm2
1136 movdqa 48(%rsp), %xmm3
1137 movdqa 64(%rsp), %xmm4
1138 movdqa 80(%rsp), %xmm5
1139 movdqa 96(%rsp), %xmm6
1140 movdqa 112(%rsp), %xmm7
1141 movdqa 128(%rsp), %xmm8
1142 movdqa 144(%rsp), %xmm9
1143 movdqa 160(%rsp), %xmm10
1144 movdqa 176(%rsp), %xmm11
1148 pxor 0(%rdx, %rbp), %xmm0
1149 pxor 16(%rdx, %rbp), %xmm1
1150 pxor 32(%rdx, %rbp), %xmm2
1151 pxor 48(%rdx, %rbp), %xmm3
1156 pxor 0(%rdx, %rbx), %xmm8
1157 pxor 16(%rdx, %rbx), %xmm9
1158 pxor 32(%rdx, %rbx), %xmm10
1159 pxor 48(%rdx, %rbx), %xmm11
1169 movdqa %xmm0, 0(%rsp)
1170 movdqa %xmm1, 16(%rsp)
1171 movdqa %xmm2, 32(%rsp)
1172 movdqa %xmm3, 48(%rsp)
1173 movdqa %xmm8, 128(%rsp)
1174 movdqa %xmm9, 144(%rsp)
1175 movdqa %xmm10, 160(%rsp)
1176 movdqa %xmm11, 176(%rsp)
1177 xmm_salsa8_core_2way()
1178 paddd 0(%rsp), %xmm0
1179 paddd 16(%rsp), %xmm1
1180 paddd 32(%rsp), %xmm2
1181 paddd 48(%rsp), %xmm3
1182 paddd 128(%rsp), %xmm8
1183 paddd 144(%rsp), %xmm9
1184 paddd 160(%rsp), %xmm10
1185 paddd 176(%rsp), %xmm11
1186 movdqa %xmm0, 0(%rsp)
1187 movdqa %xmm1, 16(%rsp)
1188 movdqa %xmm2, 32(%rsp)
1189 movdqa %xmm3, 48(%rsp)
1190 movdqa %xmm8, 128(%rsp)
1191 movdqa %xmm9, 144(%rsp)
1192 movdqa %xmm10, 160(%rsp)
1193 movdqa %xmm11, 176(%rsp)
1195 pxor 64(%rdx, %rbp), %xmm0
1196 pxor 80(%rdx, %rbp), %xmm1
1197 pxor 96(%rdx, %rbp), %xmm2
1198 pxor 112(%rdx, %rbp), %xmm3
1199 pxor 64(%rdx, %rbx), %xmm8
1200 pxor 80(%rdx, %rbx), %xmm9
1201 pxor 96(%rdx, %rbx), %xmm10
1202 pxor 112(%rdx, %rbx), %xmm11
1203 pxor 64(%rsp), %xmm0
1204 pxor 80(%rsp), %xmm1
1205 pxor 96(%rsp), %xmm2
1206 pxor 112(%rsp), %xmm3
1211 movdqa %xmm0, 64(%rsp)
1212 movdqa %xmm1, 80(%rsp)
1213 movdqa %xmm2, 96(%rsp)
1214 movdqa %xmm3, 112(%rsp)
1215 movdqa %xmm8, %xmm12
1216 movdqa %xmm9, %xmm13
1217 movdqa %xmm10, %xmm14
1218 movdqa %xmm11, %xmm15
1219 xmm_salsa8_core_2way()
1220 paddd 64(%rsp), %xmm0
1221 paddd 80(%rsp), %xmm1
1222 paddd 96(%rsp), %xmm2
1223 paddd 112(%rsp), %xmm3
1226 paddd %xmm10, %xmm14
1227 paddd %xmm11, %xmm15
1228 movdqa %xmm0, 64(%rsp)
1229 movdqa %xmm1, 80(%rsp)
1230 movdqa %xmm2, 96(%rsp)
1231 movdqa %xmm3, 112(%rsp)
1234 ja scrypt_core_2way_loop2
1236 movdqa %xmm12, 192(%rsp)
1237 movdqa %xmm13, 208(%rsp)
1238 movdqa %xmm14, 224(%rsp)
1239 movdqa %xmm15, 240(%rsp)
1241 scrypt_shuffle(%rsp, 0, %rdi, 0)
1242 scrypt_shuffle(%rsp, 64, %rdi, 64)
1243 scrypt_shuffle(%rsp, 128, %rsi, 0)
1244 scrypt_shuffle(%rsp, 192, %rsi, 64)
1250 movdqa 8(%rsp), %xmm6
1251 movdqa 24(%rsp), %xmm7
1252 movdqa 40(%rsp), %xmm8
1253 movdqa 56(%rsp), %xmm9
1254 movdqa 72(%rsp), %xmm10
1255 movdqa 88(%rsp), %xmm11
1256 movdqa 104(%rsp), %xmm12
1257 movdqa 120(%rsp), %xmm13
1258 movdqa 136(%rsp), %xmm14
1259 movdqa 152(%rsp), %xmm15
1267 #define xmm_salsa8_core_3way_doubleround() \
1268 movdqa %xmm1, %xmm4; \
1269 movdqa %xmm9, %xmm6; \
1270 movdqa %xmm13, %xmm7; \
1271 paddd %xmm0, %xmm4; \
1272 paddd %xmm8, %xmm6; \
1273 paddd %xmm12, %xmm7; \
1274 movdqa %xmm4, %xmm5; \
1277 pxor %xmm4, %xmm3; \
1278 pxor %xmm5, %xmm3; \
1279 movdqa %xmm0, %xmm4; \
1280 movdqa %xmm6, %xmm5; \
1283 pxor %xmm6, %xmm11; \
1284 pxor %xmm5, %xmm11; \
1285 movdqa %xmm8, %xmm6; \
1286 movdqa %xmm7, %xmm5; \
1289 pxor %xmm7, %xmm15; \
1290 pxor %xmm5, %xmm15; \
1291 movdqa %xmm12, %xmm7; \
1292 paddd %xmm3, %xmm4; \
1293 paddd %xmm11, %xmm6; \
1294 paddd %xmm15, %xmm7; \
1295 movdqa %xmm4, %xmm5; \
1298 pxor %xmm4, %xmm2; \
1299 movdqa %xmm3, %xmm4; \
1300 pshufd $0x93, %xmm3, %xmm3; \
1301 pxor %xmm5, %xmm2; \
1302 movdqa %xmm6, %xmm5; \
1305 pxor %xmm6, %xmm10; \
1306 movdqa %xmm11, %xmm6; \
1307 pshufd $0x93, %xmm11, %xmm11; \
1308 pxor %xmm5, %xmm10; \
1309 movdqa %xmm7, %xmm5; \
1312 pxor %xmm7, %xmm14; \
1313 movdqa %xmm15, %xmm7; \
1314 pshufd $0x93, %xmm15, %xmm15; \
1315 pxor %xmm5, %xmm14; \
1316 paddd %xmm2, %xmm4; \
1317 paddd %xmm10, %xmm6; \
1318 paddd %xmm14, %xmm7; \
1319 movdqa %xmm4, %xmm5; \
1322 pxor %xmm4, %xmm1; \
1323 movdqa %xmm2, %xmm4; \
1324 pshufd $0x4e, %xmm2, %xmm2; \
1325 pxor %xmm5, %xmm1; \
1326 movdqa %xmm6, %xmm5; \
1329 pxor %xmm6, %xmm9; \
1330 movdqa %xmm10, %xmm6; \
1331 pshufd $0x4e, %xmm10, %xmm10; \
1332 pxor %xmm5, %xmm9; \
1333 movdqa %xmm7, %xmm5; \
1336 pxor %xmm7, %xmm13; \
1337 movdqa %xmm14, %xmm7; \
1338 pshufd $0x4e, %xmm14, %xmm14; \
1339 pxor %xmm5, %xmm13; \
1340 paddd %xmm1, %xmm4; \
1341 paddd %xmm9, %xmm6; \
1342 paddd %xmm13, %xmm7; \
1343 movdqa %xmm4, %xmm5; \
1346 pxor %xmm4, %xmm0; \
1347 pshufd $0x39, %xmm1, %xmm1; \
1348 pxor %xmm5, %xmm0; \
1349 movdqa %xmm3, %xmm4; \
1350 movdqa %xmm6, %xmm5; \
1353 pxor %xmm6, %xmm8; \
1354 pshufd $0x39, %xmm9, %xmm9; \
1355 pxor %xmm5, %xmm8; \
1356 movdqa %xmm11, %xmm6; \
1357 movdqa %xmm7, %xmm5; \
1360 pxor %xmm7, %xmm12; \
1361 pshufd $0x39, %xmm13, %xmm13; \
1362 pxor %xmm5, %xmm12; \
1363 movdqa %xmm15, %xmm7; \
1364 paddd %xmm0, %xmm4; \
1365 paddd %xmm8, %xmm6; \
1366 paddd %xmm12, %xmm7; \
1367 movdqa %xmm4, %xmm5; \
1370 pxor %xmm4, %xmm1; \
1371 pxor %xmm5, %xmm1; \
1372 movdqa %xmm0, %xmm4; \
1373 movdqa %xmm6, %xmm5; \
1376 pxor %xmm6, %xmm9; \
1377 pxor %xmm5, %xmm9; \
1378 movdqa %xmm8, %xmm6; \
1379 movdqa %xmm7, %xmm5; \
1382 pxor %xmm7, %xmm13; \
1383 pxor %xmm5, %xmm13; \
1384 movdqa %xmm12, %xmm7; \
1385 paddd %xmm1, %xmm4; \
1386 paddd %xmm9, %xmm6; \
1387 paddd %xmm13, %xmm7; \
1388 movdqa %xmm4, %xmm5; \
1391 pxor %xmm4, %xmm2; \
1392 movdqa %xmm1, %xmm4; \
1393 pshufd $0x93, %xmm1, %xmm1; \
1394 pxor %xmm5, %xmm2; \
1395 movdqa %xmm6, %xmm5; \
1398 pxor %xmm6, %xmm10; \
1399 movdqa %xmm9, %xmm6; \
1400 pshufd $0x93, %xmm9, %xmm9; \
1401 pxor %xmm5, %xmm10; \
1402 movdqa %xmm7, %xmm5; \
1405 pxor %xmm7, %xmm14; \
1406 movdqa %xmm13, %xmm7; \
1407 pshufd $0x93, %xmm13, %xmm13; \
1408 pxor %xmm5, %xmm14; \
1409 paddd %xmm2, %xmm4; \
1410 paddd %xmm10, %xmm6; \
1411 paddd %xmm14, %xmm7; \
1412 movdqa %xmm4, %xmm5; \
1415 pxor %xmm4, %xmm3; \
1416 movdqa %xmm2, %xmm4; \
1417 pshufd $0x4e, %xmm2, %xmm2; \
1418 pxor %xmm5, %xmm3; \
1419 movdqa %xmm6, %xmm5; \
1422 pxor %xmm6, %xmm11; \
1423 movdqa %xmm10, %xmm6; \
1424 pshufd $0x4e, %xmm10, %xmm10; \
1425 pxor %xmm5, %xmm11; \
1426 movdqa %xmm7, %xmm5; \
1429 pxor %xmm7, %xmm15; \
1430 movdqa %xmm14, %xmm7; \
1431 pshufd $0x4e, %xmm14, %xmm14; \
1432 pxor %xmm5, %xmm15; \
1433 paddd %xmm3, %xmm4; \
1434 paddd %xmm11, %xmm6; \
1435 paddd %xmm15, %xmm7; \
1436 movdqa %xmm4, %xmm5; \
1439 pxor %xmm4, %xmm0; \
1440 pshufd $0x39, %xmm3, %xmm3; \
1441 pxor %xmm5, %xmm0; \
1442 movdqa %xmm6, %xmm5; \
1445 pxor %xmm6, %xmm8; \
1446 pshufd $0x39, %xmm11, %xmm11; \
1447 pxor %xmm5, %xmm8; \
1448 movdqa %xmm7, %xmm5; \
1451 pxor %xmm7, %xmm12; \
1452 pshufd $0x39, %xmm15, %xmm15; \
1453 pxor %xmm5, %xmm12; \
1456 #define xmm_salsa8_core_3way() \
1457 xmm_salsa8_core_3way_doubleround(); \
1458 xmm_salsa8_core_3way_doubleround(); \
1459 xmm_salsa8_core_3way_doubleround(); \
1460 xmm_salsa8_core_3way_doubleround(); \
1465 .globl scrypt_core_3way
1466 .globl _scrypt_core_3way
1473 movdqa %xmm6, 8(%rsp)
1474 movdqa %xmm7, 24(%rsp)
1475 movdqa %xmm8, 40(%rsp)
1476 movdqa %xmm9, 56(%rsp)
1477 movdqa %xmm10, 72(%rsp)
1478 movdqa %xmm11, 88(%rsp)
1479 movdqa %xmm12, 104(%rsp)
1480 movdqa %xmm13, 120(%rsp)
1481 movdqa %xmm14, 136(%rsp)
1482 movdqa %xmm15, 152(%rsp)
1492 scrypt_shuffle(%rdi, 0, %rsp, 0)
1493 scrypt_shuffle(%rdi, 64, %rsp, 64)
1494 scrypt_shuffle(%rsi, 0, %rsp, 128)
1495 scrypt_shuffle(%rsi, 64, %rsp, 192)
1496 scrypt_shuffle(%rdx, 0, %rsp, 256)
1497 scrypt_shuffle(%rdx, 64, %rsp, 320)
1499 movdqa 128+64(%rsp), %xmm8
1500 movdqa 128+80(%rsp), %xmm9
1501 movdqa 128+96(%rsp), %xmm10
1502 movdqa 128+112(%rsp), %xmm11
1505 leaq 3*131072(%rcx), %rax
1506 scrypt_core_3way_loop1:
1507 movdqa %xmm8, %xmm12
1508 movdqa %xmm9, %xmm13
1509 movdqa %xmm10, %xmm14
1510 movdqa %xmm11, %xmm15
1511 movdqa 0(%rsp), %xmm0
1512 movdqa 16(%rsp), %xmm1
1513 movdqa 32(%rsp), %xmm2
1514 movdqa 48(%rsp), %xmm3
1515 movdqa 64(%rsp), %xmm4
1516 movdqa 80(%rsp), %xmm5
1517 movdqa 96(%rsp), %xmm6
1518 movdqa 112(%rsp), %xmm7
1519 movdqa 128+0(%rsp), %xmm8
1520 movdqa 128+16(%rsp), %xmm9
1521 movdqa 128+32(%rsp), %xmm10
1522 movdqa 128+48(%rsp), %xmm11
1527 movdqa %xmm0, 0(%rbp)
1528 movdqa %xmm1, 16(%rbp)
1529 movdqa %xmm2, 32(%rbp)
1530 movdqa %xmm3, 48(%rbp)
1531 movdqa %xmm4, 64(%rbp)
1532 movdqa %xmm5, 80(%rbp)
1533 movdqa %xmm6, 96(%rbp)
1534 movdqa %xmm7, 112(%rbp)
1539 movdqa %xmm8, 128+0(%rbp)
1540 movdqa %xmm9, 128+16(%rbp)
1541 movdqa %xmm10, 128+32(%rbp)
1542 movdqa %xmm11, 128+48(%rbp)
1543 movdqa %xmm12, 128+64(%rbp)
1544 movdqa %xmm13, 128+80(%rbp)
1545 movdqa %xmm14, 128+96(%rbp)
1546 movdqa %xmm15, 128+112(%rbp)
1547 movdqa 256+0(%rsp), %xmm12
1548 movdqa 256+16(%rsp), %xmm13
1549 movdqa 256+32(%rsp), %xmm14
1550 movdqa 256+48(%rsp), %xmm15
1551 movdqa 256+64(%rsp), %xmm4
1552 movdqa 256+80(%rsp), %xmm5
1553 movdqa 256+96(%rsp), %xmm6
1554 movdqa 256+112(%rsp), %xmm7
1559 movdqa %xmm12, 256+0(%rbp)
1560 movdqa %xmm13, 256+16(%rbp)
1561 movdqa %xmm14, 256+32(%rbp)
1562 movdqa %xmm15, 256+48(%rbp)
1563 movdqa %xmm4, 256+64(%rbp)
1564 movdqa %xmm5, 256+80(%rbp)
1565 movdqa %xmm6, 256+96(%rbp)
1566 movdqa %xmm7, 256+112(%rbp)
1568 xmm_salsa8_core_3way()
1569 paddd 0(%rbp), %xmm0
1570 paddd 16(%rbp), %xmm1
1571 paddd 32(%rbp), %xmm2
1572 paddd 48(%rbp), %xmm3
1573 paddd 128+0(%rbp), %xmm8
1574 paddd 128+16(%rbp), %xmm9
1575 paddd 128+32(%rbp), %xmm10
1576 paddd 128+48(%rbp), %xmm11
1577 paddd 256+0(%rbp), %xmm12
1578 paddd 256+16(%rbp), %xmm13
1579 paddd 256+32(%rbp), %xmm14
1580 paddd 256+48(%rbp), %xmm15
1581 movdqa %xmm0, 0(%rsp)
1582 movdqa %xmm1, 16(%rsp)
1583 movdqa %xmm2, 32(%rsp)
1584 movdqa %xmm3, 48(%rsp)
1585 movdqa %xmm8, 128+0(%rsp)
1586 movdqa %xmm9, 128+16(%rsp)
1587 movdqa %xmm10, 128+32(%rsp)
1588 movdqa %xmm11, 128+48(%rsp)
1589 movdqa %xmm12, 256+0(%rsp)
1590 movdqa %xmm13, 256+16(%rsp)
1591 movdqa %xmm14, 256+32(%rsp)
1592 movdqa %xmm15, 256+48(%rsp)
1594 pxor 64(%rsp), %xmm0
1595 pxor 80(%rsp), %xmm1
1596 pxor 96(%rsp), %xmm2
1597 pxor 112(%rsp), %xmm3
1598 pxor 128+64(%rsp), %xmm8
1599 pxor 128+80(%rsp), %xmm9
1600 pxor 128+96(%rsp), %xmm10
1601 pxor 128+112(%rsp), %xmm11
1602 pxor 256+64(%rsp), %xmm12
1603 pxor 256+80(%rsp), %xmm13
1604 pxor 256+96(%rsp), %xmm14
1605 pxor 256+112(%rsp), %xmm15
1606 movdqa %xmm0, 64(%rsp)
1607 movdqa %xmm1, 80(%rsp)
1608 movdqa %xmm2, 96(%rsp)
1609 movdqa %xmm3, 112(%rsp)
1610 movdqa %xmm8, 128+64(%rsp)
1611 movdqa %xmm9, 128+80(%rsp)
1612 movdqa %xmm10, 128+96(%rsp)
1613 movdqa %xmm11, 128+112(%rsp)
1614 movdqa %xmm12, 256+64(%rsp)
1615 movdqa %xmm13, 256+80(%rsp)
1616 movdqa %xmm14, 256+96(%rsp)
1617 movdqa %xmm15, 256+112(%rsp)
1618 xmm_salsa8_core_3way()
1619 paddd 64(%rsp), %xmm0
1620 paddd 80(%rsp), %xmm1
1621 paddd 96(%rsp), %xmm2
1622 paddd 112(%rsp), %xmm3
1623 paddd 128+64(%rsp), %xmm8
1624 paddd 128+80(%rsp), %xmm9
1625 paddd 128+96(%rsp), %xmm10
1626 paddd 128+112(%rsp), %xmm11
1627 paddd 256+64(%rsp), %xmm12
1628 paddd 256+80(%rsp), %xmm13
1629 paddd 256+96(%rsp), %xmm14
1630 paddd 256+112(%rsp), %xmm15
1631 movdqa %xmm0, 64(%rsp)
1632 movdqa %xmm1, 80(%rsp)
1633 movdqa %xmm2, 96(%rsp)
1634 movdqa %xmm3, 112(%rsp)
1635 movdqa %xmm8, 128+64(%rsp)
1636 movdqa %xmm9, 128+80(%rsp)
1637 movdqa %xmm10, 128+96(%rsp)
1638 movdqa %xmm11, 128+112(%rsp)
1639 movdqa %xmm12, 256+64(%rsp)
1640 movdqa %xmm13, 256+80(%rsp)
1641 movdqa %xmm14, 256+96(%rsp)
1642 movdqa %xmm15, 256+112(%rsp)
1646 jne scrypt_core_3way_loop1
1649 scrypt_core_3way_loop2:
1652 leal (%ebp, %ebp, 2), %ebp
1654 movl 128+64(%rsp), %ebx
1656 leal (%ebx, %ebx, 2), %ebx
1659 movl 256+64(%rsp), %r8d
1661 leal (%r8d, %r8d, 2), %r8d
1664 movdqa 0(%rsp), %xmm0
1665 movdqa 16(%rsp), %xmm1
1666 movdqa 32(%rsp), %xmm2
1667 movdqa 48(%rsp), %xmm3
1668 movdqa 128+0(%rsp), %xmm8
1669 movdqa 128+16(%rsp), %xmm9
1670 movdqa 128+32(%rsp), %xmm10
1671 movdqa 128+48(%rsp), %xmm11
1672 movdqa 256+0(%rsp), %xmm12
1673 movdqa 256+16(%rsp), %xmm13
1674 movdqa 256+32(%rsp), %xmm14
1675 movdqa 256+48(%rsp), %xmm15
1676 pxor 0(%rcx, %rbp), %xmm0
1677 pxor 16(%rcx, %rbp), %xmm1
1678 pxor 32(%rcx, %rbp), %xmm2
1679 pxor 48(%rcx, %rbp), %xmm3
1680 pxor 0(%rcx, %rbx), %xmm8
1681 pxor 16(%rcx, %rbx), %xmm9
1682 pxor 32(%rcx, %rbx), %xmm10
1683 pxor 48(%rcx, %rbx), %xmm11
1684 pxor 0(%rcx, %r8), %xmm12
1685 pxor 16(%rcx, %r8), %xmm13
1686 pxor 32(%rcx, %r8), %xmm14
1687 pxor 48(%rcx, %r8), %xmm15
1689 pxor 64(%rsp), %xmm0
1690 pxor 80(%rsp), %xmm1
1691 pxor 96(%rsp), %xmm2
1692 pxor 112(%rsp), %xmm3
1693 pxor 128+64(%rsp), %xmm8
1694 pxor 128+80(%rsp), %xmm9
1695 pxor 128+96(%rsp), %xmm10
1696 pxor 128+112(%rsp), %xmm11
1697 pxor 256+64(%rsp), %xmm12
1698 pxor 256+80(%rsp), %xmm13
1699 pxor 256+96(%rsp), %xmm14
1700 pxor 256+112(%rsp), %xmm15
1701 movdqa %xmm0, 0(%rsp)
1702 movdqa %xmm1, 16(%rsp)
1703 movdqa %xmm2, 32(%rsp)
1704 movdqa %xmm3, 48(%rsp)
1705 movdqa %xmm8, 128+0(%rsp)
1706 movdqa %xmm9, 128+16(%rsp)
1707 movdqa %xmm10, 128+32(%rsp)
1708 movdqa %xmm11, 128+48(%rsp)
1709 movdqa %xmm12, 256+0(%rsp)
1710 movdqa %xmm13, 256+16(%rsp)
1711 movdqa %xmm14, 256+32(%rsp)
1712 movdqa %xmm15, 256+48(%rsp)
1713 xmm_salsa8_core_3way()
1714 paddd 0(%rsp), %xmm0
1715 paddd 16(%rsp), %xmm1
1716 paddd 32(%rsp), %xmm2
1717 paddd 48(%rsp), %xmm3
1718 paddd 128+0(%rsp), %xmm8
1719 paddd 128+16(%rsp), %xmm9
1720 paddd 128+32(%rsp), %xmm10
1721 paddd 128+48(%rsp), %xmm11
1722 paddd 256+0(%rsp), %xmm12
1723 paddd 256+16(%rsp), %xmm13
1724 paddd 256+32(%rsp), %xmm14
1725 paddd 256+48(%rsp), %xmm15
1726 movdqa %xmm0, 0(%rsp)
1727 movdqa %xmm1, 16(%rsp)
1728 movdqa %xmm2, 32(%rsp)
1729 movdqa %xmm3, 48(%rsp)
1730 movdqa %xmm8, 128+0(%rsp)
1731 movdqa %xmm9, 128+16(%rsp)
1732 movdqa %xmm10, 128+32(%rsp)
1733 movdqa %xmm11, 128+48(%rsp)
1734 movdqa %xmm12, 256+0(%rsp)
1735 movdqa %xmm13, 256+16(%rsp)
1736 movdqa %xmm14, 256+32(%rsp)
1737 movdqa %xmm15, 256+48(%rsp)
1739 pxor 64(%rcx, %rbp), %xmm0
1740 pxor 80(%rcx, %rbp), %xmm1
1741 pxor 96(%rcx, %rbp), %xmm2
1742 pxor 112(%rcx, %rbp), %xmm3
1743 pxor 64(%rcx, %rbx), %xmm8
1744 pxor 80(%rcx, %rbx), %xmm9
1745 pxor 96(%rcx, %rbx), %xmm10
1746 pxor 112(%rcx, %rbx), %xmm11
1747 pxor 64(%rcx, %r8), %xmm12
1748 pxor 80(%rcx, %r8), %xmm13
1749 pxor 96(%rcx, %r8), %xmm14
1750 pxor 112(%rcx, %r8), %xmm15
1751 pxor 64(%rsp), %xmm0
1752 pxor 80(%rsp), %xmm1
1753 pxor 96(%rsp), %xmm2
1754 pxor 112(%rsp), %xmm3
1755 pxor 128+64(%rsp), %xmm8
1756 pxor 128+80(%rsp), %xmm9
1757 pxor 128+96(%rsp), %xmm10
1758 pxor 128+112(%rsp), %xmm11
1759 pxor 256+64(%rsp), %xmm12
1760 pxor 256+80(%rsp), %xmm13
1761 pxor 256+96(%rsp), %xmm14
1762 pxor 256+112(%rsp), %xmm15
1763 movdqa %xmm0, 64(%rsp)
1764 movdqa %xmm1, 80(%rsp)
1765 movdqa %xmm2, 96(%rsp)
1766 movdqa %xmm3, 112(%rsp)
1767 movdqa %xmm8, 128+64(%rsp)
1768 movdqa %xmm9, 128+80(%rsp)
1769 movdqa %xmm10, 128+96(%rsp)
1770 movdqa %xmm11, 128+112(%rsp)
1771 movdqa %xmm12, 256+64(%rsp)
1772 movdqa %xmm13, 256+80(%rsp)
1773 movdqa %xmm14, 256+96(%rsp)
1774 movdqa %xmm15, 256+112(%rsp)
1775 xmm_salsa8_core_3way()
1776 paddd 64(%rsp), %xmm0
1777 paddd 80(%rsp), %xmm1
1778 paddd 96(%rsp), %xmm2
1779 paddd 112(%rsp), %xmm3
1780 paddd 128+64(%rsp), %xmm8
1781 paddd 128+80(%rsp), %xmm9
1782 paddd 128+96(%rsp), %xmm10
1783 paddd 128+112(%rsp), %xmm11
1784 paddd 256+64(%rsp), %xmm12
1785 paddd 256+80(%rsp), %xmm13
1786 paddd 256+96(%rsp), %xmm14
1787 paddd 256+112(%rsp), %xmm15
1788 movdqa %xmm0, 64(%rsp)
1789 movdqa %xmm1, 80(%rsp)
1790 movdqa %xmm2, 96(%rsp)
1791 movdqa %xmm3, 112(%rsp)
1792 movdqa %xmm8, 128+64(%rsp)
1793 movdqa %xmm9, 128+80(%rsp)
1794 movdqa %xmm10, 128+96(%rsp)
1795 movdqa %xmm11, 128+112(%rsp)
1796 movdqa %xmm12, 256+64(%rsp)
1797 movdqa %xmm13, 256+80(%rsp)
1798 movdqa %xmm14, 256+96(%rsp)
1799 movdqa %xmm15, 256+112(%rsp)
1802 ja scrypt_core_3way_loop2
1804 scrypt_shuffle(%rsp, 0, %rdi, 0)
1805 scrypt_shuffle(%rsp, 64, %rdi, 64)
1806 scrypt_shuffle(%rsp, 128, %rsi, 0)
1807 scrypt_shuffle(%rsp, 192, %rsi, 64)
1808 scrypt_shuffle(%rsp, 256, %rdx, 0)
1809 scrypt_shuffle(%rsp, 320, %rdx, 64)
1815 movdqa 8(%rsp), %xmm6
1816 movdqa 24(%rsp), %xmm7
1817 movdqa 40(%rsp), %xmm8
1818 movdqa 56(%rsp), %xmm9
1819 movdqa 72(%rsp), %xmm10
1820 movdqa 88(%rsp), %xmm11
1821 movdqa 104(%rsp), %xmm12
1822 movdqa 120(%rsp), %xmm13
1823 movdqa 136(%rsp), %xmm14
1824 movdqa 152(%rsp), %xmm15