1 # Copyright 2011-2012 pooler@litecoinpool.org
4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions
7 # 1. Redistributions of source code must retain the above copyright
8 # notice, this list of conditions and the following disclaimer.
9 # 2. Redistributions in binary form must reproduce the above copyright
10 # notice, this list of conditions and the following disclaimer in the
11 # documentation and/or other materials provided with the distribution.
13 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 #if defined(__linux__) && defined(__ELF__)
26 .section .note.GNU-stack,"",%progbits
29 #if defined(__x86_64__)
31 #define scrypt_shuffle(src, so, dest, do) \
32 movl so+60(src), %r8d; \
33 movl so+44(src), %r9d; \
34 movl so+28(src), %r10d; \
35 movl so+12(src), %r11d; \
36 movl %r8d, do+12(dest); \
37 movl %r9d, do+28(dest); \
38 movl %r10d, do+44(dest); \
39 movl %r11d, do+60(dest); \
40 movl so+40(src), %r8d; \
41 movl so+8(src), %r9d; \
42 movl so+48(src), %r10d; \
43 movl so+16(src), %r11d; \
44 movl %r8d, do+8(dest); \
45 movl %r9d, do+40(dest); \
46 movl %r10d, do+16(dest); \
47 movl %r11d, do+48(dest); \
48 movl so+20(src), %r8d; \
49 movl so+4(src), %r9d; \
50 movl so+52(src), %r10d; \
51 movl so+36(src), %r11d; \
52 movl %r8d, do+4(dest); \
53 movl %r9d, do+20(dest); \
54 movl %r10d, do+36(dest); \
55 movl %r11d, do+52(dest); \
56 movl so+0(src), %r8d; \
57 movl so+24(src), %r9d; \
58 movl so+32(src), %r10d; \
59 movl so+56(src), %r11d; \
60 movl %r8d, do+0(dest); \
61 movl %r9d, do+24(dest); \
62 movl %r10d, do+32(dest); \
63 movl %r11d, do+56(dest); \
66 #define gen_salsa8_core_doubleround() \
67 movq 72(%rsp), %r15; \
68 leaq (%r14, %rdx), %rbp; \
71 leaq (%rdi, %r15), %rbp; \
74 leaq (%rdx, %r9), %rbp; \
77 leaq (%r15, %r10), %rbp; \
80 leaq (%r9, %r11), %rbp; \
83 leaq (%r10, %r13), %rbp; \
86 leaq (%r11, %r14), %rbp; \
89 leaq (%r13, %rdi), %rbp; \
92 movq 48(%rsp), %rbp; \
93 movq %r15, 72(%rsp); \
94 leaq (%rax, %rbp), %r15; \
97 leaq (%rbp, %rbx), %r15; \
100 leaq (%rbx, %rcx), %r15; \
103 leaq (%rcx, %rax), %r15; \
106 movq 88(%rsp), %r15; \
107 movq %rbp, 48(%rsp); \
108 leaq (%r12, %r15), %rbp; \
111 leaq (%r15, %rsi), %rbp; \
114 leaq (%rsi, %r8), %rbp; \
117 leaq (%r8, %r12), %rbp; \
120 movq %r15, 88(%rsp); \
121 movq 72(%rsp), %r15; \
122 leaq (%rsi, %rdx), %rbp; \
125 leaq (%r9, %r15), %rbp; \
128 leaq (%rdx, %rdi), %rbp; \
131 leaq (%r15, %rax), %rbp; \
134 leaq (%rdi, %rcx), %rbp; \
137 leaq (%rax, %r8), %rbp; \
140 leaq (%rcx, %rsi), %rbp; \
143 leaq (%r8, %r9), %rbp; \
146 movq 48(%rsp), %rbp; \
147 movq %r15, 72(%rsp); \
148 leaq (%r10, %rbp), %r15; \
151 leaq (%rbp, %r12), %r15; \
154 leaq (%r12, %r11), %r15; \
157 leaq (%r11, %r10), %r15; \
160 movq 88(%rsp), %r15; \
161 movq %rbp, 48(%rsp); \
162 leaq (%rbx, %r15), %rbp; \
165 leaq (%r15, %r14), %rbp; \
168 leaq (%r14, %r13), %rbp; \
171 leaq (%r13, %rbx), %rbp; \
174 movq %r15, 88(%rsp); \
180 # 0: %rdx, %rdi, %rcx, %rsi
187 # 1: %r9, 72(%rsp), %rax, %r8
195 # 2: %r11, %r10, 48(%rsp), %r12
203 # 3: %r14, %r13, %rbx, 88(%rsp)
212 gen_salsa8_core_doubleround()
213 gen_salsa8_core_doubleround()
214 gen_salsa8_core_doubleround()
215 gen_salsa8_core_doubleround()
259 punpcklqdq %xmm4, %xmm0
260 punpcklqdq %xmm5, %xmm1
261 punpcklqdq %xmm6, %xmm2
262 punpcklqdq %xmm7, %xmm3
290 movdqa %xmm6, 8(%rsp)
291 movdqa %xmm7, 24(%rsp)
292 movdqa %xmm8, 40(%rsp)
293 movdqa %xmm9, 56(%rsp)
294 movdqa %xmm10, 72(%rsp)
295 movdqa %xmm11, 88(%rsp)
296 movdqa %xmm12, 104(%rsp)
297 movdqa %xmm13, 120(%rsp)
298 movdqa %xmm14, 136(%rsp)
299 movdqa %xmm15, 152(%rsp)
306 #define scrypt_core_cleanup() \
315 # GenuineIntel processors have fast SIMD
318 cmpl $0x6c65746e, %ecx
320 cmpl $0x49656e69, %edx
322 cmpl $0x756e6547, %ebx
327 movdqa 0(%rdi), %xmm8
328 movdqa 16(%rdi), %xmm9
329 movdqa 32(%rdi), %xmm10
330 movdqa 48(%rdi), %xmm11
331 movdqa 64(%rdi), %xmm12
332 movdqa 80(%rdi), %xmm13
333 movdqa 96(%rdi), %xmm14
334 movdqa 112(%rdi), %xmm15
336 leaq 131072(%rsi), %rcx
340 gen_scrypt_core_loop1:
341 movdqa %xmm8, 0(%rsi)
342 movdqa %xmm9, 16(%rsi)
343 movdqa %xmm10, 32(%rsi)
344 movdqa %xmm11, 48(%rsi)
345 movdqa %xmm12, 64(%rsi)
346 movdqa %xmm13, 80(%rsi)
347 movdqa %xmm14, 96(%rsi)
348 movdqa %xmm15, 112(%rsi)
354 movdqa %xmm8, 0(%rsp)
355 movdqa %xmm9, 16(%rsp)
356 movdqa %xmm10, 32(%rsp)
357 movdqa %xmm11, 48(%rsp)
369 movdqa %xmm12, 0(%rsp)
370 movdqa %xmm13, 16(%rsp)
371 movdqa %xmm14, 32(%rsp)
372 movdqa %xmm15, 48(%rsp)
383 jne gen_scrypt_core_loop1
386 gen_scrypt_core_loop2:
391 movdqa 0(%rsi, %rdx), %xmm0
392 movdqa 16(%rsi, %rdx), %xmm1
393 movdqa 32(%rsi, %rdx), %xmm2
394 movdqa 48(%rsi, %rdx), %xmm3
395 movdqa 64(%rsi, %rdx), %xmm4
396 movdqa 80(%rsi, %rdx), %xmm5
397 movdqa 96(%rsi, %rdx), %xmm6
398 movdqa 112(%rsi, %rdx), %xmm7
412 movdqa %xmm8, 0(%rsp)
413 movdqa %xmm9, 16(%rsp)
414 movdqa %xmm10, 32(%rsp)
415 movdqa %xmm11, 48(%rsp)
427 movdqa %xmm12, 0(%rsp)
428 movdqa %xmm13, 16(%rsp)
429 movdqa %xmm14, 32(%rsp)
430 movdqa %xmm15, 48(%rsp)
439 ja gen_scrypt_core_loop2
442 movdqa %xmm8, 0(%rdi)
443 movdqa %xmm9, 16(%rdi)
444 movdqa %xmm10, 32(%rdi)
445 movdqa %xmm11, 48(%rdi)
446 movdqa %xmm12, 64(%rdi)
447 movdqa %xmm13, 80(%rdi)
448 movdqa %xmm14, 96(%rdi)
449 movdqa %xmm15, 112(%rdi)
452 scrypt_core_cleanup()
456 #define xmm_salsa8_core_doubleround() \
457 movdqa %xmm1, %xmm4; \
458 paddd %xmm0, %xmm4; \
459 movdqa %xmm4, %xmm5; \
464 movdqa %xmm0, %xmm4; \
465 paddd %xmm3, %xmm4; \
466 movdqa %xmm4, %xmm5; \
470 movdqa %xmm3, %xmm4; \
471 pshufd $0x93, %xmm3, %xmm3; \
473 paddd %xmm2, %xmm4; \
474 movdqa %xmm4, %xmm5; \
478 movdqa %xmm2, %xmm4; \
479 pshufd $0x4e, %xmm2, %xmm2; \
481 paddd %xmm1, %xmm4; \
482 movdqa %xmm4, %xmm5; \
486 pshufd $0x39, %xmm1, %xmm1; \
488 movdqa %xmm3, %xmm4; \
489 paddd %xmm0, %xmm4; \
490 movdqa %xmm4, %xmm5; \
495 movdqa %xmm0, %xmm4; \
496 paddd %xmm1, %xmm4; \
497 movdqa %xmm4, %xmm5; \
501 movdqa %xmm1, %xmm4; \
502 pshufd $0x93, %xmm1, %xmm1; \
504 paddd %xmm2, %xmm4; \
505 movdqa %xmm4, %xmm5; \
509 movdqa %xmm2, %xmm4; \
510 pshufd $0x4e, %xmm2, %xmm2; \
512 paddd %xmm3, %xmm4; \
513 movdqa %xmm4, %xmm5; \
517 pshufd $0x39, %xmm3, %xmm3; \
521 #define xmm_salsa8_core() \
522 xmm_salsa8_core_doubleround(); \
523 xmm_salsa8_core_doubleround(); \
524 xmm_salsa8_core_doubleround(); \
525 xmm_salsa8_core_doubleround(); \
530 # shuffle 1st block into %xmm8-%xmm11
543 pshufd $0x93, %xmm0, %xmm0
544 pshufd $0x93, %xmm1, %xmm1
545 pshufd $0x93, %xmm2, %xmm2
546 pshufd $0x93, %xmm3, %xmm3
559 pshufd $0x93, %xmm0, %xmm0
560 pshufd $0x93, %xmm1, %xmm1
561 pshufd $0x93, %xmm2, %xmm2
562 pshufd $0x93, %xmm3, %xmm3
575 pshufd $0x93, %xmm0, %xmm0
576 pshufd $0x93, %xmm1, %xmm1
577 pshufd $0x93, %xmm2, %xmm2
578 pshufd $0x93, %xmm3, %xmm3
588 # shuffle 2nd block into %xmm12-%xmm15
601 pshufd $0x93, %xmm0, %xmm0
602 pshufd $0x93, %xmm1, %xmm1
603 pshufd $0x93, %xmm2, %xmm2
604 pshufd $0x93, %xmm3, %xmm3
617 pshufd $0x93, %xmm0, %xmm0
618 pshufd $0x93, %xmm1, %xmm1
619 pshufd $0x93, %xmm2, %xmm2
620 pshufd $0x93, %xmm3, %xmm3
633 pshufd $0x93, %xmm0, %xmm0
634 pshufd $0x93, %xmm1, %xmm1
635 pshufd $0x93, %xmm2, %xmm2
636 pshufd $0x93, %xmm3, %xmm3
647 leaq 131072(%rsi), %rcx
648 xmm_scrypt_core_loop1:
649 movdqa %xmm8, 0(%rdx)
650 movdqa %xmm9, 16(%rdx)
651 movdqa %xmm10, 32(%rdx)
652 movdqa %xmm11, 48(%rdx)
653 movdqa %xmm12, 64(%rdx)
654 movdqa %xmm13, 80(%rdx)
655 movdqa %xmm14, 96(%rdx)
656 movdqa %xmm15, 112(%rdx)
688 jne xmm_scrypt_core_loop1
691 xmm_scrypt_core_loop2:
695 movdqa 0(%rsi, %rdx), %xmm0
696 movdqa 16(%rsi, %rdx), %xmm1
697 movdqa 32(%rsi, %rdx), %xmm2
698 movdqa 48(%rsi, %rdx), %xmm3
699 movdqa 64(%rsi, %rdx), %xmm4
700 movdqa 80(%rsi, %rdx), %xmm5
701 movdqa 96(%rsi, %rdx), %xmm6
702 movdqa 112(%rsi, %rdx), %xmm7
741 ja xmm_scrypt_core_loop2
743 # re-shuffle 1st block back
748 pshufd $0x39, %xmm8, %xmm8
749 pshufd $0x39, %xmm9, %xmm9
750 pshufd $0x39, %xmm10, %xmm10
751 pshufd $0x39, %xmm11, %xmm11
760 pshufd $0x39, %xmm8, %xmm8
761 pshufd $0x39, %xmm9, %xmm9
762 pshufd $0x39, %xmm10, %xmm10
763 pshufd $0x39, %xmm11, %xmm11
772 pshufd $0x39, %xmm8, %xmm8
773 pshufd $0x39, %xmm9, %xmm9
774 pshufd $0x39, %xmm10, %xmm10
775 pshufd $0x39, %xmm11, %xmm11
789 # re-shuffle 2nd block back
794 pshufd $0x39, %xmm12, %xmm12
795 pshufd $0x39, %xmm13, %xmm13
796 pshufd $0x39, %xmm14, %xmm14
797 pshufd $0x39, %xmm15, %xmm15
806 pshufd $0x39, %xmm12, %xmm12
807 pshufd $0x39, %xmm13, %xmm13
808 pshufd $0x39, %xmm14, %xmm14
809 pshufd $0x39, %xmm15, %xmm15
818 pshufd $0x39, %xmm12, %xmm12
819 pshufd $0x39, %xmm13, %xmm13
820 pshufd $0x39, %xmm14, %xmm14
821 pshufd $0x39, %xmm15, %xmm15
835 scrypt_core_cleanup()
842 movdqa 8(%rsp), %xmm6
843 movdqa 24(%rsp), %xmm7
844 movdqa 40(%rsp), %xmm8
845 movdqa 56(%rsp), %xmm9
846 movdqa 72(%rsp), %xmm10
847 movdqa 88(%rsp), %xmm11
848 movdqa 104(%rsp), %xmm12
849 movdqa 120(%rsp), %xmm13
850 movdqa 136(%rsp), %xmm14
851 movdqa 152(%rsp), %xmm15