1 # Copyright 2011-2012 pooler@litecoinpool.org
4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions
7 # 1. Redistributions of source code must retain the above copyright
8 # notice, this list of conditions and the following disclaimer.
9 # 2. Redistributions in binary form must reproduce the above copyright
10 # notice, this list of conditions and the following disclaimer in the
11 # documentation and/or other materials provided with the distribution.
13 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 #if defined(OPTIMIZED_SALSA) && defined(__x86_64__)
28 #if defined(__linux__) && defined(__ELF__)
29 .section .note.GNU-stack,"",%progbits
32 #define scrypt_shuffle(src, so, dest, do) \
33 movl so+60(src), %r8d; \
34 movl so+44(src), %r9d; \
35 movl so+28(src), %r10d; \
36 movl so+12(src), %r11d; \
37 movl %r8d, do+12(dest); \
38 movl %r9d, do+28(dest); \
39 movl %r10d, do+44(dest); \
40 movl %r11d, do+60(dest); \
41 movl so+40(src), %r8d; \
42 movl so+8(src), %r9d; \
43 movl so+48(src), %r10d; \
44 movl so+16(src), %r11d; \
45 movl %r8d, do+8(dest); \
46 movl %r9d, do+40(dest); \
47 movl %r10d, do+16(dest); \
48 movl %r11d, do+48(dest); \
49 movl so+20(src), %r8d; \
50 movl so+4(src), %r9d; \
51 movl so+52(src), %r10d; \
52 movl so+36(src), %r11d; \
53 movl %r8d, do+4(dest); \
54 movl %r9d, do+20(dest); \
55 movl %r10d, do+36(dest); \
56 movl %r11d, do+52(dest); \
57 movl so+0(src), %r8d; \
58 movl so+24(src), %r9d; \
59 movl so+32(src), %r10d; \
60 movl so+56(src), %r11d; \
61 movl %r8d, do+0(dest); \
62 movl %r9d, do+24(dest); \
63 movl %r10d, do+32(dest); \
64 movl %r11d, do+56(dest); \
67 #define gen_salsa8_core_doubleround() \
68 movq 72(%rsp), %r15; \
69 leaq (%r14, %rdx), %rbp; \
72 leaq (%rdi, %r15), %rbp; \
75 leaq (%rdx, %r9), %rbp; \
78 leaq (%r15, %r10), %rbp; \
81 leaq (%r9, %r11), %rbp; \
84 leaq (%r10, %r13), %rbp; \
87 leaq (%r11, %r14), %rbp; \
90 leaq (%r13, %rdi), %rbp; \
93 movq 48(%rsp), %rbp; \
94 movq %r15, 72(%rsp); \
95 leaq (%rax, %rbp), %r15; \
98 leaq (%rbp, %rbx), %r15; \
101 leaq (%rbx, %rcx), %r15; \
104 leaq (%rcx, %rax), %r15; \
107 movq 88(%rsp), %r15; \
108 movq %rbp, 48(%rsp); \
109 leaq (%r12, %r15), %rbp; \
112 leaq (%r15, %rsi), %rbp; \
115 leaq (%rsi, %r8), %rbp; \
118 leaq (%r8, %r12), %rbp; \
121 movq %r15, 88(%rsp); \
122 movq 72(%rsp), %r15; \
123 leaq (%rsi, %rdx), %rbp; \
126 leaq (%r9, %r15), %rbp; \
129 leaq (%rdx, %rdi), %rbp; \
132 leaq (%r15, %rax), %rbp; \
135 leaq (%rdi, %rcx), %rbp; \
138 leaq (%rax, %r8), %rbp; \
141 leaq (%rcx, %rsi), %rbp; \
144 leaq (%r8, %r9), %rbp; \
147 movq 48(%rsp), %rbp; \
148 movq %r15, 72(%rsp); \
149 leaq (%r10, %rbp), %r15; \
152 leaq (%rbp, %r12), %r15; \
155 leaq (%r12, %r11), %r15; \
158 leaq (%r11, %r10), %r15; \
161 movq 88(%rsp), %r15; \
162 movq %rbp, 48(%rsp); \
163 leaq (%rbx, %r15), %rbp; \
166 leaq (%r15, %r14), %rbp; \
169 leaq (%r14, %r13), %rbp; \
172 leaq (%r13, %rbx), %rbp; \
175 movq %r15, 88(%rsp); \
181 # 0: %rdx, %rdi, %rcx, %rsi
188 # 1: %r9, 72(%rsp), %rax, %r8
196 # 2: %r11, %r10, 48(%rsp), %r12
204 # 3: %r14, %r13, %rbx, 88(%rsp)
213 gen_salsa8_core_doubleround()
214 gen_salsa8_core_doubleround()
215 gen_salsa8_core_doubleround()
216 gen_salsa8_core_doubleround()
260 punpcklqdq %xmm4, %xmm0
261 punpcklqdq %xmm5, %xmm1
262 punpcklqdq %xmm6, %xmm2
263 punpcklqdq %xmm7, %xmm3
291 movdqa %xmm6, 8(%rsp)
292 movdqa %xmm7, 24(%rsp)
293 movdqa %xmm8, 40(%rsp)
294 movdqa %xmm9, 56(%rsp)
295 movdqa %xmm10, 72(%rsp)
296 movdqa %xmm11, 88(%rsp)
297 movdqa %xmm12, 104(%rsp)
298 movdqa %xmm13, 120(%rsp)
299 movdqa %xmm14, 136(%rsp)
300 movdqa %xmm15, 152(%rsp)
307 #define scrypt_core_cleanup() \
316 # GenuineIntel processors have fast SIMD
319 cmpl $0x6c65746e, %ecx
321 cmpl $0x49656e69, %edx
323 cmpl $0x756e6547, %ebx
328 movdqa 0(%rdi), %xmm8
329 movdqa 16(%rdi), %xmm9
330 movdqa 32(%rdi), %xmm10
331 movdqa 48(%rdi), %xmm11
332 movdqa 64(%rdi), %xmm12
333 movdqa 80(%rdi), %xmm13
334 movdqa 96(%rdi), %xmm14
335 movdqa 112(%rdi), %xmm15
337 leaq 131072(%rsi), %rcx
341 gen_scrypt_core_loop1:
342 movdqa %xmm8, 0(%rsi)
343 movdqa %xmm9, 16(%rsi)
344 movdqa %xmm10, 32(%rsi)
345 movdqa %xmm11, 48(%rsi)
346 movdqa %xmm12, 64(%rsi)
347 movdqa %xmm13, 80(%rsi)
348 movdqa %xmm14, 96(%rsi)
349 movdqa %xmm15, 112(%rsi)
355 movdqa %xmm8, 0(%rsp)
356 movdqa %xmm9, 16(%rsp)
357 movdqa %xmm10, 32(%rsp)
358 movdqa %xmm11, 48(%rsp)
370 movdqa %xmm12, 0(%rsp)
371 movdqa %xmm13, 16(%rsp)
372 movdqa %xmm14, 32(%rsp)
373 movdqa %xmm15, 48(%rsp)
384 jne gen_scrypt_core_loop1
387 gen_scrypt_core_loop2:
392 movdqa 0(%rsi, %rdx), %xmm0
393 movdqa 16(%rsi, %rdx), %xmm1
394 movdqa 32(%rsi, %rdx), %xmm2
395 movdqa 48(%rsi, %rdx), %xmm3
396 movdqa 64(%rsi, %rdx), %xmm4
397 movdqa 80(%rsi, %rdx), %xmm5
398 movdqa 96(%rsi, %rdx), %xmm6
399 movdqa 112(%rsi, %rdx), %xmm7
413 movdqa %xmm8, 0(%rsp)
414 movdqa %xmm9, 16(%rsp)
415 movdqa %xmm10, 32(%rsp)
416 movdqa %xmm11, 48(%rsp)
428 movdqa %xmm12, 0(%rsp)
429 movdqa %xmm13, 16(%rsp)
430 movdqa %xmm14, 32(%rsp)
431 movdqa %xmm15, 48(%rsp)
440 ja gen_scrypt_core_loop2
443 movdqa %xmm8, 0(%rdi)
444 movdqa %xmm9, 16(%rdi)
445 movdqa %xmm10, 32(%rdi)
446 movdqa %xmm11, 48(%rdi)
447 movdqa %xmm12, 64(%rdi)
448 movdqa %xmm13, 80(%rdi)
449 movdqa %xmm14, 96(%rdi)
450 movdqa %xmm15, 112(%rdi)
453 scrypt_core_cleanup()
457 #define xmm_salsa8_core_doubleround() \
458 movdqa %xmm1, %xmm4; \
459 paddd %xmm0, %xmm4; \
460 movdqa %xmm4, %xmm5; \
465 movdqa %xmm0, %xmm4; \
466 paddd %xmm3, %xmm4; \
467 movdqa %xmm4, %xmm5; \
471 movdqa %xmm3, %xmm4; \
472 pshufd $0x93, %xmm3, %xmm3; \
474 paddd %xmm2, %xmm4; \
475 movdqa %xmm4, %xmm5; \
479 movdqa %xmm2, %xmm4; \
480 pshufd $0x4e, %xmm2, %xmm2; \
482 paddd %xmm1, %xmm4; \
483 movdqa %xmm4, %xmm5; \
487 pshufd $0x39, %xmm1, %xmm1; \
489 movdqa %xmm3, %xmm4; \
490 paddd %xmm0, %xmm4; \
491 movdqa %xmm4, %xmm5; \
496 movdqa %xmm0, %xmm4; \
497 paddd %xmm1, %xmm4; \
498 movdqa %xmm4, %xmm5; \
502 movdqa %xmm1, %xmm4; \
503 pshufd $0x93, %xmm1, %xmm1; \
505 paddd %xmm2, %xmm4; \
506 movdqa %xmm4, %xmm5; \
510 movdqa %xmm2, %xmm4; \
511 pshufd $0x4e, %xmm2, %xmm2; \
513 paddd %xmm3, %xmm4; \
514 movdqa %xmm4, %xmm5; \
518 pshufd $0x39, %xmm3, %xmm3; \
522 #define xmm_salsa8_core() \
523 xmm_salsa8_core_doubleround(); \
524 xmm_salsa8_core_doubleround(); \
525 xmm_salsa8_core_doubleround(); \
526 xmm_salsa8_core_doubleround(); \
531 # shuffle 1st block into %xmm8-%xmm11
544 pshufd $0x93, %xmm0, %xmm0
545 pshufd $0x93, %xmm1, %xmm1
546 pshufd $0x93, %xmm2, %xmm2
547 pshufd $0x93, %xmm3, %xmm3
560 pshufd $0x93, %xmm0, %xmm0
561 pshufd $0x93, %xmm1, %xmm1
562 pshufd $0x93, %xmm2, %xmm2
563 pshufd $0x93, %xmm3, %xmm3
576 pshufd $0x93, %xmm0, %xmm0
577 pshufd $0x93, %xmm1, %xmm1
578 pshufd $0x93, %xmm2, %xmm2
579 pshufd $0x93, %xmm3, %xmm3
589 # shuffle 2nd block into %xmm12-%xmm15
602 pshufd $0x93, %xmm0, %xmm0
603 pshufd $0x93, %xmm1, %xmm1
604 pshufd $0x93, %xmm2, %xmm2
605 pshufd $0x93, %xmm3, %xmm3
618 pshufd $0x93, %xmm0, %xmm0
619 pshufd $0x93, %xmm1, %xmm1
620 pshufd $0x93, %xmm2, %xmm2
621 pshufd $0x93, %xmm3, %xmm3
634 pshufd $0x93, %xmm0, %xmm0
635 pshufd $0x93, %xmm1, %xmm1
636 pshufd $0x93, %xmm2, %xmm2
637 pshufd $0x93, %xmm3, %xmm3
648 leaq 131072(%rsi), %rcx
649 xmm_scrypt_core_loop1:
650 movdqa %xmm8, 0(%rdx)
651 movdqa %xmm9, 16(%rdx)
652 movdqa %xmm10, 32(%rdx)
653 movdqa %xmm11, 48(%rdx)
654 movdqa %xmm12, 64(%rdx)
655 movdqa %xmm13, 80(%rdx)
656 movdqa %xmm14, 96(%rdx)
657 movdqa %xmm15, 112(%rdx)
689 jne xmm_scrypt_core_loop1
692 xmm_scrypt_core_loop2:
696 movdqa 0(%rsi, %rdx), %xmm0
697 movdqa 16(%rsi, %rdx), %xmm1
698 movdqa 32(%rsi, %rdx), %xmm2
699 movdqa 48(%rsi, %rdx), %xmm3
700 movdqa 64(%rsi, %rdx), %xmm4
701 movdqa 80(%rsi, %rdx), %xmm5
702 movdqa 96(%rsi, %rdx), %xmm6
703 movdqa 112(%rsi, %rdx), %xmm7
742 ja xmm_scrypt_core_loop2
744 # re-shuffle 1st block back
749 pshufd $0x39, %xmm8, %xmm8
750 pshufd $0x39, %xmm9, %xmm9
751 pshufd $0x39, %xmm10, %xmm10
752 pshufd $0x39, %xmm11, %xmm11
761 pshufd $0x39, %xmm8, %xmm8
762 pshufd $0x39, %xmm9, %xmm9
763 pshufd $0x39, %xmm10, %xmm10
764 pshufd $0x39, %xmm11, %xmm11
773 pshufd $0x39, %xmm8, %xmm8
774 pshufd $0x39, %xmm9, %xmm9
775 pshufd $0x39, %xmm10, %xmm10
776 pshufd $0x39, %xmm11, %xmm11
790 # re-shuffle 2nd block back
795 pshufd $0x39, %xmm12, %xmm12
796 pshufd $0x39, %xmm13, %xmm13
797 pshufd $0x39, %xmm14, %xmm14
798 pshufd $0x39, %xmm15, %xmm15
807 pshufd $0x39, %xmm12, %xmm12
808 pshufd $0x39, %xmm13, %xmm13
809 pshufd $0x39, %xmm14, %xmm14
810 pshufd $0x39, %xmm15, %xmm15
819 pshufd $0x39, %xmm12, %xmm12
820 pshufd $0x39, %xmm13, %xmm13
821 pshufd $0x39, %xmm14, %xmm14
822 pshufd $0x39, %xmm15, %xmm15
836 scrypt_core_cleanup()
843 movdqa 8(%rsp), %xmm6
844 movdqa 24(%rsp), %xmm7
845 movdqa 40(%rsp), %xmm8
846 movdqa 56(%rsp), %xmm9
847 movdqa 72(%rsp), %xmm10
848 movdqa 88(%rsp), %xmm11
849 movdqa 104(%rsp), %xmm12
850 movdqa 120(%rsp), %xmm13
851 movdqa 136(%rsp), %xmm14
852 movdqa 152(%rsp), %xmm15