1 # Copyright 2011-2012 pooler@litecoinpool.org
4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions
7 # 1. Redistributions of source code must retain the above copyright
8 # notice, this list of conditions and the following disclaimer.
9 # 2. Redistributions in binary form must reproduce the above copyright
10 # notice, this list of conditions and the following disclaimer in the
11 # documentation and/or other materials provided with the distribution.
13 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 #if defined(__x86_64__)
28 #if defined(__linux__) && defined(__ELF__)
29 .section .note.GNU-stack,"",%progbits
32 #define scrypt_shuffle(src, so, dest, do) \
33 movl so+60(src), %r8d; \
34 movl so+44(src), %r9d; \
35 movl so+28(src), %r10d; \
36 movl so+12(src), %r11d; \
37 movl %r8d, do+12(dest); \
38 movl %r9d, do+28(dest); \
39 movl %r10d, do+44(dest); \
40 movl %r11d, do+60(dest); \
41 movl so+40(src), %r8d; \
42 movl so+8(src), %r9d; \
43 movl so+48(src), %r10d; \
44 movl so+16(src), %r11d; \
45 movl %r8d, do+8(dest); \
46 movl %r9d, do+40(dest); \
47 movl %r10d, do+16(dest); \
48 movl %r11d, do+48(dest); \
49 movl so+20(src), %r8d; \
50 movl so+4(src), %r9d; \
51 movl so+52(src), %r10d; \
52 movl so+36(src), %r11d; \
53 movl %r8d, do+4(dest); \
54 movl %r9d, do+20(dest); \
55 movl %r10d, do+36(dest); \
56 movl %r11d, do+52(dest); \
57 movl so+0(src), %r8d; \
58 movl so+24(src), %r9d; \
59 movl so+32(src), %r10d; \
60 movl so+56(src), %r11d; \
61 movl %r8d, do+0(dest); \
62 movl %r9d, do+24(dest); \
63 movl %r10d, do+32(dest); \
64 movl %r11d, do+56(dest); \
68 #define salsa8_core_gen_doubleround() \
69 movq 72(%rsp), %r15; \
70 leaq (%r14, %rdx), %rbp; \
73 leaq (%rdi, %r15), %rbp; \
76 leaq (%rdx, %r9), %rbp; \
79 leaq (%r15, %r10), %rbp; \
82 leaq (%r9, %r11), %rbp; \
85 leaq (%r10, %r13), %rbp; \
88 leaq (%r11, %r14), %rbp; \
91 leaq (%r13, %rdi), %rbp; \
94 movq 48(%rsp), %rbp; \
95 movq %r15, 72(%rsp); \
96 leaq (%rax, %rbp), %r15; \
99 leaq (%rbp, %rbx), %r15; \
102 leaq (%rbx, %rcx), %r15; \
105 leaq (%rcx, %rax), %r15; \
108 movq 88(%rsp), %r15; \
109 movq %rbp, 48(%rsp); \
110 leaq (%r12, %r15), %rbp; \
113 leaq (%r15, %rsi), %rbp; \
116 leaq (%rsi, %r8), %rbp; \
119 leaq (%r8, %r12), %rbp; \
122 movq %r15, 88(%rsp); \
123 movq 72(%rsp), %r15; \
124 leaq (%rsi, %rdx), %rbp; \
127 leaq (%r9, %r15), %rbp; \
130 leaq (%rdx, %rdi), %rbp; \
133 leaq (%r15, %rax), %rbp; \
136 leaq (%rdi, %rcx), %rbp; \
139 leaq (%rax, %r8), %rbp; \
142 leaq (%rcx, %rsi), %rbp; \
145 leaq (%r8, %r9), %rbp; \
148 movq 48(%rsp), %rbp; \
149 movq %r15, 72(%rsp); \
150 leaq (%r10, %rbp), %r15; \
153 leaq (%rbp, %r12), %r15; \
156 leaq (%r12, %r11), %r15; \
159 leaq (%r11, %r10), %r15; \
162 movq 88(%rsp), %r15; \
163 movq %rbp, 48(%rsp); \
164 leaq (%rbx, %r15), %rbp; \
167 leaq (%r15, %r14), %rbp; \
170 leaq (%r14, %r13), %rbp; \
173 leaq (%r13, %rbx), %rbp; \
176 movq %r15, 88(%rsp); \
182 /* 0: %rdx, %rdi, %rcx, %rsi */
189 /* 1: %r9, 72(%rsp), %rax, %r8 */
197 /* 2: %r11, %r10, 48(%rsp), %r12 */
202 /* movq %r12, %r13 */
203 /* movq %r13, 48(%rsp) */
205 /* 3: %r14, %r13, %rbx, 88(%rsp) */
214 salsa8_core_gen_doubleround()
215 salsa8_core_gen_doubleround()
216 salsa8_core_gen_doubleround()
217 salsa8_core_gen_doubleround()
249 movdqa 24(%rsp), %xmm0
255 movdqa 40(%rsp), %xmm1
256 movdqa 56(%rsp), %xmm2
257 movdqa 72(%rsp), %xmm3
276 movdqa %xmm6, 8(%rsp)
277 movdqa %xmm7, 24(%rsp)
278 movdqa %xmm8, 40(%rsp)
279 movdqa %xmm9, 56(%rsp)
280 movdqa %xmm10, 72(%rsp)
281 movdqa %xmm11, 88(%rsp)
282 movdqa %xmm12, 104(%rsp)
283 movdqa %xmm13, 120(%rsp)
284 movdqa %xmm14, 136(%rsp)
285 movdqa %xmm15, 152(%rsp)
293 #define scrypt_core_cleanup() \
296 movdqa 8(%rsp), %xmm6; \
297 movdqa 24(%rsp), %xmm7; \
298 movdqa 40(%rsp), %xmm8; \
299 movdqa 56(%rsp), %xmm9; \
300 movdqa 72(%rsp), %xmm10; \
301 movdqa 88(%rsp), %xmm11; \
302 movdqa 104(%rsp), %xmm12; \
303 movdqa 120(%rsp), %xmm13; \
304 movdqa 136(%rsp), %xmm14; \
305 movdqa 152(%rsp), %xmm15; \
315 #define scrypt_core_cleanup() \
325 /* GenuineIntel processors have fast SIMD */
328 cmpl $0x6c65746e, %ecx
330 cmpl $0x49656e69, %edx
332 cmpl $0x756e6547, %ebx
338 movdqa 0(%rdi), %xmm8
339 movdqa 16(%rdi), %xmm9
340 movdqa 32(%rdi), %xmm10
341 movdqa 48(%rdi), %xmm11
342 movdqa 64(%rdi), %xmm12
343 movdqa 80(%rdi), %xmm13
344 movdqa 96(%rdi), %xmm14
345 movdqa 112(%rdi), %xmm15
347 leaq 131072(%rsi), %rcx
351 scrypt_core_gen_loop1:
352 movdqa %xmm8, 0(%rsi)
353 movdqa %xmm9, 16(%rsi)
354 movdqa %xmm10, 32(%rsi)
355 movdqa %xmm11, 48(%rsi)
356 movdqa %xmm12, 64(%rsi)
357 movdqa %xmm13, 80(%rsi)
358 movdqa %xmm14, 96(%rsi)
359 movdqa %xmm15, 112(%rsi)
365 movdqa %xmm8, 0(%rsp)
366 movdqa %xmm9, 16(%rsp)
367 movdqa %xmm10, 32(%rsp)
368 movdqa %xmm11, 48(%rsp)
380 movdqa %xmm12, 0(%rsp)
381 movdqa %xmm13, 16(%rsp)
382 movdqa %xmm14, 32(%rsp)
383 movdqa %xmm15, 48(%rsp)
394 jne scrypt_core_gen_loop1
398 scrypt_core_gen_loop2:
403 movdqa 0(%rdx), %xmm0
404 movdqa 16(%rdx), %xmm1
405 movdqa 32(%rdx), %xmm2
406 movdqa 48(%rdx), %xmm3
407 movdqa 64(%rdx), %xmm4
408 movdqa 80(%rdx), %xmm5
409 movdqa 96(%rdx), %xmm6
410 movdqa 112(%rdx), %xmm7
424 movdqa %xmm8, 0(%rsp)
425 movdqa %xmm9, 16(%rsp)
426 movdqa %xmm10, 32(%rsp)
427 movdqa %xmm11, 48(%rsp)
439 movdqa %xmm12, 0(%rsp)
440 movdqa %xmm13, 16(%rsp)
441 movdqa %xmm14, 32(%rsp)
442 movdqa %xmm15, 48(%rsp)
452 ja scrypt_core_gen_loop2
455 movdqa %xmm8, 0(%rdi)
456 movdqa %xmm9, 16(%rdi)
457 movdqa %xmm10, 32(%rdi)
458 movdqa %xmm11, 48(%rdi)
459 movdqa %xmm12, 64(%rdi)
460 movdqa %xmm13, 80(%rdi)
461 movdqa %xmm14, 96(%rdi)
462 movdqa %xmm15, 112(%rdi)
465 scrypt_core_cleanup()
469 #define salsa8_core_xmm_doubleround() \
470 movdqa %xmm1, %xmm4; \
471 paddd %xmm0, %xmm4; \
472 movdqa %xmm4, %xmm5; \
476 movdqa %xmm0, %xmm4; \
478 paddd %xmm3, %xmm4; \
479 movdqa %xmm4, %xmm5; \
483 movdqa %xmm3, %xmm4; \
485 pshufd $0x93, %xmm3, %xmm3; \
486 paddd %xmm2, %xmm4; \
487 movdqa %xmm4, %xmm5; \
491 movdqa %xmm2, %xmm4; \
493 pshufd $0x4e, %xmm2, %xmm2; \
494 paddd %xmm1, %xmm4; \
495 movdqa %xmm4, %xmm5; \
499 movdqa %xmm3, %xmm4; \
501 pshufd $0x39, %xmm1, %xmm1; \
502 paddd %xmm0, %xmm4; \
503 movdqa %xmm4, %xmm5; \
507 movdqa %xmm0, %xmm4; \
509 paddd %xmm1, %xmm4; \
510 movdqa %xmm4, %xmm5; \
514 movdqa %xmm1, %xmm4; \
516 pshufd $0x93, %xmm1, %xmm1; \
517 paddd %xmm2, %xmm4; \
518 movdqa %xmm4, %xmm5; \
522 movdqa %xmm2, %xmm4; \
524 pshufd $0x4e, %xmm2, %xmm2; \
525 paddd %xmm3, %xmm4; \
526 movdqa %xmm4, %xmm5; \
530 pshufd $0x39, %xmm3, %xmm3; \
534 #define salsa8_core_xmm() \
535 salsa8_core_xmm_doubleround(); \
536 salsa8_core_xmm_doubleround(); \
537 salsa8_core_xmm_doubleround(); \
538 salsa8_core_xmm_doubleround(); \
546 movdqa 0(%rdi), %xmm8
547 movdqa 16(%rdi), %xmm11
548 movdqa 32(%rdi), %xmm10
549 movdqa 48(%rdi), %xmm9
564 pshufd $0x4e, %xmm10, %xmm10
565 punpcklqdq %xmm10, %xmm8
566 punpckhqdq %xmm0, %xmm10
568 pshufd $0x4e, %xmm9, %xmm9
569 punpcklqdq %xmm9, %xmm11
570 punpckhqdq %xmm0, %xmm9
572 movdqa 64(%rdi), %xmm12
573 movdqa 80(%rdi), %xmm15
574 movdqa 96(%rdi), %xmm14
575 movdqa 112(%rdi), %xmm13
590 pshufd $0x4e, %xmm14, %xmm14
591 punpcklqdq %xmm14, %xmm12
592 punpckhqdq %xmm0, %xmm14
594 pshufd $0x4e, %xmm13, %xmm13
595 punpcklqdq %xmm13, %xmm15
596 punpckhqdq %xmm0, %xmm13
599 leaq 131072(%rsi), %rcx
600 scrypt_core_xmm_loop1:
605 movdqa %xmm8, 0(%rdx)
606 movdqa %xmm9, 16(%rdx)
607 movdqa %xmm10, 32(%rdx)
608 movdqa %xmm11, 48(%rdx)
609 movdqa %xmm12, 64(%rdx)
610 movdqa %xmm13, 80(%rdx)
611 movdqa %xmm14, 96(%rdx)
612 movdqa %xmm15, 112(%rdx)
640 jne scrypt_core_xmm_loop1
643 scrypt_core_xmm_loop2:
647 pxor 0(%rsi, %rdx), %xmm8
648 pxor 16(%rsi, %rdx), %xmm9
649 pxor 32(%rsi, %rdx), %xmm10
650 pxor 48(%rsi, %rdx), %xmm11
666 pxor 64(%rsi, %rdx), %xmm12
667 pxor 80(%rsi, %rdx), %xmm13
668 pxor 96(%rsi, %rdx), %xmm14
669 pxor 112(%rsi, %rdx), %xmm15
685 ja scrypt_core_xmm_loop2
704 pshufd $0x4e, %xmm10, %xmm10
705 punpcklqdq %xmm10, %xmm8
706 punpckhqdq %xmm0, %xmm10
708 pshufd $0x4e, %xmm11, %xmm11
709 punpcklqdq %xmm11, %xmm9
710 punpckhqdq %xmm0, %xmm11
711 movdqa %xmm8, 0(%rdi)
712 movdqa %xmm11, 16(%rdi)
713 movdqa %xmm10, 32(%rdi)
714 movdqa %xmm9, 48(%rdi)
730 pshufd $0x4e, %xmm14, %xmm14
731 punpcklqdq %xmm14, %xmm12
732 punpckhqdq %xmm0, %xmm14
734 pshufd $0x4e, %xmm15, %xmm15
735 punpcklqdq %xmm15, %xmm13
736 punpckhqdq %xmm0, %xmm15
737 movdqa %xmm12, 64(%rdi)
738 movdqa %xmm15, 80(%rdi)
739 movdqa %xmm14, 96(%rdi)
740 movdqa %xmm13, 112(%rdi)
742 scrypt_core_cleanup()