2 * Copyright 2011-2012 pooler@litecoinpool.org
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #if defined(__linux__) && defined(__ELF__)
28 .section .note.GNU-stack,"",%progbits
31 #if defined(__x86_64__)
33 .macro scrypt_shuffle src, so, dest, do
34 movl \so+60(\src), %r8d
35 movl \so+44(\src), %r9d
36 movl \so+28(\src), %r10d
37 movl \so+12(\src), %r11d
38 movl %r8d, \do+12(\dest)
39 movl %r9d, \do+28(\dest)
40 movl %r10d, \do+44(\dest)
41 movl %r11d, \do+60(\dest)
42 movl \so+40(\src), %r8d
43 movl \so+8(\src), %r9d
44 movl \so+48(\src), %r10d
45 movl \so+16(\src), %r11d
46 movl %r8d, \do+8(\dest)
47 movl %r9d, \do+40(\dest)
48 movl %r10d, \do+16(\dest)
49 movl %r11d, \do+48(\dest)
50 movl \so+20(\src), %r8d
51 movl \so+4(\src), %r9d
52 movl \so+52(\src), %r10d
53 movl \so+36(\src), %r11d
54 movl %r8d, \do+4(\dest)
55 movl %r9d, \do+20(\dest)
56 movl %r10d, \do+36(\dest)
57 movl %r11d, \do+52(\dest)
58 movl \so+0(\src), %r8d
59 movl \so+24(\src), %r9d
60 movl \so+32(\src), %r10d
61 movl \so+56(\src), %r11d
62 movl %r8d, \do+0(\dest)
63 movl %r9d, \do+24(\dest)
64 movl %r10d, \do+32(\dest)
65 movl %r11d, \do+56(\dest)
69 .macro salsa8_core_gen_doubleround
72 leaq (%r14, %rdx), %rbp
75 leaq (%rdi, %r15), %rbp
78 leaq (%rdx, %r9), %rbp
81 leaq (%r15, %r10), %rbp
85 leaq (%r9, %r11), %rbp
88 leaq (%r10, %r13), %rbp
91 leaq (%r11, %r14), %rbp
94 leaq (%r13, %rdi), %rbp
101 leaq (%rax, %rbp), %r15
104 leaq (%rbp, %rbx), %r15
107 leaq (%rbx, %rcx), %r15
110 leaq (%rcx, %rax), %r15
117 leaq (%r12, %r15), %rbp
120 leaq (%r15, %rsi), %rbp
123 leaq (%rsi, %r8), %rbp
126 leaq (%r8, %r12), %rbp
133 leaq (%rsi, %rdx), %rbp
136 leaq (%r9, %r15), %rbp
139 leaq (%rdx, %rdi), %rbp
142 leaq (%r15, %rax), %rbp
146 leaq (%rdi, %rcx), %rbp
149 leaq (%rax, %r8), %rbp
152 leaq (%rcx, %rsi), %rbp
155 leaq (%r8, %r9), %rbp
162 leaq (%r10, %rbp), %r15
165 leaq (%rbp, %r12), %r15
168 leaq (%r12, %r11), %r15
171 leaq (%r11, %r10), %r15
178 leaq (%rbx, %r15), %rbp
181 leaq (%r15, %r14), %rbp
184 leaq (%r14, %r13), %rbp
187 leaq (%r13, %rbx), %rbp
197 /* 0: %rdx, %rdi, %rcx, %rsi */
204 /* 1: %r9, 72(%rsp), %rax, %r8 */
212 /* 2: %r11, %r10, 48(%rsp), %r12 */
217 /* movq %r12, %r13 */
218 /* movq %r13, 48(%rsp) */
220 /* 3: %r14, %r13, %rbx, 88(%rsp) */
229 salsa8_core_gen_doubleround
230 salsa8_core_gen_doubleround
231 salsa8_core_gen_doubleround
232 salsa8_core_gen_doubleround
264 movdqa 24(%rsp), %xmm0
270 movdqa 40(%rsp), %xmm1
271 movdqa 56(%rsp), %xmm2
272 movdqa 72(%rsp), %xmm3
291 movdqa %xmm6, 8(%rsp)
292 movdqa %xmm7, 24(%rsp)
293 movdqa %xmm8, 40(%rsp)
294 movdqa %xmm9, 56(%rsp)
295 movdqa %xmm10, 72(%rsp)
296 movdqa %xmm11, 88(%rsp)
297 movdqa %xmm12, 104(%rsp)
298 movdqa %xmm13, 120(%rsp)
299 movdqa %xmm14, 136(%rsp)
300 movdqa %xmm15, 152(%rsp)
307 .macro scrypt_core_cleanup
311 movdqa 8(%rsp), %xmm6
312 movdqa 24(%rsp), %xmm7
313 movdqa 40(%rsp), %xmm8
314 movdqa 56(%rsp), %xmm9
315 movdqa 72(%rsp), %xmm10
316 movdqa 88(%rsp), %xmm11
317 movdqa 104(%rsp), %xmm12
318 movdqa 120(%rsp), %xmm13
319 movdqa 136(%rsp), %xmm14
320 movdqa 152(%rsp), %xmm15
331 /* GenuineIntel processors have fast SIMD */
334 cmpl $0x6c65746e, %ecx
336 cmpl $0x49656e69, %edx
338 cmpl $0x756e6547, %ebx
344 movdqa 0(%rdi), %xmm8
345 movdqa 16(%rdi), %xmm9
346 movdqa 32(%rdi), %xmm10
347 movdqa 48(%rdi), %xmm11
348 movdqa 64(%rdi), %xmm12
349 movdqa 80(%rdi), %xmm13
350 movdqa 96(%rdi), %xmm14
351 movdqa 112(%rdi), %xmm15
353 leaq 131072(%rsi), %rcx
357 scrypt_core_gen_loop1:
358 movdqa %xmm8, 0(%rsi)
359 movdqa %xmm9, 16(%rsi)
360 movdqa %xmm10, 32(%rsi)
361 movdqa %xmm11, 48(%rsi)
362 movdqa %xmm12, 64(%rsi)
363 movdqa %xmm13, 80(%rsi)
364 movdqa %xmm14, 96(%rsi)
365 movdqa %xmm15, 112(%rsi)
371 movdqa %xmm8, 0(%rsp)
372 movdqa %xmm9, 16(%rsp)
373 movdqa %xmm10, 32(%rsp)
374 movdqa %xmm11, 48(%rsp)
386 movdqa %xmm12, 0(%rsp)
387 movdqa %xmm13, 16(%rsp)
388 movdqa %xmm14, 32(%rsp)
389 movdqa %xmm15, 48(%rsp)
400 jne scrypt_core_gen_loop1
404 scrypt_core_gen_loop2:
409 movdqa 0(%rdx), %xmm0
410 movdqa 16(%rdx), %xmm1
411 movdqa 32(%rdx), %xmm2
412 movdqa 48(%rdx), %xmm3
413 movdqa 64(%rdx), %xmm4
414 movdqa 80(%rdx), %xmm5
415 movdqa 96(%rdx), %xmm6
416 movdqa 112(%rdx), %xmm7
430 movdqa %xmm8, 0(%rsp)
431 movdqa %xmm9, 16(%rsp)
432 movdqa %xmm10, 32(%rsp)
433 movdqa %xmm11, 48(%rsp)
445 movdqa %xmm12, 0(%rsp)
446 movdqa %xmm13, 16(%rsp)
447 movdqa %xmm14, 32(%rsp)
448 movdqa %xmm15, 48(%rsp)
458 ja scrypt_core_gen_loop2
461 movdqa %xmm8, 0(%rdi)
462 movdqa %xmm9, 16(%rdi)
463 movdqa %xmm10, 32(%rdi)
464 movdqa %xmm11, 48(%rdi)
465 movdqa %xmm12, 64(%rdi)
466 movdqa %xmm13, 80(%rdi)
467 movdqa %xmm14, 96(%rdi)
468 movdqa %xmm15, 112(%rdi)
475 .macro salsa8_core_xmm_doubleround
492 pshufd $0x93, %xmm3, %xmm3
501 pshufd $0x4e, %xmm2, %xmm2
510 pshufd $0x39, %xmm1, %xmm1
527 pshufd $0x93, %xmm1, %xmm1
536 pshufd $0x4e, %xmm2, %xmm2
543 pshufd $0x39, %xmm3, %xmm3
547 .macro salsa8_core_xmm
548 salsa8_core_xmm_doubleround
549 salsa8_core_xmm_doubleround
550 salsa8_core_xmm_doubleround
551 salsa8_core_xmm_doubleround
559 movdqa 0(%rdi), %xmm8
560 movdqa 16(%rdi), %xmm11
561 movdqa 32(%rdi), %xmm10
562 movdqa 48(%rdi), %xmm9
577 pshufd $0x4e, %xmm10, %xmm10
578 punpcklqdq %xmm10, %xmm8
579 punpckhqdq %xmm0, %xmm10
581 pshufd $0x4e, %xmm9, %xmm9
582 punpcklqdq %xmm9, %xmm11
583 punpckhqdq %xmm0, %xmm9
585 movdqa 64(%rdi), %xmm12
586 movdqa 80(%rdi), %xmm15
587 movdqa 96(%rdi), %xmm14
588 movdqa 112(%rdi), %xmm13
603 pshufd $0x4e, %xmm14, %xmm14
604 punpcklqdq %xmm14, %xmm12
605 punpckhqdq %xmm0, %xmm14
607 pshufd $0x4e, %xmm13, %xmm13
608 punpcklqdq %xmm13, %xmm15
609 punpckhqdq %xmm0, %xmm13
612 leaq 131072(%rsi), %rcx
613 scrypt_core_xmm_loop1:
618 movdqa %xmm8, 0(%rdx)
619 movdqa %xmm9, 16(%rdx)
620 movdqa %xmm10, 32(%rdx)
621 movdqa %xmm11, 48(%rdx)
622 movdqa %xmm12, 64(%rdx)
623 movdqa %xmm13, 80(%rdx)
624 movdqa %xmm14, 96(%rdx)
625 movdqa %xmm15, 112(%rdx)
653 jne scrypt_core_xmm_loop1
656 scrypt_core_xmm_loop2:
660 pxor 0(%rsi, %rdx), %xmm8
661 pxor 16(%rsi, %rdx), %xmm9
662 pxor 32(%rsi, %rdx), %xmm10
663 pxor 48(%rsi, %rdx), %xmm11
679 pxor 64(%rsi, %rdx), %xmm12
680 pxor 80(%rsi, %rdx), %xmm13
681 pxor 96(%rsi, %rdx), %xmm14
682 pxor 112(%rsi, %rdx), %xmm15
698 ja scrypt_core_xmm_loop2
717 pshufd $0x4e, %xmm10, %xmm10
718 punpcklqdq %xmm10, %xmm8
719 punpckhqdq %xmm0, %xmm10
721 pshufd $0x4e, %xmm11, %xmm11
722 punpcklqdq %xmm11, %xmm9
723 punpckhqdq %xmm0, %xmm11
724 movdqa %xmm8, 0(%rdi)
725 movdqa %xmm11, 16(%rdi)
726 movdqa %xmm10, 32(%rdi)
727 movdqa %xmm9, 48(%rdi)
743 pshufd $0x4e, %xmm14, %xmm14
744 punpcklqdq %xmm14, %xmm12
745 punpckhqdq %xmm0, %xmm14
747 pshufd $0x4e, %xmm15, %xmm15
748 punpcklqdq %xmm15, %xmm13
749 punpckhqdq %xmm0, %xmm15
750 movdqa %xmm12, 64(%rdi)
751 movdqa %xmm15, 80(%rdi)
752 movdqa %xmm14, 96(%rdi)
753 movdqa %xmm13, 112(%rdi)