2 * Copyright 2011-2012 pooler@litecoinpool.org
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #if defined(__linux__) && defined(__ELF__)
28 .section .note.GNU-stack,"",%progbits
33 .macro scrypt_shuffle src, so, dest, do
34 movl \so+60(\src), %eax
35 movl \so+44(\src), %ebx
36 movl \so+28(\src), %ecx
37 movl \so+12(\src), %edx
38 movl %eax, \do+12(\dest)
39 movl %ebx, \do+28(\dest)
40 movl %ecx, \do+44(\dest)
41 movl %edx, \do+60(\dest)
42 movl \so+40(\src), %eax
43 movl \so+8(\src), %ebx
44 movl \so+48(\src), %ecx
45 movl \so+16(\src), %edx
46 movl %eax, \do+8(\dest)
47 movl %ebx, \do+40(\dest)
48 movl %ecx, \do+16(\dest)
49 movl %edx, \do+48(\dest)
50 movl \so+20(\src), %eax
51 movl \so+4(\src), %ebx
52 movl \so+52(\src), %ecx
53 movl \so+36(\src), %edx
54 movl %eax, \do+4(\dest)
55 movl %ebx, \do+20(\dest)
56 movl %ecx, \do+36(\dest)
57 movl %edx, \do+52(\dest)
58 movl \so+0(\src), %eax
59 movl \so+24(\src), %ebx
60 movl \so+32(\src), %ecx
61 movl \so+56(\src), %edx
62 movl %eax, \do+0(\dest)
63 movl %ebx, \do+24(\dest)
64 movl %ecx, \do+32(\dest)
65 movl %edx, \do+56(\dest)
68 .macro salsa8_core_gen_quadround
73 leal (%ecx, %edx), %edi
78 leal (%edx, %ebx), %ebp
90 leal (%esi, %ebp), %edi
96 leal (%ebp, %ebx), %ecx
110 leal (%ecx, %edx), %edi
116 leal (%edx, %ebx), %esi
130 leal (%esi, %ebp), %edi
135 leal (%ebp, %ebx), %ecx
149 leal (%ecx, %edx), %edi
155 leal (%edx, %ebx), %esi
169 leal (%esi, %ebp), %edi
175 leal (%ebp, %ebx), %ecx
189 leal (%ecx, %edx), %edi
195 leal (%edx, %ebx), %esi
209 leal (%esi, %ebp), %edi
214 leal (%ebp, %ebx), %ecx
228 leal (%ecx, %edx), %edi
234 leal (%edx, %ebx), %edi
248 leal (%edi, %ebp), %esi
254 leal (%ebp, %ebx), %ecx
268 leal (%ecx, %edx), %esi
274 leal (%edx, %ebx), %edi
288 leal (%edi, %ebp), %esi
293 leal (%ebp, %ebx), %ecx
307 leal (%ecx, %edx), %esi
313 leal (%edx, %ebx), %edi
327 leal (%edi, %ebp), %esi
333 leal (%ebp, %ebx), %ecx
347 leal (%ecx, %edx), %esi
353 leal (%edx, %ebx), %edi
367 leal (%edi, %ebp), %esi
373 leal (%ebp, %ebx), %ecx
391 salsa8_core_gen_quadround
392 salsa8_core_gen_quadround
407 /* Check for SSE2 availability */
410 andl $0x04000000, %edx
418 .macro scrypt_core_macro1a p, q
428 .macro scrypt_core_macro1b p, q
430 xorl \p(%esi, %edx), %eax
432 xorl \q(%esi, %edx), %ebx
439 .macro scrypt_core_macro2 p, q
448 .macro scrypt_core_macro3 p, q
454 leal 131072(%esi), %ecx
455 scrypt_core_gen_loop1:
459 scrypt_core_macro1a 0, 64
460 scrypt_core_macro1a 4, 68
461 scrypt_core_macro1a 8, 72
462 scrypt_core_macro1a 12, 76
463 scrypt_core_macro1a 16, 80
464 scrypt_core_macro1a 20, 84
465 scrypt_core_macro1a 24, 88
466 scrypt_core_macro1a 28, 92
467 scrypt_core_macro1a 32, 96
468 scrypt_core_macro1a 36, 100
469 scrypt_core_macro1a 40, 104
470 scrypt_core_macro1a 44, 108
471 scrypt_core_macro1a 48, 112
472 scrypt_core_macro1a 52, 116
473 scrypt_core_macro1a 56, 120
474 scrypt_core_macro1a 60, 124
479 scrypt_core_macro2 0, 64
480 scrypt_core_macro2 4, 68
481 scrypt_core_macro2 8, 72
482 scrypt_core_macro2 12, 76
483 scrypt_core_macro2 16, 80
484 scrypt_core_macro2 20, 84
485 scrypt_core_macro2 24, 88
486 scrypt_core_macro2 28, 92
487 scrypt_core_macro2 32, 96
488 scrypt_core_macro2 36, 100
489 scrypt_core_macro2 40, 104
490 scrypt_core_macro2 44, 108
491 scrypt_core_macro2 48, 112
492 scrypt_core_macro2 52, 116
493 scrypt_core_macro2 56, 120
494 scrypt_core_macro2 60, 124
499 scrypt_core_macro3 0, 64
500 scrypt_core_macro3 4, 68
501 scrypt_core_macro3 8, 72
502 scrypt_core_macro3 12, 76
503 scrypt_core_macro3 16, 80
504 scrypt_core_macro3 20, 84
505 scrypt_core_macro3 24, 88
506 scrypt_core_macro3 28, 92
507 scrypt_core_macro3 32, 96
508 scrypt_core_macro3 36, 100
509 scrypt_core_macro3 40, 104
510 scrypt_core_macro3 44, 108
511 scrypt_core_macro3 48, 112
512 scrypt_core_macro3 52, 116
513 scrypt_core_macro3 56, 120
514 scrypt_core_macro3 60, 124
520 jne scrypt_core_gen_loop1
524 scrypt_core_gen_loop2:
531 scrypt_core_macro1b 0, 64
532 scrypt_core_macro1b 4, 68
533 scrypt_core_macro1b 8, 72
534 scrypt_core_macro1b 12, 76
535 scrypt_core_macro1b 16, 80
536 scrypt_core_macro1b 20, 84
537 scrypt_core_macro1b 24, 88
538 scrypt_core_macro1b 28, 92
539 scrypt_core_macro1b 32, 96
540 scrypt_core_macro1b 36, 100
541 scrypt_core_macro1b 40, 104
542 scrypt_core_macro1b 44, 108
543 scrypt_core_macro1b 48, 112
544 scrypt_core_macro1b 52, 116
545 scrypt_core_macro1b 56, 120
546 scrypt_core_macro1b 60, 124
551 scrypt_core_macro2 0, 64
552 scrypt_core_macro2 4, 68
553 scrypt_core_macro2 8, 72
554 scrypt_core_macro2 12, 76
555 scrypt_core_macro2 16, 80
556 scrypt_core_macro2 20, 84
557 scrypt_core_macro2 24, 88
558 scrypt_core_macro2 28, 92
559 scrypt_core_macro2 32, 96
560 scrypt_core_macro2 36, 100
561 scrypt_core_macro2 40, 104
562 scrypt_core_macro2 44, 108
563 scrypt_core_macro2 48, 112
564 scrypt_core_macro2 52, 116
565 scrypt_core_macro2 56, 120
566 scrypt_core_macro2 60, 124
572 scrypt_core_macro3 0, 64
573 scrypt_core_macro3 4, 68
574 scrypt_core_macro3 8, 72
575 scrypt_core_macro3 12, 76
576 scrypt_core_macro3 16, 80
577 scrypt_core_macro3 20, 84
578 scrypt_core_macro3 24, 88
579 scrypt_core_macro3 28, 92
580 scrypt_core_macro3 32, 96
581 scrypt_core_macro3 36, 100
582 scrypt_core_macro3 40, 104
583 scrypt_core_macro3 44, 108
584 scrypt_core_macro3 48, 112
585 scrypt_core_macro3 52, 116
586 scrypt_core_macro3 56, 120
587 scrypt_core_macro3 60, 124
591 ja scrypt_core_gen_loop2
601 .macro salsa8_core_sse2_doubleround
618 pshufd $0x93, %xmm3, %xmm3
627 pshufd $0x4e, %xmm2, %xmm2
636 pshufd $0x39, %xmm1, %xmm1
653 pshufd $0x93, %xmm1, %xmm1
662 pshufd $0x4e, %xmm2, %xmm2
669 pshufd $0x39, %xmm3, %xmm3
673 .macro salsa8_core_sse2
674 salsa8_core_sse2_doubleround
675 salsa8_core_sse2_doubleround
676 salsa8_core_sse2_doubleround
677 salsa8_core_sse2_doubleround
688 scrypt_shuffle %edi, 0, %esp, 0
689 scrypt_shuffle %edi, 64, %esp, 64
691 movdqa 96(%esp), %xmm6
692 movdqa 112(%esp), %xmm7
695 leal 131072(%esi), %ecx
696 scrypt_core_sse2_loop1:
697 movdqa 0(%esp), %xmm0
698 movdqa 16(%esp), %xmm1
699 movdqa 32(%esp), %xmm2
700 movdqa 48(%esp), %xmm3
701 movdqa 64(%esp), %xmm4
702 movdqa 80(%esp), %xmm5
705 movdqa %xmm0, 0(%edx)
706 movdqa %xmm1, 16(%edx)
709 movdqa %xmm2, 32(%edx)
710 movdqa %xmm3, 48(%edx)
711 movdqa %xmm4, 64(%edx)
712 movdqa %xmm5, 80(%edx)
713 movdqa %xmm6, 96(%edx)
714 movdqa %xmm7, 112(%edx)
718 paddd 16(%edx), %xmm1
719 paddd 32(%edx), %xmm2
720 paddd 48(%edx), %xmm3
721 movdqa %xmm0, 0(%esp)
722 movdqa %xmm1, 16(%esp)
723 movdqa %xmm2, 32(%esp)
724 movdqa %xmm3, 48(%esp)
730 movdqa %xmm0, 64(%esp)
731 movdqa %xmm1, 80(%esp)
735 paddd 64(%esp), %xmm0
736 paddd 80(%esp), %xmm1
739 movdqa %xmm0, 64(%esp)
740 movdqa %xmm1, 80(%esp)
744 jne scrypt_core_sse2_loop1
746 movdqa 64(%esp), %xmm4
747 movdqa 80(%esp), %xmm5
750 scrypt_core_sse2_loop2:
752 movdqa 0(%esp), %xmm0
753 movdqa 16(%esp), %xmm1
754 movdqa 32(%esp), %xmm2
755 movdqa 48(%esp), %xmm3
758 pxor 0(%esi, %edx), %xmm0
759 pxor 16(%esi, %edx), %xmm1
760 pxor 32(%esi, %edx), %xmm2
761 pxor 48(%esi, %edx), %xmm3
765 movdqa %xmm0, 0(%esp)
766 movdqa %xmm1, 16(%esp)
769 movdqa %xmm2, 32(%esp)
770 movdqa %xmm3, 48(%esp)
773 paddd 16(%esp), %xmm1
774 paddd 32(%esp), %xmm2
775 paddd 48(%esp), %xmm3
776 movdqa %xmm0, 0(%esp)
777 movdqa %xmm1, 16(%esp)
778 movdqa %xmm2, 32(%esp)
779 movdqa %xmm3, 48(%esp)
781 pxor 64(%esi, %edx), %xmm0
782 pxor 80(%esi, %edx), %xmm1
783 pxor 96(%esi, %edx), %xmm2
784 pxor 112(%esi, %edx), %xmm3
789 movdqa %xmm0, 64(%esp)
790 movdqa %xmm1, 80(%esp)
794 paddd 64(%esp), %xmm0
795 paddd 80(%esp), %xmm1
800 movdqa %xmm0, 64(%esp)
801 movdqa %xmm1, 80(%esp)
804 ja scrypt_core_sse2_loop2
806 movdqa %xmm6, 96(%esp)
807 movdqa %xmm7, 112(%esp)
809 scrypt_shuffle %esp, 0, %edi, 0
810 scrypt_shuffle %esp, 64, %edi, 64