1 # Copyright 2011 pooler@litecoinpool.org
4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions
7 # 1. Redistributions of source code must retain the above copyright
8 # notice, this list of conditions and the following disclaimer.
9 # 2. Redistributions in binary form must reproduce the above copyright
10 # notice, this list of conditions and the following disclaimer in the
11 # documentation and/or other materials provided with the distribution.
13 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #if defined(__linux__) && defined(__ELF__)
28 .section .note.GNU-stack,"",%progbits
31 #define gen_salsa8_core_quadround() \
32 movl 52(%esp), %ecx; \
34 movl 20(%esp), %ebx; \
36 leal (%ecx, %edx), %edi; \
40 movl 36(%esp), %edi; \
41 leal (%edx, %ebx), %ebp; \
44 movl 24(%esp), %ebp; \
49 movl 40(%esp), %ebx; \
50 movl %ecx, 20(%esp); \
53 leal (%esi, %ebp), %edi; \
56 movl %ebx, 24(%esp); \
57 movl 56(%esp), %edi; \
59 leal (%ebp, %ebx), %ecx; \
62 movl %edi, 36(%esp); \
63 movl 28(%esp), %ecx; \
64 movl %edx, 28(%esp); \
65 movl 44(%esp), %edx; \
69 movl 60(%esp), %ebx; \
70 movl %esi, 40(%esp); \
73 leal (%ecx, %edx), %edi; \
76 movl %ebx, 44(%esp); \
77 movl 12(%esp), %edi; \
79 leal (%edx, %ebx), %esi; \
82 movl %edi, 12(%esp); \
83 movl 48(%esp), %esi; \
84 movl %ebp, 48(%esp); \
85 movl 64(%esp), %ebp; \
89 movl 16(%esp), %ebx; \
90 movl %ecx, 16(%esp); \
93 leal (%esi, %ebp), %edi; \
96 movl 32(%esp), %edi; \
98 leal (%ebp, %ebx), %ecx; \
101 movl %edi, 32(%esp); \
103 movl %edx, 52(%esp); \
104 movl 28(%esp), %edx; \
108 movl 40(%esp), %ebx; \
109 movl %esi, 28(%esp); \
112 leal (%ecx, %edx), %edi; \
115 movl %ebx, 40(%esp); \
116 movl 12(%esp), %edi; \
118 leal (%edx, %ebx), %esi; \
121 movl %edi, 12(%esp); \
122 movl 4(%esp), %esi; \
123 movl %ebp, 4(%esp); \
124 movl 48(%esp), %ebp; \
128 movl 16(%esp), %ebx; \
129 movl %ecx, 16(%esp); \
132 leal (%esi, %ebp), %edi; \
135 movl %ebx, 48(%esp); \
136 movl 32(%esp), %edi; \
138 leal (%ebp, %ebx), %ecx; \
141 movl %edi, 32(%esp); \
142 movl 24(%esp), %ecx; \
143 movl %edx, 24(%esp); \
144 movl 52(%esp), %edx; \
148 movl 28(%esp), %ebx; \
149 movl %esi, 28(%esp); \
152 leal (%ecx, %edx), %edi; \
155 movl %ebx, 52(%esp); \
156 movl 8(%esp), %edi; \
158 leal (%edx, %ebx), %esi; \
161 movl %edi, 8(%esp); \
162 movl 44(%esp), %esi; \
163 movl %ebp, 44(%esp); \
164 movl 4(%esp), %ebp; \
168 movl 20(%esp), %ebx; \
169 movl %ecx, 4(%esp); \
172 leal (%esi, %ebp), %edi; \
175 movl 36(%esp), %edi; \
177 leal (%ebp, %ebx), %ecx; \
180 movl %edi, 20(%esp); \
182 movl %edx, 36(%esp); \
183 movl 24(%esp), %edx; \
187 movl 28(%esp), %ebx; \
188 movl %esi, 24(%esp); \
191 leal (%ecx, %edx), %edi; \
194 movl %ebx, 28(%esp); \
196 movl 8(%esp), %esi; \
197 leal (%edx, %ebx), %edi; \
200 movl 40(%esp), %edi; \
201 movl %ebp, 8(%esp); \
202 movl 44(%esp), %ebp; \
203 movl %esi, 40(%esp); \
207 movl 4(%esp), %ebx; \
208 movl %ecx, 44(%esp); \
211 leal (%edi, %ebp), %esi; \
214 movl %ebx, 4(%esp); \
215 movl 20(%esp), %esi; \
217 leal (%ebp, %ebx), %ecx; \
220 movl %esi, 56(%esp); \
221 movl 48(%esp), %ecx; \
222 movl %edx, 20(%esp); \
223 movl 36(%esp), %edx; \
227 movl 24(%esp), %ebx; \
228 movl %edi, 24(%esp); \
231 leal (%ecx, %edx), %esi; \
234 movl %ebx, 60(%esp); \
235 movl 12(%esp), %esi; \
237 leal (%edx, %ebx), %edi; \
240 movl %esi, 12(%esp); \
241 movl 52(%esp), %edi; \
242 movl %ebp, 36(%esp); \
243 movl 8(%esp), %ebp; \
247 movl 16(%esp), %ebx; \
248 movl %ecx, 16(%esp); \
251 leal (%edi, %ebp), %esi; \
254 movl 32(%esp), %esi; \
256 leal (%ebp, %ebx), %ecx; \
259 movl %esi, 32(%esp); \
261 movl %edx, 48(%esp); \
262 movl 20(%esp), %edx; \
266 movl 24(%esp), %ebx; \
267 movl %edi, 20(%esp); \
270 leal (%ecx, %edx), %esi; \
273 movl %ebx, 8(%esp); \
274 movl 12(%esp), %esi; \
276 leal (%edx, %ebx), %edi; \
279 movl %esi, 12(%esp); \
280 movl 28(%esp), %edi; \
281 movl %ebp, 52(%esp); \
282 movl 36(%esp), %ebp; \
286 movl 16(%esp), %ebx; \
287 movl %ecx, 16(%esp); \
290 leal (%edi, %ebp), %esi; \
293 movl %ebx, 28(%esp); \
294 movl 32(%esp), %esi; \
296 leal (%ebp, %ebx), %ecx; \
299 movl %esi, 32(%esp); \
300 movl 4(%esp), %ecx; \
301 movl %edx, 4(%esp); \
302 movl 48(%esp), %edx; \
306 movl 20(%esp), %ebx; \
307 movl %edi, 20(%esp); \
310 leal (%ecx, %edx), %esi; \
313 movl %ebx, 48(%esp); \
314 movl 40(%esp), %esi; \
316 leal (%edx, %ebx), %edi; \
319 movl %esi, 36(%esp); \
320 movl 60(%esp), %edi; \
321 movl %ebp, 24(%esp); \
322 movl 52(%esp), %ebp; \
326 movl 44(%esp), %ebx; \
327 movl %ecx, 40(%esp); \
330 leal (%edi, %ebp), %esi; \
333 movl %ebx, 52(%esp); \
334 movl 56(%esp), %esi; \
336 leal (%ebp, %ebx), %ecx; \
339 movl %esi, 56(%esp); \
341 movl %edx, 44(%esp); \
344 movl %edi, 60(%esp); \
348 movl %ebp, 64(%esp); \
354 gen_salsa8_core_quadround()
355 gen_salsa8_core_quadround()
370 # Check for SSE2 availability
373 andl $0x04000000, %edx
381 #define scrypt_core_macro1a(p, q) \
382 movl p(%edi), %eax; \
383 movl q(%edi), %edx; \
384 movl %eax, p(%esi); \
385 movl %edx, q(%esi); \
387 movl %eax, p(%edi); \
388 movl %eax, p(%esp); \
391 #define scrypt_core_macro1b(p, q) \
392 movl p(%edi), %eax; \
393 xorl p(%esi, %edx), %eax; \
394 movl q(%edi), %ebx; \
395 xorl q(%esi, %edx), %ebx; \
396 movl %ebx, q(%edi); \
398 movl %eax, p(%edi); \
399 movl %eax, p(%esp); \
402 #define scrypt_core_macro2(p, q) \
403 movl p(%esp), %eax; \
404 addl p(%edi), %eax; \
405 movl %eax, p(%edi); \
406 xorl q(%edi), %eax; \
407 movl %eax, q(%edi); \
408 movl %eax, p(%esp); \
411 #define scrypt_core_macro3(p, q) \
412 movl p(%esp), %eax; \
413 addl q(%edi), %eax; \
414 movl %eax, q(%edi); \
417 leal 131072(%esi), %ecx
418 gen_scrypt_core_loop1:
422 scrypt_core_macro1a(0, 64)
423 scrypt_core_macro1a(4, 68)
424 scrypt_core_macro1a(8, 72)
425 scrypt_core_macro1a(12, 76)
426 scrypt_core_macro1a(16, 80)
427 scrypt_core_macro1a(20, 84)
428 scrypt_core_macro1a(24, 88)
429 scrypt_core_macro1a(28, 92)
430 scrypt_core_macro1a(32, 96)
431 scrypt_core_macro1a(36, 100)
432 scrypt_core_macro1a(40, 104)
433 scrypt_core_macro1a(44, 108)
434 scrypt_core_macro1a(48, 112)
435 scrypt_core_macro1a(52, 116)
436 scrypt_core_macro1a(56, 120)
437 scrypt_core_macro1a(60, 124)
442 scrypt_core_macro2(0, 64)
443 scrypt_core_macro2(4, 68)
444 scrypt_core_macro2(8, 72)
445 scrypt_core_macro2(12, 76)
446 scrypt_core_macro2(16, 80)
447 scrypt_core_macro2(20, 84)
448 scrypt_core_macro2(24, 88)
449 scrypt_core_macro2(28, 92)
450 scrypt_core_macro2(32, 96)
451 scrypt_core_macro2(36, 100)
452 scrypt_core_macro2(40, 104)
453 scrypt_core_macro2(44, 108)
454 scrypt_core_macro2(48, 112)
455 scrypt_core_macro2(52, 116)
456 scrypt_core_macro2(56, 120)
457 scrypt_core_macro2(60, 124)
462 scrypt_core_macro3(0, 64)
463 scrypt_core_macro3(4, 68)
464 scrypt_core_macro3(8, 72)
465 scrypt_core_macro3(12, 76)
466 scrypt_core_macro3(16, 80)
467 scrypt_core_macro3(20, 84)
468 scrypt_core_macro3(24, 88)
469 scrypt_core_macro3(28, 92)
470 scrypt_core_macro3(32, 96)
471 scrypt_core_macro3(36, 100)
472 scrypt_core_macro3(40, 104)
473 scrypt_core_macro3(44, 108)
474 scrypt_core_macro3(48, 112)
475 scrypt_core_macro3(52, 116)
476 scrypt_core_macro3(56, 120)
477 scrypt_core_macro3(60, 124)
483 jne gen_scrypt_core_loop1
487 gen_scrypt_core_loop2:
494 scrypt_core_macro1b(0, 64)
495 scrypt_core_macro1b(4, 68)
496 scrypt_core_macro1b(8, 72)
497 scrypt_core_macro1b(12, 76)
498 scrypt_core_macro1b(16, 80)
499 scrypt_core_macro1b(20, 84)
500 scrypt_core_macro1b(24, 88)
501 scrypt_core_macro1b(28, 92)
502 scrypt_core_macro1b(32, 96)
503 scrypt_core_macro1b(36, 100)
504 scrypt_core_macro1b(40, 104)
505 scrypt_core_macro1b(44, 108)
506 scrypt_core_macro1b(48, 112)
507 scrypt_core_macro1b(52, 116)
508 scrypt_core_macro1b(56, 120)
509 scrypt_core_macro1b(60, 124)
514 scrypt_core_macro2(0, 64)
515 scrypt_core_macro2(4, 68)
516 scrypt_core_macro2(8, 72)
517 scrypt_core_macro2(12, 76)
518 scrypt_core_macro2(16, 80)
519 scrypt_core_macro2(20, 84)
520 scrypt_core_macro2(24, 88)
521 scrypt_core_macro2(28, 92)
522 scrypt_core_macro2(32, 96)
523 scrypt_core_macro2(36, 100)
524 scrypt_core_macro2(40, 104)
525 scrypt_core_macro2(44, 108)
526 scrypt_core_macro2(48, 112)
527 scrypt_core_macro2(52, 116)
528 scrypt_core_macro2(56, 120)
529 scrypt_core_macro2(60, 124)
535 scrypt_core_macro3(0, 64)
536 scrypt_core_macro3(4, 68)
537 scrypt_core_macro3(8, 72)
538 scrypt_core_macro3(12, 76)
539 scrypt_core_macro3(16, 80)
540 scrypt_core_macro3(20, 84)
541 scrypt_core_macro3(24, 88)
542 scrypt_core_macro3(28, 92)
543 scrypt_core_macro3(32, 96)
544 scrypt_core_macro3(36, 100)
545 scrypt_core_macro3(40, 104)
546 scrypt_core_macro3(44, 108)
547 scrypt_core_macro3(48, 112)
548 scrypt_core_macro3(52, 116)
549 scrypt_core_macro3(56, 120)
550 scrypt_core_macro3(60, 124)
554 ja gen_scrypt_core_loop2
564 #define xmm_salsa8_core_doubleround() \
565 movdqa %xmm1, %xmm4; \
566 paddd %xmm0, %xmm4; \
567 movdqa %xmm4, %xmm5; \
572 movdqa %xmm0, %xmm4; \
573 paddd %xmm3, %xmm4; \
574 movdqa %xmm4, %xmm5; \
578 movdqa %xmm3, %xmm4; \
579 pshufd $0x93, %xmm3, %xmm3; \
581 paddd %xmm2, %xmm4; \
582 movdqa %xmm4, %xmm5; \
586 movdqa %xmm2, %xmm4; \
587 pshufd $0x4e, %xmm2, %xmm2; \
589 paddd %xmm1, %xmm4; \
590 movdqa %xmm4, %xmm5; \
594 pshufd $0x39, %xmm1, %xmm1; \
596 movdqa %xmm3, %xmm4; \
597 paddd %xmm0, %xmm4; \
598 movdqa %xmm4, %xmm5; \
603 movdqa %xmm0, %xmm4; \
604 paddd %xmm1, %xmm4; \
605 movdqa %xmm4, %xmm5; \
609 movdqa %xmm1, %xmm4; \
610 pshufd $0x93, %xmm1, %xmm1; \
612 paddd %xmm2, %xmm4; \
613 movdqa %xmm4, %xmm5; \
617 movdqa %xmm2, %xmm4; \
618 pshufd $0x4e, %xmm2, %xmm2; \
620 paddd %xmm3, %xmm4; \
621 movdqa %xmm4, %xmm5; \
625 pshufd $0x39, %xmm3, %xmm3; \
629 #define xmm_salsa8_core() \
630 xmm_salsa8_core_doubleround(); \
631 xmm_salsa8_core_doubleround(); \
632 xmm_salsa8_core_doubleround(); \
633 xmm_salsa8_core_doubleround(); \
644 # shuffle 1st block to (%esp)
678 # shuffle 2nd block to 64(%esp)
713 leal 131072(%esi), %ecx
714 xmm_scrypt_core_loop1:
715 movdqa 0(%esp), %xmm0
716 movdqa 16(%esp), %xmm1
717 movdqa 32(%esp), %xmm2
718 movdqa 48(%esp), %xmm3
719 movdqa 64(%esp), %xmm4
720 movdqa 80(%esp), %xmm5
721 movdqa 96(%esp), %xmm6
722 movdqa 112(%esp), %xmm7
723 movdqa %xmm0, 0(%edx)
724 movdqa %xmm1, 16(%edx)
725 movdqa %xmm2, 32(%edx)
726 movdqa %xmm3, 48(%edx)
727 movdqa %xmm4, 64(%edx)
728 movdqa %xmm5, 80(%edx)
729 movdqa %xmm6, 96(%edx)
730 movdqa %xmm7, 112(%edx)
736 movdqa %xmm0, 0(%esp)
737 movdqa %xmm1, 16(%esp)
738 movdqa %xmm2, 32(%esp)
739 movdqa %xmm3, 48(%esp)
742 paddd 16(%esp), %xmm1
743 paddd 32(%esp), %xmm2
744 paddd 48(%esp), %xmm3
745 movdqa %xmm0, 0(%esp)
746 movdqa %xmm1, 16(%esp)
747 movdqa %xmm2, 32(%esp)
748 movdqa %xmm3, 48(%esp)
753 pxor 112(%esp), %xmm3
754 movdqa %xmm0, 64(%esp)
755 movdqa %xmm1, 80(%esp)
756 movdqa %xmm2, 96(%esp)
757 movdqa %xmm3, 112(%esp)
759 paddd 64(%esp), %xmm0
760 paddd 80(%esp), %xmm1
761 paddd 96(%esp), %xmm2
762 paddd 112(%esp), %xmm3
763 movdqa %xmm0, 64(%esp)
764 movdqa %xmm1, 80(%esp)
765 movdqa %xmm2, 96(%esp)
766 movdqa %xmm3, 112(%esp)
770 jne xmm_scrypt_core_loop1
773 xmm_scrypt_core_loop2:
774 movdqa 0(%esp), %xmm0
775 movdqa 16(%esp), %xmm1
776 movdqa 32(%esp), %xmm2
777 movdqa 48(%esp), %xmm3
778 movdqa 64(%esp), %xmm4
779 movdqa 80(%esp), %xmm5
780 movdqa 96(%esp), %xmm6
781 movdqa 112(%esp), %xmm7
785 pxor 0(%esi, %edx), %xmm0
786 pxor 16(%esi, %edx), %xmm1
787 pxor 32(%esi, %edx), %xmm2
788 pxor 48(%esi, %edx), %xmm3
789 pxor 64(%esi, %edx), %xmm4
790 pxor 80(%esi, %edx), %xmm5
791 pxor 96(%esi, %edx), %xmm6
792 pxor 112(%esi, %edx), %xmm7
793 movdqa %xmm4, 64(%esp)
794 movdqa %xmm5, 80(%esp)
795 movdqa %xmm6, 96(%esp)
796 movdqa %xmm7, 112(%esp)
802 movdqa %xmm0, 0(%esp)
803 movdqa %xmm1, 16(%esp)
804 movdqa %xmm2, 32(%esp)
805 movdqa %xmm3, 48(%esp)
808 paddd 16(%esp), %xmm1
809 paddd 32(%esp), %xmm2
810 paddd 48(%esp), %xmm3
811 movdqa %xmm0, 0(%esp)
812 movdqa %xmm1, 16(%esp)
813 movdqa %xmm2, 32(%esp)
814 movdqa %xmm3, 48(%esp)
819 pxor 112(%esp), %xmm3
820 movdqa %xmm0, 64(%esp)
821 movdqa %xmm1, 80(%esp)
822 movdqa %xmm2, 96(%esp)
823 movdqa %xmm3, 112(%esp)
825 paddd 64(%esp), %xmm0
826 paddd 80(%esp), %xmm1
827 paddd 96(%esp), %xmm2
828 paddd 112(%esp), %xmm3
829 movdqa %xmm0, 64(%esp)
830 movdqa %xmm1, 80(%esp)
831 movdqa %xmm2, 96(%esp)
832 movdqa %xmm3, 112(%esp)
835 ja xmm_scrypt_core_loop2
837 # re-shuffle 1st block back
871 # re-shuffle 2nd block back