1 # Copyright 2011 pooler@litecoinpool.org
4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions
7 # 1. Redistributions of source code must retain the above copyright
8 # notice, this list of conditions and the following disclaimer.
9 # 2. Redistributions in binary form must reproduce the above copyright
10 # notice, this list of conditions and the following disclaimer in the
11 # documentation and/or other materials provided with the distribution.
13 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 .macro gen_salsa8_core_quadround
32 leal (%ecx, %edx), %edi
37 leal (%edx, %ebx), %ebp
49 leal (%esi, %ebp), %edi
55 leal (%ebp, %ebx), %ecx
69 leal (%ecx, %edx), %edi
75 leal (%edx, %ebx), %esi
89 leal (%esi, %ebp), %edi
94 leal (%ebp, %ebx), %ecx
108 leal (%ecx, %edx), %edi
114 leal (%edx, %ebx), %esi
128 leal (%esi, %ebp), %edi
134 leal (%ebp, %ebx), %ecx
148 leal (%ecx, %edx), %edi
154 leal (%edx, %ebx), %esi
168 leal (%esi, %ebp), %edi
173 leal (%ebp, %ebx), %ecx
187 leal (%ecx, %edx), %edi
193 leal (%edx, %ebx), %edi
207 leal (%edi, %ebp), %esi
213 leal (%ebp, %ebx), %ecx
227 leal (%ecx, %edx), %esi
233 leal (%edx, %ebx), %edi
247 leal (%edi, %ebp), %esi
252 leal (%ebp, %ebx), %ecx
266 leal (%ecx, %edx), %esi
272 leal (%edx, %ebx), %edi
286 leal (%edi, %ebp), %esi
292 leal (%ebp, %ebx), %ecx
306 leal (%ecx, %edx), %esi
312 leal (%edx, %ebx), %edi
326 leal (%edi, %ebp), %esi
332 leal (%ebp, %ebx), %ecx
350 gen_salsa8_core_quadround
351 gen_salsa8_core_quadround
366 # Check for SSE2 availability
369 andl $0x04000000, %edx
377 .macro scrypt_core_macro1a p, q
387 .macro scrypt_core_macro1b p, q
389 xorl \p(%esi, %edx), %eax
391 xorl \q(%esi, %edx), %ebx
398 .macro scrypt_core_macro2 p, q
407 .macro scrypt_core_macro3 p, q
413 leal 131072(%esi), %ecx
414 gen_scrypt_core_loop1:
418 scrypt_core_macro1a 0, 64
419 scrypt_core_macro1a 4, 68
420 scrypt_core_macro1a 8, 72
421 scrypt_core_macro1a 12, 76
422 scrypt_core_macro1a 16, 80
423 scrypt_core_macro1a 20, 84
424 scrypt_core_macro1a 24, 88
425 scrypt_core_macro1a 28, 92
426 scrypt_core_macro1a 32, 96
427 scrypt_core_macro1a 36, 100
428 scrypt_core_macro1a 40, 104
429 scrypt_core_macro1a 44, 108
430 scrypt_core_macro1a 48, 112
431 scrypt_core_macro1a 52, 116
432 scrypt_core_macro1a 56, 120
433 scrypt_core_macro1a 60, 124
438 scrypt_core_macro2 0, 64
439 scrypt_core_macro2 4, 68
440 scrypt_core_macro2 8, 72
441 scrypt_core_macro2 12, 76
442 scrypt_core_macro2 16, 80
443 scrypt_core_macro2 20, 84
444 scrypt_core_macro2 24, 88
445 scrypt_core_macro2 28, 92
446 scrypt_core_macro2 32, 96
447 scrypt_core_macro2 36, 100
448 scrypt_core_macro2 40, 104
449 scrypt_core_macro2 44, 108
450 scrypt_core_macro2 48, 112
451 scrypt_core_macro2 52, 116
452 scrypt_core_macro2 56, 120
453 scrypt_core_macro2 60, 124
458 scrypt_core_macro3 0, 64
459 scrypt_core_macro3 4, 68
460 scrypt_core_macro3 8, 72
461 scrypt_core_macro3 12, 76
462 scrypt_core_macro3 16, 80
463 scrypt_core_macro3 20, 84
464 scrypt_core_macro3 24, 88
465 scrypt_core_macro3 28, 92
466 scrypt_core_macro3 32, 96
467 scrypt_core_macro3 36, 100
468 scrypt_core_macro3 40, 104
469 scrypt_core_macro3 44, 108
470 scrypt_core_macro3 48, 112
471 scrypt_core_macro3 52, 116
472 scrypt_core_macro3 56, 120
473 scrypt_core_macro3 60, 124
479 jne gen_scrypt_core_loop1
483 gen_scrypt_core_loop2:
490 scrypt_core_macro1b 0, 64
491 scrypt_core_macro1b 4, 68
492 scrypt_core_macro1b 8, 72
493 scrypt_core_macro1b 12, 76
494 scrypt_core_macro1b 16, 80
495 scrypt_core_macro1b 20, 84
496 scrypt_core_macro1b 24, 88
497 scrypt_core_macro1b 28, 92
498 scrypt_core_macro1b 32, 96
499 scrypt_core_macro1b 36, 100
500 scrypt_core_macro1b 40, 104
501 scrypt_core_macro1b 44, 108
502 scrypt_core_macro1b 48, 112
503 scrypt_core_macro1b 52, 116
504 scrypt_core_macro1b 56, 120
505 scrypt_core_macro1b 60, 124
510 scrypt_core_macro2 0, 64
511 scrypt_core_macro2 4, 68
512 scrypt_core_macro2 8, 72
513 scrypt_core_macro2 12, 76
514 scrypt_core_macro2 16, 80
515 scrypt_core_macro2 20, 84
516 scrypt_core_macro2 24, 88
517 scrypt_core_macro2 28, 92
518 scrypt_core_macro2 32, 96
519 scrypt_core_macro2 36, 100
520 scrypt_core_macro2 40, 104
521 scrypt_core_macro2 44, 108
522 scrypt_core_macro2 48, 112
523 scrypt_core_macro2 52, 116
524 scrypt_core_macro2 56, 120
525 scrypt_core_macro2 60, 124
531 scrypt_core_macro3 0, 64
532 scrypt_core_macro3 4, 68
533 scrypt_core_macro3 8, 72
534 scrypt_core_macro3 12, 76
535 scrypt_core_macro3 16, 80
536 scrypt_core_macro3 20, 84
537 scrypt_core_macro3 24, 88
538 scrypt_core_macro3 28, 92
539 scrypt_core_macro3 32, 96
540 scrypt_core_macro3 36, 100
541 scrypt_core_macro3 40, 104
542 scrypt_core_macro3 44, 108
543 scrypt_core_macro3 48, 112
544 scrypt_core_macro3 52, 116
545 scrypt_core_macro3 56, 120
546 scrypt_core_macro3 60, 124
550 ja gen_scrypt_core_loop2
560 .macro xmm_salsa8_core_doubleround
576 pshufd $0x93, %xmm3, %xmm3
585 pshufd $0x4e, %xmm2, %xmm2
593 pshufd $0x39, %xmm1, %xmm1
611 pshufd $0x93, %xmm1, %xmm1
620 pshufd $0x4e, %xmm2, %xmm2
628 pshufd $0x39, %xmm3, %xmm3
632 .macro xmm_salsa8_core
633 xmm_salsa8_core_doubleround
634 xmm_salsa8_core_doubleround
635 xmm_salsa8_core_doubleround
636 xmm_salsa8_core_doubleround
647 # shuffle 1st block to (%esp)
681 # shuffle 2nd block to 64(%esp)
716 leal 131072(%esi), %ecx
717 xmm_scrypt_core_loop1:
718 movdqa 0(%esp), %xmm0
719 movdqa 16(%esp), %xmm1
720 movdqa 32(%esp), %xmm2
721 movdqa 48(%esp), %xmm3
722 movdqa 64(%esp), %xmm4
723 movdqa 80(%esp), %xmm5
724 movdqa 96(%esp), %xmm6
725 movdqa 112(%esp), %xmm7
726 movdqa %xmm0, 0(%edx)
727 movdqa %xmm1, 16(%edx)
728 movdqa %xmm2, 32(%edx)
729 movdqa %xmm3, 48(%edx)
730 movdqa %xmm4, 64(%edx)
731 movdqa %xmm5, 80(%edx)
732 movdqa %xmm6, 96(%edx)
733 movdqa %xmm7, 112(%edx)
739 movdqa %xmm0, 0(%esp)
740 movdqa %xmm1, 16(%esp)
741 movdqa %xmm2, 32(%esp)
742 movdqa %xmm3, 48(%esp)
745 paddd 16(%esp), %xmm1
746 paddd 32(%esp), %xmm2
747 paddd 48(%esp), %xmm3
748 movdqa %xmm0, 0(%esp)
749 movdqa %xmm1, 16(%esp)
750 movdqa %xmm2, 32(%esp)
751 movdqa %xmm3, 48(%esp)
756 pxor 112(%esp), %xmm3
757 movdqa %xmm0, 64(%esp)
758 movdqa %xmm1, 80(%esp)
759 movdqa %xmm2, 96(%esp)
760 movdqa %xmm3, 112(%esp)
762 paddd 64(%esp), %xmm0
763 paddd 80(%esp), %xmm1
764 paddd 96(%esp), %xmm2
765 paddd 112(%esp), %xmm3
766 movdqa %xmm0, 64(%esp)
767 movdqa %xmm1, 80(%esp)
768 movdqa %xmm2, 96(%esp)
769 movdqa %xmm3, 112(%esp)
773 jne xmm_scrypt_core_loop1
776 xmm_scrypt_core_loop2:
777 movdqa 0(%esp), %xmm0
778 movdqa 16(%esp), %xmm1
779 movdqa 32(%esp), %xmm2
780 movdqa 48(%esp), %xmm3
781 movdqa 64(%esp), %xmm4
782 movdqa 80(%esp), %xmm5
783 movdqa 96(%esp), %xmm6
784 movdqa 112(%esp), %xmm7
788 pxor 0(%esi, %edx), %xmm0
789 pxor 16(%esi, %edx), %xmm1
790 pxor 32(%esi, %edx), %xmm2
791 pxor 48(%esi, %edx), %xmm3
792 pxor 64(%esi, %edx), %xmm4
793 pxor 80(%esi, %edx), %xmm5
794 pxor 96(%esi, %edx), %xmm6
795 pxor 112(%esi, %edx), %xmm7
796 movdqa %xmm4, 64(%esp)
797 movdqa %xmm5, 80(%esp)
798 movdqa %xmm6, 96(%esp)
799 movdqa %xmm7, 112(%esp)
805 movdqa %xmm0, 0(%esp)
806 movdqa %xmm1, 16(%esp)
807 movdqa %xmm2, 32(%esp)
808 movdqa %xmm3, 48(%esp)
811 paddd 16(%esp), %xmm1
812 paddd 32(%esp), %xmm2
813 paddd 48(%esp), %xmm3
814 movdqa %xmm0, 0(%esp)
815 movdqa %xmm1, 16(%esp)
816 movdqa %xmm2, 32(%esp)
817 movdqa %xmm3, 48(%esp)
822 pxor 112(%esp), %xmm3
823 movdqa %xmm0, 64(%esp)
824 movdqa %xmm1, 80(%esp)
825 movdqa %xmm2, 96(%esp)
826 movdqa %xmm3, 112(%esp)
828 paddd 64(%esp), %xmm0
829 paddd 80(%esp), %xmm1
830 paddd 96(%esp), %xmm2
831 paddd 112(%esp), %xmm3
832 movdqa %xmm0, 64(%esp)
833 movdqa %xmm1, 80(%esp)
834 movdqa %xmm2, 96(%esp)
835 movdqa %xmm3, 112(%esp)
838 ja xmm_scrypt_core_loop2
840 # re-shuffle 1st block back
874 # re-shuffle 2nd block back