1 # Copyright 2011 pooler@litecoinpool.org
4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions
7 # 1. Redistributions of source code must retain the above copyright
8 # notice, this list of conditions and the following disclaimer.
9 # 2. Redistributions in binary form must reproduce the above copyright
10 # notice, this list of conditions and the following disclaimer in the
11 # documentation and/or other materials provided with the distribution.
13 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #define gen_salsa8_core_quadround() \
28 movl 52(%esp), %ecx; \
30 movl 20(%esp), %ebx; \
32 leal (%ecx, %edx), %edi; \
36 movl 36(%esp), %edi; \
37 leal (%edx, %ebx), %ebp; \
40 movl 24(%esp), %ebp; \
45 movl 40(%esp), %ebx; \
46 movl %ecx, 20(%esp); \
49 leal (%esi, %ebp), %edi; \
52 movl %ebx, 24(%esp); \
53 movl 56(%esp), %edi; \
55 leal (%ebp, %ebx), %ecx; \
58 movl %edi, 36(%esp); \
59 movl 28(%esp), %ecx; \
60 movl %edx, 28(%esp); \
61 movl 44(%esp), %edx; \
65 movl 60(%esp), %ebx; \
66 movl %esi, 40(%esp); \
69 leal (%ecx, %edx), %edi; \
72 movl %ebx, 44(%esp); \
73 movl 12(%esp), %edi; \
75 leal (%edx, %ebx), %esi; \
78 movl %edi, 12(%esp); \
79 movl 48(%esp), %esi; \
80 movl %ebp, 48(%esp); \
81 movl 64(%esp), %ebp; \
85 movl 16(%esp), %ebx; \
86 movl %ecx, 16(%esp); \
89 leal (%esi, %ebp), %edi; \
92 movl 32(%esp), %edi; \
94 leal (%ebp, %ebx), %ecx; \
97 movl %edi, 32(%esp); \
99 movl %edx, 52(%esp); \
100 movl 28(%esp), %edx; \
104 movl 40(%esp), %ebx; \
105 movl %esi, 28(%esp); \
108 leal (%ecx, %edx), %edi; \
111 movl %ebx, 40(%esp); \
112 movl 12(%esp), %edi; \
114 leal (%edx, %ebx), %esi; \
117 movl %edi, 12(%esp); \
118 movl 4(%esp), %esi; \
119 movl %ebp, 4(%esp); \
120 movl 48(%esp), %ebp; \
124 movl 16(%esp), %ebx; \
125 movl %ecx, 16(%esp); \
128 leal (%esi, %ebp), %edi; \
131 movl %ebx, 48(%esp); \
132 movl 32(%esp), %edi; \
134 leal (%ebp, %ebx), %ecx; \
137 movl %edi, 32(%esp); \
138 movl 24(%esp), %ecx; \
139 movl %edx, 24(%esp); \
140 movl 52(%esp), %edx; \
144 movl 28(%esp), %ebx; \
145 movl %esi, 28(%esp); \
148 leal (%ecx, %edx), %edi; \
151 movl %ebx, 52(%esp); \
152 movl 8(%esp), %edi; \
154 leal (%edx, %ebx), %esi; \
157 movl %edi, 8(%esp); \
158 movl 44(%esp), %esi; \
159 movl %ebp, 44(%esp); \
160 movl 4(%esp), %ebp; \
164 movl 20(%esp), %ebx; \
165 movl %ecx, 4(%esp); \
168 leal (%esi, %ebp), %edi; \
171 movl 36(%esp), %edi; \
173 leal (%ebp, %ebx), %ecx; \
176 movl %edi, 20(%esp); \
178 movl %edx, 36(%esp); \
179 movl 24(%esp), %edx; \
183 movl 28(%esp), %ebx; \
184 movl %esi, 24(%esp); \
187 leal (%ecx, %edx), %edi; \
190 movl %ebx, 28(%esp); \
192 movl 8(%esp), %esi; \
193 leal (%edx, %ebx), %edi; \
196 movl 40(%esp), %edi; \
197 movl %ebp, 8(%esp); \
198 movl 44(%esp), %ebp; \
199 movl %esi, 40(%esp); \
203 movl 4(%esp), %ebx; \
204 movl %ecx, 44(%esp); \
207 leal (%edi, %ebp), %esi; \
210 movl %ebx, 4(%esp); \
211 movl 20(%esp), %esi; \
213 leal (%ebp, %ebx), %ecx; \
216 movl %esi, 56(%esp); \
217 movl 48(%esp), %ecx; \
218 movl %edx, 20(%esp); \
219 movl 36(%esp), %edx; \
223 movl 24(%esp), %ebx; \
224 movl %edi, 24(%esp); \
227 leal (%ecx, %edx), %esi; \
230 movl %ebx, 60(%esp); \
231 movl 12(%esp), %esi; \
233 leal (%edx, %ebx), %edi; \
236 movl %esi, 12(%esp); \
237 movl 52(%esp), %edi; \
238 movl %ebp, 36(%esp); \
239 movl 8(%esp), %ebp; \
243 movl 16(%esp), %ebx; \
244 movl %ecx, 16(%esp); \
247 leal (%edi, %ebp), %esi; \
250 movl 32(%esp), %esi; \
252 leal (%ebp, %ebx), %ecx; \
255 movl %esi, 32(%esp); \
257 movl %edx, 48(%esp); \
258 movl 20(%esp), %edx; \
262 movl 24(%esp), %ebx; \
263 movl %edi, 20(%esp); \
266 leal (%ecx, %edx), %esi; \
269 movl %ebx, 8(%esp); \
270 movl 12(%esp), %esi; \
272 leal (%edx, %ebx), %edi; \
275 movl %esi, 12(%esp); \
276 movl 28(%esp), %edi; \
277 movl %ebp, 52(%esp); \
278 movl 36(%esp), %ebp; \
282 movl 16(%esp), %ebx; \
283 movl %ecx, 16(%esp); \
286 leal (%edi, %ebp), %esi; \
289 movl %ebx, 28(%esp); \
290 movl 32(%esp), %esi; \
292 leal (%ebp, %ebx), %ecx; \
295 movl %esi, 32(%esp); \
296 movl 4(%esp), %ecx; \
297 movl %edx, 4(%esp); \
298 movl 48(%esp), %edx; \
302 movl 20(%esp), %ebx; \
303 movl %edi, 20(%esp); \
306 leal (%ecx, %edx), %esi; \
309 movl %ebx, 48(%esp); \
310 movl 40(%esp), %esi; \
312 leal (%edx, %ebx), %edi; \
315 movl %esi, 36(%esp); \
316 movl 60(%esp), %edi; \
317 movl %ebp, 24(%esp); \
318 movl 52(%esp), %ebp; \
322 movl 44(%esp), %ebx; \
323 movl %ecx, 40(%esp); \
326 leal (%edi, %ebp), %esi; \
329 movl %ebx, 52(%esp); \
330 movl 56(%esp), %esi; \
332 leal (%ebp, %ebx), %ecx; \
335 movl %esi, 56(%esp); \
337 movl %edx, 44(%esp); \
340 movl %edi, 60(%esp); \
344 movl %ebp, 64(%esp); \
350 gen_salsa8_core_quadround()
351 gen_salsa8_core_quadround()
366 # Check for SSE2 availability
369 andl $0x04000000, %edx
377 #define scrypt_core_macro1a(p, q) \
378 movl p(%edi), %eax; \
379 movl q(%edi), %edx; \
380 movl %eax, p(%esi); \
381 movl %edx, q(%esi); \
383 movl %eax, p(%edi); \
384 movl %eax, p(%esp); \
387 #define scrypt_core_macro1b(p, q) \
388 movl p(%edi), %eax; \
389 xorl p(%esi, %edx), %eax; \
390 movl q(%edi), %ebx; \
391 xorl q(%esi, %edx), %ebx; \
392 movl %ebx, q(%edi); \
394 movl %eax, p(%edi); \
395 movl %eax, p(%esp); \
398 #define scrypt_core_macro2(p, q) \
399 movl p(%esp), %eax; \
400 addl p(%edi), %eax; \
401 movl %eax, p(%edi); \
402 xorl q(%edi), %eax; \
403 movl %eax, q(%edi); \
404 movl %eax, p(%esp); \
407 #define scrypt_core_macro3(p, q) \
408 movl p(%esp), %eax; \
409 addl q(%edi), %eax; \
410 movl %eax, q(%edi); \
413 leal 131072(%esi), %ecx
414 gen_scrypt_core_loop1:
418 scrypt_core_macro1a(0, 64)
419 scrypt_core_macro1a(4, 68)
420 scrypt_core_macro1a(8, 72)
421 scrypt_core_macro1a(12, 76)
422 scrypt_core_macro1a(16, 80)
423 scrypt_core_macro1a(20, 84)
424 scrypt_core_macro1a(24, 88)
425 scrypt_core_macro1a(28, 92)
426 scrypt_core_macro1a(32, 96)
427 scrypt_core_macro1a(36, 100)
428 scrypt_core_macro1a(40, 104)
429 scrypt_core_macro1a(44, 108)
430 scrypt_core_macro1a(48, 112)
431 scrypt_core_macro1a(52, 116)
432 scrypt_core_macro1a(56, 120)
433 scrypt_core_macro1a(60, 124)
438 scrypt_core_macro2(0, 64)
439 scrypt_core_macro2(4, 68)
440 scrypt_core_macro2(8, 72)
441 scrypt_core_macro2(12, 76)
442 scrypt_core_macro2(16, 80)
443 scrypt_core_macro2(20, 84)
444 scrypt_core_macro2(24, 88)
445 scrypt_core_macro2(28, 92)
446 scrypt_core_macro2(32, 96)
447 scrypt_core_macro2(36, 100)
448 scrypt_core_macro2(40, 104)
449 scrypt_core_macro2(44, 108)
450 scrypt_core_macro2(48, 112)
451 scrypt_core_macro2(52, 116)
452 scrypt_core_macro2(56, 120)
453 scrypt_core_macro2(60, 124)
458 scrypt_core_macro3(0, 64)
459 scrypt_core_macro3(4, 68)
460 scrypt_core_macro3(8, 72)
461 scrypt_core_macro3(12, 76)
462 scrypt_core_macro3(16, 80)
463 scrypt_core_macro3(20, 84)
464 scrypt_core_macro3(24, 88)
465 scrypt_core_macro3(28, 92)
466 scrypt_core_macro3(32, 96)
467 scrypt_core_macro3(36, 100)
468 scrypt_core_macro3(40, 104)
469 scrypt_core_macro3(44, 108)
470 scrypt_core_macro3(48, 112)
471 scrypt_core_macro3(52, 116)
472 scrypt_core_macro3(56, 120)
473 scrypt_core_macro3(60, 124)
479 jne gen_scrypt_core_loop1
483 gen_scrypt_core_loop2:
490 scrypt_core_macro1b(0, 64)
491 scrypt_core_macro1b(4, 68)
492 scrypt_core_macro1b(8, 72)
493 scrypt_core_macro1b(12, 76)
494 scrypt_core_macro1b(16, 80)
495 scrypt_core_macro1b(20, 84)
496 scrypt_core_macro1b(24, 88)
497 scrypt_core_macro1b(28, 92)
498 scrypt_core_macro1b(32, 96)
499 scrypt_core_macro1b(36, 100)
500 scrypt_core_macro1b(40, 104)
501 scrypt_core_macro1b(44, 108)
502 scrypt_core_macro1b(48, 112)
503 scrypt_core_macro1b(52, 116)
504 scrypt_core_macro1b(56, 120)
505 scrypt_core_macro1b(60, 124)
510 scrypt_core_macro2(0, 64)
511 scrypt_core_macro2(4, 68)
512 scrypt_core_macro2(8, 72)
513 scrypt_core_macro2(12, 76)
514 scrypt_core_macro2(16, 80)
515 scrypt_core_macro2(20, 84)
516 scrypt_core_macro2(24, 88)
517 scrypt_core_macro2(28, 92)
518 scrypt_core_macro2(32, 96)
519 scrypt_core_macro2(36, 100)
520 scrypt_core_macro2(40, 104)
521 scrypt_core_macro2(44, 108)
522 scrypt_core_macro2(48, 112)
523 scrypt_core_macro2(52, 116)
524 scrypt_core_macro2(56, 120)
525 scrypt_core_macro2(60, 124)
531 scrypt_core_macro3(0, 64)
532 scrypt_core_macro3(4, 68)
533 scrypt_core_macro3(8, 72)
534 scrypt_core_macro3(12, 76)
535 scrypt_core_macro3(16, 80)
536 scrypt_core_macro3(20, 84)
537 scrypt_core_macro3(24, 88)
538 scrypt_core_macro3(28, 92)
539 scrypt_core_macro3(32, 96)
540 scrypt_core_macro3(36, 100)
541 scrypt_core_macro3(40, 104)
542 scrypt_core_macro3(44, 108)
543 scrypt_core_macro3(48, 112)
544 scrypt_core_macro3(52, 116)
545 scrypt_core_macro3(56, 120)
546 scrypt_core_macro3(60, 124)
550 ja gen_scrypt_core_loop2
560 #define xmm_salsa8_core_doubleround() \
561 movdqa %xmm1, %xmm4; \
562 paddd %xmm0, %xmm4; \
563 movdqa %xmm4, %xmm5; \
568 movdqa %xmm0, %xmm4; \
569 paddd %xmm3, %xmm4; \
570 movdqa %xmm4, %xmm5; \
574 movdqa %xmm3, %xmm4; \
575 pshufd $0x93, %xmm3, %xmm3; \
577 paddd %xmm2, %xmm4; \
578 movdqa %xmm4, %xmm5; \
582 movdqa %xmm2, %xmm4; \
583 pshufd $0x4e, %xmm2, %xmm2; \
585 paddd %xmm1, %xmm4; \
586 movdqa %xmm4, %xmm5; \
590 pshufd $0x39, %xmm1, %xmm1; \
592 movdqa %xmm3, %xmm4; \
593 paddd %xmm0, %xmm4; \
594 movdqa %xmm4, %xmm5; \
599 movdqa %xmm0, %xmm4; \
600 paddd %xmm1, %xmm4; \
601 movdqa %xmm4, %xmm5; \
605 movdqa %xmm1, %xmm4; \
606 pshufd $0x93, %xmm1, %xmm1; \
608 paddd %xmm2, %xmm4; \
609 movdqa %xmm4, %xmm5; \
613 movdqa %xmm2, %xmm4; \
614 pshufd $0x4e, %xmm2, %xmm2; \
616 paddd %xmm3, %xmm4; \
617 movdqa %xmm4, %xmm5; \
621 pshufd $0x39, %xmm3, %xmm3; \
625 #define xmm_salsa8_core() \
626 xmm_salsa8_core_doubleround(); \
627 xmm_salsa8_core_doubleround(); \
628 xmm_salsa8_core_doubleround(); \
629 xmm_salsa8_core_doubleround(); \
640 # shuffle 1st block to (%esp)
674 # shuffle 2nd block to 64(%esp)
709 leal 131072(%esi), %ecx
710 xmm_scrypt_core_loop1:
711 movdqa 0(%esp), %xmm0
712 movdqa 16(%esp), %xmm1
713 movdqa 32(%esp), %xmm2
714 movdqa 48(%esp), %xmm3
715 movdqa 64(%esp), %xmm4
716 movdqa 80(%esp), %xmm5
717 movdqa 96(%esp), %xmm6
718 movdqa 112(%esp), %xmm7
719 movdqa %xmm0, 0(%edx)
720 movdqa %xmm1, 16(%edx)
721 movdqa %xmm2, 32(%edx)
722 movdqa %xmm3, 48(%edx)
723 movdqa %xmm4, 64(%edx)
724 movdqa %xmm5, 80(%edx)
725 movdqa %xmm6, 96(%edx)
726 movdqa %xmm7, 112(%edx)
732 movdqa %xmm0, 0(%esp)
733 movdqa %xmm1, 16(%esp)
734 movdqa %xmm2, 32(%esp)
735 movdqa %xmm3, 48(%esp)
738 paddd 16(%esp), %xmm1
739 paddd 32(%esp), %xmm2
740 paddd 48(%esp), %xmm3
741 movdqa %xmm0, 0(%esp)
742 movdqa %xmm1, 16(%esp)
743 movdqa %xmm2, 32(%esp)
744 movdqa %xmm3, 48(%esp)
749 pxor 112(%esp), %xmm3
750 movdqa %xmm0, 64(%esp)
751 movdqa %xmm1, 80(%esp)
752 movdqa %xmm2, 96(%esp)
753 movdqa %xmm3, 112(%esp)
755 paddd 64(%esp), %xmm0
756 paddd 80(%esp), %xmm1
757 paddd 96(%esp), %xmm2
758 paddd 112(%esp), %xmm3
759 movdqa %xmm0, 64(%esp)
760 movdqa %xmm1, 80(%esp)
761 movdqa %xmm2, 96(%esp)
762 movdqa %xmm3, 112(%esp)
766 jne xmm_scrypt_core_loop1
769 xmm_scrypt_core_loop2:
770 movdqa 0(%esp), %xmm0
771 movdqa 16(%esp), %xmm1
772 movdqa 32(%esp), %xmm2
773 movdqa 48(%esp), %xmm3
774 movdqa 64(%esp), %xmm4
775 movdqa 80(%esp), %xmm5
776 movdqa 96(%esp), %xmm6
777 movdqa 112(%esp), %xmm7
781 pxor 0(%esi, %edx), %xmm0
782 pxor 16(%esi, %edx), %xmm1
783 pxor 32(%esi, %edx), %xmm2
784 pxor 48(%esi, %edx), %xmm3
785 pxor 64(%esi, %edx), %xmm4
786 pxor 80(%esi, %edx), %xmm5
787 pxor 96(%esi, %edx), %xmm6
788 pxor 112(%esi, %edx), %xmm7
789 movdqa %xmm4, 64(%esp)
790 movdqa %xmm5, 80(%esp)
791 movdqa %xmm6, 96(%esp)
792 movdqa %xmm7, 112(%esp)
798 movdqa %xmm0, 0(%esp)
799 movdqa %xmm1, 16(%esp)
800 movdqa %xmm2, 32(%esp)
801 movdqa %xmm3, 48(%esp)
804 paddd 16(%esp), %xmm1
805 paddd 32(%esp), %xmm2
806 paddd 48(%esp), %xmm3
807 movdqa %xmm0, 0(%esp)
808 movdqa %xmm1, 16(%esp)
809 movdqa %xmm2, 32(%esp)
810 movdqa %xmm3, 48(%esp)
815 pxor 112(%esp), %xmm3
816 movdqa %xmm0, 64(%esp)
817 movdqa %xmm1, 80(%esp)
818 movdqa %xmm2, 96(%esp)
819 movdqa %xmm3, 112(%esp)
821 paddd 64(%esp), %xmm0
822 paddd 80(%esp), %xmm1
823 paddd 96(%esp), %xmm2
824 paddd 112(%esp), %xmm3
825 movdqa %xmm0, 64(%esp)
826 movdqa %xmm1, 80(%esp)
827 movdqa %xmm2, 96(%esp)
828 movdqa %xmm3, 112(%esp)
831 ja xmm_scrypt_core_loop2
833 # re-shuffle 1st block back
867 # re-shuffle 2nd block back