4 #ifndef ED25519_LOAD_BYTES
5 #define ED25519_LOAD_BYTES
10 static uint64_t load_3(const unsigned char *in) {
13 result = (uint64_t) in[0];
14 result |= ((uint64_t) in[1]) << 8;
15 result |= ((uint64_t) in[2]) << 16;
20 static uint64_t load_4(const unsigned char *in) {
23 result = (uint64_t) in[0];
24 result |= ((uint64_t) in[1]) << 8;
25 result |= ((uint64_t) in[2]) << 16;
26 result |= ((uint64_t) in[3]) << 24;
73 Can overlap h with f or g.
76 |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
77 |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
80 |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
83 void fe_add(fe h, const fe f, const fe g) {
104 int32_t h0 = f0 + g0;
105 int32_t h1 = f1 + g1;
106 int32_t h2 = f2 + g2;
107 int32_t h3 = f3 + g3;
108 int32_t h4 = f4 + g4;
109 int32_t h5 = f5 + g5;
110 int32_t h6 = f6 + g6;
111 int32_t h7 = f7 + g7;
112 int32_t h8 = f8 + g8;
113 int32_t h9 = f9 + g9;
130 Replace (f,g) with (g,g) if b == 1;
131 replace (f,g) with (f,g) if b == 0.
133 Preconditions: b in {0,1}.
136 void fe_cmov(fe f, const fe g, unsigned int b) {
157 int32_t x0 = f0 ^ g0;
158 int32_t x1 = f1 ^ g1;
159 int32_t x2 = f2 ^ g2;
160 int32_t x3 = f3 ^ g3;
161 int32_t x4 = f4 ^ g4;
162 int32_t x5 = f5 ^ g5;
163 int32_t x6 = f6 ^ g6;
164 int32_t x7 = f7 ^ g7;
165 int32_t x8 = f8 ^ g8;
166 int32_t x9 = f9 ^ g9;
168 b = (unsigned int) (- (int) b); /* silence warning */
193 Replace (f,g) with (g,f) if b == 1;
194 replace (f,g) with (f,g) if b == 0.
196 Preconditions: b in {0,1}.
199 void fe_cswap(fe f,fe g,unsigned int b) {
220 int32_t x0 = f0 ^ g0;
221 int32_t x1 = f1 ^ g1;
222 int32_t x2 = f2 ^ g2;
223 int32_t x3 = f3 ^ g3;
224 int32_t x4 = f4 ^ g4;
225 int32_t x5 = f5 ^ g5;
226 int32_t x6 = f6 ^ g6;
227 int32_t x7 = f7 ^ g7;
228 int32_t x8 = f8 ^ g8;
229 int32_t x9 = f9 ^ g9;
269 void fe_copy(fe h, const fe f) {
296 Ignores top bit of h.
299 void fe_frombytes(fe h, const unsigned char *s) {
300 int64_t h0 = load_4(s);
301 int64_t h1 = load_3(s + 4) << 6;
302 int64_t h2 = load_3(s + 7) << 5;
303 int64_t h3 = load_3(s + 10) << 3;
304 int64_t h4 = load_3(s + 13) << 2;
305 int64_t h5 = load_4(s + 16);
306 int64_t h6 = load_3(s + 20) << 7;
307 int64_t h7 = load_3(s + 23) << 5;
308 int64_t h8 = load_3(s + 26) << 4;
309 int64_t h9 = (load_3(s + 29) & 8388607) << 2;
321 carry9 = (h9 + (int64_t) (1 << 24)) >> 25;
324 carry1 = (h1 + (int64_t) (1 << 24)) >> 25;
327 carry3 = (h3 + (int64_t) (1 << 24)) >> 25;
330 carry5 = (h5 + (int64_t) (1 << 24)) >> 25;
333 carry7 = (h7 + (int64_t) (1 << 24)) >> 25;
336 carry0 = (h0 + (int64_t) (1 << 25)) >> 26;
339 carry2 = (h2 + (int64_t) (1 << 25)) >> 26;
342 carry4 = (h4 + (int64_t) (1 << 25)) >> 26;
345 carry6 = (h6 + (int64_t) (1 << 25)) >> 26;
348 carry8 = (h8 + (int64_t) (1 << 25)) >> 26;
366 void fe_invert(fe out, const fe z) {
375 for (i = 1; i < 1; ++i) {
381 for (i = 1; i < 2; ++i) {
389 for (i = 1; i < 1; ++i) {
396 for (i = 1; i < 5; ++i) {
403 for (i = 1; i < 10; ++i) {
410 for (i = 1; i < 20; ++i) {
417 for (i = 1; i < 10; ++i) {
424 for (i = 1; i < 50; ++i) {
431 for (i = 1; i < 100; ++i) {
438 for (i = 1; i < 50; ++i) {
445 for (i = 1; i < 5; ++i) {
455 return 1 if f is in {1,3,5,...,q-2}
456 return 0 if f is in {0,2,4,...,q-1}
459 |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
462 int fe_isnegative(const fe f) {
477 |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
480 int fe_isnonzero(const fe f) {
487 #define F(i) r |= s[i]
528 Can overlap h with f or g.
531 |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
532 |g| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
535 |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
539 Notes on implementation strategy:
541 Using schoolbook multiplication.
542 Karatsuba would save a little in some cost models.
544 Most multiplications by 2 and 19 are 32-bit precomputations;
545 cheaper than 64-bit postcomputations.
547 There is one remaining multiplication by 19 in the carry chain;
548 one *19 precomputation can be merged into this,
549 but the resulting data flow is considerably less clean.
551 There are 12 carries below.
552 10 of them are 2-way parallelizable and vectorizable.
553 Can get away with 11 carries, but then data flow is much deeper.
555 With tighter constraints on inputs can squeeze carries into int32.
558 void fe_mul(fe h, const fe f, const fe g) {
579 int32_t g1_19 = 19 * g1; /* 1.959375*2^29 */
580 int32_t g2_19 = 19 * g2; /* 1.959375*2^30; still ok */
581 int32_t g3_19 = 19 * g3;
582 int32_t g4_19 = 19 * g4;
583 int32_t g5_19 = 19 * g5;
584 int32_t g6_19 = 19 * g6;
585 int32_t g7_19 = 19 * g7;
586 int32_t g8_19 = 19 * g8;
587 int32_t g9_19 = 19 * g9;
588 int32_t f1_2 = 2 * f1;
589 int32_t f3_2 = 2 * f3;
590 int32_t f5_2 = 2 * f5;
591 int32_t f7_2 = 2 * f7;
592 int32_t f9_2 = 2 * f9;
593 int64_t f0g0 = f0 * (int64_t) g0;
594 int64_t f0g1 = f0 * (int64_t) g1;
595 int64_t f0g2 = f0 * (int64_t) g2;
596 int64_t f0g3 = f0 * (int64_t) g3;
597 int64_t f0g4 = f0 * (int64_t) g4;
598 int64_t f0g5 = f0 * (int64_t) g5;
599 int64_t f0g6 = f0 * (int64_t) g6;
600 int64_t f0g7 = f0 * (int64_t) g7;
601 int64_t f0g8 = f0 * (int64_t) g8;
602 int64_t f0g9 = f0 * (int64_t) g9;
603 int64_t f1g0 = f1 * (int64_t) g0;
604 int64_t f1g1_2 = f1_2 * (int64_t) g1;
605 int64_t f1g2 = f1 * (int64_t) g2;
606 int64_t f1g3_2 = f1_2 * (int64_t) g3;
607 int64_t f1g4 = f1 * (int64_t) g4;
608 int64_t f1g5_2 = f1_2 * (int64_t) g5;
609 int64_t f1g6 = f1 * (int64_t) g6;
610 int64_t f1g7_2 = f1_2 * (int64_t) g7;
611 int64_t f1g8 = f1 * (int64_t) g8;
612 int64_t f1g9_38 = f1_2 * (int64_t) g9_19;
613 int64_t f2g0 = f2 * (int64_t) g0;
614 int64_t f2g1 = f2 * (int64_t) g1;
615 int64_t f2g2 = f2 * (int64_t) g2;
616 int64_t f2g3 = f2 * (int64_t) g3;
617 int64_t f2g4 = f2 * (int64_t) g4;
618 int64_t f2g5 = f2 * (int64_t) g5;
619 int64_t f2g6 = f2 * (int64_t) g6;
620 int64_t f2g7 = f2 * (int64_t) g7;
621 int64_t f2g8_19 = f2 * (int64_t) g8_19;
622 int64_t f2g9_19 = f2 * (int64_t) g9_19;
623 int64_t f3g0 = f3 * (int64_t) g0;
624 int64_t f3g1_2 = f3_2 * (int64_t) g1;
625 int64_t f3g2 = f3 * (int64_t) g2;
626 int64_t f3g3_2 = f3_2 * (int64_t) g3;
627 int64_t f3g4 = f3 * (int64_t) g4;
628 int64_t f3g5_2 = f3_2 * (int64_t) g5;
629 int64_t f3g6 = f3 * (int64_t) g6;
630 int64_t f3g7_38 = f3_2 * (int64_t) g7_19;
631 int64_t f3g8_19 = f3 * (int64_t) g8_19;
632 int64_t f3g9_38 = f3_2 * (int64_t) g9_19;
633 int64_t f4g0 = f4 * (int64_t) g0;
634 int64_t f4g1 = f4 * (int64_t) g1;
635 int64_t f4g2 = f4 * (int64_t) g2;
636 int64_t f4g3 = f4 * (int64_t) g3;
637 int64_t f4g4 = f4 * (int64_t) g4;
638 int64_t f4g5 = f4 * (int64_t) g5;
639 int64_t f4g6_19 = f4 * (int64_t) g6_19;
640 int64_t f4g7_19 = f4 * (int64_t) g7_19;
641 int64_t f4g8_19 = f4 * (int64_t) g8_19;
642 int64_t f4g9_19 = f4 * (int64_t) g9_19;
643 int64_t f5g0 = f5 * (int64_t) g0;
644 int64_t f5g1_2 = f5_2 * (int64_t) g1;
645 int64_t f5g2 = f5 * (int64_t) g2;
646 int64_t f5g3_2 = f5_2 * (int64_t) g3;
647 int64_t f5g4 = f5 * (int64_t) g4;
648 int64_t f5g5_38 = f5_2 * (int64_t) g5_19;
649 int64_t f5g6_19 = f5 * (int64_t) g6_19;
650 int64_t f5g7_38 = f5_2 * (int64_t) g7_19;
651 int64_t f5g8_19 = f5 * (int64_t) g8_19;
652 int64_t f5g9_38 = f5_2 * (int64_t) g9_19;
653 int64_t f6g0 = f6 * (int64_t) g0;
654 int64_t f6g1 = f6 * (int64_t) g1;
655 int64_t f6g2 = f6 * (int64_t) g2;
656 int64_t f6g3 = f6 * (int64_t) g3;
657 int64_t f6g4_19 = f6 * (int64_t) g4_19;
658 int64_t f6g5_19 = f6 * (int64_t) g5_19;
659 int64_t f6g6_19 = f6 * (int64_t) g6_19;
660 int64_t f6g7_19 = f6 * (int64_t) g7_19;
661 int64_t f6g8_19 = f6 * (int64_t) g8_19;
662 int64_t f6g9_19 = f6 * (int64_t) g9_19;
663 int64_t f7g0 = f7 * (int64_t) g0;
664 int64_t f7g1_2 = f7_2 * (int64_t) g1;
665 int64_t f7g2 = f7 * (int64_t) g2;
666 int64_t f7g3_38 = f7_2 * (int64_t) g3_19;
667 int64_t f7g4_19 = f7 * (int64_t) g4_19;
668 int64_t f7g5_38 = f7_2 * (int64_t) g5_19;
669 int64_t f7g6_19 = f7 * (int64_t) g6_19;
670 int64_t f7g7_38 = f7_2 * (int64_t) g7_19;
671 int64_t f7g8_19 = f7 * (int64_t) g8_19;
672 int64_t f7g9_38 = f7_2 * (int64_t) g9_19;
673 int64_t f8g0 = f8 * (int64_t) g0;
674 int64_t f8g1 = f8 * (int64_t) g1;
675 int64_t f8g2_19 = f8 * (int64_t) g2_19;
676 int64_t f8g3_19 = f8 * (int64_t) g3_19;
677 int64_t f8g4_19 = f8 * (int64_t) g4_19;
678 int64_t f8g5_19 = f8 * (int64_t) g5_19;
679 int64_t f8g6_19 = f8 * (int64_t) g6_19;
680 int64_t f8g7_19 = f8 * (int64_t) g7_19;
681 int64_t f8g8_19 = f8 * (int64_t) g8_19;
682 int64_t f8g9_19 = f8 * (int64_t) g9_19;
683 int64_t f9g0 = f9 * (int64_t) g0;
684 int64_t f9g1_38 = f9_2 * (int64_t) g1_19;
685 int64_t f9g2_19 = f9 * (int64_t) g2_19;
686 int64_t f9g3_38 = f9_2 * (int64_t) g3_19;
687 int64_t f9g4_19 = f9 * (int64_t) g4_19;
688 int64_t f9g5_38 = f9_2 * (int64_t) g5_19;
689 int64_t f9g6_19 = f9 * (int64_t) g6_19;
690 int64_t f9g7_38 = f9_2 * (int64_t) g7_19;
691 int64_t f9g8_19 = f9 * (int64_t) g8_19;
692 int64_t f9g9_38 = f9_2 * (int64_t) g9_19;
693 int64_t h0 = f0g0 + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 + f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38;
694 int64_t h1 = f0g1 + f1g0 + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 + f6g5_19 + f7g4_19 + f8g3_19 + f9g2_19;
695 int64_t h2 = f0g2 + f1g1_2 + f2g0 + f3g9_38 + f4g8_19 + f5g7_38 + f6g6_19 + f7g5_38 + f8g4_19 + f9g3_38;
696 int64_t h3 = f0g3 + f1g2 + f2g1 + f3g0 + f4g9_19 + f5g8_19 + f6g7_19 + f7g6_19 + f8g5_19 + f9g4_19;
697 int64_t h4 = f0g4 + f1g3_2 + f2g2 + f3g1_2 + f4g0 + f5g9_38 + f6g8_19 + f7g7_38 + f8g6_19 + f9g5_38;
698 int64_t h5 = f0g5 + f1g4 + f2g3 + f3g2 + f4g1 + f5g0 + f6g9_19 + f7g8_19 + f8g7_19 + f9g6_19;
699 int64_t h6 = f0g6 + f1g5_2 + f2g4 + f3g3_2 + f4g2 + f5g1_2 + f6g0 + f7g9_38 + f8g8_19 + f9g7_38;
700 int64_t h7 = f0g7 + f1g6 + f2g5 + f3g4 + f4g3 + f5g2 + f6g1 + f7g0 + f8g9_19 + f9g8_19;
701 int64_t h8 = f0g8 + f1g7_2 + f2g6 + f3g5_2 + f4g4 + f5g3_2 + f6g2 + f7g1_2 + f8g0 + f9g9_38;
702 int64_t h9 = f0g9 + f1g8 + f2g7 + f3g6 + f4g5 + f5g4 + f6g3 + f7g2 + f8g1 + f9g0 ;
714 carry0 = (h0 + (int64_t) (1 << 25)) >> 26;
717 carry4 = (h4 + (int64_t) (1 << 25)) >> 26;
721 carry1 = (h1 + (int64_t) (1 << 24)) >> 25;
724 carry5 = (h5 + (int64_t) (1 << 24)) >> 25;
728 carry2 = (h2 + (int64_t) (1 << 25)) >> 26;
731 carry6 = (h6 + (int64_t) (1 << 25)) >> 26;
735 carry3 = (h3 + (int64_t) (1 << 24)) >> 25;
738 carry7 = (h7 + (int64_t) (1 << 24)) >> 25;
742 carry4 = (h4 + (int64_t) (1 << 25)) >> 26;
745 carry8 = (h8 + (int64_t) (1 << 25)) >> 26;
749 carry9 = (h9 + (int64_t) (1 << 24)) >> 25;
753 carry0 = (h0 + (int64_t) (1 << 25)) >> 26;
772 Can overlap h with f.
775 |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
778 |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
781 void fe_mul121666(fe h, fe f) {
792 int64_t h0 = f0 * (int64_t) 121666;
793 int64_t h1 = f1 * (int64_t) 121666;
794 int64_t h2 = f2 * (int64_t) 121666;
795 int64_t h3 = f3 * (int64_t) 121666;
796 int64_t h4 = f4 * (int64_t) 121666;
797 int64_t h5 = f5 * (int64_t) 121666;
798 int64_t h6 = f6 * (int64_t) 121666;
799 int64_t h7 = f7 * (int64_t) 121666;
800 int64_t h8 = f8 * (int64_t) 121666;
801 int64_t h9 = f9 * (int64_t) 121666;
813 carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
814 carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
815 carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
816 carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
817 carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
819 carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
820 carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
821 carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
822 carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
823 carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
842 |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
845 |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
848 void fe_neg(fe h, const fe f) {
883 void fe_pow22523(fe out, const fe z) {
890 for (i = 1; i < 1; ++i) {
896 for (i = 1; i < 2; ++i) {
904 for (i = 1; i < 1; ++i) {
911 for (i = 1; i < 5; ++i) {
918 for (i = 1; i < 10; ++i) {
925 for (i = 1; i < 20; ++i) {
932 for (i = 1; i < 10; ++i) {
939 for (i = 1; i < 50; ++i) {
946 for (i = 1; i < 100; ++i) {
953 for (i = 1; i < 50; ++i) {
960 for (i = 1; i < 2; ++i) {
971 Can overlap h with f.
974 |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
977 |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
981 See fe_mul.c for discussion of implementation strategy.
984 void fe_sq(fe h, const fe f) {
995 int32_t f0_2 = 2 * f0;
996 int32_t f1_2 = 2 * f1;
997 int32_t f2_2 = 2 * f2;
998 int32_t f3_2 = 2 * f3;
999 int32_t f4_2 = 2 * f4;
1000 int32_t f5_2 = 2 * f5;
1001 int32_t f6_2 = 2 * f6;
1002 int32_t f7_2 = 2 * f7;
1003 int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
1004 int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
1005 int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
1006 int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
1007 int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
1008 int64_t f0f0 = f0 * (int64_t) f0;
1009 int64_t f0f1_2 = f0_2 * (int64_t) f1;
1010 int64_t f0f2_2 = f0_2 * (int64_t) f2;
1011 int64_t f0f3_2 = f0_2 * (int64_t) f3;
1012 int64_t f0f4_2 = f0_2 * (int64_t) f4;
1013 int64_t f0f5_2 = f0_2 * (int64_t) f5;
1014 int64_t f0f6_2 = f0_2 * (int64_t) f6;
1015 int64_t f0f7_2 = f0_2 * (int64_t) f7;
1016 int64_t f0f8_2 = f0_2 * (int64_t) f8;
1017 int64_t f0f9_2 = f0_2 * (int64_t) f9;
1018 int64_t f1f1_2 = f1_2 * (int64_t) f1;
1019 int64_t f1f2_2 = f1_2 * (int64_t) f2;
1020 int64_t f1f3_4 = f1_2 * (int64_t) f3_2;
1021 int64_t f1f4_2 = f1_2 * (int64_t) f4;
1022 int64_t f1f5_4 = f1_2 * (int64_t) f5_2;
1023 int64_t f1f6_2 = f1_2 * (int64_t) f6;
1024 int64_t f1f7_4 = f1_2 * (int64_t) f7_2;
1025 int64_t f1f8_2 = f1_2 * (int64_t) f8;
1026 int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
1027 int64_t f2f2 = f2 * (int64_t) f2;
1028 int64_t f2f3_2 = f2_2 * (int64_t) f3;
1029 int64_t f2f4_2 = f2_2 * (int64_t) f4;
1030 int64_t f2f5_2 = f2_2 * (int64_t) f5;
1031 int64_t f2f6_2 = f2_2 * (int64_t) f6;
1032 int64_t f2f7_2 = f2_2 * (int64_t) f7;
1033 int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
1034 int64_t f2f9_38 = f2 * (int64_t) f9_38;
1035 int64_t f3f3_2 = f3_2 * (int64_t) f3;
1036 int64_t f3f4_2 = f3_2 * (int64_t) f4;
1037 int64_t f3f5_4 = f3_2 * (int64_t) f5_2;
1038 int64_t f3f6_2 = f3_2 * (int64_t) f6;
1039 int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
1040 int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
1041 int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
1042 int64_t f4f4 = f4 * (int64_t) f4;
1043 int64_t f4f5_2 = f4_2 * (int64_t) f5;
1044 int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
1045 int64_t f4f7_38 = f4 * (int64_t) f7_38;
1046 int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
1047 int64_t f4f9_38 = f4 * (int64_t) f9_38;
1048 int64_t f5f5_38 = f5 * (int64_t) f5_38;
1049 int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
1050 int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
1051 int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
1052 int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
1053 int64_t f6f6_19 = f6 * (int64_t) f6_19;
1054 int64_t f6f7_38 = f6 * (int64_t) f7_38;
1055 int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
1056 int64_t f6f9_38 = f6 * (int64_t) f9_38;
1057 int64_t f7f7_38 = f7 * (int64_t) f7_38;
1058 int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
1059 int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
1060 int64_t f8f8_19 = f8 * (int64_t) f8_19;
1061 int64_t f8f9_38 = f8 * (int64_t) f9_38;
1062 int64_t f9f9_38 = f9 * (int64_t) f9_38;
1063 int64_t h0 = f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38;
1064 int64_t h1 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38;
1065 int64_t h2 = f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19;
1066 int64_t h3 = f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38;
1067 int64_t h4 = f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38;
1068 int64_t h5 = f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38;
1069 int64_t h6 = f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19;
1070 int64_t h7 = f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38;
1071 int64_t h8 = f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38;
1072 int64_t h9 = f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2;
1083 carry0 = (h0 + (int64_t) (1 << 25)) >> 26;
1086 carry4 = (h4 + (int64_t) (1 << 25)) >> 26;
1089 carry1 = (h1 + (int64_t) (1 << 24)) >> 25;
1092 carry5 = (h5 + (int64_t) (1 << 24)) >> 25;
1095 carry2 = (h2 + (int64_t) (1 << 25)) >> 26;
1098 carry6 = (h6 + (int64_t) (1 << 25)) >> 26;
1101 carry3 = (h3 + (int64_t) (1 << 24)) >> 25;
1104 carry7 = (h7 + (int64_t) (1 << 24)) >> 25;
1107 carry4 = (h4 + (int64_t) (1 << 25)) >> 26;
1110 carry8 = (h8 + (int64_t) (1 << 25)) >> 26;
1113 carry9 = (h9 + (int64_t) (1 << 24)) >> 25;
1116 carry0 = (h0 + (int64_t) (1 << 25)) >> 26;
1119 h[0] = (int32_t) h0;
1120 h[1] = (int32_t) h1;
1121 h[2] = (int32_t) h2;
1122 h[3] = (int32_t) h3;
1123 h[4] = (int32_t) h4;
1124 h[5] = (int32_t) h5;
1125 h[6] = (int32_t) h6;
1126 h[7] = (int32_t) h7;
1127 h[8] = (int32_t) h8;
1128 h[9] = (int32_t) h9;
1134 Can overlap h with f.
1137 |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
1140 |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
1144 See fe_mul.c for discussion of implementation strategy.
1147 void fe_sq2(fe h, const fe f) {
1158 int32_t f0_2 = 2 * f0;
1159 int32_t f1_2 = 2 * f1;
1160 int32_t f2_2 = 2 * f2;
1161 int32_t f3_2 = 2 * f3;
1162 int32_t f4_2 = 2 * f4;
1163 int32_t f5_2 = 2 * f5;
1164 int32_t f6_2 = 2 * f6;
1165 int32_t f7_2 = 2 * f7;
1166 int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
1167 int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
1168 int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
1169 int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
1170 int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
1171 int64_t f0f0 = f0 * (int64_t) f0;
1172 int64_t f0f1_2 = f0_2 * (int64_t) f1;
1173 int64_t f0f2_2 = f0_2 * (int64_t) f2;
1174 int64_t f0f3_2 = f0_2 * (int64_t) f3;
1175 int64_t f0f4_2 = f0_2 * (int64_t) f4;
1176 int64_t f0f5_2 = f0_2 * (int64_t) f5;
1177 int64_t f0f6_2 = f0_2 * (int64_t) f6;
1178 int64_t f0f7_2 = f0_2 * (int64_t) f7;
1179 int64_t f0f8_2 = f0_2 * (int64_t) f8;
1180 int64_t f0f9_2 = f0_2 * (int64_t) f9;
1181 int64_t f1f1_2 = f1_2 * (int64_t) f1;
1182 int64_t f1f2_2 = f1_2 * (int64_t) f2;
1183 int64_t f1f3_4 = f1_2 * (int64_t) f3_2;
1184 int64_t f1f4_2 = f1_2 * (int64_t) f4;
1185 int64_t f1f5_4 = f1_2 * (int64_t) f5_2;
1186 int64_t f1f6_2 = f1_2 * (int64_t) f6;
1187 int64_t f1f7_4 = f1_2 * (int64_t) f7_2;
1188 int64_t f1f8_2 = f1_2 * (int64_t) f8;
1189 int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
1190 int64_t f2f2 = f2 * (int64_t) f2;
1191 int64_t f2f3_2 = f2_2 * (int64_t) f3;
1192 int64_t f2f4_2 = f2_2 * (int64_t) f4;
1193 int64_t f2f5_2 = f2_2 * (int64_t) f5;
1194 int64_t f2f6_2 = f2_2 * (int64_t) f6;
1195 int64_t f2f7_2 = f2_2 * (int64_t) f7;
1196 int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
1197 int64_t f2f9_38 = f2 * (int64_t) f9_38;
1198 int64_t f3f3_2 = f3_2 * (int64_t) f3;
1199 int64_t f3f4_2 = f3_2 * (int64_t) f4;
1200 int64_t f3f5_4 = f3_2 * (int64_t) f5_2;
1201 int64_t f3f6_2 = f3_2 * (int64_t) f6;
1202 int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
1203 int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
1204 int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
1205 int64_t f4f4 = f4 * (int64_t) f4;
1206 int64_t f4f5_2 = f4_2 * (int64_t) f5;
1207 int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
1208 int64_t f4f7_38 = f4 * (int64_t) f7_38;
1209 int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
1210 int64_t f4f9_38 = f4 * (int64_t) f9_38;
1211 int64_t f5f5_38 = f5 * (int64_t) f5_38;
1212 int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
1213 int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
1214 int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
1215 int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
1216 int64_t f6f6_19 = f6 * (int64_t) f6_19;
1217 int64_t f6f7_38 = f6 * (int64_t) f7_38;
1218 int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
1219 int64_t f6f9_38 = f6 * (int64_t) f9_38;
1220 int64_t f7f7_38 = f7 * (int64_t) f7_38;
1221 int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
1222 int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
1223 int64_t f8f8_19 = f8 * (int64_t) f8_19;
1224 int64_t f8f9_38 = f8 * (int64_t) f9_38;
1225 int64_t f9f9_38 = f9 * (int64_t) f9_38;
1226 int64_t h0 = f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38;
1227 int64_t h1 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38;
1228 int64_t h2 = f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19;
1229 int64_t h3 = f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38;
1230 int64_t h4 = f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38;
1231 int64_t h5 = f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38;
1232 int64_t h6 = f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19;
1233 int64_t h7 = f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38;
1234 int64_t h8 = f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38;
1235 int64_t h9 = f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2;
1256 carry0 = (h0 + (int64_t) (1 << 25)) >> 26;
1259 carry4 = (h4 + (int64_t) (1 << 25)) >> 26;
1262 carry1 = (h1 + (int64_t) (1 << 24)) >> 25;
1265 carry5 = (h5 + (int64_t) (1 << 24)) >> 25;
1268 carry2 = (h2 + (int64_t) (1 << 25)) >> 26;
1271 carry6 = (h6 + (int64_t) (1 << 25)) >> 26;
1274 carry3 = (h3 + (int64_t) (1 << 24)) >> 25;
1277 carry7 = (h7 + (int64_t) (1 << 24)) >> 25;
1280 carry4 = (h4 + (int64_t) (1 << 25)) >> 26;
1283 carry8 = (h8 + (int64_t) (1 << 25)) >> 26;
1286 carry9 = (h9 + (int64_t) (1 << 24)) >> 25;
1289 carry0 = (h0 + (int64_t) (1 << 25)) >> 26;
1292 h[0] = (int32_t) h0;
1293 h[1] = (int32_t) h1;
1294 h[2] = (int32_t) h2;
1295 h[3] = (int32_t) h3;
1296 h[4] = (int32_t) h4;
1297 h[5] = (int32_t) h5;
1298 h[6] = (int32_t) h6;
1299 h[7] = (int32_t) h7;
1300 h[8] = (int32_t) h8;
1301 h[9] = (int32_t) h9;
1307 Can overlap h with f or g.
1310 |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
1311 |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
1314 |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
1317 void fe_sub(fe h, const fe f, const fe g) {
1338 int32_t h0 = f0 - g0;
1339 int32_t h1 = f1 - g1;
1340 int32_t h2 = f2 - g2;
1341 int32_t h3 = f3 - g3;
1342 int32_t h4 = f4 - g4;
1343 int32_t h5 = f5 - g5;
1344 int32_t h6 = f6 - g6;
1345 int32_t h7 = f7 - g7;
1346 int32_t h8 = f8 - g8;
1347 int32_t h9 = f9 - g9;
1365 |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
1367 Write p=2^255-19; q=floor(h/p).
1368 Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
1371 Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
1372 Also have |h-2^230 h9|<2^231 so |19 2^(-255)(h-2^230 h9)|<1/4.
1374 Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
1378 Have 0<=r<=p-1=2^255-20.
1379 Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
1381 Write x=r+19(2^-255)r+y.
1382 Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
1384 Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
1385 so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
1388 void fe_tobytes(unsigned char *s, const fe h) {
1410 q = (19 * h9 + (((int32_t) 1) << 24)) >> 25;
1421 /* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
1423 /* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
1456 Goal: Output h0+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
1457 Have h0+...+2^230 h9 between 0 and 2^255-1;
1458 evidently 2^255 h10-2^255 q = 0.
1459 Goal: Output h0+...+2^230 h9.
1461 s[0] = (unsigned char) (h0 >> 0);
1462 s[1] = (unsigned char) (h0 >> 8);
1463 s[2] = (unsigned char) (h0 >> 16);
1464 s[3] = (unsigned char) ((h0 >> 24) | (h1 << 2));
1465 s[4] = (unsigned char) (h1 >> 6);
1466 s[5] = (unsigned char) (h1 >> 14);
1467 s[6] = (unsigned char) ((h1 >> 22) | (h2 << 3));
1468 s[7] = (unsigned char) (h2 >> 5);
1469 s[8] = (unsigned char) (h2 >> 13);
1470 s[9] = (unsigned char) ((h2 >> 21) | (h3 << 5));
1471 s[10] = (unsigned char) (h3 >> 3);
1472 s[11] = (unsigned char) (h3 >> 11);
1473 s[12] = (unsigned char) ((h3 >> 19) | (h4 << 6));
1474 s[13] = (unsigned char) (h4 >> 2);
1475 s[14] = (unsigned char) (h4 >> 10);
1476 s[15] = (unsigned char) (h4 >> 18);
1477 s[16] = (unsigned char) (h5 >> 0);
1478 s[17] = (unsigned char) (h5 >> 8);
1479 s[18] = (unsigned char) (h5 >> 16);
1480 s[19] = (unsigned char) ((h5 >> 24) | (h6 << 1));
1481 s[20] = (unsigned char) (h6 >> 7);
1482 s[21] = (unsigned char) (h6 >> 15);
1483 s[22] = (unsigned char) ((h6 >> 23) | (h7 << 3));
1484 s[23] = (unsigned char) (h7 >> 5);
1485 s[24] = (unsigned char) (h7 >> 13);
1486 s[25] = (unsigned char) ((h7 >> 21) | (h8 << 4));
1487 s[26] = (unsigned char) (h8 >> 4);
1488 s[27] = (unsigned char) (h8 >> 12);
1489 s[28] = (unsigned char) ((h8 >> 20) | (h9 << 6));
1490 s[29] = (unsigned char) (h9 >> 2);
1491 s[30] = (unsigned char) (h9 >> 10);
1492 s[31] = (unsigned char) (h9 >> 18);