gitweb.ps.run Git - matrix_esp_thesis/blob - ext/olm/lib/ed25519/src/fe.c

   1 #include "fixedint.h"
   2 #include "fe.h"
   3
   4 #ifndef ED25519_LOAD_BYTES
   5 #define ED25519_LOAD_BYTES
   6
   7 /*
   8     helper functions
   9 */
  10 static uint64_t load_3(const unsigned char *in) {
  11     uint64_t result;
  12
  13     result = (uint64_t) in[0];
  14     result |= ((uint64_t) in[1]) << 8;
  15     result |= ((uint64_t) in[2]) << 16;
  16
  17     return result;
  18 }
  19
  20 static uint64_t load_4(const unsigned char *in) {
  21     uint64_t result;
  22
  23     result = (uint64_t) in[0];
  24     result |= ((uint64_t) in[1]) << 8;
  25     result |= ((uint64_t) in[2]) << 16;
  26     result |= ((uint64_t) in[3]) << 24;
  27
  28     return result;
  29 }
  30
  31 #endif
  32
  33 /*
  34     h = 0
  35 */
  36
  37 void fe_0(fe h) {
  38     h[0] = 0;
  39     h[1] = 0;
  40     h[2] = 0;
  41     h[3] = 0;
  42     h[4] = 0;
  43     h[5] = 0;
  44     h[6] = 0;
  45     h[7] = 0;
  46     h[8] = 0;
  47     h[9] = 0;
  48 }
  49
  50
  51
  52 /*
  53     h = 1
  54 */
  55
  56 void fe_1(fe h) {
  57     h[0] = 1;
  58     h[1] = 0;
  59     h[2] = 0;
  60     h[3] = 0;
  61     h[4] = 0;
  62     h[5] = 0;
  63     h[6] = 0;
  64     h[7] = 0;
  65     h[8] = 0;
  66     h[9] = 0;
  67 }
  68
  69
  70
  71 /*
  72     h = f + g
  73     Can overlap h with f or g.
  74
  75     Preconditions:
  76        |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
  77        |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
  78
  79     Postconditions:
  80        |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
  81 */
  82
  83 void fe_add(fe h, const fe f, const fe g) {
  84     int32_t f0 = f[0];
  85     int32_t f1 = f[1];
  86     int32_t f2 = f[2];
  87     int32_t f3 = f[3];
  88     int32_t f4 = f[4];
  89     int32_t f5 = f[5];
  90     int32_t f6 = f[6];
  91     int32_t f7 = f[7];
  92     int32_t f8 = f[8];
  93     int32_t f9 = f[9];
  94     int32_t g0 = g[0];
  95     int32_t g1 = g[1];
  96     int32_t g2 = g[2];
  97     int32_t g3 = g[3];
  98     int32_t g4 = g[4];
  99     int32_t g5 = g[5];
 100     int32_t g6 = g[6];
 101     int32_t g7 = g[7];
 102     int32_t g8 = g[8];
 103     int32_t g9 = g[9];
 104     int32_t h0 = f0 + g0;
 105     int32_t h1 = f1 + g1;
 106     int32_t h2 = f2 + g2;
 107     int32_t h3 = f3 + g3;
 108     int32_t h4 = f4 + g4;
 109     int32_t h5 = f5 + g5;
 110     int32_t h6 = f6 + g6;
 111     int32_t h7 = f7 + g7;
 112     int32_t h8 = f8 + g8;
 113     int32_t h9 = f9 + g9;
 114
 115     h[0] = h0;
 116     h[1] = h1;
 117     h[2] = h2;
 118     h[3] = h3;
 119     h[4] = h4;
 120     h[5] = h5;
 121     h[6] = h6;
 122     h[7] = h7;
 123     h[8] = h8;
 124     h[9] = h9;
 125 }
 126
 127
 128
 129 /*
 130     Replace (f,g) with (g,g) if b == 1;
 131     replace (f,g) with (f,g) if b == 0.
 132
 133     Preconditions: b in {0,1}.
 134 */
 135
 136 void fe_cmov(fe f, const fe g, unsigned int b) {
 137     int32_t f0 = f[0];
 138     int32_t f1 = f[1];
 139     int32_t f2 = f[2];
 140     int32_t f3 = f[3];
 141     int32_t f4 = f[4];
 142     int32_t f5 = f[5];
 143     int32_t f6 = f[6];
 144     int32_t f7 = f[7];
 145     int32_t f8 = f[8];
 146     int32_t f9 = f[9];
 147     int32_t g0 = g[0];
 148     int32_t g1 = g[1];
 149     int32_t g2 = g[2];
 150     int32_t g3 = g[3];
 151     int32_t g4 = g[4];
 152     int32_t g5 = g[5];
 153     int32_t g6 = g[6];
 154     int32_t g7 = g[7];
 155     int32_t g8 = g[8];
 156     int32_t g9 = g[9];
 157     int32_t x0 = f0 ^ g0;
 158     int32_t x1 = f1 ^ g1;
 159     int32_t x2 = f2 ^ g2;
 160     int32_t x3 = f3 ^ g3;
 161     int32_t x4 = f4 ^ g4;
 162     int32_t x5 = f5 ^ g5;
 163     int32_t x6 = f6 ^ g6;
 164     int32_t x7 = f7 ^ g7;
 165     int32_t x8 = f8 ^ g8;
 166     int32_t x9 = f9 ^ g9;
 167
 168     b = (unsigned int) (- (int) b); /* silence warning */
 169     x0 &= b;
 170     x1 &= b;
 171     x2 &= b;
 172     x3 &= b;
 173     x4 &= b;
 174     x5 &= b;
 175     x6 &= b;
 176     x7 &= b;
 177     x8 &= b;
 178     x9 &= b;
 179
 180     f[0] = f0 ^ x0;
 181     f[1] = f1 ^ x1;
 182     f[2] = f2 ^ x2;
 183     f[3] = f3 ^ x3;
 184     f[4] = f4 ^ x4;
 185     f[5] = f5 ^ x5;
 186     f[6] = f6 ^ x6;
 187     f[7] = f7 ^ x7;
 188     f[8] = f8 ^ x8;
 189     f[9] = f9 ^ x9;
 190 }
 191
 192 /*
 193     Replace (f,g) with (g,f) if b == 1;
 194     replace (f,g) with (f,g) if b == 0.
 195
 196     Preconditions: b in {0,1}.
 197 */
 198
 199 void fe_cswap(fe f,fe g,unsigned int b) {
 200     int32_t f0 = f[0];
 201     int32_t f1 = f[1];
 202     int32_t f2 = f[2];
 203     int32_t f3 = f[3];
 204     int32_t f4 = f[4];
 205     int32_t f5 = f[5];
 206     int32_t f6 = f[6];
 207     int32_t f7 = f[7];
 208     int32_t f8 = f[8];
 209     int32_t f9 = f[9];
 210     int32_t g0 = g[0];
 211     int32_t g1 = g[1];
 212     int32_t g2 = g[2];
 213     int32_t g3 = g[3];
 214     int32_t g4 = g[4];
 215     int32_t g5 = g[5];
 216     int32_t g6 = g[6];
 217     int32_t g7 = g[7];
 218     int32_t g8 = g[8];
 219     int32_t g9 = g[9];
 220     int32_t x0 = f0 ^ g0;
 221     int32_t x1 = f1 ^ g1;
 222     int32_t x2 = f2 ^ g2;
 223     int32_t x3 = f3 ^ g3;
 224     int32_t x4 = f4 ^ g4;
 225     int32_t x5 = f5 ^ g5;
 226     int32_t x6 = f6 ^ g6;
 227     int32_t x7 = f7 ^ g7;
 228     int32_t x8 = f8 ^ g8;
 229     int32_t x9 = f9 ^ g9;
 230     b = -b;
 231     x0 &= b;
 232     x1 &= b;
 233     x2 &= b;
 234     x3 &= b;
 235     x4 &= b;
 236     x5 &= b;
 237     x6 &= b;
 238     x7 &= b;
 239     x8 &= b;
 240     x9 &= b;
 241     f[0] = f0 ^ x0;
 242     f[1] = f1 ^ x1;
 243     f[2] = f2 ^ x2;
 244     f[3] = f3 ^ x3;
 245     f[4] = f4 ^ x4;
 246     f[5] = f5 ^ x5;
 247     f[6] = f6 ^ x6;
 248     f[7] = f7 ^ x7;
 249     f[8] = f8 ^ x8;
 250     f[9] = f9 ^ x9;
 251     g[0] = g0 ^ x0;
 252     g[1] = g1 ^ x1;
 253     g[2] = g2 ^ x2;
 254     g[3] = g3 ^ x3;
 255     g[4] = g4 ^ x4;
 256     g[5] = g5 ^ x5;
 257     g[6] = g6 ^ x6;
 258     g[7] = g7 ^ x7;
 259     g[8] = g8 ^ x8;
 260     g[9] = g9 ^ x9;
 261 }
 262
 263
 264
 265 /*
 266     h = f
 267 */
 268
 269 void fe_copy(fe h, const fe f) {
 270     int32_t f0 = f[0];
 271     int32_t f1 = f[1];
 272     int32_t f2 = f[2];
 273     int32_t f3 = f[3];
 274     int32_t f4 = f[4];
 275     int32_t f5 = f[5];
 276     int32_t f6 = f[6];
 277     int32_t f7 = f[7];
 278     int32_t f8 = f[8];
 279     int32_t f9 = f[9];
 280
 281     h[0] = f0;
 282     h[1] = f1;
 283     h[2] = f2;
 284     h[3] = f3;
 285     h[4] = f4;
 286     h[5] = f5;
 287     h[6] = f6;
 288     h[7] = f7;
 289     h[8] = f8;
 290     h[9] = f9;
 291 }
 292
 293
 294
 295 /*
 296     Ignores top bit of h.
 297 */
 298
 299 void fe_frombytes(fe h, const unsigned char *s) {
 300     int64_t h0 = load_4(s);
 301     int64_t h1 = load_3(s + 4) << 6;
 302     int64_t h2 = load_3(s + 7) << 5;
 303     int64_t h3 = load_3(s + 10) << 3;
 304     int64_t h4 = load_3(s + 13) << 2;
 305     int64_t h5 = load_4(s + 16);
 306     int64_t h6 = load_3(s + 20) << 7;
 307     int64_t h7 = load_3(s + 23) << 5;
 308     int64_t h8 = load_3(s + 26) << 4;
 309     int64_t h9 = (load_3(s + 29) & 8388607) << 2;
 310     int64_t carry0;
 311     int64_t carry1;
 312     int64_t carry2;
 313     int64_t carry3;
 314     int64_t carry4;
 315     int64_t carry5;
 316     int64_t carry6;
 317     int64_t carry7;
 318     int64_t carry8;
 319     int64_t carry9;
 320
 321     carry9 = (h9 + (int64_t) (1 << 24)) >> 25;
 322     h0 += carry9 * 19;
 323     h9 -= carry9 << 25;
 324     carry1 = (h1 + (int64_t) (1 << 24)) >> 25;
 325     h2 += carry1;
 326     h1 -= carry1 << 25;
 327     carry3 = (h3 + (int64_t) (1 << 24)) >> 25;
 328     h4 += carry3;
 329     h3 -= carry3 << 25;
 330     carry5 = (h5 + (int64_t) (1 << 24)) >> 25;
 331     h6 += carry5;
 332     h5 -= carry5 << 25;
 333     carry7 = (h7 + (int64_t) (1 << 24)) >> 25;
 334     h8 += carry7;
 335     h7 -= carry7 << 25;
 336     carry0 = (h0 + (int64_t) (1 << 25)) >> 26;
 337     h1 += carry0;
 338     h0 -= carry0 << 26;
 339     carry2 = (h2 + (int64_t) (1 << 25)) >> 26;
 340     h3 += carry2;
 341     h2 -= carry2 << 26;
 342     carry4 = (h4 + (int64_t) (1 << 25)) >> 26;
 343     h5 += carry4;
 344     h4 -= carry4 << 26;
 345     carry6 = (h6 + (int64_t) (1 << 25)) >> 26;
 346     h7 += carry6;
 347     h6 -= carry6 << 26;
 348     carry8 = (h8 + (int64_t) (1 << 25)) >> 26;
 349     h9 += carry8;
 350     h8 -= carry8 << 26;
 351
 352     h[0] = (int32_t) h0;
 353     h[1] = (int32_t) h1;
 354     h[2] = (int32_t) h2;
 355     h[3] = (int32_t) h3;
 356     h[4] = (int32_t) h4;
 357     h[5] = (int32_t) h5;
 358     h[6] = (int32_t) h6;
 359     h[7] = (int32_t) h7;
 360     h[8] = (int32_t) h8;
 361     h[9] = (int32_t) h9;
 362 }
 363
 364
 365
 366 void fe_invert(fe out, const fe z) {
 367     fe t0;
 368     fe t1;
 369     fe t2;
 370     fe t3;
 371     int i;
 372
 373     fe_sq(t0, z);
 374
 375     for (i = 1; i < 1; ++i) {
 376         fe_sq(t0, t0);
 377     }
 378
 379     fe_sq(t1, t0);
 380
 381     for (i = 1; i < 2; ++i) {
 382         fe_sq(t1, t1);
 383     }
 384
 385     fe_mul(t1, z, t1);
 386     fe_mul(t0, t0, t1);
 387     fe_sq(t2, t0);
 388
 389     for (i = 1; i < 1; ++i) {
 390         fe_sq(t2, t2);
 391     }
 392
 393     fe_mul(t1, t1, t2);
 394     fe_sq(t2, t1);
 395
 396     for (i = 1; i < 5; ++i) {
 397         fe_sq(t2, t2);
 398     }
 399
 400     fe_mul(t1, t2, t1);
 401     fe_sq(t2, t1);
 402
 403     for (i = 1; i < 10; ++i) {
 404         fe_sq(t2, t2);
 405     }
 406
 407     fe_mul(t2, t2, t1);
 408     fe_sq(t3, t2);
 409
 410     for (i = 1; i < 20; ++i) {
 411         fe_sq(t3, t3);
 412     }
 413
 414     fe_mul(t2, t3, t2);
 415     fe_sq(t2, t2);
 416
 417     for (i = 1; i < 10; ++i) {
 418         fe_sq(t2, t2);
 419     }
 420
 421     fe_mul(t1, t2, t1);
 422     fe_sq(t2, t1);
 423
 424     for (i = 1; i < 50; ++i) {
 425         fe_sq(t2, t2);
 426     }
 427
 428     fe_mul(t2, t2, t1);
 429     fe_sq(t3, t2);
 430
 431     for (i = 1; i < 100; ++i) {
 432         fe_sq(t3, t3);
 433     }
 434
 435     fe_mul(t2, t3, t2);
 436     fe_sq(t2, t2);
 437
 438     for (i = 1; i < 50; ++i) {
 439         fe_sq(t2, t2);
 440     }
 441
 442     fe_mul(t1, t2, t1);
 443     fe_sq(t1, t1);
 444
 445     for (i = 1; i < 5; ++i) {
 446         fe_sq(t1, t1);
 447     }
 448
 449     fe_mul(out, t1, t0);
 450 }
 451
 452
 453
 454 /*
 455     return 1 if f is in {1,3,5,...,q-2}
 456     return 0 if f is in {0,2,4,...,q-1}
 457
 458     Preconditions:
 459        |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
 460 */
 461
 462 int fe_isnegative(const fe f) {
 463     unsigned char s[32];
 464
 465     fe_tobytes(s, f);
 466
 467     return s[0] & 1;
 468 }
 469
 470
 471
 472 /*
 473     return 1 if f == 0
 474     return 0 if f != 0
 475
 476     Preconditions:
 477        |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
 478 */
 479
 480 int fe_isnonzero(const fe f) {
 481     unsigned char s[32];
 482     unsigned char r;
 483
 484     fe_tobytes(s, f);
 485
 486     r = s[0];
 487     #define F(i) r |= s[i]
 488     F(1);
 489     F(2);
 490     F(3);
 491     F(4);
 492     F(5);
 493     F(6);
 494     F(7);
 495     F(8);
 496     F(9);
 497     F(10);
 498     F(11);
 499     F(12);
 500     F(13);
 501     F(14);
 502     F(15);
 503     F(16);
 504     F(17);
 505     F(18);
 506     F(19);
 507     F(20);
 508     F(21);
 509     F(22);
 510     F(23);
 511     F(24);
 512     F(25);
 513     F(26);
 514     F(27);
 515     F(28);
 516     F(29);
 517     F(30);
 518     F(31);
 519     #undef F
 520
 521     return r != 0;
 522 }
 523
 524
 525
 526 /*
 527     h = f * g
 528     Can overlap h with f or g.
 529
 530     Preconditions:
 531        |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
 532        |g| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
 533
 534     Postconditions:
 535        |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
 536     */
 537
 538     /*
 539     Notes on implementation strategy:
 540
 541     Using schoolbook multiplication.
 542     Karatsuba would save a little in some cost models.
 543
 544     Most multiplications by 2 and 19 are 32-bit precomputations;
 545     cheaper than 64-bit postcomputations.
 546
 547     There is one remaining multiplication by 19 in the carry chain;
 548     one *19 precomputation can be merged into this,
 549     but the resulting data flow is considerably less clean.
 550
 551     There are 12 carries below.
 552     10 of them are 2-way parallelizable and vectorizable.
 553     Can get away with 11 carries, but then data flow is much deeper.
 554
 555     With tighter constraints on inputs can squeeze carries into int32.
 556 */
 557
 558 void fe_mul(fe h, const fe f, const fe g) {
 559     int32_t f0 = f[0];
 560     int32_t f1 = f[1];
 561     int32_t f2 = f[2];
 562     int32_t f3 = f[3];
 563     int32_t f4 = f[4];
 564     int32_t f5 = f[5];
 565     int32_t f6 = f[6];
 566     int32_t f7 = f[7];
 567     int32_t f8 = f[8];
 568     int32_t f9 = f[9];
 569     int32_t g0 = g[0];
 570     int32_t g1 = g[1];
 571     int32_t g2 = g[2];
 572     int32_t g3 = g[3];
 573     int32_t g4 = g[4];
 574     int32_t g5 = g[5];
 575     int32_t g6 = g[6];
 576     int32_t g7 = g[7];
 577     int32_t g8 = g[8];
 578     int32_t g9 = g[9];
 579     int32_t g1_19 = 19 * g1; /* 1.959375*2^29 */
 580     int32_t g2_19 = 19 * g2; /* 1.959375*2^30; still ok */
 581     int32_t g3_19 = 19 * g3;
 582     int32_t g4_19 = 19 * g4;
 583     int32_t g5_19 = 19 * g5;
 584     int32_t g6_19 = 19 * g6;
 585     int32_t g7_19 = 19 * g7;
 586     int32_t g8_19 = 19 * g8;
 587     int32_t g9_19 = 19 * g9;
 588     int32_t f1_2 = 2 * f1;
 589     int32_t f3_2 = 2 * f3;
 590     int32_t f5_2 = 2 * f5;
 591     int32_t f7_2 = 2 * f7;
 592     int32_t f9_2 = 2 * f9;
 593     int64_t f0g0    = f0   * (int64_t) g0;
 594     int64_t f0g1    = f0   * (int64_t) g1;
 595     int64_t f0g2    = f0   * (int64_t) g2;
 596     int64_t f0g3    = f0   * (int64_t) g3;
 597     int64_t f0g4    = f0   * (int64_t) g4;
 598     int64_t f0g5    = f0   * (int64_t) g5;
 599     int64_t f0g6    = f0   * (int64_t) g6;
 600     int64_t f0g7    = f0   * (int64_t) g7;
 601     int64_t f0g8    = f0   * (int64_t) g8;
 602     int64_t f0g9    = f0   * (int64_t) g9;
 603     int64_t f1g0    = f1   * (int64_t) g0;
 604     int64_t f1g1_2  = f1_2 * (int64_t) g1;
 605     int64_t f1g2    = f1   * (int64_t) g2;
 606     int64_t f1g3_2  = f1_2 * (int64_t) g3;
 607     int64_t f1g4    = f1   * (int64_t) g4;
 608     int64_t f1g5_2  = f1_2 * (int64_t) g5;
 609     int64_t f1g6    = f1   * (int64_t) g6;
 610     int64_t f1g7_2  = f1_2 * (int64_t) g7;
 611     int64_t f1g8    = f1   * (int64_t) g8;
 612     int64_t f1g9_38 = f1_2 * (int64_t) g9_19;
 613     int64_t f2g0    = f2   * (int64_t) g0;
 614     int64_t f2g1    = f2   * (int64_t) g1;
 615     int64_t f2g2    = f2   * (int64_t) g2;
 616     int64_t f2g3    = f2   * (int64_t) g3;
 617     int64_t f2g4    = f2   * (int64_t) g4;
 618     int64_t f2g5    = f2   * (int64_t) g5;
 619     int64_t f2g6    = f2   * (int64_t) g6;
 620     int64_t f2g7    = f2   * (int64_t) g7;
 621     int64_t f2g8_19 = f2   * (int64_t) g8_19;
 622     int64_t f2g9_19 = f2   * (int64_t) g9_19;
 623     int64_t f3g0    = f3   * (int64_t) g0;
 624     int64_t f3g1_2  = f3_2 * (int64_t) g1;
 625     int64_t f3g2    = f3   * (int64_t) g2;
 626     int64_t f3g3_2  = f3_2 * (int64_t) g3;
 627     int64_t f3g4    = f3   * (int64_t) g4;
 628     int64_t f3g5_2  = f3_2 * (int64_t) g5;
 629     int64_t f3g6    = f3   * (int64_t) g6;
 630     int64_t f3g7_38 = f3_2 * (int64_t) g7_19;
 631     int64_t f3g8_19 = f3   * (int64_t) g8_19;
 632     int64_t f3g9_38 = f3_2 * (int64_t) g9_19;
 633     int64_t f4g0    = f4   * (int64_t) g0;
 634     int64_t f4g1    = f4   * (int64_t) g1;
 635     int64_t f4g2    = f4   * (int64_t) g2;
 636     int64_t f4g3    = f4   * (int64_t) g3;
 637     int64_t f4g4    = f4   * (int64_t) g4;
 638     int64_t f4g5    = f4   * (int64_t) g5;
 639     int64_t f4g6_19 = f4   * (int64_t) g6_19;
 640     int64_t f4g7_19 = f4   * (int64_t) g7_19;
 641     int64_t f4g8_19 = f4   * (int64_t) g8_19;
 642     int64_t f4g9_19 = f4   * (int64_t) g9_19;
 643     int64_t f5g0    = f5   * (int64_t) g0;
 644     int64_t f5g1_2  = f5_2 * (int64_t) g1;
 645     int64_t f5g2    = f5   * (int64_t) g2;
 646     int64_t f5g3_2  = f5_2 * (int64_t) g3;
 647     int64_t f5g4    = f5   * (int64_t) g4;
 648     int64_t f5g5_38 = f5_2 * (int64_t) g5_19;
 649     int64_t f5g6_19 = f5   * (int64_t) g6_19;
 650     int64_t f5g7_38 = f5_2 * (int64_t) g7_19;
 651     int64_t f5g8_19 = f5   * (int64_t) g8_19;
 652     int64_t f5g9_38 = f5_2 * (int64_t) g9_19;
 653     int64_t f6g0    = f6   * (int64_t) g0;
 654     int64_t f6g1    = f6   * (int64_t) g1;
 655     int64_t f6g2    = f6   * (int64_t) g2;
 656     int64_t f6g3    = f6   * (int64_t) g3;
 657     int64_t f6g4_19 = f6   * (int64_t) g4_19;
 658     int64_t f6g5_19 = f6   * (int64_t) g5_19;
 659     int64_t f6g6_19 = f6   * (int64_t) g6_19;
 660     int64_t f6g7_19 = f6   * (int64_t) g7_19;
 661     int64_t f6g8_19 = f6   * (int64_t) g8_19;
 662     int64_t f6g9_19 = f6   * (int64_t) g9_19;
 663     int64_t f7g0    = f7   * (int64_t) g0;
 664     int64_t f7g1_2  = f7_2 * (int64_t) g1;
 665     int64_t f7g2    = f7   * (int64_t) g2;
 666     int64_t f7g3_38 = f7_2 * (int64_t) g3_19;
 667     int64_t f7g4_19 = f7   * (int64_t) g4_19;
 668     int64_t f7g5_38 = f7_2 * (int64_t) g5_19;
 669     int64_t f7g6_19 = f7   * (int64_t) g6_19;
 670     int64_t f7g7_38 = f7_2 * (int64_t) g7_19;
 671     int64_t f7g8_19 = f7   * (int64_t) g8_19;
 672     int64_t f7g9_38 = f7_2 * (int64_t) g9_19;
 673     int64_t f8g0    = f8   * (int64_t) g0;
 674     int64_t f8g1    = f8   * (int64_t) g1;
 675     int64_t f8g2_19 = f8   * (int64_t) g2_19;
 676     int64_t f8g3_19 = f8   * (int64_t) g3_19;
 677     int64_t f8g4_19 = f8   * (int64_t) g4_19;
 678     int64_t f8g5_19 = f8   * (int64_t) g5_19;
 679     int64_t f8g6_19 = f8   * (int64_t) g6_19;
 680     int64_t f8g7_19 = f8   * (int64_t) g7_19;
 681     int64_t f8g8_19 = f8   * (int64_t) g8_19;
 682     int64_t f8g9_19 = f8   * (int64_t) g9_19;
 683     int64_t f9g0    = f9   * (int64_t) g0;
 684     int64_t f9g1_38 = f9_2 * (int64_t) g1_19;
 685     int64_t f9g2_19 = f9   * (int64_t) g2_19;
 686     int64_t f9g3_38 = f9_2 * (int64_t) g3_19;
 687     int64_t f9g4_19 = f9   * (int64_t) g4_19;
 688     int64_t f9g5_38 = f9_2 * (int64_t) g5_19;
 689     int64_t f9g6_19 = f9   * (int64_t) g6_19;
 690     int64_t f9g7_38 = f9_2 * (int64_t) g7_19;
 691     int64_t f9g8_19 = f9   * (int64_t) g8_19;
 692     int64_t f9g9_38 = f9_2 * (int64_t) g9_19;
 693     int64_t h0 = f0g0 + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 + f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38;
 694     int64_t h1 = f0g1 + f1g0   + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 + f6g5_19 + f7g4_19 + f8g3_19 + f9g2_19;
 695     int64_t h2 = f0g2 + f1g1_2 + f2g0   + f3g9_38 + f4g8_19 + f5g7_38 + f6g6_19 + f7g5_38 + f8g4_19 + f9g3_38;
 696     int64_t h3 = f0g3 + f1g2   + f2g1   + f3g0   + f4g9_19 + f5g8_19 + f6g7_19 + f7g6_19 + f8g5_19 + f9g4_19;
 697     int64_t h4 = f0g4 + f1g3_2 + f2g2   + f3g1_2 + f4g0   + f5g9_38 + f6g8_19 + f7g7_38 + f8g6_19 + f9g5_38;
 698     int64_t h5 = f0g5 + f1g4   + f2g3   + f3g2   + f4g1   + f5g0   + f6g9_19 + f7g8_19 + f8g7_19 + f9g6_19;
 699     int64_t h6 = f0g6 + f1g5_2 + f2g4   + f3g3_2 + f4g2   + f5g1_2 + f6g0   + f7g9_38 + f8g8_19 + f9g7_38;
 700     int64_t h7 = f0g7 + f1g6   + f2g5   + f3g4   + f4g3   + f5g2   + f6g1   + f7g0   + f8g9_19 + f9g8_19;
 701     int64_t h8 = f0g8 + f1g7_2 + f2g6   + f3g5_2 + f4g4   + f5g3_2 + f6g2   + f7g1_2 + f8g0   + f9g9_38;
 702     int64_t h9 = f0g9 + f1g8   + f2g7   + f3g6   + f4g5   + f5g4   + f6g3   + f7g2   + f8g1   + f9g0   ;
 703     int64_t carry0;
 704     int64_t carry1;
 705     int64_t carry2;
 706     int64_t carry3;
 707     int64_t carry4;
 708     int64_t carry5;
 709     int64_t carry6;
 710     int64_t carry7;
 711     int64_t carry8;
 712     int64_t carry9;
 713
 714     carry0 = (h0 + (int64_t) (1 << 25)) >> 26;
 715     h1 += carry0;
 716     h0 -= carry0 << 26;
 717     carry4 = (h4 + (int64_t) (1 << 25)) >> 26;
 718     h5 += carry4;
 719     h4 -= carry4 << 26;
 720
 721     carry1 = (h1 + (int64_t) (1 << 24)) >> 25;
 722     h2 += carry1;
 723     h1 -= carry1 << 25;
 724     carry5 = (h5 + (int64_t) (1 << 24)) >> 25;
 725     h6 += carry5;
 726     h5 -= carry5 << 25;
 727
 728     carry2 = (h2 + (int64_t) (1 << 25)) >> 26;
 729     h3 += carry2;
 730     h2 -= carry2 << 26;
 731     carry6 = (h6 + (int64_t) (1 << 25)) >> 26;
 732     h7 += carry6;
 733     h6 -= carry6 << 26;
 734
 735     carry3 = (h3 + (int64_t) (1 << 24)) >> 25;
 736     h4 += carry3;
 737     h3 -= carry3 << 25;
 738     carry7 = (h7 + (int64_t) (1 << 24)) >> 25;
 739     h8 += carry7;
 740     h7 -= carry7 << 25;
 741
 742     carry4 = (h4 + (int64_t) (1 << 25)) >> 26;
 743     h5 += carry4;
 744     h4 -= carry4 << 26;
 745     carry8 = (h8 + (int64_t) (1 << 25)) >> 26;
 746     h9 += carry8;
 747     h8 -= carry8 << 26;
 748
 749     carry9 = (h9 + (int64_t) (1 << 24)) >> 25;
 750     h0 += carry9 * 19;
 751     h9 -= carry9 << 25;
 752
 753     carry0 = (h0 + (int64_t) (1 << 25)) >> 26;
 754     h1 += carry0;
 755     h0 -= carry0 << 26;
 756
 757     h[0] = (int32_t) h0;
 758     h[1] = (int32_t) h1;
 759     h[2] = (int32_t) h2;
 760     h[3] = (int32_t) h3;
 761     h[4] = (int32_t) h4;
 762     h[5] = (int32_t) h5;
 763     h[6] = (int32_t) h6;
 764     h[7] = (int32_t) h7;
 765     h[8] = (int32_t) h8;
 766     h[9] = (int32_t) h9;
 767 }
 768
 769
 770 /*
 771 h = f * 121666
 772 Can overlap h with f.
 773
 774 Preconditions:
 775    |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
 776
 777 Postconditions:
 778    |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
 779 */
 780
 781 void fe_mul121666(fe h, fe f) {
 782     int32_t f0 = f[0];
 783     int32_t f1 = f[1];
 784     int32_t f2 = f[2];
 785     int32_t f3 = f[3];
 786     int32_t f4 = f[4];
 787     int32_t f5 = f[5];
 788     int32_t f6 = f[6];
 789     int32_t f7 = f[7];
 790     int32_t f8 = f[8];
 791     int32_t f9 = f[9];
 792     int64_t h0 = f0 * (int64_t) 121666;
 793     int64_t h1 = f1 * (int64_t) 121666;
 794     int64_t h2 = f2 * (int64_t) 121666;
 795     int64_t h3 = f3 * (int64_t) 121666;
 796     int64_t h4 = f4 * (int64_t) 121666;
 797     int64_t h5 = f5 * (int64_t) 121666;
 798     int64_t h6 = f6 * (int64_t) 121666;
 799     int64_t h7 = f7 * (int64_t) 121666;
 800     int64_t h8 = f8 * (int64_t) 121666;
 801     int64_t h9 = f9 * (int64_t) 121666;
 802     int64_t carry0;
 803     int64_t carry1;
 804     int64_t carry2;
 805     int64_t carry3;
 806     int64_t carry4;
 807     int64_t carry5;
 808     int64_t carry6;
 809     int64_t carry7;
 810     int64_t carry8;
 811     int64_t carry9;
 812
 813     carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
 814     carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
 815     carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
 816     carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
 817     carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
 818
 819     carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
 820     carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
 821     carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
 822     carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
 823     carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
 824
 825     h[0] = h0;
 826     h[1] = h1;
 827     h[2] = h2;
 828     h[3] = h3;
 829     h[4] = h4;
 830     h[5] = h5;
 831     h[6] = h6;
 832     h[7] = h7;
 833     h[8] = h8;
 834     h[9] = h9;
 835 }
 836
 837
 838 /*
 839 h = -f
 840
 841 Preconditions:
 842    |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
 843
 844 Postconditions:
 845    |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
 846 */
 847
 848 void fe_neg(fe h, const fe f) {
 849     int32_t f0 = f[0];
 850     int32_t f1 = f[1];
 851     int32_t f2 = f[2];
 852     int32_t f3 = f[3];
 853     int32_t f4 = f[4];
 854     int32_t f5 = f[5];
 855     int32_t f6 = f[6];
 856     int32_t f7 = f[7];
 857     int32_t f8 = f[8];
 858     int32_t f9 = f[9];
 859     int32_t h0 = -f0;
 860     int32_t h1 = -f1;
 861     int32_t h2 = -f2;
 862     int32_t h3 = -f3;
 863     int32_t h4 = -f4;
 864     int32_t h5 = -f5;
 865     int32_t h6 = -f6;
 866     int32_t h7 = -f7;
 867     int32_t h8 = -f8;
 868     int32_t h9 = -f9;
 869
 870     h[0] = h0;
 871     h[1] = h1;
 872     h[2] = h2;
 873     h[3] = h3;
 874     h[4] = h4;
 875     h[5] = h5;
 876     h[6] = h6;
 877     h[7] = h7;
 878     h[8] = h8;
 879     h[9] = h9;
 880 }
 881
 882
 883 void fe_pow22523(fe out, const fe z) {
 884     fe t0;
 885     fe t1;
 886     fe t2;
 887     int i;
 888     fe_sq(t0, z);
 889
 890     for (i = 1; i < 1; ++i) {
 891         fe_sq(t0, t0);
 892     }
 893
 894     fe_sq(t1, t0);
 895
 896     for (i = 1; i < 2; ++i) {
 897         fe_sq(t1, t1);
 898     }
 899
 900     fe_mul(t1, z, t1);
 901     fe_mul(t0, t0, t1);
 902     fe_sq(t0, t0);
 903
 904     for (i = 1; i < 1; ++i) {
 905         fe_sq(t0, t0);
 906     }
 907
 908     fe_mul(t0, t1, t0);
 909     fe_sq(t1, t0);
 910
 911     for (i = 1; i < 5; ++i) {
 912         fe_sq(t1, t1);
 913     }
 914
 915     fe_mul(t0, t1, t0);
 916     fe_sq(t1, t0);
 917
 918     for (i = 1; i < 10; ++i) {
 919         fe_sq(t1, t1);
 920     }
 921
 922     fe_mul(t1, t1, t0);
 923     fe_sq(t2, t1);
 924
 925     for (i = 1; i < 20; ++i) {
 926         fe_sq(t2, t2);
 927     }
 928
 929     fe_mul(t1, t2, t1);
 930     fe_sq(t1, t1);
 931
 932     for (i = 1; i < 10; ++i) {
 933         fe_sq(t1, t1);
 934     }
 935
 936     fe_mul(t0, t1, t0);
 937     fe_sq(t1, t0);
 938
 939     for (i = 1; i < 50; ++i) {
 940         fe_sq(t1, t1);
 941     }
 942
 943     fe_mul(t1, t1, t0);
 944     fe_sq(t2, t1);
 945
 946     for (i = 1; i < 100; ++i) {
 947         fe_sq(t2, t2);
 948     }
 949
 950     fe_mul(t1, t2, t1);
 951     fe_sq(t1, t1);
 952
 953     for (i = 1; i < 50; ++i) {
 954         fe_sq(t1, t1);
 955     }
 956
 957     fe_mul(t0, t1, t0);
 958     fe_sq(t0, t0);
 959
 960     for (i = 1; i < 2; ++i) {
 961         fe_sq(t0, t0);
 962     }
 963
 964     fe_mul(out, t0, z);
 965     return;
 966 }
 967
 968
 969 /*
 970 h = f * f
 971 Can overlap h with f.
 972
 973 Preconditions:
 974    |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
 975
 976 Postconditions:
 977    |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
 978 */
 979
 980 /*
 981 See fe_mul.c for discussion of implementation strategy.
 982 */
 983
 984 void fe_sq(fe h, const fe f) {
 985     int32_t f0 = f[0];
 986     int32_t f1 = f[1];
 987     int32_t f2 = f[2];
 988     int32_t f3 = f[3];
 989     int32_t f4 = f[4];
 990     int32_t f5 = f[5];
 991     int32_t f6 = f[6];
 992     int32_t f7 = f[7];
 993     int32_t f8 = f[8];
 994     int32_t f9 = f[9];
 995     int32_t f0_2 = 2 * f0;
 996     int32_t f1_2 = 2 * f1;
 997     int32_t f2_2 = 2 * f2;
 998     int32_t f3_2 = 2 * f3;
 999     int32_t f4_2 = 2 * f4;
1000     int32_t f5_2 = 2 * f5;
1001     int32_t f6_2 = 2 * f6;
1002     int32_t f7_2 = 2 * f7;
1003     int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
1004     int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
1005     int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
1006     int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
1007     int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
1008     int64_t f0f0    = f0   * (int64_t) f0;
1009     int64_t f0f1_2  = f0_2 * (int64_t) f1;
1010     int64_t f0f2_2  = f0_2 * (int64_t) f2;
1011     int64_t f0f3_2  = f0_2 * (int64_t) f3;
1012     int64_t f0f4_2  = f0_2 * (int64_t) f4;
1013     int64_t f0f5_2  = f0_2 * (int64_t) f5;
1014     int64_t f0f6_2  = f0_2 * (int64_t) f6;
1015     int64_t f0f7_2  = f0_2 * (int64_t) f7;
1016     int64_t f0f8_2  = f0_2 * (int64_t) f8;
1017     int64_t f0f9_2  = f0_2 * (int64_t) f9;
1018     int64_t f1f1_2  = f1_2 * (int64_t) f1;
1019     int64_t f1f2_2  = f1_2 * (int64_t) f2;
1020     int64_t f1f3_4  = f1_2 * (int64_t) f3_2;
1021     int64_t f1f4_2  = f1_2 * (int64_t) f4;
1022     int64_t f1f5_4  = f1_2 * (int64_t) f5_2;
1023     int64_t f1f6_2  = f1_2 * (int64_t) f6;
1024     int64_t f1f7_4  = f1_2 * (int64_t) f7_2;
1025     int64_t f1f8_2  = f1_2 * (int64_t) f8;
1026     int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
1027     int64_t f2f2    = f2   * (int64_t) f2;
1028     int64_t f2f3_2  = f2_2 * (int64_t) f3;
1029     int64_t f2f4_2  = f2_2 * (int64_t) f4;
1030     int64_t f2f5_2  = f2_2 * (int64_t) f5;
1031     int64_t f2f6_2  = f2_2 * (int64_t) f6;
1032     int64_t f2f7_2  = f2_2 * (int64_t) f7;
1033     int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
1034     int64_t f2f9_38 = f2   * (int64_t) f9_38;
1035     int64_t f3f3_2  = f3_2 * (int64_t) f3;
1036     int64_t f3f4_2  = f3_2 * (int64_t) f4;
1037     int64_t f3f5_4  = f3_2 * (int64_t) f5_2;
1038     int64_t f3f6_2  = f3_2 * (int64_t) f6;
1039     int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
1040     int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
1041     int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
1042     int64_t f4f4    = f4   * (int64_t) f4;
1043     int64_t f4f5_2  = f4_2 * (int64_t) f5;
1044     int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
1045     int64_t f4f7_38 = f4   * (int64_t) f7_38;
1046     int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
1047     int64_t f4f9_38 = f4   * (int64_t) f9_38;
1048     int64_t f5f5_38 = f5   * (int64_t) f5_38;
1049     int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
1050     int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
1051     int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
1052     int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
1053     int64_t f6f6_19 = f6   * (int64_t) f6_19;
1054     int64_t f6f7_38 = f6   * (int64_t) f7_38;
1055     int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
1056     int64_t f6f9_38 = f6   * (int64_t) f9_38;
1057     int64_t f7f7_38 = f7   * (int64_t) f7_38;
1058     int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
1059     int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
1060     int64_t f8f8_19 = f8   * (int64_t) f8_19;
1061     int64_t f8f9_38 = f8   * (int64_t) f9_38;
1062     int64_t f9f9_38 = f9   * (int64_t) f9_38;
1063     int64_t h0 = f0f0  + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38;
1064     int64_t h1 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38;
1065     int64_t h2 = f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19;
1066     int64_t h3 = f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38;
1067     int64_t h4 = f0f4_2 + f1f3_4 + f2f2   + f5f9_76 + f6f8_38 + f7f7_38;
1068     int64_t h5 = f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38;
1069     int64_t h6 = f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19;
1070     int64_t h7 = f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38;
1071     int64_t h8 = f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4   + f9f9_38;
1072     int64_t h9 = f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2;
1073     int64_t carry0;
1074     int64_t carry1;
1075     int64_t carry2;
1076     int64_t carry3;
1077     int64_t carry4;
1078     int64_t carry5;
1079     int64_t carry6;
1080     int64_t carry7;
1081     int64_t carry8;
1082     int64_t carry9;
1083     carry0 = (h0 + (int64_t) (1 << 25)) >> 26;
1084     h1 += carry0;
1085     h0 -= carry0 << 26;
1086     carry4 = (h4 + (int64_t) (1 << 25)) >> 26;
1087     h5 += carry4;
1088     h4 -= carry4 << 26;
1089     carry1 = (h1 + (int64_t) (1 << 24)) >> 25;
1090     h2 += carry1;
1091     h1 -= carry1 << 25;
1092     carry5 = (h5 + (int64_t) (1 << 24)) >> 25;
1093     h6 += carry5;
1094     h5 -= carry5 << 25;
1095     carry2 = (h2 + (int64_t) (1 << 25)) >> 26;
1096     h3 += carry2;
1097     h2 -= carry2 << 26;
1098     carry6 = (h6 + (int64_t) (1 << 25)) >> 26;
1099     h7 += carry6;
1100     h6 -= carry6 << 26;
1101     carry3 = (h3 + (int64_t) (1 << 24)) >> 25;
1102     h4 += carry3;
1103     h3 -= carry3 << 25;
1104     carry7 = (h7 + (int64_t) (1 << 24)) >> 25;
1105     h8 += carry7;
1106     h7 -= carry7 << 25;
1107     carry4 = (h4 + (int64_t) (1 << 25)) >> 26;
1108     h5 += carry4;
1109     h4 -= carry4 << 26;
1110     carry8 = (h8 + (int64_t) (1 << 25)) >> 26;
1111     h9 += carry8;
1112     h8 -= carry8 << 26;
1113     carry9 = (h9 + (int64_t) (1 << 24)) >> 25;
1114     h0 += carry9 * 19;
1115     h9 -= carry9 << 25;
1116     carry0 = (h0 + (int64_t) (1 << 25)) >> 26;
1117     h1 += carry0;
1118     h0 -= carry0 << 26;
1119     h[0] = (int32_t) h0;
1120     h[1] = (int32_t) h1;
1121     h[2] = (int32_t) h2;
1122     h[3] = (int32_t) h3;
1123     h[4] = (int32_t) h4;
1124     h[5] = (int32_t) h5;
1125     h[6] = (int32_t) h6;
1126     h[7] = (int32_t) h7;
1127     h[8] = (int32_t) h8;
1128     h[9] = (int32_t) h9;
1129 }
1130
1131
1132 /*
1133 h = 2 * f * f
1134 Can overlap h with f.
1135
1136 Preconditions:
1137    |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
1138
1139 Postconditions:
1140    |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
1141 */
1142
1143 /*
1144 See fe_mul.c for discussion of implementation strategy.
1145 */
1146
1147 void fe_sq2(fe h, const fe f) {
1148     int32_t f0 = f[0];
1149     int32_t f1 = f[1];
1150     int32_t f2 = f[2];
1151     int32_t f3 = f[3];
1152     int32_t f4 = f[4];
1153     int32_t f5 = f[5];
1154     int32_t f6 = f[6];
1155     int32_t f7 = f[7];
1156     int32_t f8 = f[8];
1157     int32_t f9 = f[9];
1158     int32_t f0_2 = 2 * f0;
1159     int32_t f1_2 = 2 * f1;
1160     int32_t f2_2 = 2 * f2;
1161     int32_t f3_2 = 2 * f3;
1162     int32_t f4_2 = 2 * f4;
1163     int32_t f5_2 = 2 * f5;
1164     int32_t f6_2 = 2 * f6;
1165     int32_t f7_2 = 2 * f7;
1166     int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
1167     int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
1168     int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
1169     int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
1170     int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
1171     int64_t f0f0    = f0   * (int64_t) f0;
1172     int64_t f0f1_2  = f0_2 * (int64_t) f1;
1173     int64_t f0f2_2  = f0_2 * (int64_t) f2;
1174     int64_t f0f3_2  = f0_2 * (int64_t) f3;
1175     int64_t f0f4_2  = f0_2 * (int64_t) f4;
1176     int64_t f0f5_2  = f0_2 * (int64_t) f5;
1177     int64_t f0f6_2  = f0_2 * (int64_t) f6;
1178     int64_t f0f7_2  = f0_2 * (int64_t) f7;
1179     int64_t f0f8_2  = f0_2 * (int64_t) f8;
1180     int64_t f0f9_2  = f0_2 * (int64_t) f9;
1181     int64_t f1f1_2  = f1_2 * (int64_t) f1;
1182     int64_t f1f2_2  = f1_2 * (int64_t) f2;
1183     int64_t f1f3_4  = f1_2 * (int64_t) f3_2;
1184     int64_t f1f4_2  = f1_2 * (int64_t) f4;
1185     int64_t f1f5_4  = f1_2 * (int64_t) f5_2;
1186     int64_t f1f6_2  = f1_2 * (int64_t) f6;
1187     int64_t f1f7_4  = f1_2 * (int64_t) f7_2;
1188     int64_t f1f8_2  = f1_2 * (int64_t) f8;
1189     int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
1190     int64_t f2f2    = f2   * (int64_t) f2;
1191     int64_t f2f3_2  = f2_2 * (int64_t) f3;
1192     int64_t f2f4_2  = f2_2 * (int64_t) f4;
1193     int64_t f2f5_2  = f2_2 * (int64_t) f5;
1194     int64_t f2f6_2  = f2_2 * (int64_t) f6;
1195     int64_t f2f7_2  = f2_2 * (int64_t) f7;
1196     int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
1197     int64_t f2f9_38 = f2   * (int64_t) f9_38;
1198     int64_t f3f3_2  = f3_2 * (int64_t) f3;
1199     int64_t f3f4_2  = f3_2 * (int64_t) f4;
1200     int64_t f3f5_4  = f3_2 * (int64_t) f5_2;
1201     int64_t f3f6_2  = f3_2 * (int64_t) f6;
1202     int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
1203     int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
1204     int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
1205     int64_t f4f4    = f4   * (int64_t) f4;
1206     int64_t f4f5_2  = f4_2 * (int64_t) f5;
1207     int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
1208     int64_t f4f7_38 = f4   * (int64_t) f7_38;
1209     int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
1210     int64_t f4f9_38 = f4   * (int64_t) f9_38;
1211     int64_t f5f5_38 = f5   * (int64_t) f5_38;
1212     int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
1213     int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
1214     int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
1215     int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
1216     int64_t f6f6_19 = f6   * (int64_t) f6_19;
1217     int64_t f6f7_38 = f6   * (int64_t) f7_38;
1218     int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
1219     int64_t f6f9_38 = f6   * (int64_t) f9_38;
1220     int64_t f7f7_38 = f7   * (int64_t) f7_38;
1221     int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
1222     int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
1223     int64_t f8f8_19 = f8   * (int64_t) f8_19;
1224     int64_t f8f9_38 = f8   * (int64_t) f9_38;
1225     int64_t f9f9_38 = f9   * (int64_t) f9_38;
1226     int64_t h0 = f0f0  + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38;
1227     int64_t h1 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38;
1228     int64_t h2 = f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19;
1229     int64_t h3 = f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38;
1230     int64_t h4 = f0f4_2 + f1f3_4 + f2f2   + f5f9_76 + f6f8_38 + f7f7_38;
1231     int64_t h5 = f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38;
1232     int64_t h6 = f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19;
1233     int64_t h7 = f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38;
1234     int64_t h8 = f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4   + f9f9_38;
1235     int64_t h9 = f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2;
1236     int64_t carry0;
1237     int64_t carry1;
1238     int64_t carry2;
1239     int64_t carry3;
1240     int64_t carry4;
1241     int64_t carry5;
1242     int64_t carry6;
1243     int64_t carry7;
1244     int64_t carry8;
1245     int64_t carry9;
1246     h0 += h0;
1247     h1 += h1;
1248     h2 += h2;
1249     h3 += h3;
1250     h4 += h4;
1251     h5 += h5;
1252     h6 += h6;
1253     h7 += h7;
1254     h8 += h8;
1255     h9 += h9;
1256     carry0 = (h0 + (int64_t) (1 << 25)) >> 26;
1257     h1 += carry0;
1258     h0 -= carry0 << 26;
1259     carry4 = (h4 + (int64_t) (1 << 25)) >> 26;
1260     h5 += carry4;
1261     h4 -= carry4 << 26;
1262     carry1 = (h1 + (int64_t) (1 << 24)) >> 25;
1263     h2 += carry1;
1264     h1 -= carry1 << 25;
1265     carry5 = (h5 + (int64_t) (1 << 24)) >> 25;
1266     h6 += carry5;
1267     h5 -= carry5 << 25;
1268     carry2 = (h2 + (int64_t) (1 << 25)) >> 26;
1269     h3 += carry2;
1270     h2 -= carry2 << 26;
1271     carry6 = (h6 + (int64_t) (1 << 25)) >> 26;
1272     h7 += carry6;
1273     h6 -= carry6 << 26;
1274     carry3 = (h3 + (int64_t) (1 << 24)) >> 25;
1275     h4 += carry3;
1276     h3 -= carry3 << 25;
1277     carry7 = (h7 + (int64_t) (1 << 24)) >> 25;
1278     h8 += carry7;
1279     h7 -= carry7 << 25;
1280     carry4 = (h4 + (int64_t) (1 << 25)) >> 26;
1281     h5 += carry4;
1282     h4 -= carry4 << 26;
1283     carry8 = (h8 + (int64_t) (1 << 25)) >> 26;
1284     h9 += carry8;
1285     h8 -= carry8 << 26;
1286     carry9 = (h9 + (int64_t) (1 << 24)) >> 25;
1287     h0 += carry9 * 19;
1288     h9 -= carry9 << 25;
1289     carry0 = (h0 + (int64_t) (1 << 25)) >> 26;
1290     h1 += carry0;
1291     h0 -= carry0 << 26;
1292     h[0] = (int32_t) h0;
1293     h[1] = (int32_t) h1;
1294     h[2] = (int32_t) h2;
1295     h[3] = (int32_t) h3;
1296     h[4] = (int32_t) h4;
1297     h[5] = (int32_t) h5;
1298     h[6] = (int32_t) h6;
1299     h[7] = (int32_t) h7;
1300     h[8] = (int32_t) h8;
1301     h[9] = (int32_t) h9;
1302 }
1303
1304
1305 /*
1306 h = f - g
1307 Can overlap h with f or g.
1308
1309 Preconditions:
1310    |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
1311    |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
1312
1313 Postconditions:
1314    |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
1315 */
1316
1317 void fe_sub(fe h, const fe f, const fe g) {
1318     int32_t f0 = f[0];
1319     int32_t f1 = f[1];
1320     int32_t f2 = f[2];
1321     int32_t f3 = f[3];
1322     int32_t f4 = f[4];
1323     int32_t f5 = f[5];
1324     int32_t f6 = f[6];
1325     int32_t f7 = f[7];
1326     int32_t f8 = f[8];
1327     int32_t f9 = f[9];
1328     int32_t g0 = g[0];
1329     int32_t g1 = g[1];
1330     int32_t g2 = g[2];
1331     int32_t g3 = g[3];
1332     int32_t g4 = g[4];
1333     int32_t g5 = g[5];
1334     int32_t g6 = g[6];
1335     int32_t g7 = g[7];
1336     int32_t g8 = g[8];
1337     int32_t g9 = g[9];
1338     int32_t h0 = f0 - g0;
1339     int32_t h1 = f1 - g1;
1340     int32_t h2 = f2 - g2;
1341     int32_t h3 = f3 - g3;
1342     int32_t h4 = f4 - g4;
1343     int32_t h5 = f5 - g5;
1344     int32_t h6 = f6 - g6;
1345     int32_t h7 = f7 - g7;
1346     int32_t h8 = f8 - g8;
1347     int32_t h9 = f9 - g9;
1348
1349     h[0] = h0;
1350     h[1] = h1;
1351     h[2] = h2;
1352     h[3] = h3;
1353     h[4] = h4;
1354     h[5] = h5;
1355     h[6] = h6;
1356     h[7] = h7;
1357     h[8] = h8;
1358     h[9] = h9;
1359 }
1360
1361
1362
1363 /*
1364 Preconditions:
1365   |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
1366
1367 Write p=2^255-19; q=floor(h/p).
1368 Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
1369
1370 Proof:
1371   Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
1372   Also have |h-2^230 h9|<2^231 so |19 2^(-255)(h-2^230 h9)|<1/4.
1373
1374   Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
1375   Then 0<y<1.
1376
1377   Write r=h-pq.
1378   Have 0<=r<=p-1=2^255-20.
1379   Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
1380
1381   Write x=r+19(2^-255)r+y.
1382   Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
1383
1384   Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
1385   so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
1386 */
1387
1388 void fe_tobytes(unsigned char *s, const fe h) {
1389     int32_t h0 = h[0];
1390     int32_t h1 = h[1];
1391     int32_t h2 = h[2];
1392     int32_t h3 = h[3];
1393     int32_t h4 = h[4];
1394     int32_t h5 = h[5];
1395     int32_t h6 = h[6];
1396     int32_t h7 = h[7];
1397     int32_t h8 = h[8];
1398     int32_t h9 = h[9];
1399     int32_t q;
1400     int32_t carry0;
1401     int32_t carry1;
1402     int32_t carry2;
1403     int32_t carry3;
1404     int32_t carry4;
1405     int32_t carry5;
1406     int32_t carry6;
1407     int32_t carry7;
1408     int32_t carry8;
1409     int32_t carry9;
1410     q = (19 * h9 + (((int32_t) 1) << 24)) >> 25;
1411     q = (h0 + q) >> 26;
1412     q = (h1 + q) >> 25;
1413     q = (h2 + q) >> 26;
1414     q = (h3 + q) >> 25;
1415     q = (h4 + q) >> 26;
1416     q = (h5 + q) >> 25;
1417     q = (h6 + q) >> 26;
1418     q = (h7 + q) >> 25;
1419     q = (h8 + q) >> 26;
1420     q = (h9 + q) >> 25;
1421     /* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
1422     h0 += 19 * q;
1423     /* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
1424     carry0 = h0 >> 26;
1425     h1 += carry0;
1426     h0 -= carry0 << 26;
1427     carry1 = h1 >> 25;
1428     h2 += carry1;
1429     h1 -= carry1 << 25;
1430     carry2 = h2 >> 26;
1431     h3 += carry2;
1432     h2 -= carry2 << 26;
1433     carry3 = h3 >> 25;
1434     h4 += carry3;
1435     h3 -= carry3 << 25;
1436     carry4 = h4 >> 26;
1437     h5 += carry4;
1438     h4 -= carry4 << 26;
1439     carry5 = h5 >> 25;
1440     h6 += carry5;
1441     h5 -= carry5 << 25;
1442     carry6 = h6 >> 26;
1443     h7 += carry6;
1444     h6 -= carry6 << 26;
1445     carry7 = h7 >> 25;
1446     h8 += carry7;
1447     h7 -= carry7 << 25;
1448     carry8 = h8 >> 26;
1449     h9 += carry8;
1450     h8 -= carry8 << 26;
1451     carry9 = h9 >> 25;
1452     h9 -= carry9 << 25;
1453
1454     /* h10 = carry9 */
1455     /*
1456     Goal: Output h0+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
1457     Have h0+...+2^230 h9 between 0 and 2^255-1;
1458     evidently 2^255 h10-2^255 q = 0.
1459     Goal: Output h0+...+2^230 h9.
1460     */
1461     s[0] = (unsigned char) (h0 >> 0);
1462     s[1] = (unsigned char) (h0 >> 8);
1463     s[2] = (unsigned char) (h0 >> 16);
1464     s[3] = (unsigned char) ((h0 >> 24) | (h1 << 2));
1465     s[4] = (unsigned char) (h1 >> 6);
1466     s[5] = (unsigned char) (h1 >> 14);
1467     s[6] = (unsigned char) ((h1 >> 22) | (h2 << 3));
1468     s[7] = (unsigned char) (h2 >> 5);
1469     s[8] = (unsigned char) (h2 >> 13);
1470     s[9] = (unsigned char) ((h2 >> 21) | (h3 << 5));
1471     s[10] = (unsigned char) (h3 >> 3);
1472     s[11] = (unsigned char) (h3 >> 11);
1473     s[12] = (unsigned char) ((h3 >> 19) | (h4 << 6));
1474     s[13] = (unsigned char) (h4 >> 2);
1475     s[14] = (unsigned char) (h4 >> 10);
1476     s[15] = (unsigned char) (h4 >> 18);
1477     s[16] = (unsigned char) (h5 >> 0);
1478     s[17] = (unsigned char) (h5 >> 8);
1479     s[18] = (unsigned char) (h5 >> 16);
1480     s[19] = (unsigned char) ((h5 >> 24) | (h6 << 1));
1481     s[20] = (unsigned char) (h6 >> 7);
1482     s[21] = (unsigned char) (h6 >> 15);
1483     s[22] = (unsigned char) ((h6 >> 23) | (h7 << 3));
1484     s[23] = (unsigned char) (h7 >> 5);
1485     s[24] = (unsigned char) (h7 >> 13);
1486     s[25] = (unsigned char) ((h7 >> 21) | (h8 << 4));
1487     s[26] = (unsigned char) (h8 >> 4);
1488     s[27] = (unsigned char) (h8 >> 12);
1489     s[28] = (unsigned char) ((h8 >> 20) | (h9 << 6));
1490     s[29] = (unsigned char) (h9 >> 2);
1491     s[30] = (unsigned char) (h9 >> 10);
1492     s[31] = (unsigned char) (h9 >> 18);
1493 }