libbf.c (239834B)
1 /* 2 * Tiny arbitrary precision floating point library 3 * 4 * Copyright (c) 2017-2021 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include <stdlib.h> 25 #include <stdio.h> 26 #include <inttypes.h> 27 #include <math.h> 28 #include <string.h> 29 #include <assert.h> 30 31 #ifdef __AVX2__ 32 #include <immintrin.h> 33 #endif 34 35 #include "cutils.h" 36 #include "libbf.h" 37 38 /* enable it to check the multiplication result */ 39 //#define USE_MUL_CHECK 40 #ifdef CONFIG_BIGNUM 41 /* enable it to use FFT/NTT multiplication */ 42 #define USE_FFT_MUL 43 /* enable decimal floating point support */ 44 #define USE_BF_DEC 45 #endif 46 47 //#define inline __attribute__((always_inline)) 48 49 #ifdef __AVX2__ 50 #define FFT_MUL_THRESHOLD 100 /* in limbs of the smallest factor */ 51 #else 52 #define FFT_MUL_THRESHOLD 100 /* in limbs of the smallest factor */ 53 #endif 54 55 /* XXX: adjust */ 56 #define DIVNORM_LARGE_THRESHOLD 50 57 #define UDIV1NORM_THRESHOLD 3 58 59 #if LIMB_BITS == 64 60 #define FMT_LIMB1 "%" PRIx64 61 #define FMT_LIMB "%016" PRIx64 62 #define PRId_LIMB PRId64 63 #define PRIu_LIMB PRIu64 64 65 #else 66 67 #define FMT_LIMB1 "%x" 68 #define FMT_LIMB "%08x" 69 #define PRId_LIMB "d" 70 #define PRIu_LIMB "u" 71 72 #endif 73 74 typedef intptr_t mp_size_t; 75 76 typedef int bf_op2_func_t(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, 77 bf_flags_t flags); 78 79 #ifdef USE_FFT_MUL 80 81 #define FFT_MUL_R_OVERLAP_A (1 << 0) 82 #define FFT_MUL_R_OVERLAP_B (1 << 1) 83 #define FFT_MUL_R_NORESIZE (1 << 2) 84 85 static no_inline int fft_mul(bf_context_t *s, 86 bf_t *res, limb_t *a_tab, limb_t a_len, 87 limb_t *b_tab, limb_t b_len, int mul_flags); 88 static void fft_clear_cache(bf_context_t *s); 89 #endif 90 #ifdef USE_BF_DEC 91 static limb_t get_digit(const limb_t *tab, limb_t len, slimb_t pos); 92 #endif 93 94 95 /* could leading zeros */ 96 static inline int clz(limb_t a) 97 { 98 if (a == 0) { 99 return LIMB_BITS; 100 } else { 101 #if LIMB_BITS == 64 102 return clz64(a); 103 #else 104 return clz32(a); 105 #endif 106 } 107 } 108 109 static inline int ctz(limb_t a) 110 { 111 if (a == 0) { 112 return LIMB_BITS; 113 } else { 114 #if LIMB_BITS == 64 115 return ctz64(a); 116 #else 117 return ctz32(a); 118 #endif 119 } 120 } 121 122 static inline int ceil_log2(limb_t a) 123 { 124 if (a <= 1) 125 return 0; 126 else 127 return LIMB_BITS - clz(a - 1); 128 } 129 130 /* b must be >= 1 */ 131 static inline slimb_t ceil_div(slimb_t a, slimb_t b) 132 { 133 if (a >= 0) 134 return (a + b - 1) / b; 135 else 136 return a / b; 137 } 138 139 #ifdef USE_BF_DEC 140 /* b must be >= 1 */ 141 static inline slimb_t floor_div(slimb_t a, slimb_t b) 142 { 143 if (a >= 0) { 144 return a / b; 145 } else { 146 return (a - b + 1) / b; 147 } 148 } 149 #endif 150 151 /* return r = a modulo b (0 <= r <= b - 1. b must be >= 1 */ 152 static inline limb_t smod(slimb_t a, slimb_t b) 153 { 154 a = a % (slimb_t)b; 155 if (a < 0) 156 a += b; 157 return a; 158 } 159 160 /* signed addition with saturation */ 161 static inline slimb_t sat_add(slimb_t a, slimb_t b) 162 { 163 slimb_t r; 164 r = a + b; 165 /* overflow ? */ 166 if (((a ^ r) & (b ^ r)) < 0) 167 r = (a >> (LIMB_BITS - 1)) ^ (((limb_t)1 << (LIMB_BITS - 1)) - 1); 168 return r; 169 } 170 171 static inline __maybe_unused limb_t shrd(limb_t low, limb_t high, long shift) 172 { 173 if (shift != 0) 174 low = (low >> shift) | (high << (LIMB_BITS - shift)); 175 return low; 176 } 177 178 static inline __maybe_unused limb_t shld(limb_t a1, limb_t a0, long shift) 179 { 180 if (shift != 0) 181 return (a1 << shift) | (a0 >> (LIMB_BITS - shift)); 182 else 183 return a1; 184 } 185 186 #define malloc(s) malloc_is_forbidden(s) 187 #define free(p) free_is_forbidden(p) 188 #define realloc(p, s) realloc_is_forbidden(p, s) 189 190 void bf_context_init(bf_context_t *s, bf_realloc_func_t *realloc_func, 191 void *realloc_opaque) 192 { 193 memset(s, 0, sizeof(*s)); 194 s->realloc_func = realloc_func; 195 s->realloc_opaque = realloc_opaque; 196 } 197 198 void bf_context_end(bf_context_t *s) 199 { 200 bf_clear_cache(s); 201 } 202 203 void bf_init(bf_context_t *s, bf_t *r) 204 { 205 r->ctx = s; 206 r->sign = 0; 207 r->expn = BF_EXP_ZERO; 208 r->len = 0; 209 r->tab = NULL; 210 } 211 212 /* return 0 if OK, -1 if alloc error */ 213 int bf_resize(bf_t *r, limb_t len) 214 { 215 limb_t *tab; 216 217 if (len != r->len) { 218 tab = bf_realloc(r->ctx, r->tab, len * sizeof(limb_t)); 219 if (!tab && len != 0) 220 return -1; 221 r->tab = tab; 222 r->len = len; 223 } 224 return 0; 225 } 226 227 /* return 0 or BF_ST_MEM_ERROR */ 228 int bf_set_ui(bf_t *r, uint64_t a) 229 { 230 r->sign = 0; 231 if (a == 0) { 232 r->expn = BF_EXP_ZERO; 233 bf_resize(r, 0); /* cannot fail */ 234 } 235 #if LIMB_BITS == 32 236 else if (a <= 0xffffffff) 237 #else 238 else 239 #endif 240 { 241 int shift; 242 if (bf_resize(r, 1)) 243 goto fail; 244 shift = clz(a); 245 r->tab[0] = a << shift; 246 r->expn = LIMB_BITS - shift; 247 } 248 #if LIMB_BITS == 32 249 else { 250 uint32_t a1, a0; 251 int shift; 252 if (bf_resize(r, 2)) 253 goto fail; 254 a0 = a; 255 a1 = a >> 32; 256 shift = clz(a1); 257 r->tab[0] = a0 << shift; 258 r->tab[1] = shld(a1, a0, shift); 259 r->expn = 2 * LIMB_BITS - shift; 260 } 261 #endif 262 return 0; 263 fail: 264 bf_set_nan(r); 265 return BF_ST_MEM_ERROR; 266 } 267 268 /* return 0 or BF_ST_MEM_ERROR */ 269 int bf_set_si(bf_t *r, int64_t a) 270 { 271 int ret; 272 273 if (a < 0) { 274 ret = bf_set_ui(r, -a); 275 r->sign = 1; 276 } else { 277 ret = bf_set_ui(r, a); 278 } 279 return ret; 280 } 281 282 void bf_set_nan(bf_t *r) 283 { 284 bf_resize(r, 0); /* cannot fail */ 285 r->expn = BF_EXP_NAN; 286 r->sign = 0; 287 } 288 289 void bf_set_zero(bf_t *r, int is_neg) 290 { 291 bf_resize(r, 0); /* cannot fail */ 292 r->expn = BF_EXP_ZERO; 293 r->sign = is_neg; 294 } 295 296 void bf_set_inf(bf_t *r, int is_neg) 297 { 298 bf_resize(r, 0); /* cannot fail */ 299 r->expn = BF_EXP_INF; 300 r->sign = is_neg; 301 } 302 303 /* return 0 or BF_ST_MEM_ERROR */ 304 int bf_set(bf_t *r, const bf_t *a) 305 { 306 if (r == a) 307 return 0; 308 if (bf_resize(r, a->len)) { 309 bf_set_nan(r); 310 return BF_ST_MEM_ERROR; 311 } 312 r->sign = a->sign; 313 r->expn = a->expn; 314 memcpy_no_ub(r->tab, a->tab, a->len * sizeof(limb_t)); 315 return 0; 316 } 317 318 /* equivalent to bf_set(r, a); bf_delete(a) */ 319 void bf_move(bf_t *r, bf_t *a) 320 { 321 bf_context_t *s = r->ctx; 322 if (r == a) 323 return; 324 bf_free(s, r->tab); 325 *r = *a; 326 } 327 328 static limb_t get_limbz(const bf_t *a, limb_t idx) 329 { 330 if (idx >= a->len) 331 return 0; 332 else 333 return a->tab[idx]; 334 } 335 336 /* get LIMB_BITS at bit position 'pos' in tab */ 337 static inline limb_t get_bits(const limb_t *tab, limb_t len, slimb_t pos) 338 { 339 limb_t i, a0, a1; 340 int p; 341 342 i = pos >> LIMB_LOG2_BITS; 343 p = pos & (LIMB_BITS - 1); 344 if (i < len) 345 a0 = tab[i]; 346 else 347 a0 = 0; 348 if (p == 0) { 349 return a0; 350 } else { 351 i++; 352 if (i < len) 353 a1 = tab[i]; 354 else 355 a1 = 0; 356 return (a0 >> p) | (a1 << (LIMB_BITS - p)); 357 } 358 } 359 360 static inline limb_t get_bit(const limb_t *tab, limb_t len, slimb_t pos) 361 { 362 slimb_t i; 363 i = pos >> LIMB_LOG2_BITS; 364 if (i < 0 || i >= len) 365 return 0; 366 return (tab[i] >> (pos & (LIMB_BITS - 1))) & 1; 367 } 368 369 static inline limb_t limb_mask(int start, int last) 370 { 371 limb_t v; 372 int n; 373 n = last - start + 1; 374 if (n == LIMB_BITS) 375 v = -1; 376 else 377 v = (((limb_t)1 << n) - 1) << start; 378 return v; 379 } 380 381 static limb_t mp_scan_nz(const limb_t *tab, mp_size_t n) 382 { 383 mp_size_t i; 384 for(i = 0; i < n; i++) { 385 if (tab[i] != 0) 386 return 1; 387 } 388 return 0; 389 } 390 391 /* return != 0 if one bit between 0 and bit_pos inclusive is not zero. */ 392 static inline limb_t scan_bit_nz(const bf_t *r, slimb_t bit_pos) 393 { 394 slimb_t pos; 395 limb_t v; 396 397 pos = bit_pos >> LIMB_LOG2_BITS; 398 if (pos < 0) 399 return 0; 400 v = r->tab[pos] & limb_mask(0, bit_pos & (LIMB_BITS - 1)); 401 if (v != 0) 402 return 1; 403 pos--; 404 while (pos >= 0) { 405 if (r->tab[pos] != 0) 406 return 1; 407 pos--; 408 } 409 return 0; 410 } 411 412 /* return the addend for rounding. Note that prec can be <= 0 (for 413 BF_FLAG_RADPNT_PREC) */ 414 static int bf_get_rnd_add(int *pret, const bf_t *r, limb_t l, 415 slimb_t prec, int rnd_mode) 416 { 417 int add_one, inexact; 418 limb_t bit1, bit0; 419 420 if (rnd_mode == BF_RNDF) { 421 bit0 = 1; /* faithful rounding does not honor the INEXACT flag */ 422 } else { 423 /* starting limb for bit 'prec + 1' */ 424 bit0 = scan_bit_nz(r, l * LIMB_BITS - 1 - bf_max(0, prec + 1)); 425 } 426 427 /* get the bit at 'prec' */ 428 bit1 = get_bit(r->tab, l, l * LIMB_BITS - 1 - prec); 429 inexact = (bit1 | bit0) != 0; 430 431 add_one = 0; 432 switch(rnd_mode) { 433 case BF_RNDZ: 434 break; 435 case BF_RNDN: 436 if (bit1) { 437 if (bit0) { 438 add_one = 1; 439 } else { 440 /* round to even */ 441 add_one = 442 get_bit(r->tab, l, l * LIMB_BITS - 1 - (prec - 1)); 443 } 444 } 445 break; 446 case BF_RNDD: 447 case BF_RNDU: 448 if (r->sign == (rnd_mode == BF_RNDD)) 449 add_one = inexact; 450 break; 451 case BF_RNDA: 452 add_one = inexact; 453 break; 454 case BF_RNDNA: 455 case BF_RNDF: 456 add_one = bit1; 457 break; 458 default: 459 abort(); 460 } 461 462 if (inexact) 463 *pret |= BF_ST_INEXACT; 464 return add_one; 465 } 466 467 static int bf_set_overflow(bf_t *r, int sign, limb_t prec, bf_flags_t flags) 468 { 469 slimb_t i, l, e_max; 470 int rnd_mode; 471 472 rnd_mode = flags & BF_RND_MASK; 473 if (prec == BF_PREC_INF || 474 rnd_mode == BF_RNDN || 475 rnd_mode == BF_RNDNA || 476 rnd_mode == BF_RNDA || 477 (rnd_mode == BF_RNDD && sign == 1) || 478 (rnd_mode == BF_RNDU && sign == 0)) { 479 bf_set_inf(r, sign); 480 } else { 481 /* set to maximum finite number */ 482 l = (prec + LIMB_BITS - 1) / LIMB_BITS; 483 if (bf_resize(r, l)) { 484 bf_set_nan(r); 485 return BF_ST_MEM_ERROR; 486 } 487 r->tab[0] = limb_mask((-prec) & (LIMB_BITS - 1), 488 LIMB_BITS - 1); 489 for(i = 1; i < l; i++) 490 r->tab[i] = (limb_t)-1; 491 e_max = (limb_t)1 << (bf_get_exp_bits(flags) - 1); 492 r->expn = e_max; 493 r->sign = sign; 494 } 495 return BF_ST_OVERFLOW | BF_ST_INEXACT; 496 } 497 498 /* round to prec1 bits assuming 'r' is non zero and finite. 'r' is 499 assumed to have length 'l' (1 <= l <= r->len). Note: 'prec1' can be 500 infinite (BF_PREC_INF). 'ret' is 0 or BF_ST_INEXACT if the result 501 is known to be inexact. Can fail with BF_ST_MEM_ERROR in case of 502 overflow not returning infinity. */ 503 static int __bf_round(bf_t *r, limb_t prec1, bf_flags_t flags, limb_t l, 504 int ret) 505 { 506 limb_t v, a; 507 int shift, add_one, rnd_mode; 508 slimb_t i, bit_pos, pos, e_min, e_max, e_range, prec; 509 510 /* e_min and e_max are computed to match the IEEE 754 conventions */ 511 e_range = (limb_t)1 << (bf_get_exp_bits(flags) - 1); 512 e_min = -e_range + 3; 513 e_max = e_range; 514 515 if (flags & BF_FLAG_RADPNT_PREC) { 516 /* 'prec' is the precision after the radix point */ 517 if (prec1 != BF_PREC_INF) 518 prec = r->expn + prec1; 519 else 520 prec = prec1; 521 } else if (unlikely(r->expn < e_min) && (flags & BF_FLAG_SUBNORMAL)) { 522 /* restrict the precision in case of potentially subnormal 523 result */ 524 assert(prec1 != BF_PREC_INF); 525 prec = prec1 - (e_min - r->expn); 526 } else { 527 prec = prec1; 528 } 529 530 /* round to prec bits */ 531 rnd_mode = flags & BF_RND_MASK; 532 add_one = bf_get_rnd_add(&ret, r, l, prec, rnd_mode); 533 534 if (prec <= 0) { 535 if (add_one) { 536 bf_resize(r, 1); /* cannot fail */ 537 r->tab[0] = (limb_t)1 << (LIMB_BITS - 1); 538 r->expn += 1 - prec; 539 ret |= BF_ST_UNDERFLOW | BF_ST_INEXACT; 540 return ret; 541 } else { 542 goto underflow; 543 } 544 } else if (add_one) { 545 limb_t carry; 546 547 /* add one starting at digit 'prec - 1' */ 548 bit_pos = l * LIMB_BITS - 1 - (prec - 1); 549 pos = bit_pos >> LIMB_LOG2_BITS; 550 carry = (limb_t)1 << (bit_pos & (LIMB_BITS - 1)); 551 552 for(i = pos; i < l; i++) { 553 v = r->tab[i] + carry; 554 carry = (v < carry); 555 r->tab[i] = v; 556 if (carry == 0) 557 break; 558 } 559 if (carry) { 560 /* shift right by one digit */ 561 v = 1; 562 for(i = l - 1; i >= pos; i--) { 563 a = r->tab[i]; 564 r->tab[i] = (a >> 1) | (v << (LIMB_BITS - 1)); 565 v = a; 566 } 567 r->expn++; 568 } 569 } 570 571 /* check underflow */ 572 if (unlikely(r->expn < e_min)) { 573 if (flags & BF_FLAG_SUBNORMAL) { 574 /* if inexact, also set the underflow flag */ 575 if (ret & BF_ST_INEXACT) 576 ret |= BF_ST_UNDERFLOW; 577 } else { 578 underflow: 579 ret |= BF_ST_UNDERFLOW | BF_ST_INEXACT; 580 bf_set_zero(r, r->sign); 581 return ret; 582 } 583 } 584 585 /* check overflow */ 586 if (unlikely(r->expn > e_max)) 587 return bf_set_overflow(r, r->sign, prec1, flags); 588 589 /* keep the bits starting at 'prec - 1' */ 590 bit_pos = l * LIMB_BITS - 1 - (prec - 1); 591 i = bit_pos >> LIMB_LOG2_BITS; 592 if (i >= 0) { 593 shift = bit_pos & (LIMB_BITS - 1); 594 if (shift != 0) 595 r->tab[i] &= limb_mask(shift, LIMB_BITS - 1); 596 } else { 597 i = 0; 598 } 599 /* remove trailing zeros */ 600 while (r->tab[i] == 0) 601 i++; 602 if (i > 0) { 603 l -= i; 604 memmove(r->tab, r->tab + i, l * sizeof(limb_t)); 605 } 606 bf_resize(r, l); /* cannot fail */ 607 return ret; 608 } 609 610 /* 'r' must be a finite number. */ 611 int bf_normalize_and_round(bf_t *r, limb_t prec1, bf_flags_t flags) 612 { 613 limb_t l, v, a; 614 int shift, ret; 615 slimb_t i; 616 617 // bf_print_str("bf_renorm", r); 618 l = r->len; 619 while (l > 0 && r->tab[l - 1] == 0) 620 l--; 621 if (l == 0) { 622 /* zero */ 623 r->expn = BF_EXP_ZERO; 624 bf_resize(r, 0); /* cannot fail */ 625 ret = 0; 626 } else { 627 r->expn -= (r->len - l) * LIMB_BITS; 628 /* shift to have the MSB set to '1' */ 629 v = r->tab[l - 1]; 630 shift = clz(v); 631 if (shift != 0) { 632 v = 0; 633 for(i = 0; i < l; i++) { 634 a = r->tab[i]; 635 r->tab[i] = (a << shift) | (v >> (LIMB_BITS - shift)); 636 v = a; 637 } 638 r->expn -= shift; 639 } 640 ret = __bf_round(r, prec1, flags, l, 0); 641 } 642 // bf_print_str("r_final", r); 643 return ret; 644 } 645 646 /* return true if rounding can be done at precision 'prec' assuming 647 the exact result r is such that |r-a| <= 2^(EXP(a)-k). */ 648 /* XXX: check the case where the exponent would be incremented by the 649 rounding */ 650 int bf_can_round(const bf_t *a, slimb_t prec, bf_rnd_t rnd_mode, slimb_t k) 651 { 652 BOOL is_rndn; 653 slimb_t bit_pos, n; 654 limb_t bit; 655 656 if (a->expn == BF_EXP_INF || a->expn == BF_EXP_NAN) 657 return FALSE; 658 if (rnd_mode == BF_RNDF) { 659 return (k >= (prec + 1)); 660 } 661 if (a->expn == BF_EXP_ZERO) 662 return FALSE; 663 is_rndn = (rnd_mode == BF_RNDN || rnd_mode == BF_RNDNA); 664 if (k < (prec + 2)) 665 return FALSE; 666 bit_pos = a->len * LIMB_BITS - 1 - prec; 667 n = k - prec; 668 /* bit pattern for RNDN or RNDNA: 0111.. or 1000... 669 for other rounding modes: 000... or 111... 670 */ 671 bit = get_bit(a->tab, a->len, bit_pos); 672 bit_pos--; 673 n--; 674 bit ^= is_rndn; 675 /* XXX: slow, but a few iterations on average */ 676 while (n != 0) { 677 if (get_bit(a->tab, a->len, bit_pos) != bit) 678 return TRUE; 679 bit_pos--; 680 n--; 681 } 682 return FALSE; 683 } 684 685 /* Cannot fail with BF_ST_MEM_ERROR. */ 686 int bf_round(bf_t *r, limb_t prec, bf_flags_t flags) 687 { 688 if (r->len == 0) 689 return 0; 690 return __bf_round(r, prec, flags, r->len, 0); 691 } 692 693 /* for debugging */ 694 static __maybe_unused void dump_limbs(const char *str, const limb_t *tab, limb_t n) 695 { 696 limb_t i; 697 printf("%s: len=%" PRId_LIMB "\n", str, n); 698 for(i = 0; i < n; i++) { 699 printf("%" PRId_LIMB ": " FMT_LIMB "\n", 700 i, tab[i]); 701 } 702 } 703 704 void mp_print_str(const char *str, const limb_t *tab, limb_t n) 705 { 706 slimb_t i; 707 printf("%s= 0x", str); 708 for(i = n - 1; i >= 0; i--) { 709 if (i != (n - 1)) 710 printf("_"); 711 printf(FMT_LIMB, tab[i]); 712 } 713 printf("\n"); 714 } 715 716 static __maybe_unused void mp_print_str_h(const char *str, 717 const limb_t *tab, limb_t n, 718 limb_t high) 719 { 720 slimb_t i; 721 printf("%s= 0x", str); 722 printf(FMT_LIMB, high); 723 for(i = n - 1; i >= 0; i--) { 724 printf("_"); 725 printf(FMT_LIMB, tab[i]); 726 } 727 printf("\n"); 728 } 729 730 /* for debugging */ 731 void bf_print_str(const char *str, const bf_t *a) 732 { 733 slimb_t i; 734 printf("%s=", str); 735 736 if (a->expn == BF_EXP_NAN) { 737 printf("NaN"); 738 } else { 739 if (a->sign) 740 putchar('-'); 741 if (a->expn == BF_EXP_ZERO) { 742 putchar('0'); 743 } else if (a->expn == BF_EXP_INF) { 744 printf("Inf"); 745 } else { 746 printf("0x0."); 747 for(i = a->len - 1; i >= 0; i--) 748 printf(FMT_LIMB, a->tab[i]); 749 printf("p%" PRId_LIMB, a->expn); 750 } 751 } 752 printf("\n"); 753 } 754 755 /* compare the absolute value of 'a' and 'b'. Return < 0 if a < b, 0 756 if a = b and > 0 otherwise. */ 757 int bf_cmpu(const bf_t *a, const bf_t *b) 758 { 759 slimb_t i; 760 limb_t len, v1, v2; 761 762 if (a->expn != b->expn) { 763 if (a->expn < b->expn) 764 return -1; 765 else 766 return 1; 767 } 768 len = bf_max(a->len, b->len); 769 for(i = len - 1; i >= 0; i--) { 770 v1 = get_limbz(a, a->len - len + i); 771 v2 = get_limbz(b, b->len - len + i); 772 if (v1 != v2) { 773 if (v1 < v2) 774 return -1; 775 else 776 return 1; 777 } 778 } 779 return 0; 780 } 781 782 /* Full order: -0 < 0, NaN == NaN and NaN is larger than all other numbers */ 783 int bf_cmp_full(const bf_t *a, const bf_t *b) 784 { 785 int res; 786 787 if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { 788 if (a->expn == b->expn) 789 res = 0; 790 else if (a->expn == BF_EXP_NAN) 791 res = 1; 792 else 793 res = -1; 794 } else if (a->sign != b->sign) { 795 res = 1 - 2 * a->sign; 796 } else { 797 res = bf_cmpu(a, b); 798 if (a->sign) 799 res = -res; 800 } 801 return res; 802 } 803 804 /* Standard floating point comparison: return 2 if one of the operands 805 is NaN (unordered) or -1, 0, 1 depending on the ordering assuming 806 -0 == +0 */ 807 int bf_cmp(const bf_t *a, const bf_t *b) 808 { 809 int res; 810 811 if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { 812 res = 2; 813 } else if (a->sign != b->sign) { 814 if (a->expn == BF_EXP_ZERO && b->expn == BF_EXP_ZERO) 815 res = 0; 816 else 817 res = 1 - 2 * a->sign; 818 } else { 819 res = bf_cmpu(a, b); 820 if (a->sign) 821 res = -res; 822 } 823 return res; 824 } 825 826 /* Compute the number of bits 'n' matching the pattern: 827 a= X1000..0 828 b= X0111..1 829 830 When computing a-b, the result will have at least n leading zero 831 bits. 832 833 Precondition: a > b and a.expn - b.expn = 0 or 1 834 */ 835 static limb_t count_cancelled_bits(const bf_t *a, const bf_t *b) 836 { 837 slimb_t bit_offset, b_offset, n; 838 int p, p1; 839 limb_t v1, v2, mask; 840 841 bit_offset = a->len * LIMB_BITS - 1; 842 b_offset = (b->len - a->len) * LIMB_BITS - (LIMB_BITS - 1) + 843 a->expn - b->expn; 844 n = 0; 845 846 /* first search the equals bits */ 847 for(;;) { 848 v1 = get_limbz(a, bit_offset >> LIMB_LOG2_BITS); 849 v2 = get_bits(b->tab, b->len, bit_offset + b_offset); 850 // printf("v1=" FMT_LIMB " v2=" FMT_LIMB "\n", v1, v2); 851 if (v1 != v2) 852 break; 853 n += LIMB_BITS; 854 bit_offset -= LIMB_BITS; 855 } 856 /* find the position of the first different bit */ 857 p = clz(v1 ^ v2) + 1; 858 n += p; 859 /* then search for '0' in a and '1' in b */ 860 p = LIMB_BITS - p; 861 if (p > 0) { 862 /* search in the trailing p bits of v1 and v2 */ 863 mask = limb_mask(0, p - 1); 864 p1 = bf_min(clz(v1 & mask), clz((~v2) & mask)) - (LIMB_BITS - p); 865 n += p1; 866 if (p1 != p) 867 goto done; 868 } 869 bit_offset -= LIMB_BITS; 870 for(;;) { 871 v1 = get_limbz(a, bit_offset >> LIMB_LOG2_BITS); 872 v2 = get_bits(b->tab, b->len, bit_offset + b_offset); 873 // printf("v1=" FMT_LIMB " v2=" FMT_LIMB "\n", v1, v2); 874 if (v1 != 0 || v2 != -1) { 875 /* different: count the matching bits */ 876 p1 = bf_min(clz(v1), clz(~v2)); 877 n += p1; 878 break; 879 } 880 n += LIMB_BITS; 881 bit_offset -= LIMB_BITS; 882 } 883 done: 884 return n; 885 } 886 887 static int bf_add_internal(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, 888 bf_flags_t flags, int b_neg) 889 { 890 const bf_t *tmp; 891 int is_sub, ret, cmp_res, a_sign, b_sign; 892 893 a_sign = a->sign; 894 b_sign = b->sign ^ b_neg; 895 is_sub = a_sign ^ b_sign; 896 cmp_res = bf_cmpu(a, b); 897 if (cmp_res < 0) { 898 tmp = a; 899 a = b; 900 b = tmp; 901 a_sign = b_sign; /* b_sign is never used later */ 902 } 903 /* abs(a) >= abs(b) */ 904 if (cmp_res == 0 && is_sub && a->expn < BF_EXP_INF) { 905 /* zero result */ 906 bf_set_zero(r, (flags & BF_RND_MASK) == BF_RNDD); 907 ret = 0; 908 } else if (a->len == 0 || b->len == 0) { 909 ret = 0; 910 if (a->expn >= BF_EXP_INF) { 911 if (a->expn == BF_EXP_NAN) { 912 /* at least one operand is NaN */ 913 bf_set_nan(r); 914 } else if (b->expn == BF_EXP_INF && is_sub) { 915 /* infinities with different signs */ 916 bf_set_nan(r); 917 ret = BF_ST_INVALID_OP; 918 } else { 919 bf_set_inf(r, a_sign); 920 } 921 } else { 922 /* at least one zero and not subtract */ 923 bf_set(r, a); 924 r->sign = a_sign; 925 goto renorm; 926 } 927 } else { 928 slimb_t d, a_offset, b_bit_offset, i, cancelled_bits; 929 limb_t carry, v1, v2, u, r_len, carry1, precl, tot_len, z, sub_mask; 930 931 r->sign = a_sign; 932 r->expn = a->expn; 933 d = a->expn - b->expn; 934 /* must add more precision for the leading cancelled bits in 935 subtraction */ 936 if (is_sub) { 937 if (d <= 1) 938 cancelled_bits = count_cancelled_bits(a, b); 939 else 940 cancelled_bits = 1; 941 } else { 942 cancelled_bits = 0; 943 } 944 945 /* add two extra bits for rounding */ 946 precl = (cancelled_bits + prec + 2 + LIMB_BITS - 1) / LIMB_BITS; 947 tot_len = bf_max(a->len, b->len + (d + LIMB_BITS - 1) / LIMB_BITS); 948 r_len = bf_min(precl, tot_len); 949 if (bf_resize(r, r_len)) 950 goto fail; 951 a_offset = a->len - r_len; 952 b_bit_offset = (b->len - r_len) * LIMB_BITS + d; 953 954 /* compute the bits before for the rounding */ 955 carry = is_sub; 956 z = 0; 957 sub_mask = -is_sub; 958 i = r_len - tot_len; 959 while (i < 0) { 960 slimb_t ap, bp; 961 BOOL inflag; 962 963 ap = a_offset + i; 964 bp = b_bit_offset + i * LIMB_BITS; 965 inflag = FALSE; 966 if (ap >= 0 && ap < a->len) { 967 v1 = a->tab[ap]; 968 inflag = TRUE; 969 } else { 970 v1 = 0; 971 } 972 if (bp + LIMB_BITS > 0 && bp < (slimb_t)(b->len * LIMB_BITS)) { 973 v2 = get_bits(b->tab, b->len, bp); 974 inflag = TRUE; 975 } else { 976 v2 = 0; 977 } 978 if (!inflag) { 979 /* outside 'a' and 'b': go directly to the next value 980 inside a or b so that the running time does not 981 depend on the exponent difference */ 982 i = 0; 983 if (ap < 0) 984 i = bf_min(i, -a_offset); 985 /* b_bit_offset + i * LIMB_BITS + LIMB_BITS >= 1 986 equivalent to 987 i >= ceil(-b_bit_offset + 1 - LIMB_BITS) / LIMB_BITS) 988 */ 989 if (bp + LIMB_BITS <= 0) 990 i = bf_min(i, (-b_bit_offset) >> LIMB_LOG2_BITS); 991 } else { 992 i++; 993 } 994 v2 ^= sub_mask; 995 u = v1 + v2; 996 carry1 = u < v1; 997 u += carry; 998 carry = (u < carry) | carry1; 999 z |= u; 1000 } 1001 /* and the result */ 1002 for(i = 0; i < r_len; i++) { 1003 v1 = get_limbz(a, a_offset + i); 1004 v2 = get_bits(b->tab, b->len, b_bit_offset + i * LIMB_BITS); 1005 v2 ^= sub_mask; 1006 u = v1 + v2; 1007 carry1 = u < v1; 1008 u += carry; 1009 carry = (u < carry) | carry1; 1010 r->tab[i] = u; 1011 } 1012 /* set the extra bits for the rounding */ 1013 r->tab[0] |= (z != 0); 1014 1015 /* carry is only possible in add case */ 1016 if (!is_sub && carry) { 1017 if (bf_resize(r, r_len + 1)) 1018 goto fail; 1019 r->tab[r_len] = 1; 1020 r->expn += LIMB_BITS; 1021 } 1022 renorm: 1023 ret = bf_normalize_and_round(r, prec, flags); 1024 } 1025 return ret; 1026 fail: 1027 bf_set_nan(r); 1028 return BF_ST_MEM_ERROR; 1029 } 1030 1031 static int __bf_add(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, 1032 bf_flags_t flags) 1033 { 1034 return bf_add_internal(r, a, b, prec, flags, 0); 1035 } 1036 1037 static int __bf_sub(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, 1038 bf_flags_t flags) 1039 { 1040 return bf_add_internal(r, a, b, prec, flags, 1); 1041 } 1042 1043 limb_t mp_add(limb_t *res, const limb_t *op1, const limb_t *op2, 1044 limb_t n, limb_t carry) 1045 { 1046 slimb_t i; 1047 limb_t k, a, v, k1; 1048 1049 k = carry; 1050 for(i=0;i<n;i++) { 1051 v = op1[i]; 1052 a = v + op2[i]; 1053 k1 = a < v; 1054 a = a + k; 1055 k = (a < k) | k1; 1056 res[i] = a; 1057 } 1058 return k; 1059 } 1060 1061 limb_t mp_add_ui(limb_t *tab, limb_t b, size_t n) 1062 { 1063 size_t i; 1064 limb_t k, a; 1065 1066 k=b; 1067 for(i=0;i<n;i++) { 1068 if (k == 0) 1069 break; 1070 a = tab[i] + k; 1071 k = (a < k); 1072 tab[i] = a; 1073 } 1074 return k; 1075 } 1076 1077 limb_t mp_sub(limb_t *res, const limb_t *op1, const limb_t *op2, 1078 mp_size_t n, limb_t carry) 1079 { 1080 int i; 1081 limb_t k, a, v, k1; 1082 1083 k = carry; 1084 for(i=0;i<n;i++) { 1085 v = op1[i]; 1086 a = v - op2[i]; 1087 k1 = a > v; 1088 v = a - k; 1089 k = (v > a) | k1; 1090 res[i] = v; 1091 } 1092 return k; 1093 } 1094 1095 /* compute 0 - op2 */ 1096 static limb_t mp_neg(limb_t *res, const limb_t *op2, mp_size_t n, limb_t carry) 1097 { 1098 int i; 1099 limb_t k, a, v, k1; 1100 1101 k = carry; 1102 for(i=0;i<n;i++) { 1103 v = 0; 1104 a = v - op2[i]; 1105 k1 = a > v; 1106 v = a - k; 1107 k = (v > a) | k1; 1108 res[i] = v; 1109 } 1110 return k; 1111 } 1112 1113 limb_t mp_sub_ui(limb_t *tab, limb_t b, mp_size_t n) 1114 { 1115 mp_size_t i; 1116 limb_t k, a, v; 1117 1118 k=b; 1119 for(i=0;i<n;i++) { 1120 v = tab[i]; 1121 a = v - k; 1122 k = a > v; 1123 tab[i] = a; 1124 if (k == 0) 1125 break; 1126 } 1127 return k; 1128 } 1129 1130 /* r = (a + high*B^n) >> shift. Return the remainder r (0 <= r < 2^shift). 1131 1 <= shift <= LIMB_BITS - 1 */ 1132 static limb_t mp_shr(limb_t *tab_r, const limb_t *tab, mp_size_t n, 1133 int shift, limb_t high) 1134 { 1135 mp_size_t i; 1136 limb_t l, a; 1137 1138 assert(shift >= 1 && shift < LIMB_BITS); 1139 l = high; 1140 for(i = n - 1; i >= 0; i--) { 1141 a = tab[i]; 1142 tab_r[i] = (a >> shift) | (l << (LIMB_BITS - shift)); 1143 l = a; 1144 } 1145 return l & (((limb_t)1 << shift) - 1); 1146 } 1147 1148 /* tabr[] = taba[] * b + l. Return the high carry */ 1149 static limb_t mp_mul1(limb_t *tabr, const limb_t *taba, limb_t n, 1150 limb_t b, limb_t l) 1151 { 1152 limb_t i; 1153 dlimb_t t; 1154 1155 for(i = 0; i < n; i++) { 1156 t = (dlimb_t)taba[i] * (dlimb_t)b + l; 1157 tabr[i] = t; 1158 l = t >> LIMB_BITS; 1159 } 1160 return l; 1161 } 1162 1163 /* tabr[] += taba[] * b, return the high word. */ 1164 static limb_t mp_add_mul1(limb_t *tabr, const limb_t *taba, limb_t n, 1165 limb_t b) 1166 { 1167 limb_t i, l; 1168 dlimb_t t; 1169 1170 l = 0; 1171 for(i = 0; i < n; i++) { 1172 t = (dlimb_t)taba[i] * (dlimb_t)b + l + tabr[i]; 1173 tabr[i] = t; 1174 l = t >> LIMB_BITS; 1175 } 1176 return l; 1177 } 1178 1179 /* size of the result : op1_size + op2_size. */ 1180 static void mp_mul_basecase(limb_t *result, 1181 const limb_t *op1, limb_t op1_size, 1182 const limb_t *op2, limb_t op2_size) 1183 { 1184 limb_t i, r; 1185 1186 result[op1_size] = mp_mul1(result, op1, op1_size, op2[0], 0); 1187 for(i=1;i<op2_size;i++) { 1188 r = mp_add_mul1(result + i, op1, op1_size, op2[i]); 1189 result[i + op1_size] = r; 1190 } 1191 } 1192 1193 /* return 0 if OK, -1 if memory error */ 1194 /* XXX: change API so that result can be allocated */ 1195 int mp_mul(bf_context_t *s, limb_t *result, 1196 const limb_t *op1, limb_t op1_size, 1197 const limb_t *op2, limb_t op2_size) 1198 { 1199 #ifdef USE_FFT_MUL 1200 if (unlikely(bf_min(op1_size, op2_size) >= FFT_MUL_THRESHOLD)) { 1201 bf_t r_s, *r = &r_s; 1202 r->tab = result; 1203 /* XXX: optimize memory usage in API */ 1204 if (fft_mul(s, r, (limb_t *)op1, op1_size, 1205 (limb_t *)op2, op2_size, FFT_MUL_R_NORESIZE)) 1206 return -1; 1207 } else 1208 #endif 1209 { 1210 mp_mul_basecase(result, op1, op1_size, op2, op2_size); 1211 } 1212 return 0; 1213 } 1214 1215 /* tabr[] -= taba[] * b. Return the value to substract to the high 1216 word. */ 1217 static limb_t mp_sub_mul1(limb_t *tabr, const limb_t *taba, limb_t n, 1218 limb_t b) 1219 { 1220 limb_t i, l; 1221 dlimb_t t; 1222 1223 l = 0; 1224 for(i = 0; i < n; i++) { 1225 t = tabr[i] - (dlimb_t)taba[i] * (dlimb_t)b - l; 1226 tabr[i] = t; 1227 l = -(t >> LIMB_BITS); 1228 } 1229 return l; 1230 } 1231 1232 /* WARNING: d must be >= 2^(LIMB_BITS-1) */ 1233 static inline limb_t udiv1norm_init(limb_t d) 1234 { 1235 limb_t a0, a1; 1236 a1 = -d - 1; 1237 a0 = -1; 1238 return (((dlimb_t)a1 << LIMB_BITS) | a0) / d; 1239 } 1240 1241 /* return the quotient and the remainder in '*pr'of 'a1*2^LIMB_BITS+a0 1242 / d' with 0 <= a1 < d. */ 1243 static inline limb_t udiv1norm(limb_t *pr, limb_t a1, limb_t a0, 1244 limb_t d, limb_t d_inv) 1245 { 1246 limb_t n1m, n_adj, q, r, ah; 1247 dlimb_t a; 1248 n1m = ((slimb_t)a0 >> (LIMB_BITS - 1)); 1249 n_adj = a0 + (n1m & d); 1250 a = (dlimb_t)d_inv * (a1 - n1m) + n_adj; 1251 q = (a >> LIMB_BITS) + a1; 1252 /* compute a - q * r and update q so that the remainder is\ 1253 between 0 and d - 1 */ 1254 a = ((dlimb_t)a1 << LIMB_BITS) | a0; 1255 a = a - (dlimb_t)q * d - d; 1256 ah = a >> LIMB_BITS; 1257 q += 1 + ah; 1258 r = (limb_t)a + (ah & d); 1259 *pr = r; 1260 return q; 1261 } 1262 1263 /* b must be >= 1 << (LIMB_BITS - 1) */ 1264 static limb_t mp_div1norm(limb_t *tabr, const limb_t *taba, limb_t n, 1265 limb_t b, limb_t r) 1266 { 1267 slimb_t i; 1268 1269 if (n >= UDIV1NORM_THRESHOLD) { 1270 limb_t b_inv; 1271 b_inv = udiv1norm_init(b); 1272 for(i = n - 1; i >= 0; i--) { 1273 tabr[i] = udiv1norm(&r, r, taba[i], b, b_inv); 1274 } 1275 } else { 1276 dlimb_t a1; 1277 for(i = n - 1; i >= 0; i--) { 1278 a1 = ((dlimb_t)r << LIMB_BITS) | taba[i]; 1279 tabr[i] = a1 / b; 1280 r = a1 % b; 1281 } 1282 } 1283 return r; 1284 } 1285 1286 static int mp_divnorm_large(bf_context_t *s, 1287 limb_t *tabq, limb_t *taba, limb_t na, 1288 const limb_t *tabb, limb_t nb); 1289 1290 /* base case division: divides taba[0..na-1] by tabb[0..nb-1]. tabb[nb 1291 - 1] must be >= 1 << (LIMB_BITS - 1). na - nb must be >= 0. 'taba' 1292 is modified and contains the remainder (nb limbs). tabq[0..na-nb] 1293 contains the quotient with tabq[na - nb] <= 1. */ 1294 static int mp_divnorm(bf_context_t *s, limb_t *tabq, limb_t *taba, limb_t na, 1295 const limb_t *tabb, limb_t nb) 1296 { 1297 limb_t r, a, c, q, v, b1, b1_inv, n, dummy_r; 1298 slimb_t i, j; 1299 1300 b1 = tabb[nb - 1]; 1301 if (nb == 1) { 1302 taba[0] = mp_div1norm(tabq, taba, na, b1, 0); 1303 return 0; 1304 } 1305 n = na - nb; 1306 if (bf_min(n, nb) >= DIVNORM_LARGE_THRESHOLD) { 1307 return mp_divnorm_large(s, tabq, taba, na, tabb, nb); 1308 } 1309 1310 if (n >= UDIV1NORM_THRESHOLD) 1311 b1_inv = udiv1norm_init(b1); 1312 else 1313 b1_inv = 0; 1314 1315 /* first iteration: the quotient is only 0 or 1 */ 1316 q = 1; 1317 for(j = nb - 1; j >= 0; j--) { 1318 if (taba[n + j] != tabb[j]) { 1319 if (taba[n + j] < tabb[j]) 1320 q = 0; 1321 break; 1322 } 1323 } 1324 tabq[n] = q; 1325 if (q) { 1326 mp_sub(taba + n, taba + n, tabb, nb, 0); 1327 } 1328 1329 for(i = n - 1; i >= 0; i--) { 1330 if (unlikely(taba[i + nb] >= b1)) { 1331 q = -1; 1332 } else if (b1_inv) { 1333 q = udiv1norm(&dummy_r, taba[i + nb], taba[i + nb - 1], b1, b1_inv); 1334 } else { 1335 dlimb_t al; 1336 al = ((dlimb_t)taba[i + nb] << LIMB_BITS) | taba[i + nb - 1]; 1337 q = al / b1; 1338 r = al % b1; 1339 } 1340 r = mp_sub_mul1(taba + i, tabb, nb, q); 1341 1342 v = taba[i + nb]; 1343 a = v - r; 1344 c = (a > v); 1345 taba[i + nb] = a; 1346 1347 if (c != 0) { 1348 /* negative result */ 1349 for(;;) { 1350 q--; 1351 c = mp_add(taba + i, taba + i, tabb, nb, 0); 1352 /* propagate carry and test if positive result */ 1353 if (c != 0) { 1354 if (++taba[i + nb] == 0) { 1355 break; 1356 } 1357 } 1358 } 1359 } 1360 tabq[i] = q; 1361 } 1362 return 0; 1363 } 1364 1365 /* compute r=B^(2*n)/a such as a*r < B^(2*n) < a*r + 2 with n >= 1. 'a' 1366 has n limbs with a[n-1] >= B/2 and 'r' has n+1 limbs with r[n] = 1. 1367 1368 See Modern Computer Arithmetic by Richard P. Brent and Paul 1369 Zimmermann, algorithm 3.5 */ 1370 int mp_recip(bf_context_t *s, limb_t *tabr, const limb_t *taba, limb_t n) 1371 { 1372 mp_size_t l, h, k, i; 1373 limb_t *tabxh, *tabt, c, *tabu; 1374 1375 if (n <= 2) { 1376 /* return ceil(B^(2*n)/a) - 1 */ 1377 /* XXX: could avoid allocation */ 1378 tabu = bf_malloc(s, sizeof(limb_t) * (2 * n + 1)); 1379 tabt = bf_malloc(s, sizeof(limb_t) * (n + 2)); 1380 if (!tabt || !tabu) 1381 goto fail; 1382 for(i = 0; i < 2 * n; i++) 1383 tabu[i] = 0; 1384 tabu[2 * n] = 1; 1385 if (mp_divnorm(s, tabt, tabu, 2 * n + 1, taba, n)) 1386 goto fail; 1387 for(i = 0; i < n + 1; i++) 1388 tabr[i] = tabt[i]; 1389 if (mp_scan_nz(tabu, n) == 0) { 1390 /* only happens for a=B^n/2 */ 1391 mp_sub_ui(tabr, 1, n + 1); 1392 } 1393 } else { 1394 l = (n - 1) / 2; 1395 h = n - l; 1396 /* n=2p -> l=p-1, h = p + 1, k = p + 3 1397 n=2p+1-> l=p, h = p + 1; k = p + 2 1398 */ 1399 tabt = bf_malloc(s, sizeof(limb_t) * (n + h + 1)); 1400 tabu = bf_malloc(s, sizeof(limb_t) * (n + 2 * h - l + 2)); 1401 if (!tabt || !tabu) 1402 goto fail; 1403 tabxh = tabr + l; 1404 if (mp_recip(s, tabxh, taba + l, h)) 1405 goto fail; 1406 if (mp_mul(s, tabt, taba, n, tabxh, h + 1)) /* n + h + 1 limbs */ 1407 goto fail; 1408 while (tabt[n + h] != 0) { 1409 mp_sub_ui(tabxh, 1, h + 1); 1410 c = mp_sub(tabt, tabt, taba, n, 0); 1411 mp_sub_ui(tabt + n, c, h + 1); 1412 } 1413 /* T = B^(n+h) - T */ 1414 mp_neg(tabt, tabt, n + h + 1, 0); 1415 tabt[n + h]++; 1416 if (mp_mul(s, tabu, tabt + l, n + h + 1 - l, tabxh, h + 1)) 1417 goto fail; 1418 /* n + 2*h - l + 2 limbs */ 1419 k = 2 * h - l; 1420 for(i = 0; i < l; i++) 1421 tabr[i] = tabu[i + k]; 1422 mp_add(tabr + l, tabr + l, tabu + 2 * h, h, 0); 1423 } 1424 bf_free(s, tabt); 1425 bf_free(s, tabu); 1426 return 0; 1427 fail: 1428 bf_free(s, tabt); 1429 bf_free(s, tabu); 1430 return -1; 1431 } 1432 1433 /* return -1, 0 or 1 */ 1434 static int mp_cmp(const limb_t *taba, const limb_t *tabb, mp_size_t n) 1435 { 1436 mp_size_t i; 1437 for(i = n - 1; i >= 0; i--) { 1438 if (taba[i] != tabb[i]) { 1439 if (taba[i] < tabb[i]) 1440 return -1; 1441 else 1442 return 1; 1443 } 1444 } 1445 return 0; 1446 } 1447 1448 //#define DEBUG_DIVNORM_LARGE 1449 //#define DEBUG_DIVNORM_LARGE2 1450 1451 /* subquadratic divnorm */ 1452 static int mp_divnorm_large(bf_context_t *s, 1453 limb_t *tabq, limb_t *taba, limb_t na, 1454 const limb_t *tabb, limb_t nb) 1455 { 1456 limb_t *tabb_inv, nq, *tabt, i, n; 1457 nq = na - nb; 1458 #ifdef DEBUG_DIVNORM_LARGE 1459 printf("na=%d nb=%d nq=%d\n", (int)na, (int)nb, (int)nq); 1460 mp_print_str("a", taba, na); 1461 mp_print_str("b", tabb, nb); 1462 #endif 1463 assert(nq >= 1); 1464 n = nq; 1465 if (nq < nb) 1466 n++; 1467 tabb_inv = bf_malloc(s, sizeof(limb_t) * (n + 1)); 1468 tabt = bf_malloc(s, sizeof(limb_t) * 2 * (n + 1)); 1469 if (!tabb_inv || !tabt) 1470 goto fail; 1471 1472 if (n >= nb) { 1473 for(i = 0; i < n - nb; i++) 1474 tabt[i] = 0; 1475 for(i = 0; i < nb; i++) 1476 tabt[i + n - nb] = tabb[i]; 1477 } else { 1478 /* truncate B: need to increment it so that the approximate 1479 inverse is smaller that the exact inverse */ 1480 for(i = 0; i < n; i++) 1481 tabt[i] = tabb[i + nb - n]; 1482 if (mp_add_ui(tabt, 1, n)) { 1483 /* tabt = B^n : tabb_inv = B^n */ 1484 memset(tabb_inv, 0, n * sizeof(limb_t)); 1485 tabb_inv[n] = 1; 1486 goto recip_done; 1487 } 1488 } 1489 if (mp_recip(s, tabb_inv, tabt, n)) 1490 goto fail; 1491 recip_done: 1492 /* Q=A*B^-1 */ 1493 if (mp_mul(s, tabt, tabb_inv, n + 1, taba + na - (n + 1), n + 1)) 1494 goto fail; 1495 1496 for(i = 0; i < nq + 1; i++) 1497 tabq[i] = tabt[i + 2 * (n + 1) - (nq + 1)]; 1498 #ifdef DEBUG_DIVNORM_LARGE 1499 mp_print_str("q", tabq, nq + 1); 1500 #endif 1501 1502 bf_free(s, tabt); 1503 bf_free(s, tabb_inv); 1504 tabb_inv = NULL; 1505 1506 /* R=A-B*Q */ 1507 tabt = bf_malloc(s, sizeof(limb_t) * (na + 1)); 1508 if (!tabt) 1509 goto fail; 1510 if (mp_mul(s, tabt, tabq, nq + 1, tabb, nb)) 1511 goto fail; 1512 /* we add one more limb for the result */ 1513 mp_sub(taba, taba, tabt, nb + 1, 0); 1514 bf_free(s, tabt); 1515 /* the approximated quotient is smaller than than the exact one, 1516 hence we may have to increment it */ 1517 #ifdef DEBUG_DIVNORM_LARGE2 1518 int cnt = 0; 1519 static int cnt_max; 1520 #endif 1521 for(;;) { 1522 if (taba[nb] == 0 && mp_cmp(taba, tabb, nb) < 0) 1523 break; 1524 taba[nb] -= mp_sub(taba, taba, tabb, nb, 0); 1525 mp_add_ui(tabq, 1, nq + 1); 1526 #ifdef DEBUG_DIVNORM_LARGE2 1527 cnt++; 1528 #endif 1529 } 1530 #ifdef DEBUG_DIVNORM_LARGE2 1531 if (cnt > cnt_max) { 1532 cnt_max = cnt; 1533 printf("\ncnt=%d nq=%d nb=%d\n", cnt_max, (int)nq, (int)nb); 1534 } 1535 #endif 1536 return 0; 1537 fail: 1538 bf_free(s, tabb_inv); 1539 bf_free(s, tabt); 1540 return -1; 1541 } 1542 1543 int bf_mul(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, 1544 bf_flags_t flags) 1545 { 1546 int ret, r_sign; 1547 1548 if (a->len < b->len) { 1549 const bf_t *tmp = a; 1550 a = b; 1551 b = tmp; 1552 } 1553 r_sign = a->sign ^ b->sign; 1554 /* here b->len <= a->len */ 1555 if (b->len == 0) { 1556 if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { 1557 bf_set_nan(r); 1558 ret = 0; 1559 } else if (a->expn == BF_EXP_INF || b->expn == BF_EXP_INF) { 1560 if ((a->expn == BF_EXP_INF && b->expn == BF_EXP_ZERO) || 1561 (a->expn == BF_EXP_ZERO && b->expn == BF_EXP_INF)) { 1562 bf_set_nan(r); 1563 ret = BF_ST_INVALID_OP; 1564 } else { 1565 bf_set_inf(r, r_sign); 1566 ret = 0; 1567 } 1568 } else { 1569 bf_set_zero(r, r_sign); 1570 ret = 0; 1571 } 1572 } else { 1573 bf_t tmp, *r1 = NULL; 1574 limb_t a_len, b_len, precl; 1575 limb_t *a_tab, *b_tab; 1576 1577 a_len = a->len; 1578 b_len = b->len; 1579 1580 if ((flags & BF_RND_MASK) == BF_RNDF) { 1581 /* faithful rounding does not require using the full inputs */ 1582 precl = (prec + 2 + LIMB_BITS - 1) / LIMB_BITS; 1583 a_len = bf_min(a_len, precl); 1584 b_len = bf_min(b_len, precl); 1585 } 1586 a_tab = a->tab + a->len - a_len; 1587 b_tab = b->tab + b->len - b_len; 1588 1589 #ifdef USE_FFT_MUL 1590 if (b_len >= FFT_MUL_THRESHOLD) { 1591 int mul_flags = 0; 1592 if (r == a) 1593 mul_flags |= FFT_MUL_R_OVERLAP_A; 1594 if (r == b) 1595 mul_flags |= FFT_MUL_R_OVERLAP_B; 1596 if (fft_mul(r->ctx, r, a_tab, a_len, b_tab, b_len, mul_flags)) 1597 goto fail; 1598 } else 1599 #endif 1600 { 1601 if (r == a || r == b) { 1602 bf_init(r->ctx, &tmp); 1603 r1 = r; 1604 r = &tmp; 1605 } 1606 if (bf_resize(r, a_len + b_len)) { 1607 #ifdef USE_FFT_MUL 1608 fail: 1609 #endif 1610 bf_set_nan(r); 1611 ret = BF_ST_MEM_ERROR; 1612 goto done; 1613 } 1614 mp_mul_basecase(r->tab, a_tab, a_len, b_tab, b_len); 1615 } 1616 r->sign = r_sign; 1617 r->expn = a->expn + b->expn; 1618 ret = bf_normalize_and_round(r, prec, flags); 1619 done: 1620 if (r == &tmp) 1621 bf_move(r1, &tmp); 1622 } 1623 return ret; 1624 } 1625 1626 /* multiply 'r' by 2^e */ 1627 int bf_mul_2exp(bf_t *r, slimb_t e, limb_t prec, bf_flags_t flags) 1628 { 1629 slimb_t e_max; 1630 if (r->len == 0) 1631 return 0; 1632 e_max = ((limb_t)1 << BF_EXT_EXP_BITS_MAX) - 1; 1633 e = bf_max(e, -e_max); 1634 e = bf_min(e, e_max); 1635 r->expn += e; 1636 return __bf_round(r, prec, flags, r->len, 0); 1637 } 1638 1639 /* Return e such as a=m*2^e with m odd integer. return 0 if a is zero, 1640 Infinite or Nan. */ 1641 slimb_t bf_get_exp_min(const bf_t *a) 1642 { 1643 slimb_t i; 1644 limb_t v; 1645 int k; 1646 1647 for(i = 0; i < a->len; i++) { 1648 v = a->tab[i]; 1649 if (v != 0) { 1650 k = ctz(v); 1651 return a->expn - (a->len - i) * LIMB_BITS + k; 1652 } 1653 } 1654 return 0; 1655 } 1656 1657 /* a and b must be finite numbers with a >= 0 and b > 0. 'q' is the 1658 integer defined as floor(a/b) and r = a - q * b. */ 1659 static void bf_tdivremu(bf_t *q, bf_t *r, 1660 const bf_t *a, const bf_t *b) 1661 { 1662 if (bf_cmpu(a, b) < 0) { 1663 bf_set_ui(q, 0); 1664 bf_set(r, a); 1665 } else { 1666 bf_div(q, a, b, bf_max(a->expn - b->expn + 1, 2), BF_RNDZ); 1667 bf_rint(q, BF_RNDZ); 1668 bf_mul(r, q, b, BF_PREC_INF, BF_RNDZ); 1669 bf_sub(r, a, r, BF_PREC_INF, BF_RNDZ); 1670 } 1671 } 1672 1673 static int __bf_div(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, 1674 bf_flags_t flags) 1675 { 1676 bf_context_t *s = r->ctx; 1677 int ret, r_sign; 1678 limb_t n, nb, precl; 1679 1680 r_sign = a->sign ^ b->sign; 1681 if (a->expn >= BF_EXP_INF || b->expn >= BF_EXP_INF) { 1682 if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { 1683 bf_set_nan(r); 1684 return 0; 1685 } else if (a->expn == BF_EXP_INF && b->expn == BF_EXP_INF) { 1686 bf_set_nan(r); 1687 return BF_ST_INVALID_OP; 1688 } else if (a->expn == BF_EXP_INF) { 1689 bf_set_inf(r, r_sign); 1690 return 0; 1691 } else { 1692 bf_set_zero(r, r_sign); 1693 return 0; 1694 } 1695 } else if (a->expn == BF_EXP_ZERO) { 1696 if (b->expn == BF_EXP_ZERO) { 1697 bf_set_nan(r); 1698 return BF_ST_INVALID_OP; 1699 } else { 1700 bf_set_zero(r, r_sign); 1701 return 0; 1702 } 1703 } else if (b->expn == BF_EXP_ZERO) { 1704 bf_set_inf(r, r_sign); 1705 return BF_ST_DIVIDE_ZERO; 1706 } 1707 1708 /* number of limbs of the quotient (2 extra bits for rounding) */ 1709 precl = (prec + 2 + LIMB_BITS - 1) / LIMB_BITS; 1710 nb = b->len; 1711 n = bf_max(a->len, precl); 1712 1713 { 1714 limb_t *taba, na; 1715 slimb_t d; 1716 1717 na = n + nb; 1718 taba = bf_malloc(s, (na + 1) * sizeof(limb_t)); 1719 if (!taba) 1720 goto fail; 1721 d = na - a->len; 1722 memset(taba, 0, d * sizeof(limb_t)); 1723 memcpy(taba + d, a->tab, a->len * sizeof(limb_t)); 1724 if (bf_resize(r, n + 1)) 1725 goto fail1; 1726 if (mp_divnorm(s, r->tab, taba, na, b->tab, nb)) { 1727 fail1: 1728 bf_free(s, taba); 1729 goto fail; 1730 } 1731 /* see if non zero remainder */ 1732 if (mp_scan_nz(taba, nb)) 1733 r->tab[0] |= 1; 1734 bf_free(r->ctx, taba); 1735 r->expn = a->expn - b->expn + LIMB_BITS; 1736 r->sign = r_sign; 1737 ret = bf_normalize_and_round(r, prec, flags); 1738 } 1739 return ret; 1740 fail: 1741 bf_set_nan(r); 1742 return BF_ST_MEM_ERROR; 1743 } 1744 1745 /* division and remainder. 1746 1747 rnd_mode is the rounding mode for the quotient. The additional 1748 rounding mode BF_RND_EUCLIDIAN is supported. 1749 1750 'q' is an integer. 'r' is rounded with prec and flags (prec can be 1751 BF_PREC_INF). 1752 */ 1753 int bf_divrem(bf_t *q, bf_t *r, const bf_t *a, const bf_t *b, 1754 limb_t prec, bf_flags_t flags, int rnd_mode) 1755 { 1756 bf_t a1_s, *a1 = &a1_s; 1757 bf_t b1_s, *b1 = &b1_s; 1758 int q_sign, ret; 1759 BOOL is_ceil, is_rndn; 1760 1761 assert(q != a && q != b); 1762 assert(r != a && r != b); 1763 assert(q != r); 1764 1765 if (a->len == 0 || b->len == 0) { 1766 bf_set_zero(q, 0); 1767 if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { 1768 bf_set_nan(r); 1769 return 0; 1770 } else if (a->expn == BF_EXP_INF || b->expn == BF_EXP_ZERO) { 1771 bf_set_nan(r); 1772 return BF_ST_INVALID_OP; 1773 } else { 1774 bf_set(r, a); 1775 return bf_round(r, prec, flags); 1776 } 1777 } 1778 1779 q_sign = a->sign ^ b->sign; 1780 is_rndn = (rnd_mode == BF_RNDN || rnd_mode == BF_RNDNA); 1781 switch(rnd_mode) { 1782 default: 1783 case BF_RNDZ: 1784 case BF_RNDN: 1785 case BF_RNDNA: 1786 is_ceil = FALSE; 1787 break; 1788 case BF_RNDD: 1789 is_ceil = q_sign; 1790 break; 1791 case BF_RNDU: 1792 is_ceil = q_sign ^ 1; 1793 break; 1794 case BF_RNDA: 1795 is_ceil = TRUE; 1796 break; 1797 case BF_DIVREM_EUCLIDIAN: 1798 is_ceil = a->sign; 1799 break; 1800 } 1801 1802 a1->expn = a->expn; 1803 a1->tab = a->tab; 1804 a1->len = a->len; 1805 a1->sign = 0; 1806 1807 b1->expn = b->expn; 1808 b1->tab = b->tab; 1809 b1->len = b->len; 1810 b1->sign = 0; 1811 1812 /* XXX: could improve to avoid having a large 'q' */ 1813 bf_tdivremu(q, r, a1, b1); 1814 if (bf_is_nan(q) || bf_is_nan(r)) 1815 goto fail; 1816 1817 if (r->len != 0) { 1818 if (is_rndn) { 1819 int res; 1820 b1->expn--; 1821 res = bf_cmpu(r, b1); 1822 b1->expn++; 1823 if (res > 0 || 1824 (res == 0 && 1825 (rnd_mode == BF_RNDNA || 1826 get_bit(q->tab, q->len, q->len * LIMB_BITS - q->expn)))) { 1827 goto do_sub_r; 1828 } 1829 } else if (is_ceil) { 1830 do_sub_r: 1831 ret = bf_add_si(q, q, 1, BF_PREC_INF, BF_RNDZ); 1832 ret |= bf_sub(r, r, b1, BF_PREC_INF, BF_RNDZ); 1833 if (ret & BF_ST_MEM_ERROR) 1834 goto fail; 1835 } 1836 } 1837 1838 r->sign ^= a->sign; 1839 q->sign = q_sign; 1840 return bf_round(r, prec, flags); 1841 fail: 1842 bf_set_nan(q); 1843 bf_set_nan(r); 1844 return BF_ST_MEM_ERROR; 1845 } 1846 1847 int bf_rem(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, 1848 bf_flags_t flags, int rnd_mode) 1849 { 1850 bf_t q_s, *q = &q_s; 1851 int ret; 1852 1853 bf_init(r->ctx, q); 1854 ret = bf_divrem(q, r, a, b, prec, flags, rnd_mode); 1855 bf_delete(q); 1856 return ret; 1857 } 1858 1859 static inline int bf_get_limb(slimb_t *pres, const bf_t *a, int flags) 1860 { 1861 #if LIMB_BITS == 32 1862 return bf_get_int32(pres, a, flags); 1863 #else 1864 return bf_get_int64(pres, a, flags); 1865 #endif 1866 } 1867 1868 int bf_remquo(slimb_t *pq, bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, 1869 bf_flags_t flags, int rnd_mode) 1870 { 1871 bf_t q_s, *q = &q_s; 1872 int ret; 1873 1874 bf_init(r->ctx, q); 1875 ret = bf_divrem(q, r, a, b, prec, flags, rnd_mode); 1876 bf_get_limb(pq, q, BF_GET_INT_MOD); 1877 bf_delete(q); 1878 return ret; 1879 } 1880 1881 static __maybe_unused inline limb_t mul_mod(limb_t a, limb_t b, limb_t m) 1882 { 1883 dlimb_t t; 1884 t = (dlimb_t)a * (dlimb_t)b; 1885 return t % m; 1886 } 1887 1888 #if defined(USE_MUL_CHECK) 1889 static limb_t mp_mod1(const limb_t *tab, limb_t n, limb_t m, limb_t r) 1890 { 1891 slimb_t i; 1892 dlimb_t t; 1893 1894 for(i = n - 1; i >= 0; i--) { 1895 t = ((dlimb_t)r << LIMB_BITS) | tab[i]; 1896 r = t % m; 1897 } 1898 return r; 1899 } 1900 #endif 1901 1902 static const uint16_t sqrt_table[192] = { 1903 128,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,144,145,146,147,148,149,150,150,151,152,153,154,155,155,156,157,158,159,160,160,161,162,163,163,164,165,166,167,167,168,169,170,170,171,172,173,173,174,175,176,176,177,178,178,179,180,181,181,182,183,183,184,185,185,186,187,187,188,189,189,190,191,192,192,193,193,194,195,195,196,197,197,198,199,199,200,201,201,202,203,203,204,204,205,206,206,207,208,208,209,209,210,211,211,212,212,213,214,214,215,215,216,217,217,218,218,219,219,220,221,221,222,222,223,224,224,225,225,226,226,227,227,228,229,229,230,230,231,231,232,232,233,234,234,235,235,236,236,237,237,238,238,239,240,240,241,241,242,242,243,243,244,244,245,245,246,246,247,247,248,248,249,249,250,250,251,251,252,252,253,253,254,254,255, 1904 }; 1905 1906 /* a >= 2^(LIMB_BITS - 2). Return (s, r) with s=floor(sqrt(a)) and 1907 r=a-s^2. 0 <= r <= 2 * s */ 1908 static limb_t mp_sqrtrem1(limb_t *pr, limb_t a) 1909 { 1910 limb_t s1, r1, s, r, q, u, num; 1911 1912 /* use a table for the 16 -> 8 bit sqrt */ 1913 s1 = sqrt_table[(a >> (LIMB_BITS - 8)) - 64]; 1914 r1 = (a >> (LIMB_BITS - 16)) - s1 * s1; 1915 if (r1 > 2 * s1) { 1916 r1 -= 2 * s1 + 1; 1917 s1++; 1918 } 1919 1920 /* one iteration to get a 32 -> 16 bit sqrt */ 1921 num = (r1 << 8) | ((a >> (LIMB_BITS - 32 + 8)) & 0xff); 1922 q = num / (2 * s1); /* q <= 2^8 */ 1923 u = num % (2 * s1); 1924 s = (s1 << 8) + q; 1925 r = (u << 8) | ((a >> (LIMB_BITS - 32)) & 0xff); 1926 r -= q * q; 1927 if ((slimb_t)r < 0) { 1928 s--; 1929 r += 2 * s + 1; 1930 } 1931 1932 #if LIMB_BITS == 64 1933 s1 = s; 1934 r1 = r; 1935 /* one more iteration for 64 -> 32 bit sqrt */ 1936 num = (r1 << 16) | ((a >> (LIMB_BITS - 64 + 16)) & 0xffff); 1937 q = num / (2 * s1); /* q <= 2^16 */ 1938 u = num % (2 * s1); 1939 s = (s1 << 16) + q; 1940 r = (u << 16) | ((a >> (LIMB_BITS - 64)) & 0xffff); 1941 r -= q * q; 1942 if ((slimb_t)r < 0) { 1943 s--; 1944 r += 2 * s + 1; 1945 } 1946 #endif 1947 *pr = r; 1948 return s; 1949 } 1950 1951 /* return floor(sqrt(a)) */ 1952 limb_t bf_isqrt(limb_t a) 1953 { 1954 limb_t s, r; 1955 int k; 1956 1957 if (a == 0) 1958 return 0; 1959 k = clz(a) & ~1; 1960 s = mp_sqrtrem1(&r, a << k); 1961 s >>= (k >> 1); 1962 return s; 1963 } 1964 1965 static limb_t mp_sqrtrem2(limb_t *tabs, limb_t *taba) 1966 { 1967 limb_t s1, r1, s, q, u, a0, a1; 1968 dlimb_t r, num; 1969 int l; 1970 1971 a0 = taba[0]; 1972 a1 = taba[1]; 1973 s1 = mp_sqrtrem1(&r1, a1); 1974 l = LIMB_BITS / 2; 1975 num = ((dlimb_t)r1 << l) | (a0 >> l); 1976 q = num / (2 * s1); 1977 u = num % (2 * s1); 1978 s = (s1 << l) + q; 1979 r = ((dlimb_t)u << l) | (a0 & (((limb_t)1 << l) - 1)); 1980 if (unlikely((q >> l) != 0)) 1981 r -= (dlimb_t)1 << LIMB_BITS; /* special case when q=2^l */ 1982 else 1983 r -= q * q; 1984 if ((slimb_t)(r >> LIMB_BITS) < 0) { 1985 s--; 1986 r += 2 * (dlimb_t)s + 1; 1987 } 1988 tabs[0] = s; 1989 taba[0] = r; 1990 return r >> LIMB_BITS; 1991 } 1992 1993 //#define DEBUG_SQRTREM 1994 1995 /* tmp_buf must contain (n / 2 + 1 limbs). *prh contains the highest 1996 limb of the remainder. */ 1997 static int mp_sqrtrem_rec(bf_context_t *s, limb_t *tabs, limb_t *taba, limb_t n, 1998 limb_t *tmp_buf, limb_t *prh) 1999 { 2000 limb_t l, h, rh, ql, qh, c, i; 2001 2002 if (n == 1) { 2003 *prh = mp_sqrtrem2(tabs, taba); 2004 return 0; 2005 } 2006 #ifdef DEBUG_SQRTREM 2007 mp_print_str("a", taba, 2 * n); 2008 #endif 2009 l = n / 2; 2010 h = n - l; 2011 if (mp_sqrtrem_rec(s, tabs + l, taba + 2 * l, h, tmp_buf, &qh)) 2012 return -1; 2013 #ifdef DEBUG_SQRTREM 2014 mp_print_str("s1", tabs + l, h); 2015 mp_print_str_h("r1", taba + 2 * l, h, qh); 2016 mp_print_str_h("r2", taba + l, n, qh); 2017 #endif 2018 2019 /* the remainder is in taba + 2 * l. Its high bit is in qh */ 2020 if (qh) { 2021 mp_sub(taba + 2 * l, taba + 2 * l, tabs + l, h, 0); 2022 } 2023 /* instead of dividing by 2*s, divide by s (which is normalized) 2024 and update q and r */ 2025 if (mp_divnorm(s, tmp_buf, taba + l, n, tabs + l, h)) 2026 return -1; 2027 qh += tmp_buf[l]; 2028 for(i = 0; i < l; i++) 2029 tabs[i] = tmp_buf[i]; 2030 ql = mp_shr(tabs, tabs, l, 1, qh & 1); 2031 qh = qh >> 1; /* 0 or 1 */ 2032 if (ql) 2033 rh = mp_add(taba + l, taba + l, tabs + l, h, 0); 2034 else 2035 rh = 0; 2036 #ifdef DEBUG_SQRTREM 2037 mp_print_str_h("q", tabs, l, qh); 2038 mp_print_str_h("u", taba + l, h, rh); 2039 #endif 2040 2041 mp_add_ui(tabs + l, qh, h); 2042 #ifdef DEBUG_SQRTREM 2043 mp_print_str_h("s2", tabs, n, sh); 2044 #endif 2045 2046 /* q = qh, tabs[l - 1 ... 0], r = taba[n - 1 ... l] */ 2047 /* subtract q^2. if qh = 1 then q = B^l, so we can take shortcuts */ 2048 if (qh) { 2049 c = qh; 2050 } else { 2051 if (mp_mul(s, taba + n, tabs, l, tabs, l)) 2052 return -1; 2053 c = mp_sub(taba, taba, taba + n, 2 * l, 0); 2054 } 2055 rh -= mp_sub_ui(taba + 2 * l, c, n - 2 * l); 2056 if ((slimb_t)rh < 0) { 2057 mp_sub_ui(tabs, 1, n); 2058 rh += mp_add_mul1(taba, tabs, n, 2); 2059 rh += mp_add_ui(taba, 1, n); 2060 } 2061 *prh = rh; 2062 return 0; 2063 } 2064 2065 /* 'taba' has 2*n limbs with n >= 1 and taba[2*n-1] >= 2 ^ (LIMB_BITS 2066 - 2). Return (s, r) with s=floor(sqrt(a)) and r=a-s^2. 0 <= r <= 2 2067 * s. tabs has n limbs. r is returned in the lower n limbs of 2068 taba. Its r[n] is the returned value of the function. */ 2069 /* Algorithm from the article "Karatsuba Square Root" by Paul Zimmermann and 2070 inspirated from its GMP implementation */ 2071 int mp_sqrtrem(bf_context_t *s, limb_t *tabs, limb_t *taba, limb_t n) 2072 { 2073 limb_t tmp_buf1[8]; 2074 limb_t *tmp_buf; 2075 mp_size_t n2; 2076 int ret; 2077 n2 = n / 2 + 1; 2078 if (n2 <= countof(tmp_buf1)) { 2079 tmp_buf = tmp_buf1; 2080 } else { 2081 tmp_buf = bf_malloc(s, sizeof(limb_t) * n2); 2082 if (!tmp_buf) 2083 return -1; 2084 } 2085 ret = mp_sqrtrem_rec(s, tabs, taba, n, tmp_buf, taba + n); 2086 if (tmp_buf != tmp_buf1) 2087 bf_free(s, tmp_buf); 2088 return ret; 2089 } 2090 2091 /* Integer square root with remainder. 'a' must be an integer. r = 2092 floor(sqrt(a)) and rem = a - r^2. BF_ST_INEXACT is set if the result 2093 is inexact. 'rem' can be NULL if the remainder is not needed. */ 2094 int bf_sqrtrem(bf_t *r, bf_t *rem1, const bf_t *a) 2095 { 2096 int ret; 2097 2098 if (a->len == 0) { 2099 if (a->expn == BF_EXP_NAN) { 2100 bf_set_nan(r); 2101 } else if (a->expn == BF_EXP_INF && a->sign) { 2102 goto invalid_op; 2103 } else { 2104 bf_set(r, a); 2105 } 2106 if (rem1) 2107 bf_set_ui(rem1, 0); 2108 ret = 0; 2109 } else if (a->sign) { 2110 invalid_op: 2111 bf_set_nan(r); 2112 if (rem1) 2113 bf_set_ui(rem1, 0); 2114 ret = BF_ST_INVALID_OP; 2115 } else { 2116 bf_t rem_s, *rem; 2117 2118 bf_sqrt(r, a, (a->expn + 1) / 2, BF_RNDZ); 2119 bf_rint(r, BF_RNDZ); 2120 /* see if the result is exact by computing the remainder */ 2121 if (rem1) { 2122 rem = rem1; 2123 } else { 2124 rem = &rem_s; 2125 bf_init(r->ctx, rem); 2126 } 2127 /* XXX: could avoid recomputing the remainder */ 2128 bf_mul(rem, r, r, BF_PREC_INF, BF_RNDZ); 2129 bf_neg(rem); 2130 bf_add(rem, rem, a, BF_PREC_INF, BF_RNDZ); 2131 if (bf_is_nan(rem)) { 2132 ret = BF_ST_MEM_ERROR; 2133 goto done; 2134 } 2135 if (rem->len != 0) { 2136 ret = BF_ST_INEXACT; 2137 } else { 2138 ret = 0; 2139 } 2140 done: 2141 if (!rem1) 2142 bf_delete(rem); 2143 } 2144 return ret; 2145 } 2146 2147 int bf_sqrt(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags) 2148 { 2149 bf_context_t *s = a->ctx; 2150 int ret; 2151 2152 assert(r != a); 2153 2154 if (a->len == 0) { 2155 if (a->expn == BF_EXP_NAN) { 2156 bf_set_nan(r); 2157 } else if (a->expn == BF_EXP_INF && a->sign) { 2158 goto invalid_op; 2159 } else { 2160 bf_set(r, a); 2161 } 2162 ret = 0; 2163 } else if (a->sign) { 2164 invalid_op: 2165 bf_set_nan(r); 2166 ret = BF_ST_INVALID_OP; 2167 } else { 2168 limb_t *a1; 2169 slimb_t n, n1; 2170 limb_t res; 2171 2172 /* convert the mantissa to an integer with at least 2 * 2173 prec + 4 bits */ 2174 n = (2 * (prec + 2) + 2 * LIMB_BITS - 1) / (2 * LIMB_BITS); 2175 if (bf_resize(r, n)) 2176 goto fail; 2177 a1 = bf_malloc(s, sizeof(limb_t) * 2 * n); 2178 if (!a1) 2179 goto fail; 2180 n1 = bf_min(2 * n, a->len); 2181 memset(a1, 0, (2 * n - n1) * sizeof(limb_t)); 2182 memcpy(a1 + 2 * n - n1, a->tab + a->len - n1, n1 * sizeof(limb_t)); 2183 if (a->expn & 1) { 2184 res = mp_shr(a1, a1, 2 * n, 1, 0); 2185 } else { 2186 res = 0; 2187 } 2188 if (mp_sqrtrem(s, r->tab, a1, n)) { 2189 bf_free(s, a1); 2190 goto fail; 2191 } 2192 if (!res) { 2193 res = mp_scan_nz(a1, n + 1); 2194 } 2195 bf_free(s, a1); 2196 if (!res) { 2197 res = mp_scan_nz(a->tab, a->len - n1); 2198 } 2199 if (res != 0) 2200 r->tab[0] |= 1; 2201 r->sign = 0; 2202 r->expn = (a->expn + 1) >> 1; 2203 ret = bf_round(r, prec, flags); 2204 } 2205 return ret; 2206 fail: 2207 bf_set_nan(r); 2208 return BF_ST_MEM_ERROR; 2209 } 2210 2211 static no_inline int bf_op2(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, 2212 bf_flags_t flags, bf_op2_func_t *func) 2213 { 2214 bf_t tmp; 2215 int ret; 2216 2217 if (r == a || r == b) { 2218 bf_init(r->ctx, &tmp); 2219 ret = func(&tmp, a, b, prec, flags); 2220 bf_move(r, &tmp); 2221 } else { 2222 ret = func(r, a, b, prec, flags); 2223 } 2224 return ret; 2225 } 2226 2227 int bf_add(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, 2228 bf_flags_t flags) 2229 { 2230 return bf_op2(r, a, b, prec, flags, __bf_add); 2231 } 2232 2233 int bf_sub(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, 2234 bf_flags_t flags) 2235 { 2236 return bf_op2(r, a, b, prec, flags, __bf_sub); 2237 } 2238 2239 int bf_div(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, 2240 bf_flags_t flags) 2241 { 2242 return bf_op2(r, a, b, prec, flags, __bf_div); 2243 } 2244 2245 int bf_mul_ui(bf_t *r, const bf_t *a, uint64_t b1, limb_t prec, 2246 bf_flags_t flags) 2247 { 2248 bf_t b; 2249 int ret; 2250 bf_init(r->ctx, &b); 2251 ret = bf_set_ui(&b, b1); 2252 ret |= bf_mul(r, a, &b, prec, flags); 2253 bf_delete(&b); 2254 return ret; 2255 } 2256 2257 int bf_mul_si(bf_t *r, const bf_t *a, int64_t b1, limb_t prec, 2258 bf_flags_t flags) 2259 { 2260 bf_t b; 2261 int ret; 2262 bf_init(r->ctx, &b); 2263 ret = bf_set_si(&b, b1); 2264 ret |= bf_mul(r, a, &b, prec, flags); 2265 bf_delete(&b); 2266 return ret; 2267 } 2268 2269 int bf_add_si(bf_t *r, const bf_t *a, int64_t b1, limb_t prec, 2270 bf_flags_t flags) 2271 { 2272 bf_t b; 2273 int ret; 2274 2275 bf_init(r->ctx, &b); 2276 ret = bf_set_si(&b, b1); 2277 ret |= bf_add(r, a, &b, prec, flags); 2278 bf_delete(&b); 2279 return ret; 2280 } 2281 2282 static int bf_pow_ui(bf_t *r, const bf_t *a, limb_t b, limb_t prec, 2283 bf_flags_t flags) 2284 { 2285 int ret, n_bits, i; 2286 2287 assert(r != a); 2288 if (b == 0) 2289 return bf_set_ui(r, 1); 2290 ret = bf_set(r, a); 2291 n_bits = LIMB_BITS - clz(b); 2292 for(i = n_bits - 2; i >= 0; i--) { 2293 ret |= bf_mul(r, r, r, prec, flags); 2294 if ((b >> i) & 1) 2295 ret |= bf_mul(r, r, a, prec, flags); 2296 } 2297 return ret; 2298 } 2299 2300 static int bf_pow_ui_ui(bf_t *r, limb_t a1, limb_t b, 2301 limb_t prec, bf_flags_t flags) 2302 { 2303 bf_t a; 2304 int ret; 2305 2306 #ifdef USE_BF_DEC 2307 if (a1 == 10 && b <= LIMB_DIGITS) { 2308 /* use precomputed powers. We do not round at this point 2309 because we expect the caller to do it */ 2310 ret = bf_set_ui(r, mp_pow_dec[b]); 2311 } else 2312 #endif 2313 { 2314 bf_init(r->ctx, &a); 2315 ret = bf_set_ui(&a, a1); 2316 ret |= bf_pow_ui(r, &a, b, prec, flags); 2317 bf_delete(&a); 2318 } 2319 return ret; 2320 } 2321 2322 /* convert to integer (infinite precision) */ 2323 int bf_rint(bf_t *r, int rnd_mode) 2324 { 2325 return bf_round(r, 0, rnd_mode | BF_FLAG_RADPNT_PREC); 2326 } 2327 2328 /* logical operations */ 2329 #define BF_LOGIC_OR 0 2330 #define BF_LOGIC_XOR 1 2331 #define BF_LOGIC_AND 2 2332 2333 static inline limb_t bf_logic_op1(limb_t a, limb_t b, int op) 2334 { 2335 switch(op) { 2336 case BF_LOGIC_OR: 2337 return a | b; 2338 case BF_LOGIC_XOR: 2339 return a ^ b; 2340 default: 2341 case BF_LOGIC_AND: 2342 return a & b; 2343 } 2344 } 2345 2346 static int bf_logic_op(bf_t *r, const bf_t *a1, const bf_t *b1, int op) 2347 { 2348 bf_t b1_s, a1_s, *a, *b; 2349 limb_t a_sign, b_sign, r_sign; 2350 slimb_t l, i, a_bit_offset, b_bit_offset; 2351 limb_t v1, v2, v1_mask, v2_mask, r_mask; 2352 int ret; 2353 2354 assert(r != a1 && r != b1); 2355 2356 if (a1->expn <= 0) 2357 a_sign = 0; /* minus zero is considered as positive */ 2358 else 2359 a_sign = a1->sign; 2360 2361 if (b1->expn <= 0) 2362 b_sign = 0; /* minus zero is considered as positive */ 2363 else 2364 b_sign = b1->sign; 2365 2366 if (a_sign) { 2367 a = &a1_s; 2368 bf_init(r->ctx, a); 2369 if (bf_add_si(a, a1, 1, BF_PREC_INF, BF_RNDZ)) { 2370 b = NULL; 2371 goto fail; 2372 } 2373 } else { 2374 a = (bf_t *)a1; 2375 } 2376 2377 if (b_sign) { 2378 b = &b1_s; 2379 bf_init(r->ctx, b); 2380 if (bf_add_si(b, b1, 1, BF_PREC_INF, BF_RNDZ)) 2381 goto fail; 2382 } else { 2383 b = (bf_t *)b1; 2384 } 2385 2386 r_sign = bf_logic_op1(a_sign, b_sign, op); 2387 if (op == BF_LOGIC_AND && r_sign == 0) { 2388 /* no need to compute extra zeros for and */ 2389 if (a_sign == 0 && b_sign == 0) 2390 l = bf_min(a->expn, b->expn); 2391 else if (a_sign == 0) 2392 l = a->expn; 2393 else 2394 l = b->expn; 2395 } else { 2396 l = bf_max(a->expn, b->expn); 2397 } 2398 /* Note: a or b can be zero */ 2399 l = (bf_max(l, 1) + LIMB_BITS - 1) / LIMB_BITS; 2400 if (bf_resize(r, l)) 2401 goto fail; 2402 a_bit_offset = a->len * LIMB_BITS - a->expn; 2403 b_bit_offset = b->len * LIMB_BITS - b->expn; 2404 v1_mask = -a_sign; 2405 v2_mask = -b_sign; 2406 r_mask = -r_sign; 2407 for(i = 0; i < l; i++) { 2408 v1 = get_bits(a->tab, a->len, a_bit_offset + i * LIMB_BITS) ^ v1_mask; 2409 v2 = get_bits(b->tab, b->len, b_bit_offset + i * LIMB_BITS) ^ v2_mask; 2410 r->tab[i] = bf_logic_op1(v1, v2, op) ^ r_mask; 2411 } 2412 r->expn = l * LIMB_BITS; 2413 r->sign = r_sign; 2414 bf_normalize_and_round(r, BF_PREC_INF, BF_RNDZ); /* cannot fail */ 2415 if (r_sign) { 2416 if (bf_add_si(r, r, -1, BF_PREC_INF, BF_RNDZ)) 2417 goto fail; 2418 } 2419 ret = 0; 2420 done: 2421 if (a == &a1_s) 2422 bf_delete(a); 2423 if (b == &b1_s) 2424 bf_delete(b); 2425 return ret; 2426 fail: 2427 bf_set_nan(r); 2428 ret = BF_ST_MEM_ERROR; 2429 goto done; 2430 } 2431 2432 /* 'a' and 'b' must be integers. Return 0 or BF_ST_MEM_ERROR. */ 2433 int bf_logic_or(bf_t *r, const bf_t *a, const bf_t *b) 2434 { 2435 return bf_logic_op(r, a, b, BF_LOGIC_OR); 2436 } 2437 2438 /* 'a' and 'b' must be integers. Return 0 or BF_ST_MEM_ERROR. */ 2439 int bf_logic_xor(bf_t *r, const bf_t *a, const bf_t *b) 2440 { 2441 return bf_logic_op(r, a, b, BF_LOGIC_XOR); 2442 } 2443 2444 /* 'a' and 'b' must be integers. Return 0 or BF_ST_MEM_ERROR. */ 2445 int bf_logic_and(bf_t *r, const bf_t *a, const bf_t *b) 2446 { 2447 return bf_logic_op(r, a, b, BF_LOGIC_AND); 2448 } 2449 2450 /* conversion between fixed size types */ 2451 2452 typedef union { 2453 double d; 2454 uint64_t u; 2455 } Float64Union; 2456 2457 int bf_get_float64(const bf_t *a, double *pres, bf_rnd_t rnd_mode) 2458 { 2459 Float64Union u; 2460 int e, ret; 2461 uint64_t m; 2462 2463 ret = 0; 2464 if (a->expn == BF_EXP_NAN) { 2465 u.u = 0x7ff8000000000000; /* quiet nan */ 2466 } else { 2467 bf_t b_s, *b = &b_s; 2468 2469 bf_init(a->ctx, b); 2470 bf_set(b, a); 2471 if (bf_is_finite(b)) { 2472 ret = bf_round(b, 53, rnd_mode | BF_FLAG_SUBNORMAL | bf_set_exp_bits(11)); 2473 } 2474 if (b->expn == BF_EXP_INF) { 2475 e = (1 << 11) - 1; 2476 m = 0; 2477 } else if (b->expn == BF_EXP_ZERO) { 2478 e = 0; 2479 m = 0; 2480 } else { 2481 e = b->expn + 1023 - 1; 2482 #if LIMB_BITS == 32 2483 if (b->len == 2) { 2484 m = ((uint64_t)b->tab[1] << 32) | b->tab[0]; 2485 } else { 2486 m = ((uint64_t)b->tab[0] << 32); 2487 } 2488 #else 2489 m = b->tab[0]; 2490 #endif 2491 if (e <= 0) { 2492 /* subnormal */ 2493 m = m >> (12 - e); 2494 e = 0; 2495 } else { 2496 m = (m << 1) >> 12; 2497 } 2498 } 2499 u.u = m | ((uint64_t)e << 52) | ((uint64_t)b->sign << 63); 2500 bf_delete(b); 2501 } 2502 *pres = u.d; 2503 return ret; 2504 } 2505 2506 int bf_set_float64(bf_t *a, double d) 2507 { 2508 Float64Union u; 2509 uint64_t m; 2510 int shift, e, sgn; 2511 2512 u.d = d; 2513 sgn = u.u >> 63; 2514 e = (u.u >> 52) & ((1 << 11) - 1); 2515 m = u.u & (((uint64_t)1 << 52) - 1); 2516 if (e == ((1 << 11) - 1)) { 2517 if (m != 0) { 2518 bf_set_nan(a); 2519 } else { 2520 bf_set_inf(a, sgn); 2521 } 2522 } else if (e == 0) { 2523 if (m == 0) { 2524 bf_set_zero(a, sgn); 2525 } else { 2526 /* subnormal number */ 2527 m <<= 12; 2528 shift = clz64(m); 2529 m <<= shift; 2530 e = -shift; 2531 goto norm; 2532 } 2533 } else { 2534 m = (m << 11) | ((uint64_t)1 << 63); 2535 norm: 2536 a->expn = e - 1023 + 1; 2537 #if LIMB_BITS == 32 2538 if (bf_resize(a, 2)) 2539 goto fail; 2540 a->tab[0] = m; 2541 a->tab[1] = m >> 32; 2542 #else 2543 if (bf_resize(a, 1)) 2544 goto fail; 2545 a->tab[0] = m; 2546 #endif 2547 a->sign = sgn; 2548 } 2549 return 0; 2550 fail: 2551 bf_set_nan(a); 2552 return BF_ST_MEM_ERROR; 2553 } 2554 2555 /* The rounding mode is always BF_RNDZ. Return BF_ST_INVALID_OP if there 2556 is an overflow and 0 otherwise. */ 2557 int bf_get_int32(int *pres, const bf_t *a, int flags) 2558 { 2559 uint32_t v; 2560 int ret; 2561 if (a->expn >= BF_EXP_INF) { 2562 ret = BF_ST_INVALID_OP; 2563 if (flags & BF_GET_INT_MOD) { 2564 v = 0; 2565 } else if (a->expn == BF_EXP_INF) { 2566 v = (uint32_t)INT32_MAX + a->sign; 2567 } else { 2568 v = INT32_MAX; 2569 } 2570 } else if (a->expn <= 0) { 2571 v = 0; 2572 ret = 0; 2573 } else if (a->expn <= 31) { 2574 v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn); 2575 if (a->sign) 2576 v = -v; 2577 ret = 0; 2578 } else if (!(flags & BF_GET_INT_MOD)) { 2579 ret = BF_ST_INVALID_OP; 2580 if (a->sign) { 2581 v = (uint32_t)INT32_MAX + 1; 2582 if (a->expn == 32 && 2583 (a->tab[a->len - 1] >> (LIMB_BITS - 32)) == v) { 2584 ret = 0; 2585 } 2586 } else { 2587 v = INT32_MAX; 2588 } 2589 } else { 2590 v = get_bits(a->tab, a->len, a->len * LIMB_BITS - a->expn); 2591 if (a->sign) 2592 v = -v; 2593 ret = 0; 2594 } 2595 *pres = v; 2596 return ret; 2597 } 2598 2599 /* The rounding mode is always BF_RNDZ. Return BF_ST_INVALID_OP if there 2600 is an overflow and 0 otherwise. */ 2601 int bf_get_int64(int64_t *pres, const bf_t *a, int flags) 2602 { 2603 uint64_t v; 2604 int ret; 2605 if (a->expn >= BF_EXP_INF) { 2606 ret = BF_ST_INVALID_OP; 2607 if (flags & BF_GET_INT_MOD) { 2608 v = 0; 2609 } else if (a->expn == BF_EXP_INF) { 2610 v = (uint64_t)INT64_MAX + a->sign; 2611 } else { 2612 v = INT64_MAX; 2613 } 2614 } else if (a->expn <= 0) { 2615 v = 0; 2616 ret = 0; 2617 } else if (a->expn <= 63) { 2618 #if LIMB_BITS == 32 2619 if (a->expn <= 32) 2620 v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn); 2621 else 2622 v = (((uint64_t)a->tab[a->len - 1] << 32) | 2623 get_limbz(a, a->len - 2)) >> (64 - a->expn); 2624 #else 2625 v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn); 2626 #endif 2627 if (a->sign) 2628 v = -v; 2629 ret = 0; 2630 } else if (!(flags & BF_GET_INT_MOD)) { 2631 ret = BF_ST_INVALID_OP; 2632 if (a->sign) { 2633 uint64_t v1; 2634 v = (uint64_t)INT64_MAX + 1; 2635 if (a->expn == 64) { 2636 v1 = a->tab[a->len - 1]; 2637 #if LIMB_BITS == 32 2638 v1 = (v1 << 32) | get_limbz(a, a->len - 2); 2639 #endif 2640 if (v1 == v) 2641 ret = 0; 2642 } 2643 } else { 2644 v = INT64_MAX; 2645 } 2646 } else { 2647 slimb_t bit_pos = a->len * LIMB_BITS - a->expn; 2648 v = get_bits(a->tab, a->len, bit_pos); 2649 #if LIMB_BITS == 32 2650 v |= (uint64_t)get_bits(a->tab, a->len, bit_pos + 32) << 32; 2651 #endif 2652 if (a->sign) 2653 v = -v; 2654 ret = 0; 2655 } 2656 *pres = v; 2657 return ret; 2658 } 2659 2660 /* The rounding mode is always BF_RNDZ. Return BF_ST_INVALID_OP if there 2661 is an overflow and 0 otherwise. */ 2662 int bf_get_uint64(uint64_t *pres, const bf_t *a) 2663 { 2664 uint64_t v; 2665 int ret; 2666 if (a->expn == BF_EXP_NAN) { 2667 goto overflow; 2668 } else if (a->expn <= 0) { 2669 v = 0; 2670 ret = 0; 2671 } else if (a->sign) { 2672 v = 0; 2673 ret = BF_ST_INVALID_OP; 2674 } else if (a->expn <= 64) { 2675 #if LIMB_BITS == 32 2676 if (a->expn <= 32) 2677 v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn); 2678 else 2679 v = (((uint64_t)a->tab[a->len - 1] << 32) | 2680 get_limbz(a, a->len - 2)) >> (64 - a->expn); 2681 #else 2682 v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn); 2683 #endif 2684 ret = 0; 2685 } else { 2686 overflow: 2687 v = UINT64_MAX; 2688 ret = BF_ST_INVALID_OP; 2689 } 2690 *pres = v; 2691 return ret; 2692 } 2693 2694 /* base conversion from radix */ 2695 2696 static const uint8_t digits_per_limb_table[BF_RADIX_MAX - 1] = { 2697 #if LIMB_BITS == 32 2698 32,20,16,13,12,11,10,10, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 2699 #else 2700 64,40,32,27,24,22,21,20,19,18,17,17,16,16,16,15,15,15,14,14,14,14,13,13,13,13,13,13,13,12,12,12,12,12,12, 2701 #endif 2702 }; 2703 2704 static limb_t get_limb_radix(int radix) 2705 { 2706 int i, k; 2707 limb_t radixl; 2708 2709 k = digits_per_limb_table[radix - 2]; 2710 radixl = radix; 2711 for(i = 1; i < k; i++) 2712 radixl *= radix; 2713 return radixl; 2714 } 2715 2716 /* return != 0 if error */ 2717 static int bf_integer_from_radix_rec(bf_t *r, const limb_t *tab, 2718 limb_t n, int level, limb_t n0, 2719 limb_t radix, bf_t *pow_tab) 2720 { 2721 int ret; 2722 if (n == 1) { 2723 ret = bf_set_ui(r, tab[0]); 2724 } else { 2725 bf_t T_s, *T = &T_s, *B; 2726 limb_t n1, n2; 2727 2728 n2 = (((n0 * 2) >> (level + 1)) + 1) / 2; 2729 n1 = n - n2; 2730 // printf("level=%d n0=%ld n1=%ld n2=%ld\n", level, n0, n1, n2); 2731 B = &pow_tab[level]; 2732 if (B->len == 0) { 2733 ret = bf_pow_ui_ui(B, radix, n2, BF_PREC_INF, BF_RNDZ); 2734 if (ret) 2735 return ret; 2736 } 2737 ret = bf_integer_from_radix_rec(r, tab + n2, n1, level + 1, n0, 2738 radix, pow_tab); 2739 if (ret) 2740 return ret; 2741 ret = bf_mul(r, r, B, BF_PREC_INF, BF_RNDZ); 2742 if (ret) 2743 return ret; 2744 bf_init(r->ctx, T); 2745 ret = bf_integer_from_radix_rec(T, tab, n2, level + 1, n0, 2746 radix, pow_tab); 2747 if (!ret) 2748 ret = bf_add(r, r, T, BF_PREC_INF, BF_RNDZ); 2749 bf_delete(T); 2750 } 2751 return ret; 2752 // bf_print_str(" r=", r); 2753 } 2754 2755 /* return 0 if OK != 0 if memory error */ 2756 static int bf_integer_from_radix(bf_t *r, const limb_t *tab, 2757 limb_t n, limb_t radix) 2758 { 2759 bf_context_t *s = r->ctx; 2760 int pow_tab_len, i, ret; 2761 limb_t radixl; 2762 bf_t *pow_tab; 2763 2764 radixl = get_limb_radix(radix); 2765 pow_tab_len = ceil_log2(n) + 2; /* XXX: check */ 2766 pow_tab = bf_malloc(s, sizeof(pow_tab[0]) * pow_tab_len); 2767 if (!pow_tab) 2768 return -1; 2769 for(i = 0; i < pow_tab_len; i++) 2770 bf_init(r->ctx, &pow_tab[i]); 2771 ret = bf_integer_from_radix_rec(r, tab, n, 0, n, radixl, pow_tab); 2772 for(i = 0; i < pow_tab_len; i++) { 2773 bf_delete(&pow_tab[i]); 2774 } 2775 bf_free(s, pow_tab); 2776 return ret; 2777 } 2778 2779 /* compute and round T * radix^expn. */ 2780 int bf_mul_pow_radix(bf_t *r, const bf_t *T, limb_t radix, 2781 slimb_t expn, limb_t prec, bf_flags_t flags) 2782 { 2783 int ret, expn_sign, overflow; 2784 slimb_t e, extra_bits, prec1, ziv_extra_bits; 2785 bf_t B_s, *B = &B_s; 2786 2787 if (T->len == 0) { 2788 return bf_set(r, T); 2789 } else if (expn == 0) { 2790 ret = bf_set(r, T); 2791 ret |= bf_round(r, prec, flags); 2792 return ret; 2793 } 2794 2795 e = expn; 2796 expn_sign = 0; 2797 if (e < 0) { 2798 e = -e; 2799 expn_sign = 1; 2800 } 2801 bf_init(r->ctx, B); 2802 if (prec == BF_PREC_INF) { 2803 /* infinite precision: only used if the result is known to be exact */ 2804 ret = bf_pow_ui_ui(B, radix, e, BF_PREC_INF, BF_RNDN); 2805 if (expn_sign) { 2806 ret |= bf_div(r, T, B, T->len * LIMB_BITS, BF_RNDN); 2807 } else { 2808 ret |= bf_mul(r, T, B, BF_PREC_INF, BF_RNDN); 2809 } 2810 } else { 2811 ziv_extra_bits = 16; 2812 for(;;) { 2813 prec1 = prec + ziv_extra_bits; 2814 /* XXX: correct overflow/underflow handling */ 2815 /* XXX: rigorous error analysis needed */ 2816 extra_bits = ceil_log2(e) * 2 + 1; 2817 ret = bf_pow_ui_ui(B, radix, e, prec1 + extra_bits, BF_RNDN | BF_FLAG_EXT_EXP); 2818 overflow = !bf_is_finite(B); 2819 /* XXX: if bf_pow_ui_ui returns an exact result, can stop 2820 after the next operation */ 2821 if (expn_sign) 2822 ret |= bf_div(r, T, B, prec1 + extra_bits, BF_RNDN | BF_FLAG_EXT_EXP); 2823 else 2824 ret |= bf_mul(r, T, B, prec1 + extra_bits, BF_RNDN | BF_FLAG_EXT_EXP); 2825 if (ret & BF_ST_MEM_ERROR) 2826 break; 2827 if ((ret & BF_ST_INEXACT) && 2828 !bf_can_round(r, prec, flags & BF_RND_MASK, prec1) && 2829 !overflow) { 2830 /* and more precision and retry */ 2831 ziv_extra_bits = ziv_extra_bits + (ziv_extra_bits / 2); 2832 } else { 2833 /* XXX: need to use __bf_round() to pass the inexact 2834 flag for the subnormal case */ 2835 ret = bf_round(r, prec, flags) | (ret & BF_ST_INEXACT); 2836 break; 2837 } 2838 } 2839 } 2840 bf_delete(B); 2841 return ret; 2842 } 2843 2844 static inline int to_digit(int c) 2845 { 2846 if (c >= '0' && c <= '9') 2847 return c - '0'; 2848 else if (c >= 'A' && c <= 'Z') 2849 return c - 'A' + 10; 2850 else if (c >= 'a' && c <= 'z') 2851 return c - 'a' + 10; 2852 else 2853 return 36; 2854 } 2855 2856 /* add a limb at 'pos' and decrement pos. new space is created if 2857 needed. Return 0 if OK, -1 if memory error */ 2858 static int bf_add_limb(bf_t *a, slimb_t *ppos, limb_t v) 2859 { 2860 slimb_t pos; 2861 pos = *ppos; 2862 if (unlikely(pos < 0)) { 2863 limb_t new_size, d, *new_tab; 2864 new_size = bf_max(a->len + 1, a->len * 3 / 2); 2865 new_tab = bf_realloc(a->ctx, a->tab, sizeof(limb_t) * new_size); 2866 if (!new_tab) 2867 return -1; 2868 a->tab = new_tab; 2869 d = new_size - a->len; 2870 memmove(a->tab + d, a->tab, a->len * sizeof(limb_t)); 2871 a->len = new_size; 2872 pos += d; 2873 } 2874 a->tab[pos--] = v; 2875 *ppos = pos; 2876 return 0; 2877 } 2878 2879 static int bf_tolower(int c) 2880 { 2881 if (c >= 'A' && c <= 'Z') 2882 c = c - 'A' + 'a'; 2883 return c; 2884 } 2885 2886 static int strcasestart(const char *str, const char *val, const char **ptr) 2887 { 2888 const char *p, *q; 2889 p = str; 2890 q = val; 2891 while (*q != '\0') { 2892 if (bf_tolower(*p) != *q) 2893 return 0; 2894 p++; 2895 q++; 2896 } 2897 if (ptr) 2898 *ptr = p; 2899 return 1; 2900 } 2901 2902 static int bf_atof_internal(bf_t *r, slimb_t *pexponent, 2903 const char *str, const char **pnext, int radix, 2904 limb_t prec, bf_flags_t flags, BOOL is_dec) 2905 { 2906 const char *p, *p_start; 2907 int is_neg, radix_bits, exp_is_neg, ret, digits_per_limb, shift; 2908 limb_t cur_limb; 2909 slimb_t pos, expn, int_len, digit_count; 2910 BOOL has_decpt, is_bin_exp; 2911 bf_t a_s, *a; 2912 2913 *pexponent = 0; 2914 p = str; 2915 if (!(flags & BF_ATOF_NO_NAN_INF) && radix <= 16 && 2916 strcasestart(p, "nan", &p)) { 2917 bf_set_nan(r); 2918 ret = 0; 2919 goto done; 2920 } 2921 is_neg = 0; 2922 2923 if (p[0] == '+') { 2924 p++; 2925 p_start = p; 2926 } else if (p[0] == '-') { 2927 is_neg = 1; 2928 p++; 2929 p_start = p; 2930 } else { 2931 p_start = p; 2932 } 2933 if (p[0] == '0') { 2934 if ((p[1] == 'x' || p[1] == 'X') && 2935 (radix == 0 || radix == 16) && 2936 !(flags & BF_ATOF_NO_HEX)) { 2937 radix = 16; 2938 p += 2; 2939 } else if ((p[1] == 'o' || p[1] == 'O') && 2940 radix == 0 && (flags & BF_ATOF_BIN_OCT)) { 2941 p += 2; 2942 radix = 8; 2943 } else if ((p[1] == 'b' || p[1] == 'B') && 2944 radix == 0 && (flags & BF_ATOF_BIN_OCT)) { 2945 p += 2; 2946 radix = 2; 2947 } else { 2948 goto no_prefix; 2949 } 2950 /* there must be a digit after the prefix */ 2951 if (to_digit((uint8_t)*p) >= radix) { 2952 bf_set_nan(r); 2953 ret = 0; 2954 goto done; 2955 } 2956 no_prefix: ; 2957 } else { 2958 if (!(flags & BF_ATOF_NO_NAN_INF) && radix <= 16 && 2959 strcasestart(p, "inf", &p)) { 2960 bf_set_inf(r, is_neg); 2961 ret = 0; 2962 goto done; 2963 } 2964 } 2965 2966 if (radix == 0) 2967 radix = 10; 2968 if (is_dec) { 2969 assert(radix == 10); 2970 radix_bits = 0; 2971 a = r; 2972 } else if ((radix & (radix - 1)) != 0) { 2973 radix_bits = 0; /* base is not a power of two */ 2974 a = &a_s; 2975 bf_init(r->ctx, a); 2976 } else { 2977 radix_bits = ceil_log2(radix); 2978 a = r; 2979 } 2980 2981 /* skip leading zeros */ 2982 /* XXX: could also skip zeros after the decimal point */ 2983 while (*p == '0') 2984 p++; 2985 2986 if (radix_bits) { 2987 shift = digits_per_limb = LIMB_BITS; 2988 } else { 2989 radix_bits = 0; 2990 shift = digits_per_limb = digits_per_limb_table[radix - 2]; 2991 } 2992 cur_limb = 0; 2993 bf_resize(a, 1); 2994 pos = 0; 2995 has_decpt = FALSE; 2996 int_len = digit_count = 0; 2997 for(;;) { 2998 limb_t c; 2999 if (*p == '.' && (p > p_start || to_digit(p[1]) < radix)) { 3000 if (has_decpt) 3001 break; 3002 has_decpt = TRUE; 3003 int_len = digit_count; 3004 p++; 3005 } 3006 c = to_digit(*p); 3007 if (c >= radix) 3008 break; 3009 digit_count++; 3010 p++; 3011 if (radix_bits) { 3012 shift -= radix_bits; 3013 if (shift <= 0) { 3014 cur_limb |= c >> (-shift); 3015 if (bf_add_limb(a, &pos, cur_limb)) 3016 goto mem_error; 3017 if (shift < 0) 3018 cur_limb = c << (LIMB_BITS + shift); 3019 else 3020 cur_limb = 0; 3021 shift += LIMB_BITS; 3022 } else { 3023 cur_limb |= c << shift; 3024 } 3025 } else { 3026 cur_limb = cur_limb * radix + c; 3027 shift--; 3028 if (shift == 0) { 3029 if (bf_add_limb(a, &pos, cur_limb)) 3030 goto mem_error; 3031 shift = digits_per_limb; 3032 cur_limb = 0; 3033 } 3034 } 3035 } 3036 if (!has_decpt) 3037 int_len = digit_count; 3038 3039 /* add the last limb and pad with zeros */ 3040 if (shift != digits_per_limb) { 3041 if (radix_bits == 0) { 3042 while (shift != 0) { 3043 cur_limb *= radix; 3044 shift--; 3045 } 3046 } 3047 if (bf_add_limb(a, &pos, cur_limb)) { 3048 mem_error: 3049 ret = BF_ST_MEM_ERROR; 3050 if (!radix_bits) 3051 bf_delete(a); 3052 bf_set_nan(r); 3053 goto done; 3054 } 3055 } 3056 3057 /* reset the next limbs to zero (we prefer to reallocate in the 3058 renormalization) */ 3059 memset(a->tab, 0, (pos + 1) * sizeof(limb_t)); 3060 3061 if (p == p_start) { 3062 ret = 0; 3063 if (!radix_bits) 3064 bf_delete(a); 3065 bf_set_nan(r); 3066 goto done; 3067 } 3068 3069 /* parse the exponent, if any */ 3070 expn = 0; 3071 is_bin_exp = FALSE; 3072 if (((radix == 10 && (*p == 'e' || *p == 'E')) || 3073 (radix != 10 && (*p == '@' || 3074 (radix_bits && (*p == 'p' || *p == 'P'))))) && 3075 p > p_start) { 3076 is_bin_exp = (*p == 'p' || *p == 'P'); 3077 p++; 3078 exp_is_neg = 0; 3079 if (*p == '+') { 3080 p++; 3081 } else if (*p == '-') { 3082 exp_is_neg = 1; 3083 p++; 3084 } 3085 for(;;) { 3086 int c; 3087 c = to_digit(*p); 3088 if (c >= 10) 3089 break; 3090 if (unlikely(expn > ((BF_RAW_EXP_MAX - 2 - 9) / 10))) { 3091 /* exponent overflow */ 3092 if (exp_is_neg) { 3093 bf_set_zero(r, is_neg); 3094 ret = BF_ST_UNDERFLOW | BF_ST_INEXACT; 3095 } else { 3096 bf_set_inf(r, is_neg); 3097 ret = BF_ST_OVERFLOW | BF_ST_INEXACT; 3098 } 3099 goto done; 3100 } 3101 p++; 3102 expn = expn * 10 + c; 3103 } 3104 if (exp_is_neg) 3105 expn = -expn; 3106 } 3107 if (is_dec) { 3108 a->expn = expn + int_len; 3109 a->sign = is_neg; 3110 ret = bfdec_normalize_and_round((bfdec_t *)a, prec, flags); 3111 } else if (radix_bits) { 3112 /* XXX: may overflow */ 3113 if (!is_bin_exp) 3114 expn *= radix_bits; 3115 a->expn = expn + (int_len * radix_bits); 3116 a->sign = is_neg; 3117 ret = bf_normalize_and_round(a, prec, flags); 3118 } else { 3119 limb_t l; 3120 pos++; 3121 l = a->len - pos; /* number of limbs */ 3122 if (l == 0) { 3123 bf_set_zero(r, is_neg); 3124 ret = 0; 3125 } else { 3126 bf_t T_s, *T = &T_s; 3127 3128 expn -= l * digits_per_limb - int_len; 3129 bf_init(r->ctx, T); 3130 if (bf_integer_from_radix(T, a->tab + pos, l, radix)) { 3131 bf_set_nan(r); 3132 ret = BF_ST_MEM_ERROR; 3133 } else { 3134 T->sign = is_neg; 3135 if (flags & BF_ATOF_EXPONENT) { 3136 /* return the exponent */ 3137 *pexponent = expn; 3138 ret = bf_set(r, T); 3139 } else { 3140 ret = bf_mul_pow_radix(r, T, radix, expn, prec, flags); 3141 } 3142 } 3143 bf_delete(T); 3144 } 3145 bf_delete(a); 3146 } 3147 done: 3148 if (pnext) 3149 *pnext = p; 3150 return ret; 3151 } 3152 3153 /* 3154 Return (status, n, exp). 'status' is the floating point status. 'n' 3155 is the parsed number. 3156 3157 If (flags & BF_ATOF_EXPONENT) and if the radix is not a power of 3158 two, the parsed number is equal to r * 3159 (*pexponent)^radix. Otherwise *pexponent = 0. 3160 */ 3161 int bf_atof2(bf_t *r, slimb_t *pexponent, 3162 const char *str, const char **pnext, int radix, 3163 limb_t prec, bf_flags_t flags) 3164 { 3165 return bf_atof_internal(r, pexponent, str, pnext, radix, prec, flags, 3166 FALSE); 3167 } 3168 3169 int bf_atof(bf_t *r, const char *str, const char **pnext, int radix, 3170 limb_t prec, bf_flags_t flags) 3171 { 3172 slimb_t dummy_exp; 3173 return bf_atof_internal(r, &dummy_exp, str, pnext, radix, prec, flags, FALSE); 3174 } 3175 3176 /* base conversion to radix */ 3177 3178 #if LIMB_BITS == 64 3179 #define RADIXL_10 UINT64_C(10000000000000000000) 3180 #else 3181 #define RADIXL_10 UINT64_C(1000000000) 3182 #endif 3183 3184 static const uint32_t inv_log2_radix[BF_RADIX_MAX - 1][LIMB_BITS / 32 + 1] = { 3185 #if LIMB_BITS == 32 3186 { 0x80000000, 0x00000000,}, 3187 { 0x50c24e60, 0xd4d4f4a7,}, 3188 { 0x40000000, 0x00000000,}, 3189 { 0x372068d2, 0x0a1ee5ca,}, 3190 { 0x3184648d, 0xb8153e7a,}, 3191 { 0x2d983275, 0x9d5369c4,}, 3192 { 0x2aaaaaaa, 0xaaaaaaab,}, 3193 { 0x28612730, 0x6a6a7a54,}, 3194 { 0x268826a1, 0x3ef3fde6,}, 3195 { 0x25001383, 0xbac8a744,}, 3196 { 0x23b46706, 0x82c0c709,}, 3197 { 0x229729f1, 0xb2c83ded,}, 3198 { 0x219e7ffd, 0xa5ad572b,}, 3199 { 0x20c33b88, 0xda7c29ab,}, 3200 { 0x20000000, 0x00000000,}, 3201 { 0x1f50b57e, 0xac5884b3,}, 3202 { 0x1eb22cc6, 0x8aa6e26f,}, 3203 { 0x1e21e118, 0x0c5daab2,}, 3204 { 0x1d9dcd21, 0x439834e4,}, 3205 { 0x1d244c78, 0x367a0d65,}, 3206 { 0x1cb40589, 0xac173e0c,}, 3207 { 0x1c4bd95b, 0xa8d72b0d,}, 3208 { 0x1bead768, 0x98f8ce4c,}, 3209 { 0x1b903469, 0x050f72e5,}, 3210 { 0x1b3b433f, 0x2eb06f15,}, 3211 { 0x1aeb6f75, 0x9c46fc38,}, 3212 { 0x1aa038eb, 0x0e3bfd17,}, 3213 { 0x1a593062, 0xb38d8c56,}, 3214 { 0x1a15f4c3, 0x2b95a2e6,}, 3215 { 0x19d630dc, 0xcc7ddef9,}, 3216 { 0x19999999, 0x9999999a,}, 3217 { 0x195fec80, 0x8a609431,}, 3218 { 0x1928ee7b, 0x0b4f22f9,}, 3219 { 0x18f46acf, 0x8c06e318,}, 3220 { 0x18c23246, 0xdc0a9f3d,}, 3221 #else 3222 { 0x80000000, 0x00000000, 0x00000000,}, 3223 { 0x50c24e60, 0xd4d4f4a7, 0x021f57bc,}, 3224 { 0x40000000, 0x00000000, 0x00000000,}, 3225 { 0x372068d2, 0x0a1ee5ca, 0x19ea911b,}, 3226 { 0x3184648d, 0xb8153e7a, 0x7fc2d2e1,}, 3227 { 0x2d983275, 0x9d5369c4, 0x4dec1661,}, 3228 { 0x2aaaaaaa, 0xaaaaaaaa, 0xaaaaaaab,}, 3229 { 0x28612730, 0x6a6a7a53, 0x810fabde,}, 3230 { 0x268826a1, 0x3ef3fde6, 0x23e2566b,}, 3231 { 0x25001383, 0xbac8a744, 0x385a3349,}, 3232 { 0x23b46706, 0x82c0c709, 0x3f891718,}, 3233 { 0x229729f1, 0xb2c83ded, 0x15fba800,}, 3234 { 0x219e7ffd, 0xa5ad572a, 0xe169744b,}, 3235 { 0x20c33b88, 0xda7c29aa, 0x9bddee52,}, 3236 { 0x20000000, 0x00000000, 0x00000000,}, 3237 { 0x1f50b57e, 0xac5884b3, 0x70e28eee,}, 3238 { 0x1eb22cc6, 0x8aa6e26f, 0x06d1a2a2,}, 3239 { 0x1e21e118, 0x0c5daab1, 0x81b4f4bf,}, 3240 { 0x1d9dcd21, 0x439834e3, 0x81667575,}, 3241 { 0x1d244c78, 0x367a0d64, 0xc8204d6d,}, 3242 { 0x1cb40589, 0xac173e0c, 0x3b7b16ba,}, 3243 { 0x1c4bd95b, 0xa8d72b0d, 0x5879f25a,}, 3244 { 0x1bead768, 0x98f8ce4c, 0x66cc2858,}, 3245 { 0x1b903469, 0x050f72e5, 0x0cf5488e,}, 3246 { 0x1b3b433f, 0x2eb06f14, 0x8c89719c,}, 3247 { 0x1aeb6f75, 0x9c46fc37, 0xab5fc7e9,}, 3248 { 0x1aa038eb, 0x0e3bfd17, 0x1bd62080,}, 3249 { 0x1a593062, 0xb38d8c56, 0x7998ab45,}, 3250 { 0x1a15f4c3, 0x2b95a2e6, 0x46aed6a0,}, 3251 { 0x19d630dc, 0xcc7ddef9, 0x5aadd61b,}, 3252 { 0x19999999, 0x99999999, 0x9999999a,}, 3253 { 0x195fec80, 0x8a609430, 0xe1106014,}, 3254 { 0x1928ee7b, 0x0b4f22f9, 0x5f69791d,}, 3255 { 0x18f46acf, 0x8c06e318, 0x4d2aeb2c,}, 3256 { 0x18c23246, 0xdc0a9f3d, 0x3fe16970,}, 3257 #endif 3258 }; 3259 3260 static const limb_t log2_radix[BF_RADIX_MAX - 1] = { 3261 #if LIMB_BITS == 32 3262 0x20000000, 3263 0x32b80347, 3264 0x40000000, 3265 0x4a4d3c26, 3266 0x52b80347, 3267 0x59d5d9fd, 3268 0x60000000, 3269 0x6570068e, 3270 0x6a4d3c26, 3271 0x6eb3a9f0, 3272 0x72b80347, 3273 0x766a008e, 3274 0x79d5d9fd, 3275 0x7d053f6d, 3276 0x80000000, 3277 0x82cc7edf, 3278 0x8570068e, 3279 0x87ef05ae, 3280 0x8a4d3c26, 3281 0x8c8ddd45, 3282 0x8eb3a9f0, 3283 0x90c10501, 3284 0x92b80347, 3285 0x949a784c, 3286 0x966a008e, 3287 0x982809d6, 3288 0x99d5d9fd, 3289 0x9b74948f, 3290 0x9d053f6d, 3291 0x9e88c6b3, 3292 0xa0000000, 3293 0xa16bad37, 3294 0xa2cc7edf, 3295 0xa4231623, 3296 0xa570068e, 3297 #else 3298 0x2000000000000000, 3299 0x32b803473f7ad0f4, 3300 0x4000000000000000, 3301 0x4a4d3c25e68dc57f, 3302 0x52b803473f7ad0f4, 3303 0x59d5d9fd5010b366, 3304 0x6000000000000000, 3305 0x6570068e7ef5a1e8, 3306 0x6a4d3c25e68dc57f, 3307 0x6eb3a9f01975077f, 3308 0x72b803473f7ad0f4, 3309 0x766a008e4788cbcd, 3310 0x79d5d9fd5010b366, 3311 0x7d053f6d26089673, 3312 0x8000000000000000, 3313 0x82cc7edf592262d0, 3314 0x8570068e7ef5a1e8, 3315 0x87ef05ae409a0289, 3316 0x8a4d3c25e68dc57f, 3317 0x8c8ddd448f8b845a, 3318 0x8eb3a9f01975077f, 3319 0x90c10500d63aa659, 3320 0x92b803473f7ad0f4, 3321 0x949a784bcd1b8afe, 3322 0x966a008e4788cbcd, 3323 0x982809d5be7072dc, 3324 0x99d5d9fd5010b366, 3325 0x9b74948f5532da4b, 3326 0x9d053f6d26089673, 3327 0x9e88c6b3626a72aa, 3328 0xa000000000000000, 3329 0xa16bad3758efd873, 3330 0xa2cc7edf592262d0, 3331 0xa4231623369e78e6, 3332 0xa570068e7ef5a1e8, 3333 #endif 3334 }; 3335 3336 /* compute floor(a*b) or ceil(a*b) with b = log2(radix) or 3337 b=1/log2(radix). For is_inv = 0, strict accuracy is not guaranteed 3338 when radix is not a power of two. */ 3339 slimb_t bf_mul_log2_radix(slimb_t a1, unsigned int radix, int is_inv, 3340 int is_ceil1) 3341 { 3342 int is_neg; 3343 limb_t a; 3344 BOOL is_ceil; 3345 3346 is_ceil = is_ceil1; 3347 a = a1; 3348 if (a1 < 0) { 3349 a = -a; 3350 is_neg = 1; 3351 } else { 3352 is_neg = 0; 3353 } 3354 is_ceil ^= is_neg; 3355 if ((radix & (radix - 1)) == 0) { 3356 int radix_bits; 3357 /* radix is a power of two */ 3358 radix_bits = ceil_log2(radix); 3359 if (is_inv) { 3360 if (is_ceil) 3361 a += radix_bits - 1; 3362 a = a / radix_bits; 3363 } else { 3364 a = a * radix_bits; 3365 } 3366 } else { 3367 const uint32_t *tab; 3368 limb_t b0, b1; 3369 dlimb_t t; 3370 3371 if (is_inv) { 3372 tab = inv_log2_radix[radix - 2]; 3373 #if LIMB_BITS == 32 3374 b1 = tab[0]; 3375 b0 = tab[1]; 3376 #else 3377 b1 = ((limb_t)tab[0] << 32) | tab[1]; 3378 b0 = (limb_t)tab[2] << 32; 3379 #endif 3380 t = (dlimb_t)b0 * (dlimb_t)a; 3381 t = (dlimb_t)b1 * (dlimb_t)a + (t >> LIMB_BITS); 3382 a = t >> (LIMB_BITS - 1); 3383 } else { 3384 b0 = log2_radix[radix - 2]; 3385 t = (dlimb_t)b0 * (dlimb_t)a; 3386 a = t >> (LIMB_BITS - 3); 3387 } 3388 /* a = floor(result) and 'result' cannot be an integer */ 3389 a += is_ceil; 3390 } 3391 if (is_neg) 3392 a = -a; 3393 return a; 3394 } 3395 3396 /* 'n' is the number of output limbs */ 3397 static int bf_integer_to_radix_rec(bf_t *pow_tab, 3398 limb_t *out, const bf_t *a, limb_t n, 3399 int level, limb_t n0, limb_t radixl, 3400 unsigned int radixl_bits) 3401 { 3402 limb_t n1, n2, q_prec; 3403 int ret; 3404 3405 assert(n >= 1); 3406 if (n == 1) { 3407 out[0] = get_bits(a->tab, a->len, a->len * LIMB_BITS - a->expn); 3408 } else if (n == 2) { 3409 dlimb_t t; 3410 slimb_t pos; 3411 pos = a->len * LIMB_BITS - a->expn; 3412 t = ((dlimb_t)get_bits(a->tab, a->len, pos + LIMB_BITS) << LIMB_BITS) | 3413 get_bits(a->tab, a->len, pos); 3414 if (likely(radixl == RADIXL_10)) { 3415 /* use division by a constant when possible */ 3416 out[0] = t % RADIXL_10; 3417 out[1] = t / RADIXL_10; 3418 } else { 3419 out[0] = t % radixl; 3420 out[1] = t / radixl; 3421 } 3422 } else { 3423 bf_t Q, R, *B, *B_inv; 3424 int q_add; 3425 bf_init(a->ctx, &Q); 3426 bf_init(a->ctx, &R); 3427 n2 = (((n0 * 2) >> (level + 1)) + 1) / 2; 3428 n1 = n - n2; 3429 B = &pow_tab[2 * level]; 3430 B_inv = &pow_tab[2 * level + 1]; 3431 ret = 0; 3432 if (B->len == 0) { 3433 /* compute BASE^n2 */ 3434 ret |= bf_pow_ui_ui(B, radixl, n2, BF_PREC_INF, BF_RNDZ); 3435 /* we use enough bits for the maximum possible 'n1' value, 3436 i.e. n2 + 1 */ 3437 ret |= bf_set_ui(&R, 1); 3438 ret |= bf_div(B_inv, &R, B, (n2 + 1) * radixl_bits + 2, BF_RNDN); 3439 } 3440 // printf("%d: n1=% " PRId64 " n2=%" PRId64 "\n", level, n1, n2); 3441 q_prec = n1 * radixl_bits; 3442 ret |= bf_mul(&Q, a, B_inv, q_prec, BF_RNDN); 3443 ret |= bf_rint(&Q, BF_RNDZ); 3444 3445 ret |= bf_mul(&R, &Q, B, BF_PREC_INF, BF_RNDZ); 3446 ret |= bf_sub(&R, a, &R, BF_PREC_INF, BF_RNDZ); 3447 3448 if (ret & BF_ST_MEM_ERROR) 3449 goto fail; 3450 /* adjust if necessary */ 3451 q_add = 0; 3452 while (R.sign && R.len != 0) { 3453 if (bf_add(&R, &R, B, BF_PREC_INF, BF_RNDZ)) 3454 goto fail; 3455 q_add--; 3456 } 3457 while (bf_cmpu(&R, B) >= 0) { 3458 if (bf_sub(&R, &R, B, BF_PREC_INF, BF_RNDZ)) 3459 goto fail; 3460 q_add++; 3461 } 3462 if (q_add != 0) { 3463 if (bf_add_si(&Q, &Q, q_add, BF_PREC_INF, BF_RNDZ)) 3464 goto fail; 3465 } 3466 if (bf_integer_to_radix_rec(pow_tab, out + n2, &Q, n1, level + 1, n0, 3467 radixl, radixl_bits)) 3468 goto fail; 3469 if (bf_integer_to_radix_rec(pow_tab, out, &R, n2, level + 1, n0, 3470 radixl, radixl_bits)) { 3471 fail: 3472 bf_delete(&Q); 3473 bf_delete(&R); 3474 return -1; 3475 } 3476 bf_delete(&Q); 3477 bf_delete(&R); 3478 } 3479 return 0; 3480 } 3481 3482 /* return 0 if OK != 0 if memory error */ 3483 static int bf_integer_to_radix(bf_t *r, const bf_t *a, limb_t radixl) 3484 { 3485 bf_context_t *s = r->ctx; 3486 limb_t r_len; 3487 bf_t *pow_tab; 3488 int i, pow_tab_len, ret; 3489 3490 r_len = r->len; 3491 pow_tab_len = (ceil_log2(r_len) + 2) * 2; /* XXX: check */ 3492 pow_tab = bf_malloc(s, sizeof(pow_tab[0]) * pow_tab_len); 3493 if (!pow_tab) 3494 return -1; 3495 for(i = 0; i < pow_tab_len; i++) 3496 bf_init(r->ctx, &pow_tab[i]); 3497 3498 ret = bf_integer_to_radix_rec(pow_tab, r->tab, a, r_len, 0, r_len, radixl, 3499 ceil_log2(radixl)); 3500 3501 for(i = 0; i < pow_tab_len; i++) { 3502 bf_delete(&pow_tab[i]); 3503 } 3504 bf_free(s, pow_tab); 3505 return ret; 3506 } 3507 3508 /* a must be >= 0. 'P' is the wanted number of digits in radix 3509 'radix'. 'r' is the mantissa represented as an integer. *pE 3510 contains the exponent. Return != 0 if memory error. */ 3511 static int bf_convert_to_radix(bf_t *r, slimb_t *pE, 3512 const bf_t *a, int radix, 3513 limb_t P, bf_rnd_t rnd_mode, 3514 BOOL is_fixed_exponent) 3515 { 3516 slimb_t E, e, prec, extra_bits, ziv_extra_bits, prec0; 3517 bf_t B_s, *B = &B_s; 3518 int e_sign, ret, res; 3519 3520 if (a->len == 0) { 3521 /* zero case */ 3522 *pE = 0; 3523 return bf_set(r, a); 3524 } 3525 3526 if (is_fixed_exponent) { 3527 E = *pE; 3528 } else { 3529 /* compute the new exponent */ 3530 E = 1 + bf_mul_log2_radix(a->expn - 1, radix, TRUE, FALSE); 3531 } 3532 // bf_print_str("a", a); 3533 // printf("E=%ld P=%ld radix=%d\n", E, P, radix); 3534 3535 for(;;) { 3536 e = P - E; 3537 e_sign = 0; 3538 if (e < 0) { 3539 e = -e; 3540 e_sign = 1; 3541 } 3542 /* Note: precision for log2(radix) is not critical here */ 3543 prec0 = bf_mul_log2_radix(P, radix, FALSE, TRUE); 3544 ziv_extra_bits = 16; 3545 for(;;) { 3546 prec = prec0 + ziv_extra_bits; 3547 /* XXX: rigorous error analysis needed */ 3548 extra_bits = ceil_log2(e) * 2 + 1; 3549 ret = bf_pow_ui_ui(r, radix, e, prec + extra_bits, 3550 BF_RNDN | BF_FLAG_EXT_EXP); 3551 if (!e_sign) 3552 ret |= bf_mul(r, r, a, prec + extra_bits, 3553 BF_RNDN | BF_FLAG_EXT_EXP); 3554 else 3555 ret |= bf_div(r, a, r, prec + extra_bits, 3556 BF_RNDN | BF_FLAG_EXT_EXP); 3557 if (ret & BF_ST_MEM_ERROR) 3558 return BF_ST_MEM_ERROR; 3559 /* if the result is not exact, check that it can be safely 3560 rounded to an integer */ 3561 if ((ret & BF_ST_INEXACT) && 3562 !bf_can_round(r, r->expn, rnd_mode, prec)) { 3563 /* and more precision and retry */ 3564 ziv_extra_bits = ziv_extra_bits + (ziv_extra_bits / 2); 3565 continue; 3566 } else { 3567 ret = bf_rint(r, rnd_mode); 3568 if (ret & BF_ST_MEM_ERROR) 3569 return BF_ST_MEM_ERROR; 3570 break; 3571 } 3572 } 3573 if (is_fixed_exponent) 3574 break; 3575 /* check that the result is < B^P */ 3576 /* XXX: do a fast approximate test first ? */ 3577 bf_init(r->ctx, B); 3578 ret = bf_pow_ui_ui(B, radix, P, BF_PREC_INF, BF_RNDZ); 3579 if (ret) { 3580 bf_delete(B); 3581 return ret; 3582 } 3583 res = bf_cmpu(r, B); 3584 bf_delete(B); 3585 if (res < 0) 3586 break; 3587 /* try a larger exponent */ 3588 E++; 3589 } 3590 *pE = E; 3591 return 0; 3592 } 3593 3594 static void limb_to_a(char *buf, limb_t n, unsigned int radix, int len) 3595 { 3596 int digit, i; 3597 3598 if (radix == 10) { 3599 /* specific case with constant divisor */ 3600 for(i = len - 1; i >= 0; i--) { 3601 digit = (limb_t)n % 10; 3602 n = (limb_t)n / 10; 3603 buf[i] = digit + '0'; 3604 } 3605 } else { 3606 for(i = len - 1; i >= 0; i--) { 3607 digit = (limb_t)n % radix; 3608 n = (limb_t)n / radix; 3609 if (digit < 10) 3610 digit += '0'; 3611 else 3612 digit += 'a' - 10; 3613 buf[i] = digit; 3614 } 3615 } 3616 } 3617 3618 /* for power of 2 radixes */ 3619 static void limb_to_a2(char *buf, limb_t n, unsigned int radix_bits, int len) 3620 { 3621 int digit, i; 3622 unsigned int mask; 3623 3624 mask = (1 << radix_bits) - 1; 3625 for(i = len - 1; i >= 0; i--) { 3626 digit = n & mask; 3627 n >>= radix_bits; 3628 if (digit < 10) 3629 digit += '0'; 3630 else 3631 digit += 'a' - 10; 3632 buf[i] = digit; 3633 } 3634 } 3635 3636 /* 'a' must be an integer if the is_dec = FALSE or if the radix is not 3637 a power of two. A dot is added before the 'dot_pos' digit. dot_pos 3638 = n_digits does not display the dot. 0 <= dot_pos <= 3639 n_digits. n_digits >= 1. */ 3640 static void output_digits(DynBuf *s, const bf_t *a1, int radix, limb_t n_digits, 3641 limb_t dot_pos, BOOL is_dec) 3642 { 3643 limb_t i, v, l; 3644 slimb_t pos, pos_incr; 3645 int digits_per_limb, buf_pos, radix_bits, first_buf_pos; 3646 char buf[65]; 3647 bf_t a_s, *a; 3648 3649 if (is_dec) { 3650 digits_per_limb = LIMB_DIGITS; 3651 a = (bf_t *)a1; 3652 radix_bits = 0; 3653 pos = a->len; 3654 pos_incr = 1; 3655 first_buf_pos = 0; 3656 } else if ((radix & (radix - 1)) == 0) { 3657 a = (bf_t *)a1; 3658 radix_bits = ceil_log2(radix); 3659 digits_per_limb = LIMB_BITS / radix_bits; 3660 pos_incr = digits_per_limb * radix_bits; 3661 /* digits are aligned relative to the radix point */ 3662 pos = a->len * LIMB_BITS + smod(-a->expn, radix_bits); 3663 first_buf_pos = 0; 3664 } else { 3665 limb_t n, radixl; 3666 3667 digits_per_limb = digits_per_limb_table[radix - 2]; 3668 radixl = get_limb_radix(radix); 3669 a = &a_s; 3670 bf_init(a1->ctx, a); 3671 n = (n_digits + digits_per_limb - 1) / digits_per_limb; 3672 if (bf_resize(a, n)) { 3673 dbuf_set_error(s); 3674 goto done; 3675 } 3676 if (bf_integer_to_radix(a, a1, radixl)) { 3677 dbuf_set_error(s); 3678 goto done; 3679 } 3680 radix_bits = 0; 3681 pos = n; 3682 pos_incr = 1; 3683 first_buf_pos = pos * digits_per_limb - n_digits; 3684 } 3685 buf_pos = digits_per_limb; 3686 i = 0; 3687 while (i < n_digits) { 3688 if (buf_pos == digits_per_limb) { 3689 pos -= pos_incr; 3690 if (radix_bits == 0) { 3691 v = get_limbz(a, pos); 3692 limb_to_a(buf, v, radix, digits_per_limb); 3693 } else { 3694 v = get_bits(a->tab, a->len, pos); 3695 limb_to_a2(buf, v, radix_bits, digits_per_limb); 3696 } 3697 buf_pos = first_buf_pos; 3698 first_buf_pos = 0; 3699 } 3700 if (i < dot_pos) { 3701 l = dot_pos; 3702 } else { 3703 if (i == dot_pos) 3704 dbuf_putc(s, '.'); 3705 l = n_digits; 3706 } 3707 l = bf_min(digits_per_limb - buf_pos, l - i); 3708 dbuf_put(s, (uint8_t *)(buf + buf_pos), l); 3709 buf_pos += l; 3710 i += l; 3711 } 3712 done: 3713 if (a != a1) 3714 bf_delete(a); 3715 } 3716 3717 static void *bf_dbuf_realloc(void *opaque, void *ptr, size_t size) 3718 { 3719 bf_context_t *s = opaque; 3720 return bf_realloc(s, ptr, size); 3721 } 3722 3723 /* return the length in bytes. A trailing '\0' is added */ 3724 static char *bf_ftoa_internal(size_t *plen, const bf_t *a2, int radix, 3725 limb_t prec, bf_flags_t flags, BOOL is_dec) 3726 { 3727 bf_context_t *ctx = a2->ctx; 3728 DynBuf s_s, *s = &s_s; 3729 int radix_bits; 3730 3731 // bf_print_str("ftoa", a2); 3732 // printf("radix=%d\n", radix); 3733 dbuf_init2(s, ctx, bf_dbuf_realloc); 3734 if (a2->expn == BF_EXP_NAN) { 3735 dbuf_putstr(s, "NaN"); 3736 } else { 3737 if (a2->sign) 3738 dbuf_putc(s, '-'); 3739 if (a2->expn == BF_EXP_INF) { 3740 if (flags & BF_FTOA_JS_QUIRKS) 3741 dbuf_putstr(s, "Infinity"); 3742 else 3743 dbuf_putstr(s, "Inf"); 3744 } else { 3745 int fmt, ret; 3746 slimb_t n_digits, n, i, n_max, n1; 3747 bf_t a1_s, *a1 = &a1_s; 3748 3749 if ((radix & (radix - 1)) != 0) 3750 radix_bits = 0; 3751 else 3752 radix_bits = ceil_log2(radix); 3753 3754 fmt = flags & BF_FTOA_FORMAT_MASK; 3755 bf_init(ctx, a1); 3756 if (fmt == BF_FTOA_FORMAT_FRAC) { 3757 if (is_dec || radix_bits != 0) { 3758 if (bf_set(a1, a2)) 3759 goto fail1; 3760 #ifdef USE_BF_DEC 3761 if (is_dec) { 3762 if (bfdec_round((bfdec_t *)a1, prec, (flags & BF_RND_MASK) | BF_FLAG_RADPNT_PREC) & BF_ST_MEM_ERROR) 3763 goto fail1; 3764 n = a1->expn; 3765 } else 3766 #endif 3767 { 3768 if (bf_round(a1, prec * radix_bits, (flags & BF_RND_MASK) | BF_FLAG_RADPNT_PREC) & BF_ST_MEM_ERROR) 3769 goto fail1; 3770 n = ceil_div(a1->expn, radix_bits); 3771 } 3772 if (flags & BF_FTOA_ADD_PREFIX) { 3773 if (radix == 16) 3774 dbuf_putstr(s, "0x"); 3775 else if (radix == 8) 3776 dbuf_putstr(s, "0o"); 3777 else if (radix == 2) 3778 dbuf_putstr(s, "0b"); 3779 } 3780 if (a1->expn == BF_EXP_ZERO) { 3781 dbuf_putstr(s, "0"); 3782 if (prec > 0) { 3783 dbuf_putstr(s, "."); 3784 for(i = 0; i < prec; i++) { 3785 dbuf_putc(s, '0'); 3786 } 3787 } 3788 } else { 3789 n_digits = prec + n; 3790 if (n <= 0) { 3791 /* 0.x */ 3792 dbuf_putstr(s, "0."); 3793 for(i = 0; i < -n; i++) { 3794 dbuf_putc(s, '0'); 3795 } 3796 if (n_digits > 0) { 3797 output_digits(s, a1, radix, n_digits, n_digits, is_dec); 3798 } 3799 } else { 3800 output_digits(s, a1, radix, n_digits, n, is_dec); 3801 } 3802 } 3803 } else { 3804 size_t pos, start; 3805 bf_t a_s, *a = &a_s; 3806 3807 /* make a positive number */ 3808 a->tab = a2->tab; 3809 a->len = a2->len; 3810 a->expn = a2->expn; 3811 a->sign = 0; 3812 3813 /* one more digit for the rounding */ 3814 n = 1 + bf_mul_log2_radix(bf_max(a->expn, 0), radix, TRUE, TRUE); 3815 n_digits = n + prec; 3816 n1 = n; 3817 if (bf_convert_to_radix(a1, &n1, a, radix, n_digits, 3818 flags & BF_RND_MASK, TRUE)) 3819 goto fail1; 3820 start = s->size; 3821 output_digits(s, a1, radix, n_digits, n, is_dec); 3822 /* remove leading zeros because we allocated one more digit */ 3823 pos = start; 3824 while ((pos + 1) < s->size && s->buf[pos] == '0' && 3825 s->buf[pos + 1] != '.') 3826 pos++; 3827 if (pos > start) { 3828 memmove(s->buf + start, s->buf + pos, s->size - pos); 3829 s->size -= (pos - start); 3830 } 3831 } 3832 } else { 3833 #ifdef USE_BF_DEC 3834 if (is_dec) { 3835 if (bf_set(a1, a2)) 3836 goto fail1; 3837 if (fmt == BF_FTOA_FORMAT_FIXED) { 3838 n_digits = prec; 3839 n_max = n_digits; 3840 if (bfdec_round((bfdec_t *)a1, prec, (flags & BF_RND_MASK)) & BF_ST_MEM_ERROR) 3841 goto fail1; 3842 } else { 3843 /* prec is ignored */ 3844 prec = n_digits = a1->len * LIMB_DIGITS; 3845 /* remove the trailing zero digits */ 3846 while (n_digits > 1 && 3847 get_digit(a1->tab, a1->len, prec - n_digits) == 0) { 3848 n_digits--; 3849 } 3850 n_max = n_digits + 4; 3851 } 3852 n = a1->expn; 3853 } else 3854 #endif 3855 if (radix_bits != 0) { 3856 if (bf_set(a1, a2)) 3857 goto fail1; 3858 if (fmt == BF_FTOA_FORMAT_FIXED) { 3859 slimb_t prec_bits; 3860 n_digits = prec; 3861 n_max = n_digits; 3862 /* align to the radix point */ 3863 prec_bits = prec * radix_bits - 3864 smod(-a1->expn, radix_bits); 3865 if (bf_round(a1, prec_bits, 3866 (flags & BF_RND_MASK)) & BF_ST_MEM_ERROR) 3867 goto fail1; 3868 } else { 3869 limb_t digit_mask; 3870 slimb_t pos; 3871 /* position of the digit before the most 3872 significant digit in bits */ 3873 pos = a1->len * LIMB_BITS + 3874 smod(-a1->expn, radix_bits); 3875 n_digits = ceil_div(pos, radix_bits); 3876 /* remove the trailing zero digits */ 3877 digit_mask = ((limb_t)1 << radix_bits) - 1; 3878 while (n_digits > 1 && 3879 (get_bits(a1->tab, a1->len, pos - n_digits * radix_bits) & digit_mask) == 0) { 3880 n_digits--; 3881 } 3882 n_max = n_digits + 4; 3883 } 3884 n = ceil_div(a1->expn, radix_bits); 3885 } else { 3886 bf_t a_s, *a = &a_s; 3887 3888 /* make a positive number */ 3889 a->tab = a2->tab; 3890 a->len = a2->len; 3891 a->expn = a2->expn; 3892 a->sign = 0; 3893 3894 if (fmt == BF_FTOA_FORMAT_FIXED) { 3895 n_digits = prec; 3896 n_max = n_digits; 3897 } else { 3898 slimb_t n_digits_max, n_digits_min; 3899 3900 assert(prec != BF_PREC_INF); 3901 n_digits = 1 + bf_mul_log2_radix(prec, radix, TRUE, TRUE); 3902 /* max number of digits for non exponential 3903 notation. The rational is to have the same rule 3904 as JS i.e. n_max = 21 for 64 bit float in base 10. */ 3905 n_max = n_digits + 4; 3906 if (fmt == BF_FTOA_FORMAT_FREE_MIN) { 3907 bf_t b_s, *b = &b_s; 3908 3909 /* find the minimum number of digits by 3910 dichotomy. */ 3911 /* XXX: inefficient */ 3912 n_digits_max = n_digits; 3913 n_digits_min = 1; 3914 bf_init(ctx, b); 3915 while (n_digits_min < n_digits_max) { 3916 n_digits = (n_digits_min + n_digits_max) / 2; 3917 if (bf_convert_to_radix(a1, &n, a, radix, n_digits, 3918 flags & BF_RND_MASK, FALSE)) { 3919 bf_delete(b); 3920 goto fail1; 3921 } 3922 /* convert back to a number and compare */ 3923 ret = bf_mul_pow_radix(b, a1, radix, n - n_digits, 3924 prec, 3925 (flags & ~BF_RND_MASK) | 3926 BF_RNDN); 3927 if (ret & BF_ST_MEM_ERROR) { 3928 bf_delete(b); 3929 goto fail1; 3930 } 3931 if (bf_cmpu(b, a) == 0) { 3932 n_digits_max = n_digits; 3933 } else { 3934 n_digits_min = n_digits + 1; 3935 } 3936 } 3937 bf_delete(b); 3938 n_digits = n_digits_max; 3939 } 3940 } 3941 if (bf_convert_to_radix(a1, &n, a, radix, n_digits, 3942 flags & BF_RND_MASK, FALSE)) { 3943 fail1: 3944 bf_delete(a1); 3945 goto fail; 3946 } 3947 } 3948 if (a1->expn == BF_EXP_ZERO && 3949 fmt != BF_FTOA_FORMAT_FIXED && 3950 !(flags & BF_FTOA_FORCE_EXP)) { 3951 /* just output zero */ 3952 dbuf_putstr(s, "0"); 3953 } else { 3954 if (flags & BF_FTOA_ADD_PREFIX) { 3955 if (radix == 16) 3956 dbuf_putstr(s, "0x"); 3957 else if (radix == 8) 3958 dbuf_putstr(s, "0o"); 3959 else if (radix == 2) 3960 dbuf_putstr(s, "0b"); 3961 } 3962 if (a1->expn == BF_EXP_ZERO) 3963 n = 1; 3964 if ((flags & BF_FTOA_FORCE_EXP) || 3965 n <= -6 || n > n_max) { 3966 const char *fmt; 3967 /* exponential notation */ 3968 output_digits(s, a1, radix, n_digits, 1, is_dec); 3969 if (radix_bits != 0 && radix <= 16) { 3970 if (flags & BF_FTOA_JS_QUIRKS) 3971 fmt = "p%+" PRId_LIMB; 3972 else 3973 fmt = "p%" PRId_LIMB; 3974 dbuf_printf(s, fmt, (n - 1) * radix_bits); 3975 } else { 3976 if (flags & BF_FTOA_JS_QUIRKS) 3977 fmt = "%c%+" PRId_LIMB; 3978 else 3979 fmt = "%c%" PRId_LIMB; 3980 dbuf_printf(s, fmt, 3981 radix <= 10 ? 'e' : '@', n - 1); 3982 } 3983 } else if (n <= 0) { 3984 /* 0.x */ 3985 dbuf_putstr(s, "0."); 3986 for(i = 0; i < -n; i++) { 3987 dbuf_putc(s, '0'); 3988 } 3989 output_digits(s, a1, radix, n_digits, n_digits, is_dec); 3990 } else { 3991 if (n_digits <= n) { 3992 /* no dot */ 3993 output_digits(s, a1, radix, n_digits, n_digits, is_dec); 3994 for(i = 0; i < (n - n_digits); i++) 3995 dbuf_putc(s, '0'); 3996 } else { 3997 output_digits(s, a1, radix, n_digits, n, is_dec); 3998 } 3999 } 4000 } 4001 } 4002 bf_delete(a1); 4003 } 4004 } 4005 dbuf_putc(s, '\0'); 4006 if (dbuf_error(s)) 4007 goto fail; 4008 if (plen) 4009 *plen = s->size - 1; 4010 return (char *)s->buf; 4011 fail: 4012 bf_free(ctx, s->buf); 4013 if (plen) 4014 *plen = 0; 4015 return NULL; 4016 } 4017 4018 char *bf_ftoa(size_t *plen, const bf_t *a, int radix, limb_t prec, 4019 bf_flags_t flags) 4020 { 4021 return bf_ftoa_internal(plen, a, radix, prec, flags, FALSE); 4022 } 4023 4024 /***************************************************************/ 4025 /* transcendental functions */ 4026 4027 /* Note: the algorithm is from MPFR */ 4028 static void bf_const_log2_rec(bf_t *T, bf_t *P, bf_t *Q, limb_t n1, 4029 limb_t n2, BOOL need_P) 4030 { 4031 bf_context_t *s = T->ctx; 4032 if ((n2 - n1) == 1) { 4033 if (n1 == 0) { 4034 bf_set_ui(P, 3); 4035 } else { 4036 bf_set_ui(P, n1); 4037 P->sign = 1; 4038 } 4039 bf_set_ui(Q, 2 * n1 + 1); 4040 Q->expn += 2; 4041 bf_set(T, P); 4042 } else { 4043 limb_t m; 4044 bf_t T1_s, *T1 = &T1_s; 4045 bf_t P1_s, *P1 = &P1_s; 4046 bf_t Q1_s, *Q1 = &Q1_s; 4047 4048 m = n1 + ((n2 - n1) >> 1); 4049 bf_const_log2_rec(T, P, Q, n1, m, TRUE); 4050 bf_init(s, T1); 4051 bf_init(s, P1); 4052 bf_init(s, Q1); 4053 bf_const_log2_rec(T1, P1, Q1, m, n2, need_P); 4054 bf_mul(T, T, Q1, BF_PREC_INF, BF_RNDZ); 4055 bf_mul(T1, T1, P, BF_PREC_INF, BF_RNDZ); 4056 bf_add(T, T, T1, BF_PREC_INF, BF_RNDZ); 4057 if (need_P) 4058 bf_mul(P, P, P1, BF_PREC_INF, BF_RNDZ); 4059 bf_mul(Q, Q, Q1, BF_PREC_INF, BF_RNDZ); 4060 bf_delete(T1); 4061 bf_delete(P1); 4062 bf_delete(Q1); 4063 } 4064 } 4065 4066 /* compute log(2) with faithful rounding at precision 'prec' */ 4067 static void bf_const_log2_internal(bf_t *T, limb_t prec) 4068 { 4069 limb_t w, N; 4070 bf_t P_s, *P = &P_s; 4071 bf_t Q_s, *Q = &Q_s; 4072 4073 w = prec + 15; 4074 N = w / 3 + 1; 4075 bf_init(T->ctx, P); 4076 bf_init(T->ctx, Q); 4077 bf_const_log2_rec(T, P, Q, 0, N, FALSE); 4078 bf_div(T, T, Q, prec, BF_RNDN); 4079 bf_delete(P); 4080 bf_delete(Q); 4081 } 4082 4083 /* PI constant */ 4084 4085 #define CHUD_A 13591409 4086 #define CHUD_B 545140134 4087 #define CHUD_C 640320 4088 #define CHUD_BITS_PER_TERM 47 4089 4090 static void chud_bs(bf_t *P, bf_t *Q, bf_t *G, int64_t a, int64_t b, int need_g, 4091 limb_t prec) 4092 { 4093 bf_context_t *s = P->ctx; 4094 int64_t c; 4095 4096 if (a == (b - 1)) { 4097 bf_t T0, T1; 4098 4099 bf_init(s, &T0); 4100 bf_init(s, &T1); 4101 bf_set_ui(G, 2 * b - 1); 4102 bf_mul_ui(G, G, 6 * b - 1, prec, BF_RNDN); 4103 bf_mul_ui(G, G, 6 * b - 5, prec, BF_RNDN); 4104 bf_set_ui(&T0, CHUD_B); 4105 bf_mul_ui(&T0, &T0, b, prec, BF_RNDN); 4106 bf_set_ui(&T1, CHUD_A); 4107 bf_add(&T0, &T0, &T1, prec, BF_RNDN); 4108 bf_mul(P, G, &T0, prec, BF_RNDN); 4109 P->sign = b & 1; 4110 4111 bf_set_ui(Q, b); 4112 bf_mul_ui(Q, Q, b, prec, BF_RNDN); 4113 bf_mul_ui(Q, Q, b, prec, BF_RNDN); 4114 bf_mul_ui(Q, Q, (uint64_t)CHUD_C * CHUD_C * CHUD_C / 24, prec, BF_RNDN); 4115 bf_delete(&T0); 4116 bf_delete(&T1); 4117 } else { 4118 bf_t P2, Q2, G2; 4119 4120 bf_init(s, &P2); 4121 bf_init(s, &Q2); 4122 bf_init(s, &G2); 4123 4124 c = (a + b) / 2; 4125 chud_bs(P, Q, G, a, c, 1, prec); 4126 chud_bs(&P2, &Q2, &G2, c, b, need_g, prec); 4127 4128 /* Q = Q1 * Q2 */ 4129 /* G = G1 * G2 */ 4130 /* P = P1 * Q2 + P2 * G1 */ 4131 bf_mul(&P2, &P2, G, prec, BF_RNDN); 4132 if (!need_g) 4133 bf_set_ui(G, 0); 4134 bf_mul(P, P, &Q2, prec, BF_RNDN); 4135 bf_add(P, P, &P2, prec, BF_RNDN); 4136 bf_delete(&P2); 4137 4138 bf_mul(Q, Q, &Q2, prec, BF_RNDN); 4139 bf_delete(&Q2); 4140 if (need_g) 4141 bf_mul(G, G, &G2, prec, BF_RNDN); 4142 bf_delete(&G2); 4143 } 4144 } 4145 4146 /* compute Pi with faithful rounding at precision 'prec' using the 4147 Chudnovsky formula */ 4148 static void bf_const_pi_internal(bf_t *Q, limb_t prec) 4149 { 4150 bf_context_t *s = Q->ctx; 4151 int64_t n, prec1; 4152 bf_t P, G; 4153 4154 /* number of serie terms */ 4155 n = prec / CHUD_BITS_PER_TERM + 1; 4156 /* XXX: precision analysis */ 4157 prec1 = prec + 32; 4158 4159 bf_init(s, &P); 4160 bf_init(s, &G); 4161 4162 chud_bs(&P, Q, &G, 0, n, 0, BF_PREC_INF); 4163 4164 bf_mul_ui(&G, Q, CHUD_A, prec1, BF_RNDN); 4165 bf_add(&P, &G, &P, prec1, BF_RNDN); 4166 bf_div(Q, Q, &P, prec1, BF_RNDF); 4167 4168 bf_set_ui(&P, CHUD_C); 4169 bf_sqrt(&G, &P, prec1, BF_RNDF); 4170 bf_mul_ui(&G, &G, (uint64_t)CHUD_C / 12, prec1, BF_RNDF); 4171 bf_mul(Q, Q, &G, prec, BF_RNDN); 4172 bf_delete(&P); 4173 bf_delete(&G); 4174 } 4175 4176 static int bf_const_get(bf_t *T, limb_t prec, bf_flags_t flags, 4177 BFConstCache *c, 4178 void (*func)(bf_t *res, limb_t prec), int sign) 4179 { 4180 limb_t ziv_extra_bits, prec1; 4181 4182 ziv_extra_bits = 32; 4183 for(;;) { 4184 prec1 = prec + ziv_extra_bits; 4185 if (c->prec < prec1) { 4186 if (c->val.len == 0) 4187 bf_init(T->ctx, &c->val); 4188 func(&c->val, prec1); 4189 c->prec = prec1; 4190 } else { 4191 prec1 = c->prec; 4192 } 4193 bf_set(T, &c->val); 4194 T->sign = sign; 4195 if (!bf_can_round(T, prec, flags & BF_RND_MASK, prec1)) { 4196 /* and more precision and retry */ 4197 ziv_extra_bits = ziv_extra_bits + (ziv_extra_bits / 2); 4198 } else { 4199 break; 4200 } 4201 } 4202 return bf_round(T, prec, flags); 4203 } 4204 4205 static void bf_const_free(BFConstCache *c) 4206 { 4207 bf_delete(&c->val); 4208 memset(c, 0, sizeof(*c)); 4209 } 4210 4211 int bf_const_log2(bf_t *T, limb_t prec, bf_flags_t flags) 4212 { 4213 bf_context_t *s = T->ctx; 4214 return bf_const_get(T, prec, flags, &s->log2_cache, bf_const_log2_internal, 0); 4215 } 4216 4217 /* return rounded pi * (1 - 2 * sign) */ 4218 static int bf_const_pi_signed(bf_t *T, int sign, limb_t prec, bf_flags_t flags) 4219 { 4220 bf_context_t *s = T->ctx; 4221 return bf_const_get(T, prec, flags, &s->pi_cache, bf_const_pi_internal, 4222 sign); 4223 } 4224 4225 int bf_const_pi(bf_t *T, limb_t prec, bf_flags_t flags) 4226 { 4227 return bf_const_pi_signed(T, 0, prec, flags); 4228 } 4229 4230 void bf_clear_cache(bf_context_t *s) 4231 { 4232 #ifdef USE_FFT_MUL 4233 fft_clear_cache(s); 4234 #endif 4235 bf_const_free(&s->log2_cache); 4236 bf_const_free(&s->pi_cache); 4237 } 4238 4239 /* ZivFunc should compute the result 'r' with faithful rounding at 4240 precision 'prec'. For efficiency purposes, the final bf_round() 4241 does not need to be done in the function. */ 4242 typedef int ZivFunc(bf_t *r, const bf_t *a, limb_t prec, void *opaque); 4243 4244 static int bf_ziv_rounding(bf_t *r, const bf_t *a, 4245 limb_t prec, bf_flags_t flags, 4246 ZivFunc *f, void *opaque) 4247 { 4248 int rnd_mode, ret; 4249 slimb_t prec1, ziv_extra_bits; 4250 4251 rnd_mode = flags & BF_RND_MASK; 4252 if (rnd_mode == BF_RNDF) { 4253 /* no need to iterate */ 4254 f(r, a, prec, opaque); 4255 ret = 0; 4256 } else { 4257 ziv_extra_bits = 32; 4258 for(;;) { 4259 prec1 = prec + ziv_extra_bits; 4260 ret = f(r, a, prec1, opaque); 4261 if (ret & (BF_ST_OVERFLOW | BF_ST_UNDERFLOW | BF_ST_MEM_ERROR)) { 4262 /* overflow or underflow should never happen because 4263 it indicates the rounding cannot be done correctly, 4264 but we do not catch all the cases */ 4265 return ret; 4266 } 4267 /* if the result is exact, we can stop */ 4268 if (!(ret & BF_ST_INEXACT)) { 4269 ret = 0; 4270 break; 4271 } 4272 if (bf_can_round(r, prec, rnd_mode, prec1)) { 4273 ret = BF_ST_INEXACT; 4274 break; 4275 } 4276 ziv_extra_bits = ziv_extra_bits * 2; 4277 // printf("ziv_extra_bits=%" PRId64 "\n", (int64_t)ziv_extra_bits); 4278 } 4279 } 4280 if (r->len == 0) 4281 return ret; 4282 else 4283 return __bf_round(r, prec, flags, r->len, ret); 4284 } 4285 4286 /* add (1 - 2*e_sign) * 2^e */ 4287 static int bf_add_epsilon(bf_t *r, const bf_t *a, slimb_t e, int e_sign, 4288 limb_t prec, int flags) 4289 { 4290 bf_t T_s, *T = &T_s; 4291 int ret; 4292 /* small argument case: result = 1 + epsilon * sign(x) */ 4293 bf_init(a->ctx, T); 4294 bf_set_ui(T, 1); 4295 T->sign = e_sign; 4296 T->expn += e; 4297 ret = bf_add(r, r, T, prec, flags); 4298 bf_delete(T); 4299 return ret; 4300 } 4301 4302 /* Compute the exponential using faithful rounding at precision 'prec'. 4303 Note: the algorithm is from MPFR */ 4304 static int bf_exp_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque) 4305 { 4306 bf_context_t *s = r->ctx; 4307 bf_t T_s, *T = &T_s; 4308 slimb_t n, K, l, i, prec1; 4309 4310 assert(r != a); 4311 4312 /* argument reduction: 4313 T = a - n*log(2) with 0 <= T < log(2) and n integer. 4314 */ 4315 bf_init(s, T); 4316 if (a->expn <= -1) { 4317 /* 0 <= abs(a) <= 0.5 */ 4318 if (a->sign) 4319 n = -1; 4320 else 4321 n = 0; 4322 } else { 4323 bf_const_log2(T, LIMB_BITS, BF_RNDZ); 4324 bf_div(T, a, T, LIMB_BITS, BF_RNDD); 4325 bf_get_limb(&n, T, 0); 4326 } 4327 4328 K = bf_isqrt((prec + 1) / 2); 4329 l = (prec - 1) / K + 1; 4330 /* XXX: precision analysis ? */ 4331 prec1 = prec + (K + 2 * l + 18) + K + 8; 4332 if (a->expn > 0) 4333 prec1 += a->expn; 4334 // printf("n=%ld K=%ld prec1=%ld\n", n, K, prec1); 4335 4336 bf_const_log2(T, prec1, BF_RNDF); 4337 bf_mul_si(T, T, n, prec1, BF_RNDN); 4338 bf_sub(T, a, T, prec1, BF_RNDN); 4339 4340 /* reduce the range of T */ 4341 bf_mul_2exp(T, -K, BF_PREC_INF, BF_RNDZ); 4342 4343 /* Taylor expansion around zero : 4344 1 + x + x^2/2 + ... + x^n/n! 4345 = (1 + x * (1 + x/2 * (1 + ... (x/n)))) 4346 */ 4347 { 4348 bf_t U_s, *U = &U_s; 4349 4350 bf_init(s, U); 4351 bf_set_ui(r, 1); 4352 for(i = l ; i >= 1; i--) { 4353 bf_set_ui(U, i); 4354 bf_div(U, T, U, prec1, BF_RNDN); 4355 bf_mul(r, r, U, prec1, BF_RNDN); 4356 bf_add_si(r, r, 1, prec1, BF_RNDN); 4357 } 4358 bf_delete(U); 4359 } 4360 bf_delete(T); 4361 4362 /* undo the range reduction */ 4363 for(i = 0; i < K; i++) { 4364 bf_mul(r, r, r, prec1, BF_RNDN | BF_FLAG_EXT_EXP); 4365 } 4366 4367 /* undo the argument reduction */ 4368 bf_mul_2exp(r, n, BF_PREC_INF, BF_RNDZ | BF_FLAG_EXT_EXP); 4369 4370 return BF_ST_INEXACT; 4371 } 4372 4373 /* crude overflow and underflow tests for exp(a). a_low <= a <= a_high */ 4374 static int check_exp_underflow_overflow(bf_context_t *s, bf_t *r, 4375 const bf_t *a_low, const bf_t *a_high, 4376 limb_t prec, bf_flags_t flags) 4377 { 4378 bf_t T_s, *T = &T_s; 4379 bf_t log2_s, *log2 = &log2_s; 4380 slimb_t e_min, e_max; 4381 4382 if (a_high->expn <= 0) 4383 return 0; 4384 4385 e_max = (limb_t)1 << (bf_get_exp_bits(flags) - 1); 4386 e_min = -e_max + 3; 4387 if (flags & BF_FLAG_SUBNORMAL) 4388 e_min -= (prec - 1); 4389 4390 bf_init(s, T); 4391 bf_init(s, log2); 4392 bf_const_log2(log2, LIMB_BITS, BF_RNDU); 4393 bf_mul_ui(T, log2, e_max, LIMB_BITS, BF_RNDU); 4394 /* a_low > e_max * log(2) implies exp(a) > e_max */ 4395 if (bf_cmp_lt(T, a_low) > 0) { 4396 /* overflow */ 4397 bf_delete(T); 4398 bf_delete(log2); 4399 return bf_set_overflow(r, 0, prec, flags); 4400 } 4401 /* a_high < (e_min - 2) * log(2) implies exp(a) < (e_min - 2) */ 4402 bf_const_log2(log2, LIMB_BITS, BF_RNDD); 4403 bf_mul_si(T, log2, e_min - 2, LIMB_BITS, BF_RNDD); 4404 if (bf_cmp_lt(a_high, T)) { 4405 int rnd_mode = flags & BF_RND_MASK; 4406 4407 /* underflow */ 4408 bf_delete(T); 4409 bf_delete(log2); 4410 if (rnd_mode == BF_RNDU) { 4411 /* set the smallest value */ 4412 bf_set_ui(r, 1); 4413 r->expn = e_min; 4414 } else { 4415 bf_set_zero(r, 0); 4416 } 4417 return BF_ST_UNDERFLOW | BF_ST_INEXACT; 4418 } 4419 bf_delete(log2); 4420 bf_delete(T); 4421 return 0; 4422 } 4423 4424 int bf_exp(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags) 4425 { 4426 bf_context_t *s = r->ctx; 4427 int ret; 4428 assert(r != a); 4429 if (a->len == 0) { 4430 if (a->expn == BF_EXP_NAN) { 4431 bf_set_nan(r); 4432 } else if (a->expn == BF_EXP_INF) { 4433 if (a->sign) 4434 bf_set_zero(r, 0); 4435 else 4436 bf_set_inf(r, 0); 4437 } else { 4438 bf_set_ui(r, 1); 4439 } 4440 return 0; 4441 } 4442 4443 ret = check_exp_underflow_overflow(s, r, a, a, prec, flags); 4444 if (ret) 4445 return ret; 4446 if (a->expn < 0 && (-a->expn) >= (prec + 2)) { 4447 /* small argument case: result = 1 + epsilon * sign(x) */ 4448 bf_set_ui(r, 1); 4449 return bf_add_epsilon(r, r, -(prec + 2), a->sign, prec, flags); 4450 } 4451 4452 return bf_ziv_rounding(r, a, prec, flags, bf_exp_internal, NULL); 4453 } 4454 4455 static int bf_log_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque) 4456 { 4457 bf_context_t *s = r->ctx; 4458 bf_t T_s, *T = &T_s; 4459 bf_t U_s, *U = &U_s; 4460 bf_t V_s, *V = &V_s; 4461 slimb_t n, prec1, l, i, K; 4462 4463 assert(r != a); 4464 4465 bf_init(s, T); 4466 /* argument reduction 1 */ 4467 /* T=a*2^n with 2/3 <= T <= 4/3 */ 4468 { 4469 bf_t U_s, *U = &U_s; 4470 bf_set(T, a); 4471 n = T->expn; 4472 T->expn = 0; 4473 /* U= ~ 2/3 */ 4474 bf_init(s, U); 4475 bf_set_ui(U, 0xaaaaaaaa); 4476 U->expn = 0; 4477 if (bf_cmp_lt(T, U)) { 4478 T->expn++; 4479 n--; 4480 } 4481 bf_delete(U); 4482 } 4483 // printf("n=%ld\n", n); 4484 // bf_print_str("T", T); 4485 4486 /* XXX: precision analysis */ 4487 /* number of iterations for argument reduction 2 */ 4488 K = bf_isqrt((prec + 1) / 2); 4489 /* order of Taylor expansion */ 4490 l = prec / (2 * K) + 1; 4491 /* precision of the intermediate computations */ 4492 prec1 = prec + K + 2 * l + 32; 4493 4494 bf_init(s, U); 4495 bf_init(s, V); 4496 4497 /* Note: cancellation occurs here, so we use more precision (XXX: 4498 reduce the precision by computing the exact cancellation) */ 4499 bf_add_si(T, T, -1, BF_PREC_INF, BF_RNDN); 4500 4501 /* argument reduction 2 */ 4502 for(i = 0; i < K; i++) { 4503 /* T = T / (1 + sqrt(1 + T)) */ 4504 bf_add_si(U, T, 1, prec1, BF_RNDN); 4505 bf_sqrt(V, U, prec1, BF_RNDF); 4506 bf_add_si(U, V, 1, prec1, BF_RNDN); 4507 bf_div(T, T, U, prec1, BF_RNDN); 4508 } 4509 4510 { 4511 bf_t Y_s, *Y = &Y_s; 4512 bf_t Y2_s, *Y2 = &Y2_s; 4513 bf_init(s, Y); 4514 bf_init(s, Y2); 4515 4516 /* compute ln(1+x) = ln((1+y)/(1-y)) with y=x/(2+x) 4517 = y + y^3/3 + ... + y^(2*l + 1) / (2*l+1) 4518 with Y=Y^2 4519 = y*(1+Y/3+Y^2/5+...) = y*(1+Y*(1/3+Y*(1/5 + ...))) 4520 */ 4521 bf_add_si(Y, T, 2, prec1, BF_RNDN); 4522 bf_div(Y, T, Y, prec1, BF_RNDN); 4523 4524 bf_mul(Y2, Y, Y, prec1, BF_RNDN); 4525 bf_set_ui(r, 0); 4526 for(i = l; i >= 1; i--) { 4527 bf_set_ui(U, 1); 4528 bf_set_ui(V, 2 * i + 1); 4529 bf_div(U, U, V, prec1, BF_RNDN); 4530 bf_add(r, r, U, prec1, BF_RNDN); 4531 bf_mul(r, r, Y2, prec1, BF_RNDN); 4532 } 4533 bf_add_si(r, r, 1, prec1, BF_RNDN); 4534 bf_mul(r, r, Y, prec1, BF_RNDN); 4535 bf_delete(Y); 4536 bf_delete(Y2); 4537 } 4538 bf_delete(V); 4539 bf_delete(U); 4540 4541 /* multiplication by 2 for the Taylor expansion and undo the 4542 argument reduction 2*/ 4543 bf_mul_2exp(r, K + 1, BF_PREC_INF, BF_RNDZ); 4544 4545 /* undo the argument reduction 1 */ 4546 bf_const_log2(T, prec1, BF_RNDF); 4547 bf_mul_si(T, T, n, prec1, BF_RNDN); 4548 bf_add(r, r, T, prec1, BF_RNDN); 4549 4550 bf_delete(T); 4551 return BF_ST_INEXACT; 4552 } 4553 4554 int bf_log(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags) 4555 { 4556 bf_context_t *s = r->ctx; 4557 bf_t T_s, *T = &T_s; 4558 4559 assert(r != a); 4560 if (a->len == 0) { 4561 if (a->expn == BF_EXP_NAN) { 4562 bf_set_nan(r); 4563 return 0; 4564 } else if (a->expn == BF_EXP_INF) { 4565 if (a->sign) { 4566 bf_set_nan(r); 4567 return BF_ST_INVALID_OP; 4568 } else { 4569 bf_set_inf(r, 0); 4570 return 0; 4571 } 4572 } else { 4573 bf_set_inf(r, 1); 4574 return 0; 4575 } 4576 } 4577 if (a->sign) { 4578 bf_set_nan(r); 4579 return BF_ST_INVALID_OP; 4580 } 4581 bf_init(s, T); 4582 bf_set_ui(T, 1); 4583 if (bf_cmp_eq(a, T)) { 4584 bf_set_zero(r, 0); 4585 bf_delete(T); 4586 return 0; 4587 } 4588 bf_delete(T); 4589 4590 return bf_ziv_rounding(r, a, prec, flags, bf_log_internal, NULL); 4591 } 4592 4593 /* x and y finite and x > 0 */ 4594 static int bf_pow_generic(bf_t *r, const bf_t *x, limb_t prec, void *opaque) 4595 { 4596 bf_context_t *s = r->ctx; 4597 const bf_t *y = opaque; 4598 bf_t T_s, *T = &T_s; 4599 limb_t prec1; 4600 4601 bf_init(s, T); 4602 /* XXX: proof for the added precision */ 4603 prec1 = prec + 32; 4604 bf_log(T, x, prec1, BF_RNDF | BF_FLAG_EXT_EXP); 4605 bf_mul(T, T, y, prec1, BF_RNDF | BF_FLAG_EXT_EXP); 4606 if (bf_is_nan(T)) 4607 bf_set_nan(r); 4608 else 4609 bf_exp_internal(r, T, prec1, NULL); /* no overflow/underlow test needed */ 4610 bf_delete(T); 4611 return BF_ST_INEXACT; 4612 } 4613 4614 /* x and y finite, x > 0, y integer and y fits on one limb */ 4615 static int bf_pow_int(bf_t *r, const bf_t *x, limb_t prec, void *opaque) 4616 { 4617 bf_context_t *s = r->ctx; 4618 const bf_t *y = opaque; 4619 bf_t T_s, *T = &T_s; 4620 limb_t prec1; 4621 int ret; 4622 slimb_t y1; 4623 4624 bf_get_limb(&y1, y, 0); 4625 if (y1 < 0) 4626 y1 = -y1; 4627 /* XXX: proof for the added precision */ 4628 prec1 = prec + ceil_log2(y1) * 2 + 8; 4629 ret = bf_pow_ui(r, x, y1 < 0 ? -y1 : y1, prec1, BF_RNDN | BF_FLAG_EXT_EXP); 4630 if (y->sign) { 4631 bf_init(s, T); 4632 bf_set_ui(T, 1); 4633 ret |= bf_div(r, T, r, prec1, BF_RNDN | BF_FLAG_EXT_EXP); 4634 bf_delete(T); 4635 } 4636 return ret; 4637 } 4638 4639 /* x must be a finite non zero float. Return TRUE if there is a 4640 floating point number r such as x=r^(2^n) and return this floating 4641 point number 'r'. Otherwise return FALSE and r is undefined. */ 4642 static BOOL check_exact_power2n(bf_t *r, const bf_t *x, slimb_t n) 4643 { 4644 bf_context_t *s = r->ctx; 4645 bf_t T_s, *T = &T_s; 4646 slimb_t e, i, er; 4647 limb_t v; 4648 4649 /* x = m*2^e with m odd integer */ 4650 e = bf_get_exp_min(x); 4651 /* fast check on the exponent */ 4652 if (n > (LIMB_BITS - 1)) { 4653 if (e != 0) 4654 return FALSE; 4655 er = 0; 4656 } else { 4657 if ((e & (((limb_t)1 << n) - 1)) != 0) 4658 return FALSE; 4659 er = e >> n; 4660 } 4661 /* every perfect odd square = 1 modulo 8 */ 4662 v = get_bits(x->tab, x->len, x->len * LIMB_BITS - x->expn + e); 4663 if ((v & 7) != 1) 4664 return FALSE; 4665 4666 bf_init(s, T); 4667 bf_set(T, x); 4668 T->expn -= e; 4669 for(i = 0; i < n; i++) { 4670 if (i != 0) 4671 bf_set(T, r); 4672 if (bf_sqrtrem(r, NULL, T) != 0) 4673 return FALSE; 4674 } 4675 r->expn += er; 4676 return TRUE; 4677 } 4678 4679 /* prec = BF_PREC_INF is accepted for x and y integers and y >= 0 */ 4680 int bf_pow(bf_t *r, const bf_t *x, const bf_t *y, limb_t prec, bf_flags_t flags) 4681 { 4682 bf_context_t *s = r->ctx; 4683 bf_t T_s, *T = &T_s; 4684 bf_t ytmp_s; 4685 BOOL y_is_int, y_is_odd; 4686 int r_sign, ret, rnd_mode; 4687 slimb_t y_emin; 4688 4689 if (x->len == 0 || y->len == 0) { 4690 if (y->expn == BF_EXP_ZERO) { 4691 /* pow(x, 0) = 1 */ 4692 bf_set_ui(r, 1); 4693 } else if (x->expn == BF_EXP_NAN) { 4694 bf_set_nan(r); 4695 } else { 4696 int cmp_x_abs_1; 4697 bf_set_ui(r, 1); 4698 cmp_x_abs_1 = bf_cmpu(x, r); 4699 if (cmp_x_abs_1 == 0 && (flags & BF_POW_JS_QUIRKS) && 4700 (y->expn >= BF_EXP_INF)) { 4701 bf_set_nan(r); 4702 } else if (cmp_x_abs_1 == 0 && 4703 (!x->sign || y->expn != BF_EXP_NAN)) { 4704 /* pow(1, y) = 1 even if y = NaN */ 4705 /* pow(-1, +/-inf) = 1 */ 4706 } else if (y->expn == BF_EXP_NAN) { 4707 bf_set_nan(r); 4708 } else if (y->expn == BF_EXP_INF) { 4709 if (y->sign == (cmp_x_abs_1 > 0)) { 4710 bf_set_zero(r, 0); 4711 } else { 4712 bf_set_inf(r, 0); 4713 } 4714 } else { 4715 y_emin = bf_get_exp_min(y); 4716 y_is_odd = (y_emin == 0); 4717 if (y->sign == (x->expn == BF_EXP_ZERO)) { 4718 bf_set_inf(r, y_is_odd & x->sign); 4719 if (y->sign) { 4720 /* pow(0, y) with y < 0 */ 4721 return BF_ST_DIVIDE_ZERO; 4722 } 4723 } else { 4724 bf_set_zero(r, y_is_odd & x->sign); 4725 } 4726 } 4727 } 4728 return 0; 4729 } 4730 bf_init(s, T); 4731 bf_set(T, x); 4732 y_emin = bf_get_exp_min(y); 4733 y_is_int = (y_emin >= 0); 4734 rnd_mode = flags & BF_RND_MASK; 4735 if (x->sign) { 4736 if (!y_is_int) { 4737 bf_set_nan(r); 4738 bf_delete(T); 4739 return BF_ST_INVALID_OP; 4740 } 4741 y_is_odd = (y_emin == 0); 4742 r_sign = y_is_odd; 4743 /* change the directed rounding mode if the sign of the result 4744 is changed */ 4745 if (r_sign && (rnd_mode == BF_RNDD || rnd_mode == BF_RNDU)) 4746 flags ^= 1; 4747 bf_neg(T); 4748 } else { 4749 r_sign = 0; 4750 } 4751 4752 bf_set_ui(r, 1); 4753 if (bf_cmp_eq(T, r)) { 4754 /* abs(x) = 1: nothing more to do */ 4755 ret = 0; 4756 } else { 4757 /* check the overflow/underflow cases */ 4758 { 4759 bf_t al_s, *al = &al_s; 4760 bf_t ah_s, *ah = &ah_s; 4761 limb_t precl = LIMB_BITS; 4762 4763 bf_init(s, al); 4764 bf_init(s, ah); 4765 /* compute bounds of log(abs(x)) * y with a low precision */ 4766 /* XXX: compute bf_log() once */ 4767 /* XXX: add a fast test before this slow test */ 4768 bf_log(al, T, precl, BF_RNDD); 4769 bf_log(ah, T, precl, BF_RNDU); 4770 bf_mul(al, al, y, precl, BF_RNDD ^ y->sign); 4771 bf_mul(ah, ah, y, precl, BF_RNDU ^ y->sign); 4772 ret = check_exp_underflow_overflow(s, r, al, ah, prec, flags); 4773 bf_delete(al); 4774 bf_delete(ah); 4775 if (ret) 4776 goto done; 4777 } 4778 4779 if (y_is_int) { 4780 slimb_t T_bits, e; 4781 int_pow: 4782 T_bits = T->expn - bf_get_exp_min(T); 4783 if (T_bits == 1) { 4784 /* pow(2^b, y) = 2^(b*y) */ 4785 bf_mul_si(T, y, T->expn - 1, LIMB_BITS, BF_RNDZ); 4786 bf_get_limb(&e, T, 0); 4787 bf_set_ui(r, 1); 4788 ret = bf_mul_2exp(r, e, prec, flags); 4789 } else if (prec == BF_PREC_INF) { 4790 slimb_t y1; 4791 /* specific case for infinite precision (integer case) */ 4792 bf_get_limb(&y1, y, 0); 4793 assert(!y->sign); 4794 /* x must be an integer, so abs(x) >= 2 */ 4795 if (y1 >= ((slimb_t)1 << BF_EXP_BITS_MAX)) { 4796 bf_delete(T); 4797 return bf_set_overflow(r, 0, BF_PREC_INF, flags); 4798 } 4799 ret = bf_pow_ui(r, T, y1, BF_PREC_INF, BF_RNDZ); 4800 } else { 4801 if (y->expn <= 31) { 4802 /* small enough power: use exponentiation in all cases */ 4803 } else if (y->sign) { 4804 /* cannot be exact */ 4805 goto general_case; 4806 } else { 4807 if (rnd_mode == BF_RNDF) 4808 goto general_case; /* no need to track exact results */ 4809 /* see if the result has a chance to be exact: 4810 if x=a*2^b (a odd), x^y=a^y*2^(b*y) 4811 x^y needs a precision of at least floor_log2(a)*y bits 4812 */ 4813 bf_mul_si(r, y, T_bits - 1, LIMB_BITS, BF_RNDZ); 4814 bf_get_limb(&e, r, 0); 4815 if (prec < e) 4816 goto general_case; 4817 } 4818 ret = bf_ziv_rounding(r, T, prec, flags, bf_pow_int, (void *)y); 4819 } 4820 } else { 4821 if (rnd_mode != BF_RNDF) { 4822 bf_t *y1; 4823 if (y_emin < 0 && check_exact_power2n(r, T, -y_emin)) { 4824 /* the problem is reduced to a power to an integer */ 4825 #if 0 4826 printf("\nn=%" PRId64 "\n", -(int64_t)y_emin); 4827 bf_print_str("T", T); 4828 bf_print_str("r", r); 4829 #endif 4830 bf_set(T, r); 4831 y1 = &ytmp_s; 4832 y1->tab = y->tab; 4833 y1->len = y->len; 4834 y1->sign = y->sign; 4835 y1->expn = y->expn - y_emin; 4836 y = y1; 4837 goto int_pow; 4838 } 4839 } 4840 general_case: 4841 ret = bf_ziv_rounding(r, T, prec, flags, bf_pow_generic, (void *)y); 4842 } 4843 } 4844 done: 4845 bf_delete(T); 4846 r->sign = r_sign; 4847 return ret; 4848 } 4849 4850 /* compute sqrt(-2*x-x^2) to get |sin(x)| from cos(x) - 1. */ 4851 static void bf_sqrt_sin(bf_t *r, const bf_t *x, limb_t prec1) 4852 { 4853 bf_context_t *s = r->ctx; 4854 bf_t T_s, *T = &T_s; 4855 bf_init(s, T); 4856 bf_set(T, x); 4857 bf_mul(r, T, T, prec1, BF_RNDN); 4858 bf_mul_2exp(T, 1, BF_PREC_INF, BF_RNDZ); 4859 bf_add(T, T, r, prec1, BF_RNDN); 4860 bf_neg(T); 4861 bf_sqrt(r, T, prec1, BF_RNDF); 4862 bf_delete(T); 4863 } 4864 4865 static int bf_sincos(bf_t *s, bf_t *c, const bf_t *a, limb_t prec) 4866 { 4867 bf_context_t *s1 = a->ctx; 4868 bf_t T_s, *T = &T_s; 4869 bf_t U_s, *U = &U_s; 4870 bf_t r_s, *r = &r_s; 4871 slimb_t K, prec1, i, l, mod, prec2; 4872 int is_neg; 4873 4874 assert(c != a && s != a); 4875 4876 bf_init(s1, T); 4877 bf_init(s1, U); 4878 bf_init(s1, r); 4879 4880 /* XXX: precision analysis */ 4881 K = bf_isqrt(prec / 2); 4882 l = prec / (2 * K) + 1; 4883 prec1 = prec + 2 * K + l + 8; 4884 4885 /* after the modulo reduction, -pi/4 <= T <= pi/4 */ 4886 if (a->expn <= -1) { 4887 /* abs(a) <= 0.25: no modulo reduction needed */ 4888 bf_set(T, a); 4889 mod = 0; 4890 } else { 4891 slimb_t cancel; 4892 cancel = 0; 4893 for(;;) { 4894 prec2 = prec1 + a->expn + cancel; 4895 bf_const_pi(U, prec2, BF_RNDF); 4896 bf_mul_2exp(U, -1, BF_PREC_INF, BF_RNDZ); 4897 bf_remquo(&mod, T, a, U, prec2, BF_RNDN, BF_RNDN); 4898 // printf("T.expn=%ld prec2=%ld\n", T->expn, prec2); 4899 if (mod == 0 || (T->expn != BF_EXP_ZERO && 4900 (T->expn + prec2) >= (prec1 - 1))) 4901 break; 4902 /* increase the number of bits until the precision is good enough */ 4903 cancel = bf_max(-T->expn, (cancel + 1) * 3 / 2); 4904 } 4905 mod &= 3; 4906 } 4907 4908 is_neg = T->sign; 4909 4910 /* compute cosm1(x) = cos(x) - 1 */ 4911 bf_mul(T, T, T, prec1, BF_RNDN); 4912 bf_mul_2exp(T, -2 * K, BF_PREC_INF, BF_RNDZ); 4913 4914 /* Taylor expansion: 4915 -x^2/2 + x^4/4! - x^6/6! + ... 4916 */ 4917 bf_set_ui(r, 1); 4918 for(i = l ; i >= 1; i--) { 4919 bf_set_ui(U, 2 * i - 1); 4920 bf_mul_ui(U, U, 2 * i, BF_PREC_INF, BF_RNDZ); 4921 bf_div(U, T, U, prec1, BF_RNDN); 4922 bf_mul(r, r, U, prec1, BF_RNDN); 4923 bf_neg(r); 4924 if (i != 1) 4925 bf_add_si(r, r, 1, prec1, BF_RNDN); 4926 } 4927 bf_delete(U); 4928 4929 /* undo argument reduction: 4930 cosm1(2*x)= 2*(2*cosm1(x)+cosm1(x)^2) 4931 */ 4932 for(i = 0; i < K; i++) { 4933 bf_mul(T, r, r, prec1, BF_RNDN); 4934 bf_mul_2exp(r, 1, BF_PREC_INF, BF_RNDZ); 4935 bf_add(r, r, T, prec1, BF_RNDN); 4936 bf_mul_2exp(r, 1, BF_PREC_INF, BF_RNDZ); 4937 } 4938 bf_delete(T); 4939 4940 if (c) { 4941 if ((mod & 1) == 0) { 4942 bf_add_si(c, r, 1, prec1, BF_RNDN); 4943 } else { 4944 bf_sqrt_sin(c, r, prec1); 4945 c->sign = is_neg ^ 1; 4946 } 4947 c->sign ^= mod >> 1; 4948 } 4949 if (s) { 4950 if ((mod & 1) == 0) { 4951 bf_sqrt_sin(s, r, prec1); 4952 s->sign = is_neg; 4953 } else { 4954 bf_add_si(s, r, 1, prec1, BF_RNDN); 4955 } 4956 s->sign ^= mod >> 1; 4957 } 4958 bf_delete(r); 4959 return BF_ST_INEXACT; 4960 } 4961 4962 static int bf_cos_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque) 4963 { 4964 return bf_sincos(NULL, r, a, prec); 4965 } 4966 4967 int bf_cos(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags) 4968 { 4969 if (a->len == 0) { 4970 if (a->expn == BF_EXP_NAN) { 4971 bf_set_nan(r); 4972 return 0; 4973 } else if (a->expn == BF_EXP_INF) { 4974 bf_set_nan(r); 4975 return BF_ST_INVALID_OP; 4976 } else { 4977 bf_set_ui(r, 1); 4978 return 0; 4979 } 4980 } 4981 4982 /* small argument case: result = 1+r(x) with r(x) = -x^2/2 + 4983 O(X^4). We assume r(x) < 2^(2*EXP(x) - 1). */ 4984 if (a->expn < 0) { 4985 slimb_t e; 4986 e = 2 * a->expn - 1; 4987 if (e < -(prec + 2)) { 4988 bf_set_ui(r, 1); 4989 return bf_add_epsilon(r, r, e, 1, prec, flags); 4990 } 4991 } 4992 4993 return bf_ziv_rounding(r, a, prec, flags, bf_cos_internal, NULL); 4994 } 4995 4996 static int bf_sin_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque) 4997 { 4998 return bf_sincos(r, NULL, a, prec); 4999 } 5000 5001 int bf_sin(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags) 5002 { 5003 if (a->len == 0) { 5004 if (a->expn == BF_EXP_NAN) { 5005 bf_set_nan(r); 5006 return 0; 5007 } else if (a->expn == BF_EXP_INF) { 5008 bf_set_nan(r); 5009 return BF_ST_INVALID_OP; 5010 } else { 5011 bf_set_zero(r, a->sign); 5012 return 0; 5013 } 5014 } 5015 5016 /* small argument case: result = x+r(x) with r(x) = -x^3/6 + 5017 O(X^5). We assume r(x) < 2^(3*EXP(x) - 2). */ 5018 if (a->expn < 0) { 5019 slimb_t e; 5020 e = sat_add(2 * a->expn, a->expn - 2); 5021 if (e < a->expn - bf_max(prec + 2, a->len * LIMB_BITS + 2)) { 5022 bf_set(r, a); 5023 return bf_add_epsilon(r, r, e, 1 - a->sign, prec, flags); 5024 } 5025 } 5026 5027 return bf_ziv_rounding(r, a, prec, flags, bf_sin_internal, NULL); 5028 } 5029 5030 static int bf_tan_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque) 5031 { 5032 bf_context_t *s = r->ctx; 5033 bf_t T_s, *T = &T_s; 5034 limb_t prec1; 5035 5036 /* XXX: precision analysis */ 5037 prec1 = prec + 8; 5038 bf_init(s, T); 5039 bf_sincos(r, T, a, prec1); 5040 bf_div(r, r, T, prec1, BF_RNDF); 5041 bf_delete(T); 5042 return BF_ST_INEXACT; 5043 } 5044 5045 int bf_tan(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags) 5046 { 5047 assert(r != a); 5048 if (a->len == 0) { 5049 if (a->expn == BF_EXP_NAN) { 5050 bf_set_nan(r); 5051 return 0; 5052 } else if (a->expn == BF_EXP_INF) { 5053 bf_set_nan(r); 5054 return BF_ST_INVALID_OP; 5055 } else { 5056 bf_set_zero(r, a->sign); 5057 return 0; 5058 } 5059 } 5060 5061 /* small argument case: result = x+r(x) with r(x) = x^3/3 + 5062 O(X^5). We assume r(x) < 2^(3*EXP(x) - 1). */ 5063 if (a->expn < 0) { 5064 slimb_t e; 5065 e = sat_add(2 * a->expn, a->expn - 1); 5066 if (e < a->expn - bf_max(prec + 2, a->len * LIMB_BITS + 2)) { 5067 bf_set(r, a); 5068 return bf_add_epsilon(r, r, e, a->sign, prec, flags); 5069 } 5070 } 5071 5072 return bf_ziv_rounding(r, a, prec, flags, bf_tan_internal, NULL); 5073 } 5074 5075 /* if add_pi2 is true, add pi/2 to the result (used for acos(x) to 5076 avoid cancellation) */ 5077 static int bf_atan_internal(bf_t *r, const bf_t *a, limb_t prec, 5078 void *opaque) 5079 { 5080 bf_context_t *s = r->ctx; 5081 BOOL add_pi2 = (BOOL)(intptr_t)opaque; 5082 bf_t T_s, *T = &T_s; 5083 bf_t U_s, *U = &U_s; 5084 bf_t V_s, *V = &V_s; 5085 bf_t X2_s, *X2 = &X2_s; 5086 int cmp_1; 5087 slimb_t prec1, i, K, l; 5088 5089 /* XXX: precision analysis */ 5090 K = bf_isqrt((prec + 1) / 2); 5091 l = prec / (2 * K) + 1; 5092 prec1 = prec + K + 2 * l + 32; 5093 // printf("prec=%d K=%d l=%d prec1=%d\n", (int)prec, (int)K, (int)l, (int)prec1); 5094 5095 bf_init(s, T); 5096 cmp_1 = (a->expn >= 1); /* a >= 1 */ 5097 if (cmp_1) { 5098 bf_set_ui(T, 1); 5099 bf_div(T, T, a, prec1, BF_RNDN); 5100 } else { 5101 bf_set(T, a); 5102 } 5103 5104 /* abs(T) <= 1 */ 5105 5106 /* argument reduction */ 5107 5108 bf_init(s, U); 5109 bf_init(s, V); 5110 bf_init(s, X2); 5111 for(i = 0; i < K; i++) { 5112 /* T = T / (1 + sqrt(1 + T^2)) */ 5113 bf_mul(U, T, T, prec1, BF_RNDN); 5114 bf_add_si(U, U, 1, prec1, BF_RNDN); 5115 bf_sqrt(V, U, prec1, BF_RNDN); 5116 bf_add_si(V, V, 1, prec1, BF_RNDN); 5117 bf_div(T, T, V, prec1, BF_RNDN); 5118 } 5119 5120 /* Taylor series: 5121 x - x^3/3 + ... + (-1)^ l * y^(2*l + 1) / (2*l+1) 5122 */ 5123 bf_mul(X2, T, T, prec1, BF_RNDN); 5124 bf_set_ui(r, 0); 5125 for(i = l; i >= 1; i--) { 5126 bf_set_si(U, 1); 5127 bf_set_ui(V, 2 * i + 1); 5128 bf_div(U, U, V, prec1, BF_RNDN); 5129 bf_neg(r); 5130 bf_add(r, r, U, prec1, BF_RNDN); 5131 bf_mul(r, r, X2, prec1, BF_RNDN); 5132 } 5133 bf_neg(r); 5134 bf_add_si(r, r, 1, prec1, BF_RNDN); 5135 bf_mul(r, r, T, prec1, BF_RNDN); 5136 5137 /* undo the argument reduction */ 5138 bf_mul_2exp(r, K, BF_PREC_INF, BF_RNDZ); 5139 5140 bf_delete(U); 5141 bf_delete(V); 5142 bf_delete(X2); 5143 5144 i = add_pi2; 5145 if (cmp_1 > 0) { 5146 /* undo the inversion : r = sign(a)*PI/2 - r */ 5147 bf_neg(r); 5148 i += 1 - 2 * a->sign; 5149 } 5150 /* add i*(pi/2) with -1 <= i <= 2 */ 5151 if (i != 0) { 5152 bf_const_pi(T, prec1, BF_RNDF); 5153 if (i != 2) 5154 bf_mul_2exp(T, -1, BF_PREC_INF, BF_RNDZ); 5155 T->sign = (i < 0); 5156 bf_add(r, T, r, prec1, BF_RNDN); 5157 } 5158 5159 bf_delete(T); 5160 return BF_ST_INEXACT; 5161 } 5162 5163 int bf_atan(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags) 5164 { 5165 bf_context_t *s = r->ctx; 5166 bf_t T_s, *T = &T_s; 5167 int res; 5168 5169 if (a->len == 0) { 5170 if (a->expn == BF_EXP_NAN) { 5171 bf_set_nan(r); 5172 return 0; 5173 } else if (a->expn == BF_EXP_INF) { 5174 /* -PI/2 or PI/2 */ 5175 bf_const_pi_signed(r, a->sign, prec, flags); 5176 bf_mul_2exp(r, -1, BF_PREC_INF, BF_RNDZ); 5177 return BF_ST_INEXACT; 5178 } else { 5179 bf_set_zero(r, a->sign); 5180 return 0; 5181 } 5182 } 5183 5184 bf_init(s, T); 5185 bf_set_ui(T, 1); 5186 res = bf_cmpu(a, T); 5187 bf_delete(T); 5188 if (res == 0) { 5189 /* short cut: abs(a) == 1 -> +/-pi/4 */ 5190 bf_const_pi_signed(r, a->sign, prec, flags); 5191 bf_mul_2exp(r, -2, BF_PREC_INF, BF_RNDZ); 5192 return BF_ST_INEXACT; 5193 } 5194 5195 /* small argument case: result = x+r(x) with r(x) = -x^3/3 + 5196 O(X^5). We assume r(x) < 2^(3*EXP(x) - 1). */ 5197 if (a->expn < 0) { 5198 slimb_t e; 5199 e = sat_add(2 * a->expn, a->expn - 1); 5200 if (e < a->expn - bf_max(prec + 2, a->len * LIMB_BITS + 2)) { 5201 bf_set(r, a); 5202 return bf_add_epsilon(r, r, e, 1 - a->sign, prec, flags); 5203 } 5204 } 5205 5206 return bf_ziv_rounding(r, a, prec, flags, bf_atan_internal, (void *)FALSE); 5207 } 5208 5209 static int bf_atan2_internal(bf_t *r, const bf_t *y, limb_t prec, void *opaque) 5210 { 5211 bf_context_t *s = r->ctx; 5212 const bf_t *x = opaque; 5213 bf_t T_s, *T = &T_s; 5214 limb_t prec1; 5215 int ret; 5216 5217 if (y->expn == BF_EXP_NAN || x->expn == BF_EXP_NAN) { 5218 bf_set_nan(r); 5219 return 0; 5220 } 5221 5222 /* compute atan(y/x) assumming inf/inf = 1 and 0/0 = 0 */ 5223 bf_init(s, T); 5224 prec1 = prec + 32; 5225 if (y->expn == BF_EXP_INF && x->expn == BF_EXP_INF) { 5226 bf_set_ui(T, 1); 5227 T->sign = y->sign ^ x->sign; 5228 } else if (y->expn == BF_EXP_ZERO && x->expn == BF_EXP_ZERO) { 5229 bf_set_zero(T, y->sign ^ x->sign); 5230 } else { 5231 bf_div(T, y, x, prec1, BF_RNDF); 5232 } 5233 ret = bf_atan(r, T, prec1, BF_RNDF); 5234 5235 if (x->sign) { 5236 /* if x < 0 (it includes -0), return sign(y)*pi + atan(y/x) */ 5237 bf_const_pi(T, prec1, BF_RNDF); 5238 T->sign = y->sign; 5239 bf_add(r, r, T, prec1, BF_RNDN); 5240 ret |= BF_ST_INEXACT; 5241 } 5242 5243 bf_delete(T); 5244 return ret; 5245 } 5246 5247 int bf_atan2(bf_t *r, const bf_t *y, const bf_t *x, 5248 limb_t prec, bf_flags_t flags) 5249 { 5250 return bf_ziv_rounding(r, y, prec, flags, bf_atan2_internal, (void *)x); 5251 } 5252 5253 static int bf_asin_internal(bf_t *r, const bf_t *a, limb_t prec, void *opaque) 5254 { 5255 bf_context_t *s = r->ctx; 5256 BOOL is_acos = (BOOL)(intptr_t)opaque; 5257 bf_t T_s, *T = &T_s; 5258 limb_t prec1, prec2; 5259 5260 /* asin(x) = atan(x/sqrt(1-x^2)) 5261 acos(x) = pi/2 - asin(x) */ 5262 prec1 = prec + 8; 5263 /* increase the precision in x^2 to compensate the cancellation in 5264 (1-x^2) if x is close to 1 */ 5265 /* XXX: use less precision when possible */ 5266 if (a->expn >= 0) 5267 prec2 = BF_PREC_INF; 5268 else 5269 prec2 = prec1; 5270 bf_init(s, T); 5271 bf_mul(T, a, a, prec2, BF_RNDN); 5272 bf_neg(T); 5273 bf_add_si(T, T, 1, prec2, BF_RNDN); 5274 5275 bf_sqrt(r, T, prec1, BF_RNDN); 5276 bf_div(T, a, r, prec1, BF_RNDN); 5277 if (is_acos) 5278 bf_neg(T); 5279 bf_atan_internal(r, T, prec1, (void *)(intptr_t)is_acos); 5280 bf_delete(T); 5281 return BF_ST_INEXACT; 5282 } 5283 5284 int bf_asin(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags) 5285 { 5286 bf_context_t *s = r->ctx; 5287 bf_t T_s, *T = &T_s; 5288 int res; 5289 5290 if (a->len == 0) { 5291 if (a->expn == BF_EXP_NAN) { 5292 bf_set_nan(r); 5293 return 0; 5294 } else if (a->expn == BF_EXP_INF) { 5295 bf_set_nan(r); 5296 return BF_ST_INVALID_OP; 5297 } else { 5298 bf_set_zero(r, a->sign); 5299 return 0; 5300 } 5301 } 5302 bf_init(s, T); 5303 bf_set_ui(T, 1); 5304 res = bf_cmpu(a, T); 5305 bf_delete(T); 5306 if (res > 0) { 5307 bf_set_nan(r); 5308 return BF_ST_INVALID_OP; 5309 } 5310 5311 /* small argument case: result = x+r(x) with r(x) = x^3/6 + 5312 O(X^5). We assume r(x) < 2^(3*EXP(x) - 2). */ 5313 if (a->expn < 0) { 5314 slimb_t e; 5315 e = sat_add(2 * a->expn, a->expn - 2); 5316 if (e < a->expn - bf_max(prec + 2, a->len * LIMB_BITS + 2)) { 5317 bf_set(r, a); 5318 return bf_add_epsilon(r, r, e, a->sign, prec, flags); 5319 } 5320 } 5321 5322 return bf_ziv_rounding(r, a, prec, flags, bf_asin_internal, (void *)FALSE); 5323 } 5324 5325 int bf_acos(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags) 5326 { 5327 bf_context_t *s = r->ctx; 5328 bf_t T_s, *T = &T_s; 5329 int res; 5330 5331 if (a->len == 0) { 5332 if (a->expn == BF_EXP_NAN) { 5333 bf_set_nan(r); 5334 return 0; 5335 } else if (a->expn == BF_EXP_INF) { 5336 bf_set_nan(r); 5337 return BF_ST_INVALID_OP; 5338 } else { 5339 bf_const_pi(r, prec, flags); 5340 bf_mul_2exp(r, -1, BF_PREC_INF, BF_RNDZ); 5341 return BF_ST_INEXACT; 5342 } 5343 } 5344 bf_init(s, T); 5345 bf_set_ui(T, 1); 5346 res = bf_cmpu(a, T); 5347 bf_delete(T); 5348 if (res > 0) { 5349 bf_set_nan(r); 5350 return BF_ST_INVALID_OP; 5351 } else if (res == 0 && a->sign == 0) { 5352 bf_set_zero(r, 0); 5353 return 0; 5354 } 5355 5356 return bf_ziv_rounding(r, a, prec, flags, bf_asin_internal, (void *)TRUE); 5357 } 5358 5359 /***************************************************************/ 5360 /* decimal floating point numbers */ 5361 5362 #ifdef USE_BF_DEC 5363 5364 #define adddq(r1, r0, a1, a0) \ 5365 do { \ 5366 limb_t __t = r0; \ 5367 r0 += (a0); \ 5368 r1 += (a1) + (r0 < __t); \ 5369 } while (0) 5370 5371 #define subdq(r1, r0, a1, a0) \ 5372 do { \ 5373 limb_t __t = r0; \ 5374 r0 -= (a0); \ 5375 r1 -= (a1) + (r0 > __t); \ 5376 } while (0) 5377 5378 #if LIMB_BITS == 64 5379 5380 /* Note: we assume __int128 is available */ 5381 #define muldq(r1, r0, a, b) \ 5382 do { \ 5383 unsigned __int128 __t; \ 5384 __t = (unsigned __int128)(a) * (unsigned __int128)(b); \ 5385 r0 = __t; \ 5386 r1 = __t >> 64; \ 5387 } while (0) 5388 5389 #define divdq(q, r, a1, a0, b) \ 5390 do { \ 5391 unsigned __int128 __t; \ 5392 limb_t __b = (b); \ 5393 __t = ((unsigned __int128)(a1) << 64) | (a0); \ 5394 q = __t / __b; \ 5395 r = __t % __b; \ 5396 } while (0) 5397 5398 #else 5399 5400 #define muldq(r1, r0, a, b) \ 5401 do { \ 5402 uint64_t __t; \ 5403 __t = (uint64_t)(a) * (uint64_t)(b); \ 5404 r0 = __t; \ 5405 r1 = __t >> 32; \ 5406 } while (0) 5407 5408 #define divdq(q, r, a1, a0, b) \ 5409 do { \ 5410 uint64_t __t; \ 5411 limb_t __b = (b); \ 5412 __t = ((uint64_t)(a1) << 32) | (a0); \ 5413 q = __t / __b; \ 5414 r = __t % __b; \ 5415 } while (0) 5416 5417 #endif /* LIMB_BITS != 64 */ 5418 5419 #if LIMB_DIGITS == 19 5420 5421 /* WARNING: hardcoded for b = 1e19. It is assumed that: 5422 0 <= a1 < 2^63 */ 5423 #define divdq_base(q, r, a1, a0)\ 5424 do {\ 5425 uint64_t __a0, __a1, __t0, __t1, __b = BF_DEC_BASE; \ 5426 __a0 = a0;\ 5427 __a1 = a1;\ 5428 __t0 = __a1;\ 5429 __t0 = shld(__t0, __a0, 1);\ 5430 muldq(q, __t1, __t0, UINT64_C(17014118346046923173)); \ 5431 muldq(__t1, __t0, q, __b);\ 5432 subdq(__a1, __a0, __t1, __t0);\ 5433 subdq(__a1, __a0, 1, __b * 2); \ 5434 __t0 = (slimb_t)__a1 >> 1; \ 5435 q += 2 + __t0;\ 5436 adddq(__a1, __a0, 0, __b & __t0);\ 5437 q += __a1; \ 5438 __a0 += __b & __a1; \ 5439 r = __a0;\ 5440 } while(0) 5441 5442 #elif LIMB_DIGITS == 9 5443 5444 /* WARNING: hardcoded for b = 1e9. It is assumed that: 5445 0 <= a1 < 2^29 */ 5446 #define divdq_base(q, r, a1, a0)\ 5447 do {\ 5448 uint32_t __t0, __t1, __b = BF_DEC_BASE; \ 5449 __t0 = a1;\ 5450 __t1 = a0;\ 5451 __t0 = (__t0 << 3) | (__t1 >> (32 - 3)); \ 5452 muldq(q, __t1, __t0, 2305843009U);\ 5453 r = a0 - q * __b;\ 5454 __t1 = (r >= __b);\ 5455 q += __t1;\ 5456 if (__t1)\ 5457 r -= __b;\ 5458 } while(0) 5459 5460 #endif 5461 5462 /* fast integer division by a fixed constant */ 5463 5464 typedef struct FastDivData { 5465 limb_t m1; /* multiplier */ 5466 int8_t shift1; 5467 int8_t shift2; 5468 } FastDivData; 5469 5470 /* From "Division by Invariant Integers using Multiplication" by 5471 Torborn Granlund and Peter L. Montgomery */ 5472 /* d must be != 0 */ 5473 static inline __maybe_unused void fast_udiv_init(FastDivData *s, limb_t d) 5474 { 5475 int l; 5476 limb_t q, r, m1; 5477 if (d == 1) 5478 l = 0; 5479 else 5480 l = 64 - clz64(d - 1); 5481 divdq(q, r, ((limb_t)1 << l) - d, 0, d); 5482 (void)r; 5483 m1 = q + 1; 5484 // printf("d=%lu l=%d m1=0x%016lx\n", d, l, m1); 5485 s->m1 = m1; 5486 s->shift1 = l; 5487 if (s->shift1 > 1) 5488 s->shift1 = 1; 5489 s->shift2 = l - 1; 5490 if (s->shift2 < 0) 5491 s->shift2 = 0; 5492 } 5493 5494 static inline limb_t fast_udiv(limb_t a, const FastDivData *s) 5495 { 5496 limb_t t0, t1; 5497 muldq(t1, t0, s->m1, a); 5498 t0 = (a - t1) >> s->shift1; 5499 return (t1 + t0) >> s->shift2; 5500 } 5501 5502 /* contains 10^i */ 5503 const limb_t mp_pow_dec[LIMB_DIGITS + 1] = { 5504 1U, 5505 10U, 5506 100U, 5507 1000U, 5508 10000U, 5509 100000U, 5510 1000000U, 5511 10000000U, 5512 100000000U, 5513 1000000000U, 5514 #if LIMB_BITS == 64 5515 10000000000U, 5516 100000000000U, 5517 1000000000000U, 5518 10000000000000U, 5519 100000000000000U, 5520 1000000000000000U, 5521 10000000000000000U, 5522 100000000000000000U, 5523 1000000000000000000U, 5524 10000000000000000000U, 5525 #endif 5526 }; 5527 5528 /* precomputed from fast_udiv_init(10^i) */ 5529 static const FastDivData mp_pow_div[LIMB_DIGITS + 1] = { 5530 #if LIMB_BITS == 32 5531 { 0x00000001, 0, 0 }, 5532 { 0x9999999a, 1, 3 }, 5533 { 0x47ae147b, 1, 6 }, 5534 { 0x0624dd30, 1, 9 }, 5535 { 0xa36e2eb2, 1, 13 }, 5536 { 0x4f8b588f, 1, 16 }, 5537 { 0x0c6f7a0c, 1, 19 }, 5538 { 0xad7f29ac, 1, 23 }, 5539 { 0x5798ee24, 1, 26 }, 5540 { 0x12e0be83, 1, 29 }, 5541 #else 5542 { 0x0000000000000001, 0, 0 }, 5543 { 0x999999999999999a, 1, 3 }, 5544 { 0x47ae147ae147ae15, 1, 6 }, 5545 { 0x0624dd2f1a9fbe77, 1, 9 }, 5546 { 0xa36e2eb1c432ca58, 1, 13 }, 5547 { 0x4f8b588e368f0847, 1, 16 }, 5548 { 0x0c6f7a0b5ed8d36c, 1, 19 }, 5549 { 0xad7f29abcaf48579, 1, 23 }, 5550 { 0x5798ee2308c39dfa, 1, 26 }, 5551 { 0x12e0be826d694b2f, 1, 29 }, 5552 { 0xb7cdfd9d7bdbab7e, 1, 33 }, 5553 { 0x5fd7fe17964955fe, 1, 36 }, 5554 { 0x19799812dea11198, 1, 39 }, 5555 { 0xc25c268497681c27, 1, 43 }, 5556 { 0x6849b86a12b9b01f, 1, 46 }, 5557 { 0x203af9ee756159b3, 1, 49 }, 5558 { 0xcd2b297d889bc2b7, 1, 53 }, 5559 { 0x70ef54646d496893, 1, 56 }, 5560 { 0x2725dd1d243aba0f, 1, 59 }, 5561 { 0xd83c94fb6d2ac34d, 1, 63 }, 5562 #endif 5563 }; 5564 5565 /* divide by 10^shift with 0 <= shift <= LIMB_DIGITS */ 5566 static inline limb_t fast_shr_dec(limb_t a, int shift) 5567 { 5568 return fast_udiv(a, &mp_pow_div[shift]); 5569 } 5570 5571 /* division and remainder by 10^shift */ 5572 #define fast_shr_rem_dec(q, r, a, shift) q = fast_shr_dec(a, shift), r = a - q * mp_pow_dec[shift] 5573 5574 limb_t mp_add_dec(limb_t *res, const limb_t *op1, const limb_t *op2, 5575 mp_size_t n, limb_t carry) 5576 { 5577 limb_t base = BF_DEC_BASE; 5578 mp_size_t i; 5579 limb_t k, a, v; 5580 5581 k=carry; 5582 for(i=0;i<n;i++) { 5583 /* XXX: reuse the trick in add_mod */ 5584 v = op1[i]; 5585 a = v + op2[i] + k - base; 5586 k = a <= v; 5587 if (!k) 5588 a += base; 5589 res[i]=a; 5590 } 5591 return k; 5592 } 5593 5594 limb_t mp_add_ui_dec(limb_t *tab, limb_t b, mp_size_t n) 5595 { 5596 limb_t base = BF_DEC_BASE; 5597 mp_size_t i; 5598 limb_t k, a, v; 5599 5600 k=b; 5601 for(i=0;i<n;i++) { 5602 v = tab[i]; 5603 a = v + k - base; 5604 k = a <= v; 5605 if (!k) 5606 a += base; 5607 tab[i] = a; 5608 if (k == 0) 5609 break; 5610 } 5611 return k; 5612 } 5613 5614 limb_t mp_sub_dec(limb_t *res, const limb_t *op1, const limb_t *op2, 5615 mp_size_t n, limb_t carry) 5616 { 5617 limb_t base = BF_DEC_BASE; 5618 mp_size_t i; 5619 limb_t k, v, a; 5620 5621 k=carry; 5622 for(i=0;i<n;i++) { 5623 v = op1[i]; 5624 a = v - op2[i] - k; 5625 k = a > v; 5626 if (k) 5627 a += base; 5628 res[i] = a; 5629 } 5630 return k; 5631 } 5632 5633 limb_t mp_sub_ui_dec(limb_t *tab, limb_t b, mp_size_t n) 5634 { 5635 limb_t base = BF_DEC_BASE; 5636 mp_size_t i; 5637 limb_t k, v, a; 5638 5639 k=b; 5640 for(i=0;i<n;i++) { 5641 v = tab[i]; 5642 a = v - k; 5643 k = a > v; 5644 if (k) 5645 a += base; 5646 tab[i]=a; 5647 if (k == 0) 5648 break; 5649 } 5650 return k; 5651 } 5652 5653 /* taba[] = taba[] * b + l. 0 <= b, l <= base - 1. Return the high carry */ 5654 limb_t mp_mul1_dec(limb_t *tabr, const limb_t *taba, mp_size_t n, 5655 limb_t b, limb_t l) 5656 { 5657 mp_size_t i; 5658 limb_t t0, t1, r; 5659 5660 for(i = 0; i < n; i++) { 5661 muldq(t1, t0, taba[i], b); 5662 adddq(t1, t0, 0, l); 5663 divdq_base(l, r, t1, t0); 5664 tabr[i] = r; 5665 } 5666 return l; 5667 } 5668 5669 /* tabr[] += taba[] * b. 0 <= b <= base - 1. Return the value to add 5670 to the high word */ 5671 limb_t mp_add_mul1_dec(limb_t *tabr, const limb_t *taba, mp_size_t n, 5672 limb_t b) 5673 { 5674 mp_size_t i; 5675 limb_t l, t0, t1, r; 5676 5677 l = 0; 5678 for(i = 0; i < n; i++) { 5679 muldq(t1, t0, taba[i], b); 5680 adddq(t1, t0, 0, l); 5681 adddq(t1, t0, 0, tabr[i]); 5682 divdq_base(l, r, t1, t0); 5683 tabr[i] = r; 5684 } 5685 return l; 5686 } 5687 5688 /* tabr[] -= taba[] * b. 0 <= b <= base - 1. Return the value to 5689 substract to the high word. */ 5690 limb_t mp_sub_mul1_dec(limb_t *tabr, const limb_t *taba, mp_size_t n, 5691 limb_t b) 5692 { 5693 limb_t base = BF_DEC_BASE; 5694 mp_size_t i; 5695 limb_t l, t0, t1, r, a, v, c; 5696 5697 /* XXX: optimize */ 5698 l = 0; 5699 for(i = 0; i < n; i++) { 5700 muldq(t1, t0, taba[i], b); 5701 adddq(t1, t0, 0, l); 5702 divdq_base(l, r, t1, t0); 5703 v = tabr[i]; 5704 a = v - r; 5705 c = a > v; 5706 if (c) 5707 a += base; 5708 /* never bigger than base because r = 0 when l = base - 1 */ 5709 l += c; 5710 tabr[i] = a; 5711 } 5712 return l; 5713 } 5714 5715 /* size of the result : op1_size + op2_size. */ 5716 void mp_mul_basecase_dec(limb_t *result, 5717 const limb_t *op1, mp_size_t op1_size, 5718 const limb_t *op2, mp_size_t op2_size) 5719 { 5720 mp_size_t i; 5721 limb_t r; 5722 5723 result[op1_size] = mp_mul1_dec(result, op1, op1_size, op2[0], 0); 5724 5725 for(i=1;i<op2_size;i++) { 5726 r = mp_add_mul1_dec(result + i, op1, op1_size, op2[i]); 5727 result[i + op1_size] = r; 5728 } 5729 } 5730 5731 /* taba[] = (taba[] + r*base^na) / b. 0 <= b < base. 0 <= r < 5732 b. Return the remainder. */ 5733 limb_t mp_div1_dec(limb_t *tabr, const limb_t *taba, mp_size_t na, 5734 limb_t b, limb_t r) 5735 { 5736 limb_t base = BF_DEC_BASE; 5737 mp_size_t i; 5738 limb_t t0, t1, q; 5739 int shift; 5740 5741 #if (BF_DEC_BASE % 2) == 0 5742 if (b == 2) { 5743 limb_t base_div2; 5744 /* Note: only works if base is even */ 5745 base_div2 = base >> 1; 5746 if (r) 5747 r = base_div2; 5748 for(i = na - 1; i >= 0; i--) { 5749 t0 = taba[i]; 5750 tabr[i] = (t0 >> 1) + r; 5751 r = 0; 5752 if (t0 & 1) 5753 r = base_div2; 5754 } 5755 if (r) 5756 r = 1; 5757 } else 5758 #endif 5759 if (na >= UDIV1NORM_THRESHOLD) { 5760 shift = clz(b); 5761 if (shift == 0) { 5762 /* normalized case: b >= 2^(LIMB_BITS-1) */ 5763 limb_t b_inv; 5764 b_inv = udiv1norm_init(b); 5765 for(i = na - 1; i >= 0; i--) { 5766 muldq(t1, t0, r, base); 5767 adddq(t1, t0, 0, taba[i]); 5768 q = udiv1norm(&r, t1, t0, b, b_inv); 5769 tabr[i] = q; 5770 } 5771 } else { 5772 limb_t b_inv; 5773 b <<= shift; 5774 b_inv = udiv1norm_init(b); 5775 for(i = na - 1; i >= 0; i--) { 5776 muldq(t1, t0, r, base); 5777 adddq(t1, t0, 0, taba[i]); 5778 t1 = (t1 << shift) | (t0 >> (LIMB_BITS - shift)); 5779 t0 <<= shift; 5780 q = udiv1norm(&r, t1, t0, b, b_inv); 5781 r >>= shift; 5782 tabr[i] = q; 5783 } 5784 } 5785 } else { 5786 for(i = na - 1; i >= 0; i--) { 5787 muldq(t1, t0, r, base); 5788 adddq(t1, t0, 0, taba[i]); 5789 divdq(q, r, t1, t0, b); 5790 tabr[i] = q; 5791 } 5792 } 5793 return r; 5794 } 5795 5796 static __maybe_unused void mp_print_str_dec(const char *str, 5797 const limb_t *tab, slimb_t n) 5798 { 5799 slimb_t i; 5800 printf("%s=", str); 5801 for(i = n - 1; i >= 0; i--) { 5802 if (i != n - 1) 5803 printf("_"); 5804 printf("%0*" PRIu_LIMB, LIMB_DIGITS, tab[i]); 5805 } 5806 printf("\n"); 5807 } 5808 5809 static __maybe_unused void mp_print_str_h_dec(const char *str, 5810 const limb_t *tab, slimb_t n, 5811 limb_t high) 5812 { 5813 slimb_t i; 5814 printf("%s=", str); 5815 printf("%0*" PRIu_LIMB, LIMB_DIGITS, high); 5816 for(i = n - 1; i >= 0; i--) { 5817 printf("_"); 5818 printf("%0*" PRIu_LIMB, LIMB_DIGITS, tab[i]); 5819 } 5820 printf("\n"); 5821 } 5822 5823 //#define DEBUG_DIV_SLOW 5824 5825 #define DIV_STATIC_ALLOC_LEN 16 5826 5827 /* return q = a / b and r = a % b. 5828 5829 taba[na] must be allocated if tabb1[nb - 1] < B / 2. tabb1[nb - 1] 5830 must be != zero. na must be >= nb. 's' can be NULL if tabb1[nb - 1] 5831 >= B / 2. 5832 5833 The remainder is is returned in taba and contains nb libms. tabq 5834 contains na - nb + 1 limbs. No overlap is permitted. 5835 5836 Running time of the standard method: (na - nb + 1) * nb 5837 Return 0 if OK, -1 if memory alloc error 5838 */ 5839 /* XXX: optimize */ 5840 static int mp_div_dec(bf_context_t *s, limb_t *tabq, 5841 limb_t *taba, mp_size_t na, 5842 const limb_t *tabb1, mp_size_t nb) 5843 { 5844 limb_t base = BF_DEC_BASE; 5845 limb_t r, mult, t0, t1, a, c, q, v, *tabb; 5846 mp_size_t i, j; 5847 limb_t static_tabb[DIV_STATIC_ALLOC_LEN]; 5848 5849 #ifdef DEBUG_DIV_SLOW 5850 mp_print_str_dec("a", taba, na); 5851 mp_print_str_dec("b", tabb1, nb); 5852 #endif 5853 5854 /* normalize tabb */ 5855 r = tabb1[nb - 1]; 5856 assert(r != 0); 5857 i = na - nb; 5858 if (r >= BF_DEC_BASE / 2) { 5859 mult = 1; 5860 tabb = (limb_t *)tabb1; 5861 q = 1; 5862 for(j = nb - 1; j >= 0; j--) { 5863 if (taba[i + j] != tabb[j]) { 5864 if (taba[i + j] < tabb[j]) 5865 q = 0; 5866 break; 5867 } 5868 } 5869 tabq[i] = q; 5870 if (q) { 5871 mp_sub_dec(taba + i, taba + i, tabb, nb, 0); 5872 } 5873 i--; 5874 } else { 5875 mult = base / (r + 1); 5876 if (likely(nb <= DIV_STATIC_ALLOC_LEN)) { 5877 tabb = static_tabb; 5878 } else { 5879 tabb = bf_malloc(s, sizeof(limb_t) * nb); 5880 if (!tabb) 5881 return -1; 5882 } 5883 mp_mul1_dec(tabb, tabb1, nb, mult, 0); 5884 taba[na] = mp_mul1_dec(taba, taba, na, mult, 0); 5885 } 5886 5887 #ifdef DEBUG_DIV_SLOW 5888 printf("mult=" FMT_LIMB "\n", mult); 5889 mp_print_str_dec("a_norm", taba, na + 1); 5890 mp_print_str_dec("b_norm", tabb, nb); 5891 #endif 5892 5893 for(; i >= 0; i--) { 5894 if (unlikely(taba[i + nb] >= tabb[nb - 1])) { 5895 /* XXX: check if it is really possible */ 5896 q = base - 1; 5897 } else { 5898 muldq(t1, t0, taba[i + nb], base); 5899 adddq(t1, t0, 0, taba[i + nb - 1]); 5900 divdq(q, r, t1, t0, tabb[nb - 1]); 5901 } 5902 // printf("i=%d q1=%ld\n", i, q); 5903 5904 r = mp_sub_mul1_dec(taba + i, tabb, nb, q); 5905 // mp_dump("r1", taba + i, nb, bd); 5906 // printf("r2=%ld\n", r); 5907 5908 v = taba[i + nb]; 5909 a = v - r; 5910 c = a > v; 5911 if (c) 5912 a += base; 5913 taba[i + nb] = a; 5914 5915 if (c != 0) { 5916 /* negative result */ 5917 for(;;) { 5918 q--; 5919 c = mp_add_dec(taba + i, taba + i, tabb, nb, 0); 5920 /* propagate carry and test if positive result */ 5921 if (c != 0) { 5922 if (++taba[i + nb] == base) { 5923 break; 5924 } 5925 } 5926 } 5927 } 5928 tabq[i] = q; 5929 } 5930 5931 #ifdef DEBUG_DIV_SLOW 5932 mp_print_str_dec("q", tabq, na - nb + 1); 5933 mp_print_str_dec("r", taba, nb); 5934 #endif 5935 5936 /* remove the normalization */ 5937 if (mult != 1) { 5938 mp_div1_dec(taba, taba, nb, mult, 0); 5939 if (unlikely(tabb != static_tabb)) 5940 bf_free(s, tabb); 5941 } 5942 return 0; 5943 } 5944 5945 /* divide by 10^shift */ 5946 static limb_t mp_shr_dec(limb_t *tab_r, const limb_t *tab, mp_size_t n, 5947 limb_t shift, limb_t high) 5948 { 5949 mp_size_t i; 5950 limb_t l, a, q, r; 5951 5952 assert(shift >= 1 && shift < LIMB_DIGITS); 5953 l = high; 5954 for(i = n - 1; i >= 0; i--) { 5955 a = tab[i]; 5956 fast_shr_rem_dec(q, r, a, shift); 5957 tab_r[i] = q + l * mp_pow_dec[LIMB_DIGITS - shift]; 5958 l = r; 5959 } 5960 return l; 5961 } 5962 5963 /* multiply by 10^shift */ 5964 static limb_t mp_shl_dec(limb_t *tab_r, const limb_t *tab, mp_size_t n, 5965 limb_t shift, limb_t low) 5966 { 5967 mp_size_t i; 5968 limb_t l, a, q, r; 5969 5970 assert(shift >= 1 && shift < LIMB_DIGITS); 5971 l = low; 5972 for(i = 0; i < n; i++) { 5973 a = tab[i]; 5974 fast_shr_rem_dec(q, r, a, LIMB_DIGITS - shift); 5975 tab_r[i] = r * mp_pow_dec[shift] + l; 5976 l = q; 5977 } 5978 return l; 5979 } 5980 5981 static limb_t mp_sqrtrem2_dec(limb_t *tabs, limb_t *taba) 5982 { 5983 int k; 5984 dlimb_t a, b, r; 5985 limb_t taba1[2], s, r0, r1; 5986 5987 /* convert to binary and normalize */ 5988 a = (dlimb_t)taba[1] * BF_DEC_BASE + taba[0]; 5989 k = clz(a >> LIMB_BITS) & ~1; 5990 b = a << k; 5991 taba1[0] = b; 5992 taba1[1] = b >> LIMB_BITS; 5993 mp_sqrtrem2(&s, taba1); 5994 s >>= (k >> 1); 5995 /* convert the remainder back to decimal */ 5996 r = a - (dlimb_t)s * (dlimb_t)s; 5997 divdq_base(r1, r0, r >> LIMB_BITS, r); 5998 taba[0] = r0; 5999 tabs[0] = s; 6000 return r1; 6001 } 6002 6003 //#define DEBUG_SQRTREM_DEC 6004 6005 /* tmp_buf must contain (n / 2 + 1 limbs) */ 6006 static limb_t mp_sqrtrem_rec_dec(limb_t *tabs, limb_t *taba, limb_t n, 6007 limb_t *tmp_buf) 6008 { 6009 limb_t l, h, rh, ql, qh, c, i; 6010 6011 if (n == 1) 6012 return mp_sqrtrem2_dec(tabs, taba); 6013 #ifdef DEBUG_SQRTREM_DEC 6014 mp_print_str_dec("a", taba, 2 * n); 6015 #endif 6016 l = n / 2; 6017 h = n - l; 6018 qh = mp_sqrtrem_rec_dec(tabs + l, taba + 2 * l, h, tmp_buf); 6019 #ifdef DEBUG_SQRTREM_DEC 6020 mp_print_str_dec("s1", tabs + l, h); 6021 mp_print_str_h_dec("r1", taba + 2 * l, h, qh); 6022 mp_print_str_h_dec("r2", taba + l, n, qh); 6023 #endif 6024 6025 /* the remainder is in taba + 2 * l. Its high bit is in qh */ 6026 if (qh) { 6027 mp_sub_dec(taba + 2 * l, taba + 2 * l, tabs + l, h, 0); 6028 } 6029 /* instead of dividing by 2*s, divide by s (which is normalized) 6030 and update q and r */ 6031 mp_div_dec(NULL, tmp_buf, taba + l, n, tabs + l, h); 6032 qh += tmp_buf[l]; 6033 for(i = 0; i < l; i++) 6034 tabs[i] = tmp_buf[i]; 6035 ql = mp_div1_dec(tabs, tabs, l, 2, qh & 1); 6036 qh = qh >> 1; /* 0 or 1 */ 6037 if (ql) 6038 rh = mp_add_dec(taba + l, taba + l, tabs + l, h, 0); 6039 else 6040 rh = 0; 6041 #ifdef DEBUG_SQRTREM_DEC 6042 mp_print_str_h_dec("q", tabs, l, qh); 6043 mp_print_str_h_dec("u", taba + l, h, rh); 6044 #endif 6045 6046 mp_add_ui_dec(tabs + l, qh, h); 6047 #ifdef DEBUG_SQRTREM_DEC 6048 mp_print_str_dec("s2", tabs, n); 6049 #endif 6050 6051 /* q = qh, tabs[l - 1 ... 0], r = taba[n - 1 ... l] */ 6052 /* subtract q^2. if qh = 1 then q = B^l, so we can take shortcuts */ 6053 if (qh) { 6054 c = qh; 6055 } else { 6056 mp_mul_basecase_dec(taba + n, tabs, l, tabs, l); 6057 c = mp_sub_dec(taba, taba, taba + n, 2 * l, 0); 6058 } 6059 rh -= mp_sub_ui_dec(taba + 2 * l, c, n - 2 * l); 6060 if ((slimb_t)rh < 0) { 6061 mp_sub_ui_dec(tabs, 1, n); 6062 rh += mp_add_mul1_dec(taba, tabs, n, 2); 6063 rh += mp_add_ui_dec(taba, 1, n); 6064 } 6065 return rh; 6066 } 6067 6068 /* 'taba' has 2*n limbs with n >= 1 and taba[2*n-1] >= B/4. Return (s, 6069 r) with s=floor(sqrt(a)) and r=a-s^2. 0 <= r <= 2 * s. tabs has n 6070 limbs. r is returned in the lower n limbs of taba. Its r[n] is the 6071 returned value of the function. */ 6072 int mp_sqrtrem_dec(bf_context_t *s, limb_t *tabs, limb_t *taba, limb_t n) 6073 { 6074 limb_t tmp_buf1[8]; 6075 limb_t *tmp_buf; 6076 mp_size_t n2; 6077 n2 = n / 2 + 1; 6078 if (n2 <= countof(tmp_buf1)) { 6079 tmp_buf = tmp_buf1; 6080 } else { 6081 tmp_buf = bf_malloc(s, sizeof(limb_t) * n2); 6082 if (!tmp_buf) 6083 return -1; 6084 } 6085 taba[n] = mp_sqrtrem_rec_dec(tabs, taba, n, tmp_buf); 6086 if (tmp_buf != tmp_buf1) 6087 bf_free(s, tmp_buf); 6088 return 0; 6089 } 6090 6091 /* return the number of leading zero digits, from 0 to LIMB_DIGITS */ 6092 static int clz_dec(limb_t a) 6093 { 6094 if (a == 0) 6095 return LIMB_DIGITS; 6096 switch(LIMB_BITS - 1 - clz(a)) { 6097 case 0: /* 1-1 */ 6098 return LIMB_DIGITS - 1; 6099 case 1: /* 2-3 */ 6100 return LIMB_DIGITS - 1; 6101 case 2: /* 4-7 */ 6102 return LIMB_DIGITS - 1; 6103 case 3: /* 8-15 */ 6104 if (a < 10) 6105 return LIMB_DIGITS - 1; 6106 else 6107 return LIMB_DIGITS - 2; 6108 case 4: /* 16-31 */ 6109 return LIMB_DIGITS - 2; 6110 case 5: /* 32-63 */ 6111 return LIMB_DIGITS - 2; 6112 case 6: /* 64-127 */ 6113 if (a < 100) 6114 return LIMB_DIGITS - 2; 6115 else 6116 return LIMB_DIGITS - 3; 6117 case 7: /* 128-255 */ 6118 return LIMB_DIGITS - 3; 6119 case 8: /* 256-511 */ 6120 return LIMB_DIGITS - 3; 6121 case 9: /* 512-1023 */ 6122 if (a < 1000) 6123 return LIMB_DIGITS - 3; 6124 else 6125 return LIMB_DIGITS - 4; 6126 case 10: /* 1024-2047 */ 6127 return LIMB_DIGITS - 4; 6128 case 11: /* 2048-4095 */ 6129 return LIMB_DIGITS - 4; 6130 case 12: /* 4096-8191 */ 6131 return LIMB_DIGITS - 4; 6132 case 13: /* 8192-16383 */ 6133 if (a < 10000) 6134 return LIMB_DIGITS - 4; 6135 else 6136 return LIMB_DIGITS - 5; 6137 case 14: /* 16384-32767 */ 6138 return LIMB_DIGITS - 5; 6139 case 15: /* 32768-65535 */ 6140 return LIMB_DIGITS - 5; 6141 case 16: /* 65536-131071 */ 6142 if (a < 100000) 6143 return LIMB_DIGITS - 5; 6144 else 6145 return LIMB_DIGITS - 6; 6146 case 17: /* 131072-262143 */ 6147 return LIMB_DIGITS - 6; 6148 case 18: /* 262144-524287 */ 6149 return LIMB_DIGITS - 6; 6150 case 19: /* 524288-1048575 */ 6151 if (a < 1000000) 6152 return LIMB_DIGITS - 6; 6153 else 6154 return LIMB_DIGITS - 7; 6155 case 20: /* 1048576-2097151 */ 6156 return LIMB_DIGITS - 7; 6157 case 21: /* 2097152-4194303 */ 6158 return LIMB_DIGITS - 7; 6159 case 22: /* 4194304-8388607 */ 6160 return LIMB_DIGITS - 7; 6161 case 23: /* 8388608-16777215 */ 6162 if (a < 10000000) 6163 return LIMB_DIGITS - 7; 6164 else 6165 return LIMB_DIGITS - 8; 6166 case 24: /* 16777216-33554431 */ 6167 return LIMB_DIGITS - 8; 6168 case 25: /* 33554432-67108863 */ 6169 return LIMB_DIGITS - 8; 6170 case 26: /* 67108864-134217727 */ 6171 if (a < 100000000) 6172 return LIMB_DIGITS - 8; 6173 else 6174 return LIMB_DIGITS - 9; 6175 #if LIMB_BITS == 64 6176 case 27: /* 134217728-268435455 */ 6177 return LIMB_DIGITS - 9; 6178 case 28: /* 268435456-536870911 */ 6179 return LIMB_DIGITS - 9; 6180 case 29: /* 536870912-1073741823 */ 6181 if (a < 1000000000) 6182 return LIMB_DIGITS - 9; 6183 else 6184 return LIMB_DIGITS - 10; 6185 case 30: /* 1073741824-2147483647 */ 6186 return LIMB_DIGITS - 10; 6187 case 31: /* 2147483648-4294967295 */ 6188 return LIMB_DIGITS - 10; 6189 case 32: /* 4294967296-8589934591 */ 6190 return LIMB_DIGITS - 10; 6191 case 33: /* 8589934592-17179869183 */ 6192 if (a < 10000000000) 6193 return LIMB_DIGITS - 10; 6194 else 6195 return LIMB_DIGITS - 11; 6196 case 34: /* 17179869184-34359738367 */ 6197 return LIMB_DIGITS - 11; 6198 case 35: /* 34359738368-68719476735 */ 6199 return LIMB_DIGITS - 11; 6200 case 36: /* 68719476736-137438953471 */ 6201 if (a < 100000000000) 6202 return LIMB_DIGITS - 11; 6203 else 6204 return LIMB_DIGITS - 12; 6205 case 37: /* 137438953472-274877906943 */ 6206 return LIMB_DIGITS - 12; 6207 case 38: /* 274877906944-549755813887 */ 6208 return LIMB_DIGITS - 12; 6209 case 39: /* 549755813888-1099511627775 */ 6210 if (a < 1000000000000) 6211 return LIMB_DIGITS - 12; 6212 else 6213 return LIMB_DIGITS - 13; 6214 case 40: /* 1099511627776-2199023255551 */ 6215 return LIMB_DIGITS - 13; 6216 case 41: /* 2199023255552-4398046511103 */ 6217 return LIMB_DIGITS - 13; 6218 case 42: /* 4398046511104-8796093022207 */ 6219 return LIMB_DIGITS - 13; 6220 case 43: /* 8796093022208-17592186044415 */ 6221 if (a < 10000000000000) 6222 return LIMB_DIGITS - 13; 6223 else 6224 return LIMB_DIGITS - 14; 6225 case 44: /* 17592186044416-35184372088831 */ 6226 return LIMB_DIGITS - 14; 6227 case 45: /* 35184372088832-70368744177663 */ 6228 return LIMB_DIGITS - 14; 6229 case 46: /* 70368744177664-140737488355327 */ 6230 if (a < 100000000000000) 6231 return LIMB_DIGITS - 14; 6232 else 6233 return LIMB_DIGITS - 15; 6234 case 47: /* 140737488355328-281474976710655 */ 6235 return LIMB_DIGITS - 15; 6236 case 48: /* 281474976710656-562949953421311 */ 6237 return LIMB_DIGITS - 15; 6238 case 49: /* 562949953421312-1125899906842623 */ 6239 if (a < 1000000000000000) 6240 return LIMB_DIGITS - 15; 6241 else 6242 return LIMB_DIGITS - 16; 6243 case 50: /* 1125899906842624-2251799813685247 */ 6244 return LIMB_DIGITS - 16; 6245 case 51: /* 2251799813685248-4503599627370495 */ 6246 return LIMB_DIGITS - 16; 6247 case 52: /* 4503599627370496-9007199254740991 */ 6248 return LIMB_DIGITS - 16; 6249 case 53: /* 9007199254740992-18014398509481983 */ 6250 if (a < 10000000000000000) 6251 return LIMB_DIGITS - 16; 6252 else 6253 return LIMB_DIGITS - 17; 6254 case 54: /* 18014398509481984-36028797018963967 */ 6255 return LIMB_DIGITS - 17; 6256 case 55: /* 36028797018963968-72057594037927935 */ 6257 return LIMB_DIGITS - 17; 6258 case 56: /* 72057594037927936-144115188075855871 */ 6259 if (a < 100000000000000000) 6260 return LIMB_DIGITS - 17; 6261 else 6262 return LIMB_DIGITS - 18; 6263 case 57: /* 144115188075855872-288230376151711743 */ 6264 return LIMB_DIGITS - 18; 6265 case 58: /* 288230376151711744-576460752303423487 */ 6266 return LIMB_DIGITS - 18; 6267 case 59: /* 576460752303423488-1152921504606846975 */ 6268 if (a < 1000000000000000000) 6269 return LIMB_DIGITS - 18; 6270 else 6271 return LIMB_DIGITS - 19; 6272 #endif 6273 default: 6274 return 0; 6275 } 6276 } 6277 6278 /* for debugging */ 6279 void bfdec_print_str(const char *str, const bfdec_t *a) 6280 { 6281 slimb_t i; 6282 printf("%s=", str); 6283 6284 if (a->expn == BF_EXP_NAN) { 6285 printf("NaN"); 6286 } else { 6287 if (a->sign) 6288 putchar('-'); 6289 if (a->expn == BF_EXP_ZERO) { 6290 putchar('0'); 6291 } else if (a->expn == BF_EXP_INF) { 6292 printf("Inf"); 6293 } else { 6294 printf("0."); 6295 for(i = a->len - 1; i >= 0; i--) 6296 printf("%0*" PRIu_LIMB, LIMB_DIGITS, a->tab[i]); 6297 printf("e%" PRId_LIMB, a->expn); 6298 } 6299 } 6300 printf("\n"); 6301 } 6302 6303 /* return != 0 if one digit between 0 and bit_pos inclusive is not zero. */ 6304 static inline limb_t scan_digit_nz(const bfdec_t *r, slimb_t bit_pos) 6305 { 6306 slimb_t pos; 6307 limb_t v, q; 6308 int shift; 6309 6310 if (bit_pos < 0) 6311 return 0; 6312 pos = (limb_t)bit_pos / LIMB_DIGITS; 6313 shift = (limb_t)bit_pos % LIMB_DIGITS; 6314 fast_shr_rem_dec(q, v, r->tab[pos], shift + 1); 6315 (void)q; 6316 if (v != 0) 6317 return 1; 6318 pos--; 6319 while (pos >= 0) { 6320 if (r->tab[pos] != 0) 6321 return 1; 6322 pos--; 6323 } 6324 return 0; 6325 } 6326 6327 static limb_t get_digit(const limb_t *tab, limb_t len, slimb_t pos) 6328 { 6329 slimb_t i; 6330 int shift; 6331 i = floor_div(pos, LIMB_DIGITS); 6332 if (i < 0 || i >= len) 6333 return 0; 6334 shift = pos - i * LIMB_DIGITS; 6335 return fast_shr_dec(tab[i], shift) % 10; 6336 } 6337 6338 #if 0 6339 static limb_t get_digits(const limb_t *tab, limb_t len, slimb_t pos) 6340 { 6341 limb_t a0, a1; 6342 int shift; 6343 slimb_t i; 6344 6345 i = floor_div(pos, LIMB_DIGITS); 6346 shift = pos - i * LIMB_DIGITS; 6347 if (i >= 0 && i < len) 6348 a0 = tab[i]; 6349 else 6350 a0 = 0; 6351 if (shift == 0) { 6352 return a0; 6353 } else { 6354 i++; 6355 if (i >= 0 && i < len) 6356 a1 = tab[i]; 6357 else 6358 a1 = 0; 6359 return fast_shr_dec(a0, shift) + 6360 fast_urem(a1, &mp_pow_div[LIMB_DIGITS - shift]) * 6361 mp_pow_dec[shift]; 6362 } 6363 } 6364 #endif 6365 6366 /* return the addend for rounding. Note that prec can be <= 0 for bf_rint() */ 6367 static int bfdec_get_rnd_add(int *pret, const bfdec_t *r, limb_t l, 6368 slimb_t prec, int rnd_mode) 6369 { 6370 int add_one, inexact; 6371 limb_t digit1, digit0; 6372 6373 // bfdec_print_str("get_rnd_add", r); 6374 if (rnd_mode == BF_RNDF) { 6375 digit0 = 1; /* faithful rounding does not honor the INEXACT flag */ 6376 } else { 6377 /* starting limb for bit 'prec + 1' */ 6378 digit0 = scan_digit_nz(r, l * LIMB_DIGITS - 1 - bf_max(0, prec + 1)); 6379 } 6380 6381 /* get the digit at 'prec' */ 6382 digit1 = get_digit(r->tab, l, l * LIMB_DIGITS - 1 - prec); 6383 inexact = (digit1 | digit0) != 0; 6384 6385 add_one = 0; 6386 switch(rnd_mode) { 6387 case BF_RNDZ: 6388 break; 6389 case BF_RNDN: 6390 if (digit1 == 5) { 6391 if (digit0) { 6392 add_one = 1; 6393 } else { 6394 /* round to even */ 6395 add_one = 6396 get_digit(r->tab, l, l * LIMB_DIGITS - 1 - (prec - 1)) & 1; 6397 } 6398 } else if (digit1 > 5) { 6399 add_one = 1; 6400 } 6401 break; 6402 case BF_RNDD: 6403 case BF_RNDU: 6404 if (r->sign == (rnd_mode == BF_RNDD)) 6405 add_one = inexact; 6406 break; 6407 case BF_RNDNA: 6408 case BF_RNDF: 6409 add_one = (digit1 >= 5); 6410 break; 6411 case BF_RNDA: 6412 add_one = inexact; 6413 break; 6414 default: 6415 abort(); 6416 } 6417 6418 if (inexact) 6419 *pret |= BF_ST_INEXACT; 6420 return add_one; 6421 } 6422 6423 /* round to prec1 bits assuming 'r' is non zero and finite. 'r' is 6424 assumed to have length 'l' (1 <= l <= r->len). prec1 can be 6425 BF_PREC_INF. BF_FLAG_SUBNORMAL is not supported. Cannot fail with 6426 BF_ST_MEM_ERROR. 6427 */ 6428 static int __bfdec_round(bfdec_t *r, limb_t prec1, bf_flags_t flags, limb_t l) 6429 { 6430 int shift, add_one, rnd_mode, ret; 6431 slimb_t i, bit_pos, pos, e_min, e_max, e_range, prec; 6432 6433 /* XXX: align to IEEE 754 2008 for decimal numbers ? */ 6434 e_range = (limb_t)1 << (bf_get_exp_bits(flags) - 1); 6435 e_min = -e_range + 3; 6436 e_max = e_range; 6437 6438 if (flags & BF_FLAG_RADPNT_PREC) { 6439 /* 'prec' is the precision after the decimal point */ 6440 if (prec1 != BF_PREC_INF) 6441 prec = r->expn + prec1; 6442 else 6443 prec = prec1; 6444 } else if (unlikely(r->expn < e_min) && (flags & BF_FLAG_SUBNORMAL)) { 6445 /* restrict the precision in case of potentially subnormal 6446 result */ 6447 assert(prec1 != BF_PREC_INF); 6448 prec = prec1 - (e_min - r->expn); 6449 } else { 6450 prec = prec1; 6451 } 6452 6453 /* round to prec bits */ 6454 rnd_mode = flags & BF_RND_MASK; 6455 ret = 0; 6456 add_one = bfdec_get_rnd_add(&ret, r, l, prec, rnd_mode); 6457 6458 if (prec <= 0) { 6459 if (add_one) { 6460 bfdec_resize(r, 1); /* cannot fail because r is non zero */ 6461 r->tab[0] = BF_DEC_BASE / 10; 6462 r->expn += 1 - prec; 6463 ret |= BF_ST_UNDERFLOW | BF_ST_INEXACT; 6464 return ret; 6465 } else { 6466 goto underflow; 6467 } 6468 } else if (add_one) { 6469 limb_t carry; 6470 6471 /* add one starting at digit 'prec - 1' */ 6472 bit_pos = l * LIMB_DIGITS - 1 - (prec - 1); 6473 pos = bit_pos / LIMB_DIGITS; 6474 carry = mp_pow_dec[bit_pos % LIMB_DIGITS]; 6475 carry = mp_add_ui_dec(r->tab + pos, carry, l - pos); 6476 if (carry) { 6477 /* shift right by one digit */ 6478 mp_shr_dec(r->tab + pos, r->tab + pos, l - pos, 1, 1); 6479 r->expn++; 6480 } 6481 } 6482 6483 /* check underflow */ 6484 if (unlikely(r->expn < e_min)) { 6485 if (flags & BF_FLAG_SUBNORMAL) { 6486 /* if inexact, also set the underflow flag */ 6487 if (ret & BF_ST_INEXACT) 6488 ret |= BF_ST_UNDERFLOW; 6489 } else { 6490 underflow: 6491 bfdec_set_zero(r, r->sign); 6492 ret |= BF_ST_UNDERFLOW | BF_ST_INEXACT; 6493 return ret; 6494 } 6495 } 6496 6497 /* check overflow */ 6498 if (unlikely(r->expn > e_max)) { 6499 bfdec_set_inf(r, r->sign); 6500 ret |= BF_ST_OVERFLOW | BF_ST_INEXACT; 6501 return ret; 6502 } 6503 6504 /* keep the bits starting at 'prec - 1' */ 6505 bit_pos = l * LIMB_DIGITS - 1 - (prec - 1); 6506 i = floor_div(bit_pos, LIMB_DIGITS); 6507 if (i >= 0) { 6508 shift = smod(bit_pos, LIMB_DIGITS); 6509 if (shift != 0) { 6510 r->tab[i] = fast_shr_dec(r->tab[i], shift) * 6511 mp_pow_dec[shift]; 6512 } 6513 } else { 6514 i = 0; 6515 } 6516 /* remove trailing zeros */ 6517 while (r->tab[i] == 0) 6518 i++; 6519 if (i > 0) { 6520 l -= i; 6521 memmove(r->tab, r->tab + i, l * sizeof(limb_t)); 6522 } 6523 bfdec_resize(r, l); /* cannot fail */ 6524 return ret; 6525 } 6526 6527 /* Cannot fail with BF_ST_MEM_ERROR. */ 6528 int bfdec_round(bfdec_t *r, limb_t prec, bf_flags_t flags) 6529 { 6530 if (r->len == 0) 6531 return 0; 6532 return __bfdec_round(r, prec, flags, r->len); 6533 } 6534 6535 /* 'r' must be a finite number. Cannot fail with BF_ST_MEM_ERROR. */ 6536 int bfdec_normalize_and_round(bfdec_t *r, limb_t prec1, bf_flags_t flags) 6537 { 6538 limb_t l, v; 6539 int shift, ret; 6540 6541 // bfdec_print_str("bf_renorm", r); 6542 l = r->len; 6543 while (l > 0 && r->tab[l - 1] == 0) 6544 l--; 6545 if (l == 0) { 6546 /* zero */ 6547 r->expn = BF_EXP_ZERO; 6548 bfdec_resize(r, 0); /* cannot fail */ 6549 ret = 0; 6550 } else { 6551 r->expn -= (r->len - l) * LIMB_DIGITS; 6552 /* shift to have the MSB set to '1' */ 6553 v = r->tab[l - 1]; 6554 shift = clz_dec(v); 6555 if (shift != 0) { 6556 mp_shl_dec(r->tab, r->tab, l, shift, 0); 6557 r->expn -= shift; 6558 } 6559 ret = __bfdec_round(r, prec1, flags, l); 6560 } 6561 // bf_print_str("r_final", r); 6562 return ret; 6563 } 6564 6565 int bfdec_set_ui(bfdec_t *r, uint64_t v) 6566 { 6567 #if LIMB_BITS == 32 6568 if (v >= BF_DEC_BASE * BF_DEC_BASE) { 6569 if (bfdec_resize(r, 3)) 6570 goto fail; 6571 r->tab[0] = v % BF_DEC_BASE; 6572 v /= BF_DEC_BASE; 6573 r->tab[1] = v % BF_DEC_BASE; 6574 r->tab[2] = v / BF_DEC_BASE; 6575 r->expn = 3 * LIMB_DIGITS; 6576 } else 6577 #endif 6578 if (v >= BF_DEC_BASE) { 6579 if (bfdec_resize(r, 2)) 6580 goto fail; 6581 r->tab[0] = v % BF_DEC_BASE; 6582 r->tab[1] = v / BF_DEC_BASE; 6583 r->expn = 2 * LIMB_DIGITS; 6584 } else { 6585 if (bfdec_resize(r, 1)) 6586 goto fail; 6587 r->tab[0] = v; 6588 r->expn = LIMB_DIGITS; 6589 } 6590 r->sign = 0; 6591 return bfdec_normalize_and_round(r, BF_PREC_INF, 0); 6592 fail: 6593 bfdec_set_nan(r); 6594 return BF_ST_MEM_ERROR; 6595 } 6596 6597 int bfdec_set_si(bfdec_t *r, int64_t v) 6598 { 6599 int ret; 6600 if (v < 0) { 6601 ret = bfdec_set_ui(r, -v); 6602 r->sign = 1; 6603 } else { 6604 ret = bfdec_set_ui(r, v); 6605 } 6606 return ret; 6607 } 6608 6609 static int bfdec_add_internal(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec, bf_flags_t flags, int b_neg) 6610 { 6611 bf_context_t *s = r->ctx; 6612 int is_sub, cmp_res, a_sign, b_sign, ret; 6613 6614 a_sign = a->sign; 6615 b_sign = b->sign ^ b_neg; 6616 is_sub = a_sign ^ b_sign; 6617 cmp_res = bfdec_cmpu(a, b); 6618 if (cmp_res < 0) { 6619 const bfdec_t *tmp; 6620 tmp = a; 6621 a = b; 6622 b = tmp; 6623 a_sign = b_sign; /* b_sign is never used later */ 6624 } 6625 /* abs(a) >= abs(b) */ 6626 if (cmp_res == 0 && is_sub && a->expn < BF_EXP_INF) { 6627 /* zero result */ 6628 bfdec_set_zero(r, (flags & BF_RND_MASK) == BF_RNDD); 6629 ret = 0; 6630 } else if (a->len == 0 || b->len == 0) { 6631 ret = 0; 6632 if (a->expn >= BF_EXP_INF) { 6633 if (a->expn == BF_EXP_NAN) { 6634 /* at least one operand is NaN */ 6635 bfdec_set_nan(r); 6636 ret = 0; 6637 } else if (b->expn == BF_EXP_INF && is_sub) { 6638 /* infinities with different signs */ 6639 bfdec_set_nan(r); 6640 ret = BF_ST_INVALID_OP; 6641 } else { 6642 bfdec_set_inf(r, a_sign); 6643 } 6644 } else { 6645 /* at least one zero and not subtract */ 6646 if (bfdec_set(r, a)) 6647 return BF_ST_MEM_ERROR; 6648 r->sign = a_sign; 6649 goto renorm; 6650 } 6651 } else { 6652 slimb_t d, a_offset, b_offset, i, r_len; 6653 limb_t carry; 6654 limb_t *b1_tab; 6655 int b_shift; 6656 mp_size_t b1_len; 6657 6658 d = a->expn - b->expn; 6659 6660 /* XXX: not efficient in time and memory if the precision is 6661 not infinite */ 6662 r_len = bf_max(a->len, b->len + (d + LIMB_DIGITS - 1) / LIMB_DIGITS); 6663 if (bfdec_resize(r, r_len)) 6664 goto fail; 6665 r->sign = a_sign; 6666 r->expn = a->expn; 6667 6668 a_offset = r_len - a->len; 6669 for(i = 0; i < a_offset; i++) 6670 r->tab[i] = 0; 6671 for(i = 0; i < a->len; i++) 6672 r->tab[a_offset + i] = a->tab[i]; 6673 6674 b_shift = d % LIMB_DIGITS; 6675 if (b_shift == 0) { 6676 b1_len = b->len; 6677 b1_tab = (limb_t *)b->tab; 6678 } else { 6679 b1_len = b->len + 1; 6680 b1_tab = bf_malloc(s, sizeof(limb_t) * b1_len); 6681 if (!b1_tab) 6682 goto fail; 6683 b1_tab[0] = mp_shr_dec(b1_tab + 1, b->tab, b->len, b_shift, 0) * 6684 mp_pow_dec[LIMB_DIGITS - b_shift]; 6685 } 6686 b_offset = r_len - (b->len + (d + LIMB_DIGITS - 1) / LIMB_DIGITS); 6687 6688 if (is_sub) { 6689 carry = mp_sub_dec(r->tab + b_offset, r->tab + b_offset, 6690 b1_tab, b1_len, 0); 6691 if (carry != 0) { 6692 carry = mp_sub_ui_dec(r->tab + b_offset + b1_len, carry, 6693 r_len - (b_offset + b1_len)); 6694 assert(carry == 0); 6695 } 6696 } else { 6697 carry = mp_add_dec(r->tab + b_offset, r->tab + b_offset, 6698 b1_tab, b1_len, 0); 6699 if (carry != 0) { 6700 carry = mp_add_ui_dec(r->tab + b_offset + b1_len, carry, 6701 r_len - (b_offset + b1_len)); 6702 } 6703 if (carry != 0) { 6704 if (bfdec_resize(r, r_len + 1)) { 6705 if (b_shift != 0) 6706 bf_free(s, b1_tab); 6707 goto fail; 6708 } 6709 r->tab[r_len] = 1; 6710 r->expn += LIMB_DIGITS; 6711 } 6712 } 6713 if (b_shift != 0) 6714 bf_free(s, b1_tab); 6715 renorm: 6716 ret = bfdec_normalize_and_round(r, prec, flags); 6717 } 6718 return ret; 6719 fail: 6720 bfdec_set_nan(r); 6721 return BF_ST_MEM_ERROR; 6722 } 6723 6724 static int __bfdec_add(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec, 6725 bf_flags_t flags) 6726 { 6727 return bfdec_add_internal(r, a, b, prec, flags, 0); 6728 } 6729 6730 static int __bfdec_sub(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec, 6731 bf_flags_t flags) 6732 { 6733 return bfdec_add_internal(r, a, b, prec, flags, 1); 6734 } 6735 6736 int bfdec_add(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec, 6737 bf_flags_t flags) 6738 { 6739 return bf_op2((bf_t *)r, (bf_t *)a, (bf_t *)b, prec, flags, 6740 (bf_op2_func_t *)__bfdec_add); 6741 } 6742 6743 int bfdec_sub(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec, 6744 bf_flags_t flags) 6745 { 6746 return bf_op2((bf_t *)r, (bf_t *)a, (bf_t *)b, prec, flags, 6747 (bf_op2_func_t *)__bfdec_sub); 6748 } 6749 6750 int bfdec_mul(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec, 6751 bf_flags_t flags) 6752 { 6753 int ret, r_sign; 6754 6755 if (a->len < b->len) { 6756 const bfdec_t *tmp = a; 6757 a = b; 6758 b = tmp; 6759 } 6760 r_sign = a->sign ^ b->sign; 6761 /* here b->len <= a->len */ 6762 if (b->len == 0) { 6763 if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { 6764 bfdec_set_nan(r); 6765 ret = 0; 6766 } else if (a->expn == BF_EXP_INF || b->expn == BF_EXP_INF) { 6767 if ((a->expn == BF_EXP_INF && b->expn == BF_EXP_ZERO) || 6768 (a->expn == BF_EXP_ZERO && b->expn == BF_EXP_INF)) { 6769 bfdec_set_nan(r); 6770 ret = BF_ST_INVALID_OP; 6771 } else { 6772 bfdec_set_inf(r, r_sign); 6773 ret = 0; 6774 } 6775 } else { 6776 bfdec_set_zero(r, r_sign); 6777 ret = 0; 6778 } 6779 } else { 6780 bfdec_t tmp, *r1 = NULL; 6781 limb_t a_len, b_len; 6782 limb_t *a_tab, *b_tab; 6783 6784 a_len = a->len; 6785 b_len = b->len; 6786 a_tab = a->tab; 6787 b_tab = b->tab; 6788 6789 if (r == a || r == b) { 6790 bfdec_init(r->ctx, &tmp); 6791 r1 = r; 6792 r = &tmp; 6793 } 6794 if (bfdec_resize(r, a_len + b_len)) { 6795 bfdec_set_nan(r); 6796 ret = BF_ST_MEM_ERROR; 6797 goto done; 6798 } 6799 mp_mul_basecase_dec(r->tab, a_tab, a_len, b_tab, b_len); 6800 r->sign = r_sign; 6801 r->expn = a->expn + b->expn; 6802 ret = bfdec_normalize_and_round(r, prec, flags); 6803 done: 6804 if (r == &tmp) 6805 bfdec_move(r1, &tmp); 6806 } 6807 return ret; 6808 } 6809 6810 int bfdec_mul_si(bfdec_t *r, const bfdec_t *a, int64_t b1, limb_t prec, 6811 bf_flags_t flags) 6812 { 6813 bfdec_t b; 6814 int ret; 6815 bfdec_init(r->ctx, &b); 6816 ret = bfdec_set_si(&b, b1); 6817 ret |= bfdec_mul(r, a, &b, prec, flags); 6818 bfdec_delete(&b); 6819 return ret; 6820 } 6821 6822 int bfdec_add_si(bfdec_t *r, const bfdec_t *a, int64_t b1, limb_t prec, 6823 bf_flags_t flags) 6824 { 6825 bfdec_t b; 6826 int ret; 6827 6828 bfdec_init(r->ctx, &b); 6829 ret = bfdec_set_si(&b, b1); 6830 ret |= bfdec_add(r, a, &b, prec, flags); 6831 bfdec_delete(&b); 6832 return ret; 6833 } 6834 6835 static int __bfdec_div(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, 6836 limb_t prec, bf_flags_t flags) 6837 { 6838 int ret, r_sign; 6839 limb_t n, nb, precl; 6840 6841 r_sign = a->sign ^ b->sign; 6842 if (a->expn >= BF_EXP_INF || b->expn >= BF_EXP_INF) { 6843 if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { 6844 bfdec_set_nan(r); 6845 return 0; 6846 } else if (a->expn == BF_EXP_INF && b->expn == BF_EXP_INF) { 6847 bfdec_set_nan(r); 6848 return BF_ST_INVALID_OP; 6849 } else if (a->expn == BF_EXP_INF) { 6850 bfdec_set_inf(r, r_sign); 6851 return 0; 6852 } else { 6853 bfdec_set_zero(r, r_sign); 6854 return 0; 6855 } 6856 } else if (a->expn == BF_EXP_ZERO) { 6857 if (b->expn == BF_EXP_ZERO) { 6858 bfdec_set_nan(r); 6859 return BF_ST_INVALID_OP; 6860 } else { 6861 bfdec_set_zero(r, r_sign); 6862 return 0; 6863 } 6864 } else if (b->expn == BF_EXP_ZERO) { 6865 bfdec_set_inf(r, r_sign); 6866 return BF_ST_DIVIDE_ZERO; 6867 } 6868 6869 nb = b->len; 6870 if (prec == BF_PREC_INF) { 6871 /* infinite precision: return BF_ST_INVALID_OP if not an exact 6872 result */ 6873 /* XXX: check */ 6874 precl = nb + 1; 6875 } else if (flags & BF_FLAG_RADPNT_PREC) { 6876 /* number of digits after the decimal point */ 6877 /* XXX: check (2 extra digits for rounding + 2 digits) */ 6878 precl = (bf_max(a->expn - b->expn, 0) + 2 + 6879 prec + 2 + LIMB_DIGITS - 1) / LIMB_DIGITS; 6880 } else { 6881 /* number of limbs of the quotient (2 extra digits for rounding) */ 6882 precl = (prec + 2 + LIMB_DIGITS - 1) / LIMB_DIGITS; 6883 } 6884 n = bf_max(a->len, precl); 6885 6886 { 6887 limb_t *taba, na, i; 6888 slimb_t d; 6889 6890 na = n + nb; 6891 taba = bf_malloc(r->ctx, (na + 1) * sizeof(limb_t)); 6892 if (!taba) 6893 goto fail; 6894 d = na - a->len; 6895 memset(taba, 0, d * sizeof(limb_t)); 6896 memcpy(taba + d, a->tab, a->len * sizeof(limb_t)); 6897 if (bfdec_resize(r, n + 1)) 6898 goto fail1; 6899 if (mp_div_dec(r->ctx, r->tab, taba, na, b->tab, nb)) { 6900 fail1: 6901 bf_free(r->ctx, taba); 6902 goto fail; 6903 } 6904 /* see if non zero remainder */ 6905 for(i = 0; i < nb; i++) { 6906 if (taba[i] != 0) 6907 break; 6908 } 6909 bf_free(r->ctx, taba); 6910 if (i != nb) { 6911 if (prec == BF_PREC_INF) { 6912 bfdec_set_nan(r); 6913 return BF_ST_INVALID_OP; 6914 } else { 6915 r->tab[0] |= 1; 6916 } 6917 } 6918 r->expn = a->expn - b->expn + LIMB_DIGITS; 6919 r->sign = r_sign; 6920 ret = bfdec_normalize_and_round(r, prec, flags); 6921 } 6922 return ret; 6923 fail: 6924 bfdec_set_nan(r); 6925 return BF_ST_MEM_ERROR; 6926 } 6927 6928 int bfdec_div(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec, 6929 bf_flags_t flags) 6930 { 6931 return bf_op2((bf_t *)r, (bf_t *)a, (bf_t *)b, prec, flags, 6932 (bf_op2_func_t *)__bfdec_div); 6933 } 6934 6935 /* a and b must be finite numbers with a >= 0 and b > 0. 'q' is the 6936 integer defined as floor(a/b) and r = a - q * b. */ 6937 static void bfdec_tdivremu(bf_context_t *s, bfdec_t *q, bfdec_t *r, 6938 const bfdec_t *a, const bfdec_t *b) 6939 { 6940 if (bfdec_cmpu(a, b) < 0) { 6941 bfdec_set_ui(q, 0); 6942 bfdec_set(r, a); 6943 } else { 6944 bfdec_div(q, a, b, 0, BF_RNDZ | BF_FLAG_RADPNT_PREC); 6945 bfdec_mul(r, q, b, BF_PREC_INF, BF_RNDZ); 6946 bfdec_sub(r, a, r, BF_PREC_INF, BF_RNDZ); 6947 } 6948 } 6949 6950 /* division and remainder. 6951 6952 rnd_mode is the rounding mode for the quotient. The additional 6953 rounding mode BF_RND_EUCLIDIAN is supported. 6954 6955 'q' is an integer. 'r' is rounded with prec and flags (prec can be 6956 BF_PREC_INF). 6957 */ 6958 int bfdec_divrem(bfdec_t *q, bfdec_t *r, const bfdec_t *a, const bfdec_t *b, 6959 limb_t prec, bf_flags_t flags, int rnd_mode) 6960 { 6961 bf_context_t *s = q->ctx; 6962 bfdec_t a1_s, *a1 = &a1_s; 6963 bfdec_t b1_s, *b1 = &b1_s; 6964 bfdec_t r1_s, *r1 = &r1_s; 6965 int q_sign, res; 6966 BOOL is_ceil, is_rndn; 6967 6968 assert(q != a && q != b); 6969 assert(r != a && r != b); 6970 assert(q != r); 6971 6972 if (a->len == 0 || b->len == 0) { 6973 bfdec_set_zero(q, 0); 6974 if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { 6975 bfdec_set_nan(r); 6976 return 0; 6977 } else if (a->expn == BF_EXP_INF || b->expn == BF_EXP_ZERO) { 6978 bfdec_set_nan(r); 6979 return BF_ST_INVALID_OP; 6980 } else { 6981 bfdec_set(r, a); 6982 return bfdec_round(r, prec, flags); 6983 } 6984 } 6985 6986 q_sign = a->sign ^ b->sign; 6987 is_rndn = (rnd_mode == BF_RNDN || rnd_mode == BF_RNDNA); 6988 switch(rnd_mode) { 6989 default: 6990 case BF_RNDZ: 6991 case BF_RNDN: 6992 case BF_RNDNA: 6993 is_ceil = FALSE; 6994 break; 6995 case BF_RNDD: 6996 is_ceil = q_sign; 6997 break; 6998 case BF_RNDU: 6999 is_ceil = q_sign ^ 1; 7000 break; 7001 case BF_RNDA: 7002 is_ceil = TRUE; 7003 break; 7004 case BF_DIVREM_EUCLIDIAN: 7005 is_ceil = a->sign; 7006 break; 7007 } 7008 7009 a1->expn = a->expn; 7010 a1->tab = a->tab; 7011 a1->len = a->len; 7012 a1->sign = 0; 7013 7014 b1->expn = b->expn; 7015 b1->tab = b->tab; 7016 b1->len = b->len; 7017 b1->sign = 0; 7018 7019 // bfdec_print_str("a1", a1); 7020 // bfdec_print_str("b1", b1); 7021 /* XXX: could improve to avoid having a large 'q' */ 7022 bfdec_tdivremu(s, q, r, a1, b1); 7023 if (bfdec_is_nan(q) || bfdec_is_nan(r)) 7024 goto fail; 7025 // bfdec_print_str("q", q); 7026 // bfdec_print_str("r", r); 7027 7028 if (r->len != 0) { 7029 if (is_rndn) { 7030 bfdec_init(s, r1); 7031 if (bfdec_set(r1, r)) 7032 goto fail; 7033 if (bfdec_mul_si(r1, r1, 2, BF_PREC_INF, BF_RNDZ)) { 7034 bfdec_delete(r1); 7035 goto fail; 7036 } 7037 res = bfdec_cmpu(r1, b); 7038 bfdec_delete(r1); 7039 if (res > 0 || 7040 (res == 0 && 7041 (rnd_mode == BF_RNDNA || 7042 (get_digit(q->tab, q->len, q->len * LIMB_DIGITS - q->expn) & 1) != 0))) { 7043 goto do_sub_r; 7044 } 7045 } else if (is_ceil) { 7046 do_sub_r: 7047 res = bfdec_add_si(q, q, 1, BF_PREC_INF, BF_RNDZ); 7048 res |= bfdec_sub(r, r, b1, BF_PREC_INF, BF_RNDZ); 7049 if (res & BF_ST_MEM_ERROR) 7050 goto fail; 7051 } 7052 } 7053 7054 r->sign ^= a->sign; 7055 q->sign = q_sign; 7056 return bfdec_round(r, prec, flags); 7057 fail: 7058 bfdec_set_nan(q); 7059 bfdec_set_nan(r); 7060 return BF_ST_MEM_ERROR; 7061 } 7062 7063 int bfdec_rem(bfdec_t *r, const bfdec_t *a, const bfdec_t *b, limb_t prec, 7064 bf_flags_t flags, int rnd_mode) 7065 { 7066 bfdec_t q_s, *q = &q_s; 7067 int ret; 7068 7069 bfdec_init(r->ctx, q); 7070 ret = bfdec_divrem(q, r, a, b, prec, flags, rnd_mode); 7071 bfdec_delete(q); 7072 return ret; 7073 } 7074 7075 /* convert to integer (infinite precision) */ 7076 int bfdec_rint(bfdec_t *r, int rnd_mode) 7077 { 7078 return bfdec_round(r, 0, rnd_mode | BF_FLAG_RADPNT_PREC); 7079 } 7080 7081 int bfdec_sqrt(bfdec_t *r, const bfdec_t *a, limb_t prec, bf_flags_t flags) 7082 { 7083 bf_context_t *s = a->ctx; 7084 int ret, k; 7085 limb_t *a1, v; 7086 slimb_t n, n1, prec1; 7087 limb_t res; 7088 7089 assert(r != a); 7090 7091 if (a->len == 0) { 7092 if (a->expn == BF_EXP_NAN) { 7093 bfdec_set_nan(r); 7094 } else if (a->expn == BF_EXP_INF && a->sign) { 7095 goto invalid_op; 7096 } else { 7097 bfdec_set(r, a); 7098 } 7099 ret = 0; 7100 } else if (a->sign || prec == BF_PREC_INF) { 7101 invalid_op: 7102 bfdec_set_nan(r); 7103 ret = BF_ST_INVALID_OP; 7104 } else { 7105 if (flags & BF_FLAG_RADPNT_PREC) { 7106 prec1 = bf_max(floor_div(a->expn + 1, 2) + prec, 1); 7107 } else { 7108 prec1 = prec; 7109 } 7110 /* convert the mantissa to an integer with at least 2 * 7111 prec + 4 digits */ 7112 n = (2 * (prec1 + 2) + 2 * LIMB_DIGITS - 1) / (2 * LIMB_DIGITS); 7113 if (bfdec_resize(r, n)) 7114 goto fail; 7115 a1 = bf_malloc(s, sizeof(limb_t) * 2 * n); 7116 if (!a1) 7117 goto fail; 7118 n1 = bf_min(2 * n, a->len); 7119 memset(a1, 0, (2 * n - n1) * sizeof(limb_t)); 7120 memcpy(a1 + 2 * n - n1, a->tab + a->len - n1, n1 * sizeof(limb_t)); 7121 if (a->expn & 1) { 7122 res = mp_shr_dec(a1, a1, 2 * n, 1, 0); 7123 } else { 7124 res = 0; 7125 } 7126 /* normalize so that a1 >= B^(2*n)/4. Not need for n = 1 7127 because mp_sqrtrem2_dec already does it */ 7128 k = 0; 7129 if (n > 1) { 7130 v = a1[2 * n - 1]; 7131 while (v < BF_DEC_BASE / 4) { 7132 k++; 7133 v *= 4; 7134 } 7135 if (k != 0) 7136 mp_mul1_dec(a1, a1, 2 * n, 1 << (2 * k), 0); 7137 } 7138 if (mp_sqrtrem_dec(s, r->tab, a1, n)) { 7139 bf_free(s, a1); 7140 goto fail; 7141 } 7142 if (k != 0) 7143 mp_div1_dec(r->tab, r->tab, n, 1 << k, 0); 7144 if (!res) { 7145 res = mp_scan_nz(a1, n + 1); 7146 } 7147 bf_free(s, a1); 7148 if (!res) { 7149 res = mp_scan_nz(a->tab, a->len - n1); 7150 } 7151 if (res != 0) 7152 r->tab[0] |= 1; 7153 r->sign = 0; 7154 r->expn = (a->expn + 1) >> 1; 7155 ret = bfdec_round(r, prec, flags); 7156 } 7157 return ret; 7158 fail: 7159 bfdec_set_nan(r); 7160 return BF_ST_MEM_ERROR; 7161 } 7162 7163 /* The rounding mode is always BF_RNDZ. Return BF_ST_OVERFLOW if there 7164 is an overflow and 0 otherwise. No memory error is possible. */ 7165 int bfdec_get_int32(int *pres, const bfdec_t *a) 7166 { 7167 uint32_t v; 7168 int ret; 7169 if (a->expn >= BF_EXP_INF) { 7170 ret = 0; 7171 if (a->expn == BF_EXP_INF) { 7172 v = (uint32_t)INT32_MAX + a->sign; 7173 /* XXX: return overflow ? */ 7174 } else { 7175 v = INT32_MAX; 7176 } 7177 } else if (a->expn <= 0) { 7178 v = 0; 7179 ret = 0; 7180 } else if (a->expn <= 9) { 7181 v = fast_shr_dec(a->tab[a->len - 1], LIMB_DIGITS - a->expn); 7182 if (a->sign) 7183 v = -v; 7184 ret = 0; 7185 } else if (a->expn == 10) { 7186 uint64_t v1; 7187 uint32_t v_max; 7188 #if LIMB_BITS == 64 7189 v1 = fast_shr_dec(a->tab[a->len - 1], LIMB_DIGITS - a->expn); 7190 #else 7191 v1 = (uint64_t)a->tab[a->len - 1] * 10 + 7192 get_digit(a->tab, a->len, (a->len - 1) * LIMB_DIGITS - 1); 7193 #endif 7194 v_max = (uint32_t)INT32_MAX + a->sign; 7195 if (v1 > v_max) { 7196 v = v_max; 7197 ret = BF_ST_OVERFLOW; 7198 } else { 7199 v = v1; 7200 if (a->sign) 7201 v = -v; 7202 ret = 0; 7203 } 7204 } else { 7205 v = (uint32_t)INT32_MAX + a->sign; 7206 ret = BF_ST_OVERFLOW; 7207 } 7208 *pres = v; 7209 return ret; 7210 } 7211 7212 /* power to an integer with infinite precision */ 7213 int bfdec_pow_ui(bfdec_t *r, const bfdec_t *a, limb_t b) 7214 { 7215 int ret, n_bits, i; 7216 7217 assert(r != a); 7218 if (b == 0) 7219 return bfdec_set_ui(r, 1); 7220 ret = bfdec_set(r, a); 7221 n_bits = LIMB_BITS - clz(b); 7222 for(i = n_bits - 2; i >= 0; i--) { 7223 ret |= bfdec_mul(r, r, r, BF_PREC_INF, BF_RNDZ); 7224 if ((b >> i) & 1) 7225 ret |= bfdec_mul(r, r, a, BF_PREC_INF, BF_RNDZ); 7226 } 7227 return ret; 7228 } 7229 7230 char *bfdec_ftoa(size_t *plen, const bfdec_t *a, limb_t prec, bf_flags_t flags) 7231 { 7232 return bf_ftoa_internal(plen, (const bf_t *)a, 10, prec, flags, TRUE); 7233 } 7234 7235 int bfdec_atof(bfdec_t *r, const char *str, const char **pnext, 7236 limb_t prec, bf_flags_t flags) 7237 { 7238 slimb_t dummy_exp; 7239 return bf_atof_internal((bf_t *)r, &dummy_exp, str, pnext, 10, prec, 7240 flags, TRUE); 7241 } 7242 7243 #endif /* USE_BF_DEC */ 7244 7245 #ifdef USE_FFT_MUL 7246 /***************************************************************/ 7247 /* Integer multiplication with FFT */ 7248 7249 /* or LIMB_BITS at bit position 'pos' in tab */ 7250 static inline void put_bits(limb_t *tab, limb_t len, slimb_t pos, limb_t val) 7251 { 7252 limb_t i; 7253 int p; 7254 7255 i = pos >> LIMB_LOG2_BITS; 7256 p = pos & (LIMB_BITS - 1); 7257 if (i < len) 7258 tab[i] |= val << p; 7259 if (p != 0) { 7260 i++; 7261 if (i < len) { 7262 tab[i] |= val >> (LIMB_BITS - p); 7263 } 7264 } 7265 } 7266 7267 #if defined(__AVX2__) 7268 7269 typedef double NTTLimb; 7270 7271 /* we must have: modulo >= 1 << NTT_MOD_LOG2_MIN */ 7272 #define NTT_MOD_LOG2_MIN 50 7273 #define NTT_MOD_LOG2_MAX 51 7274 #define NB_MODS 5 7275 #define NTT_PROOT_2EXP 39 7276 static const int ntt_int_bits[NB_MODS] = { 254, 203, 152, 101, 50, }; 7277 7278 static const limb_t ntt_mods[NB_MODS] = { 0x00073a8000000001, 0x0007858000000001, 0x0007a38000000001, 0x0007a68000000001, 0x0007fd8000000001, 7279 }; 7280 7281 static const limb_t ntt_proot[2][NB_MODS] = { 7282 { 0x00056198d44332c8, 0x0002eb5d640aad39, 0x00047e31eaa35fd0, 0x0005271ac118a150, 0x00075e0ce8442bd5, }, 7283 { 0x000461169761bcc5, 0x0002dac3cb2da688, 0x0004abc97751e3bf, 0x000656778fc8c485, 0x0000dc6469c269fa, }, 7284 }; 7285 7286 static const limb_t ntt_mods_cr[NB_MODS * (NB_MODS - 1) / 2] = { 7287 0x00020e4da740da8e, 0x0004c3dc09c09c1d, 0x000063bd097b4271, 0x000799d8f18f18fd, 7288 0x0005384222222264, 0x000572b07c1f07fe, 0x00035cd08888889a, 7289 0x00066015555557e3, 0x000725960b60b623, 7290 0x0002fc1fa1d6ce12, 7291 }; 7292 7293 #else 7294 7295 typedef limb_t NTTLimb; 7296 7297 #if LIMB_BITS == 64 7298 7299 #define NTT_MOD_LOG2_MIN 61 7300 #define NTT_MOD_LOG2_MAX 62 7301 #define NB_MODS 5 7302 #define NTT_PROOT_2EXP 51 7303 static const int ntt_int_bits[NB_MODS] = { 307, 246, 185, 123, 61, }; 7304 7305 static const limb_t ntt_mods[NB_MODS] = { 0x28d8000000000001, 0x2a88000000000001, 0x2ed8000000000001, 0x3508000000000001, 0x3aa8000000000001, 7306 }; 7307 7308 static const limb_t ntt_proot[2][NB_MODS] = { 7309 { 0x1b8ea61034a2bea7, 0x21a9762de58206fb, 0x02ca782f0756a8ea, 0x278384537a3e50a1, 0x106e13fee74ce0ab, }, 7310 { 0x233513af133e13b8, 0x1d13140d1c6f75f1, 0x12cde57f97e3eeda, 0x0d6149e23cbe654f, 0x36cd204f522a1379, }, 7311 }; 7312 7313 static const limb_t ntt_mods_cr[NB_MODS * (NB_MODS - 1) / 2] = { 7314 0x08a9ed097b425eea, 0x18a44aaaaaaaaab3, 0x2493f57f57f57f5d, 0x126b8d0649a7f8d4, 7315 0x09d80ed7303b5ccc, 0x25b8bcf3cf3cf3d5, 0x2ce6ce63398ce638, 7316 0x0e31fad40a57eb59, 0x02a3529fd4a7f52f, 7317 0x3a5493e93e93e94a, 7318 }; 7319 7320 #elif LIMB_BITS == 32 7321 7322 /* we must have: modulo >= 1 << NTT_MOD_LOG2_MIN */ 7323 #define NTT_MOD_LOG2_MIN 29 7324 #define NTT_MOD_LOG2_MAX 30 7325 #define NB_MODS 5 7326 #define NTT_PROOT_2EXP 20 7327 static const int ntt_int_bits[NB_MODS] = { 148, 119, 89, 59, 29, }; 7328 7329 static const limb_t ntt_mods[NB_MODS] = { 0x0000000032b00001, 0x0000000033700001, 0x0000000036d00001, 0x0000000037300001, 0x000000003e500001, 7330 }; 7331 7332 static const limb_t ntt_proot[2][NB_MODS] = { 7333 { 0x0000000032525f31, 0x0000000005eb3b37, 0x00000000246eda9f, 0x0000000035f25901, 0x00000000022f5768, }, 7334 { 0x00000000051eba1a, 0x00000000107be10e, 0x000000001cd574e0, 0x00000000053806e6, 0x000000002cd6bf98, }, 7335 }; 7336 7337 static const limb_t ntt_mods_cr[NB_MODS * (NB_MODS - 1) / 2] = { 7338 0x000000000449559a, 0x000000001eba6ca9, 0x000000002ec18e46, 0x000000000860160b, 7339 0x000000000d321307, 0x000000000bf51120, 0x000000000f662938, 7340 0x000000000932ab3e, 0x000000002f40eef8, 7341 0x000000002e760905, 7342 }; 7343 7344 #endif /* LIMB_BITS */ 7345 7346 #endif /* !AVX2 */ 7347 7348 #if defined(__AVX2__) 7349 #define NTT_TRIG_K_MAX 18 7350 #else 7351 #define NTT_TRIG_K_MAX 19 7352 #endif 7353 7354 typedef struct BFNTTState { 7355 bf_context_t *ctx; 7356 7357 /* used for mul_mod_fast() */ 7358 limb_t ntt_mods_div[NB_MODS]; 7359 7360 limb_t ntt_proot_pow[NB_MODS][2][NTT_PROOT_2EXP + 1]; 7361 limb_t ntt_proot_pow_inv[NB_MODS][2][NTT_PROOT_2EXP + 1]; 7362 NTTLimb *ntt_trig[NB_MODS][2][NTT_TRIG_K_MAX + 1]; 7363 /* 1/2^n mod m */ 7364 limb_t ntt_len_inv[NB_MODS][NTT_PROOT_2EXP + 1][2]; 7365 #if defined(__AVX2__) 7366 __m256d ntt_mods_cr_vec[NB_MODS * (NB_MODS - 1) / 2]; 7367 __m256d ntt_mods_vec[NB_MODS]; 7368 __m256d ntt_mods_inv_vec[NB_MODS]; 7369 #else 7370 limb_t ntt_mods_cr_inv[NB_MODS * (NB_MODS - 1) / 2]; 7371 #endif 7372 } BFNTTState; 7373 7374 static NTTLimb *get_trig(BFNTTState *s, int k, int inverse, int m_idx); 7375 7376 /* add modulo with up to (LIMB_BITS-1) bit modulo */ 7377 static inline limb_t add_mod(limb_t a, limb_t b, limb_t m) 7378 { 7379 limb_t r; 7380 r = a + b; 7381 if (r >= m) 7382 r -= m; 7383 return r; 7384 } 7385 7386 /* sub modulo with up to LIMB_BITS bit modulo */ 7387 static inline limb_t sub_mod(limb_t a, limb_t b, limb_t m) 7388 { 7389 limb_t r; 7390 r = a - b; 7391 if (r > a) 7392 r += m; 7393 return r; 7394 } 7395 7396 /* return (r0+r1*B) mod m 7397 precondition: 0 <= r0+r1*B < 2^(64+NTT_MOD_LOG2_MIN) 7398 */ 7399 static inline limb_t mod_fast(dlimb_t r, 7400 limb_t m, limb_t m_inv) 7401 { 7402 limb_t a1, q, t0, r1, r0; 7403 7404 a1 = r >> NTT_MOD_LOG2_MIN; 7405 7406 q = ((dlimb_t)a1 * m_inv) >> LIMB_BITS; 7407 r = r - (dlimb_t)q * m - m * 2; 7408 r1 = r >> LIMB_BITS; 7409 t0 = (slimb_t)r1 >> 1; 7410 r += m & t0; 7411 r0 = r; 7412 r1 = r >> LIMB_BITS; 7413 r0 += m & r1; 7414 return r0; 7415 } 7416 7417 /* faster version using precomputed modulo inverse. 7418 precondition: 0 <= a * b < 2^(64+NTT_MOD_LOG2_MIN) */ 7419 static inline limb_t mul_mod_fast(limb_t a, limb_t b, 7420 limb_t m, limb_t m_inv) 7421 { 7422 dlimb_t r; 7423 r = (dlimb_t)a * (dlimb_t)b; 7424 return mod_fast(r, m, m_inv); 7425 } 7426 7427 static inline limb_t init_mul_mod_fast(limb_t m) 7428 { 7429 dlimb_t t; 7430 assert(m < (limb_t)1 << NTT_MOD_LOG2_MAX); 7431 assert(m >= (limb_t)1 << NTT_MOD_LOG2_MIN); 7432 t = (dlimb_t)1 << (LIMB_BITS + NTT_MOD_LOG2_MIN); 7433 return t / m; 7434 } 7435 7436 /* Faster version used when the multiplier is constant. 0 <= a < 2^64, 7437 0 <= b < m. */ 7438 static inline limb_t mul_mod_fast2(limb_t a, limb_t b, 7439 limb_t m, limb_t b_inv) 7440 { 7441 limb_t r, q; 7442 7443 q = ((dlimb_t)a * (dlimb_t)b_inv) >> LIMB_BITS; 7444 r = a * b - q * m; 7445 if (r >= m) 7446 r -= m; 7447 return r; 7448 } 7449 7450 /* Faster version used when the multiplier is constant. 0 <= a < 2^64, 7451 0 <= b < m. Let r = a * b mod m. The return value is 'r' or 'r + 7452 m'. */ 7453 static inline limb_t mul_mod_fast3(limb_t a, limb_t b, 7454 limb_t m, limb_t b_inv) 7455 { 7456 limb_t r, q; 7457 7458 q = ((dlimb_t)a * (dlimb_t)b_inv) >> LIMB_BITS; 7459 r = a * b - q * m; 7460 return r; 7461 } 7462 7463 static inline limb_t init_mul_mod_fast2(limb_t b, limb_t m) 7464 { 7465 return ((dlimb_t)b << LIMB_BITS) / m; 7466 } 7467 7468 #ifdef __AVX2__ 7469 7470 static inline limb_t ntt_limb_to_int(NTTLimb a, limb_t m) 7471 { 7472 slimb_t v; 7473 v = a; 7474 if (v < 0) 7475 v += m; 7476 if (v >= m) 7477 v -= m; 7478 return v; 7479 } 7480 7481 static inline NTTLimb int_to_ntt_limb(limb_t a, limb_t m) 7482 { 7483 return (slimb_t)a; 7484 } 7485 7486 static inline NTTLimb int_to_ntt_limb2(limb_t a, limb_t m) 7487 { 7488 if (a >= (m / 2)) 7489 a -= m; 7490 return (slimb_t)a; 7491 } 7492 7493 /* return r + m if r < 0 otherwise r. */ 7494 static inline __m256d ntt_mod1(__m256d r, __m256d m) 7495 { 7496 return _mm256_blendv_pd(r, r + m, r); 7497 } 7498 7499 /* input: abs(r) < 2 * m. Output: abs(r) < m */ 7500 static inline __m256d ntt_mod(__m256d r, __m256d mf, __m256d m2f) 7501 { 7502 return _mm256_blendv_pd(r, r + m2f, r) - mf; 7503 } 7504 7505 /* input: abs(a*b) < 2 * m^2, output: abs(r) < m */ 7506 static inline __m256d ntt_mul_mod(__m256d a, __m256d b, __m256d mf, 7507 __m256d m_inv) 7508 { 7509 __m256d r, q, ab1, ab0, qm0, qm1; 7510 ab1 = a * b; 7511 q = _mm256_round_pd(ab1 * m_inv, 0); /* round to nearest */ 7512 qm1 = q * mf; 7513 qm0 = _mm256_fmsub_pd(q, mf, qm1); /* low part */ 7514 ab0 = _mm256_fmsub_pd(a, b, ab1); /* low part */ 7515 r = (ab1 - qm1) + (ab0 - qm0); 7516 return r; 7517 } 7518 7519 static void *bf_aligned_malloc(bf_context_t *s, size_t size, size_t align) 7520 { 7521 void *ptr; 7522 void **ptr1; 7523 ptr = bf_malloc(s, size + sizeof(void *) + align - 1); 7524 if (!ptr) 7525 return NULL; 7526 ptr1 = (void **)(((uintptr_t)ptr + sizeof(void *) + align - 1) & 7527 ~(align - 1)); 7528 ptr1[-1] = ptr; 7529 return ptr1; 7530 } 7531 7532 static void bf_aligned_free(bf_context_t *s, void *ptr) 7533 { 7534 if (!ptr) 7535 return; 7536 bf_free(s, ((void **)ptr)[-1]); 7537 } 7538 7539 static void *ntt_malloc(BFNTTState *s, size_t size) 7540 { 7541 return bf_aligned_malloc(s->ctx, size, 64); 7542 } 7543 7544 static void ntt_free(BFNTTState *s, void *ptr) 7545 { 7546 bf_aligned_free(s->ctx, ptr); 7547 } 7548 7549 static no_inline int ntt_fft(BFNTTState *s, 7550 NTTLimb *out_buf, NTTLimb *in_buf, 7551 NTTLimb *tmp_buf, int fft_len_log2, 7552 int inverse, int m_idx) 7553 { 7554 limb_t nb_blocks, fft_per_block, p, k, n, stride_in, i, j; 7555 NTTLimb *tab_in, *tab_out, *tmp, *trig; 7556 __m256d m_inv, mf, m2f, c, a0, a1, b0, b1; 7557 limb_t m; 7558 int l; 7559 7560 m = ntt_mods[m_idx]; 7561 7562 m_inv = _mm256_set1_pd(1.0 / (double)m); 7563 mf = _mm256_set1_pd(m); 7564 m2f = _mm256_set1_pd(m * 2); 7565 7566 n = (limb_t)1 << fft_len_log2; 7567 assert(n >= 8); 7568 stride_in = n / 2; 7569 7570 tab_in = in_buf; 7571 tab_out = tmp_buf; 7572 trig = get_trig(s, fft_len_log2, inverse, m_idx); 7573 if (!trig) 7574 return -1; 7575 p = 0; 7576 for(k = 0; k < stride_in; k += 4) { 7577 a0 = _mm256_load_pd(&tab_in[k]); 7578 a1 = _mm256_load_pd(&tab_in[k + stride_in]); 7579 c = _mm256_load_pd(trig); 7580 trig += 4; 7581 b0 = ntt_mod(a0 + a1, mf, m2f); 7582 b1 = ntt_mul_mod(a0 - a1, c, mf, m_inv); 7583 a0 = _mm256_permute2f128_pd(b0, b1, 0x20); 7584 a1 = _mm256_permute2f128_pd(b0, b1, 0x31); 7585 a0 = _mm256_permute4x64_pd(a0, 0xd8); 7586 a1 = _mm256_permute4x64_pd(a1, 0xd8); 7587 _mm256_store_pd(&tab_out[p], a0); 7588 _mm256_store_pd(&tab_out[p + 4], a1); 7589 p += 2 * 4; 7590 } 7591 tmp = tab_in; 7592 tab_in = tab_out; 7593 tab_out = tmp; 7594 7595 trig = get_trig(s, fft_len_log2 - 1, inverse, m_idx); 7596 if (!trig) 7597 return -1; 7598 p = 0; 7599 for(k = 0; k < stride_in; k += 4) { 7600 a0 = _mm256_load_pd(&tab_in[k]); 7601 a1 = _mm256_load_pd(&tab_in[k + stride_in]); 7602 c = _mm256_setr_pd(trig[0], trig[0], trig[1], trig[1]); 7603 trig += 2; 7604 b0 = ntt_mod(a0 + a1, mf, m2f); 7605 b1 = ntt_mul_mod(a0 - a1, c, mf, m_inv); 7606 a0 = _mm256_permute2f128_pd(b0, b1, 0x20); 7607 a1 = _mm256_permute2f128_pd(b0, b1, 0x31); 7608 _mm256_store_pd(&tab_out[p], a0); 7609 _mm256_store_pd(&tab_out[p + 4], a1); 7610 p += 2 * 4; 7611 } 7612 tmp = tab_in; 7613 tab_in = tab_out; 7614 tab_out = tmp; 7615 7616 nb_blocks = n / 4; 7617 fft_per_block = 4; 7618 7619 l = fft_len_log2 - 2; 7620 while (nb_blocks != 2) { 7621 nb_blocks >>= 1; 7622 p = 0; 7623 k = 0; 7624 trig = get_trig(s, l, inverse, m_idx); 7625 if (!trig) 7626 return -1; 7627 for(i = 0; i < nb_blocks; i++) { 7628 c = _mm256_set1_pd(trig[0]); 7629 trig++; 7630 for(j = 0; j < fft_per_block; j += 4) { 7631 a0 = _mm256_load_pd(&tab_in[k + j]); 7632 a1 = _mm256_load_pd(&tab_in[k + j + stride_in]); 7633 b0 = ntt_mod(a0 + a1, mf, m2f); 7634 b1 = ntt_mul_mod(a0 - a1, c, mf, m_inv); 7635 _mm256_store_pd(&tab_out[p + j], b0); 7636 _mm256_store_pd(&tab_out[p + j + fft_per_block], b1); 7637 } 7638 k += fft_per_block; 7639 p += 2 * fft_per_block; 7640 } 7641 fft_per_block <<= 1; 7642 l--; 7643 tmp = tab_in; 7644 tab_in = tab_out; 7645 tab_out = tmp; 7646 } 7647 7648 tab_out = out_buf; 7649 for(k = 0; k < stride_in; k += 4) { 7650 a0 = _mm256_load_pd(&tab_in[k]); 7651 a1 = _mm256_load_pd(&tab_in[k + stride_in]); 7652 b0 = ntt_mod(a0 + a1, mf, m2f); 7653 b1 = ntt_mod(a0 - a1, mf, m2f); 7654 _mm256_store_pd(&tab_out[k], b0); 7655 _mm256_store_pd(&tab_out[k + stride_in], b1); 7656 } 7657 return 0; 7658 } 7659 7660 static void ntt_vec_mul(BFNTTState *s, 7661 NTTLimb *tab1, NTTLimb *tab2, limb_t fft_len_log2, 7662 int k_tot, int m_idx) 7663 { 7664 limb_t i, c_inv, n, m; 7665 __m256d m_inv, mf, a, b, c; 7666 7667 m = ntt_mods[m_idx]; 7668 c_inv = s->ntt_len_inv[m_idx][k_tot][0]; 7669 m_inv = _mm256_set1_pd(1.0 / (double)m); 7670 mf = _mm256_set1_pd(m); 7671 c = _mm256_set1_pd(int_to_ntt_limb(c_inv, m)); 7672 n = (limb_t)1 << fft_len_log2; 7673 for(i = 0; i < n; i += 4) { 7674 a = _mm256_load_pd(&tab1[i]); 7675 b = _mm256_load_pd(&tab2[i]); 7676 a = ntt_mul_mod(a, b, mf, m_inv); 7677 a = ntt_mul_mod(a, c, mf, m_inv); 7678 _mm256_store_pd(&tab1[i], a); 7679 } 7680 } 7681 7682 static no_inline void mul_trig(NTTLimb *buf, 7683 limb_t n, limb_t c1, limb_t m, limb_t m_inv1) 7684 { 7685 limb_t i, c2, c3, c4; 7686 __m256d c, c_mul, a0, mf, m_inv; 7687 assert(n >= 2); 7688 7689 mf = _mm256_set1_pd(m); 7690 m_inv = _mm256_set1_pd(1.0 / (double)m); 7691 7692 c2 = mul_mod_fast(c1, c1, m, m_inv1); 7693 c3 = mul_mod_fast(c2, c1, m, m_inv1); 7694 c4 = mul_mod_fast(c2, c2, m, m_inv1); 7695 c = _mm256_setr_pd(1, int_to_ntt_limb(c1, m), 7696 int_to_ntt_limb(c2, m), int_to_ntt_limb(c3, m)); 7697 c_mul = _mm256_set1_pd(int_to_ntt_limb(c4, m)); 7698 for(i = 0; i < n; i += 4) { 7699 a0 = _mm256_load_pd(&buf[i]); 7700 a0 = ntt_mul_mod(a0, c, mf, m_inv); 7701 _mm256_store_pd(&buf[i], a0); 7702 c = ntt_mul_mod(c, c_mul, mf, m_inv); 7703 } 7704 } 7705 7706 #else 7707 7708 static void *ntt_malloc(BFNTTState *s, size_t size) 7709 { 7710 return bf_malloc(s->ctx, size); 7711 } 7712 7713 static void ntt_free(BFNTTState *s, void *ptr) 7714 { 7715 bf_free(s->ctx, ptr); 7716 } 7717 7718 static inline limb_t ntt_limb_to_int(NTTLimb a, limb_t m) 7719 { 7720 if (a >= m) 7721 a -= m; 7722 return a; 7723 } 7724 7725 static inline NTTLimb int_to_ntt_limb(slimb_t a, limb_t m) 7726 { 7727 return a; 7728 } 7729 7730 static no_inline int ntt_fft(BFNTTState *s, NTTLimb *out_buf, NTTLimb *in_buf, 7731 NTTLimb *tmp_buf, int fft_len_log2, 7732 int inverse, int m_idx) 7733 { 7734 limb_t nb_blocks, fft_per_block, p, k, n, stride_in, i, j, m, m2; 7735 NTTLimb *tab_in, *tab_out, *tmp, a0, a1, b0, b1, c, *trig, c_inv; 7736 int l; 7737 7738 m = ntt_mods[m_idx]; 7739 m2 = 2 * m; 7740 n = (limb_t)1 << fft_len_log2; 7741 nb_blocks = n; 7742 fft_per_block = 1; 7743 stride_in = n / 2; 7744 tab_in = in_buf; 7745 tab_out = tmp_buf; 7746 l = fft_len_log2; 7747 while (nb_blocks != 2) { 7748 nb_blocks >>= 1; 7749 p = 0; 7750 k = 0; 7751 trig = get_trig(s, l, inverse, m_idx); 7752 if (!trig) 7753 return -1; 7754 for(i = 0; i < nb_blocks; i++) { 7755 c = trig[0]; 7756 c_inv = trig[1]; 7757 trig += 2; 7758 for(j = 0; j < fft_per_block; j++) { 7759 a0 = tab_in[k + j]; 7760 a1 = tab_in[k + j + stride_in]; 7761 b0 = add_mod(a0, a1, m2); 7762 b1 = a0 - a1 + m2; 7763 b1 = mul_mod_fast3(b1, c, m, c_inv); 7764 tab_out[p + j] = b0; 7765 tab_out[p + j + fft_per_block] = b1; 7766 } 7767 k += fft_per_block; 7768 p += 2 * fft_per_block; 7769 } 7770 fft_per_block <<= 1; 7771 l--; 7772 tmp = tab_in; 7773 tab_in = tab_out; 7774 tab_out = tmp; 7775 } 7776 /* no twiddle in last step */ 7777 tab_out = out_buf; 7778 for(k = 0; k < stride_in; k++) { 7779 a0 = tab_in[k]; 7780 a1 = tab_in[k + stride_in]; 7781 b0 = add_mod(a0, a1, m2); 7782 b1 = sub_mod(a0, a1, m2); 7783 tab_out[k] = b0; 7784 tab_out[k + stride_in] = b1; 7785 } 7786 return 0; 7787 } 7788 7789 static void ntt_vec_mul(BFNTTState *s, 7790 NTTLimb *tab1, NTTLimb *tab2, int fft_len_log2, 7791 int k_tot, int m_idx) 7792 { 7793 limb_t i, norm, norm_inv, a, n, m, m_inv; 7794 7795 m = ntt_mods[m_idx]; 7796 m_inv = s->ntt_mods_div[m_idx]; 7797 norm = s->ntt_len_inv[m_idx][k_tot][0]; 7798 norm_inv = s->ntt_len_inv[m_idx][k_tot][1]; 7799 n = (limb_t)1 << fft_len_log2; 7800 for(i = 0; i < n; i++) { 7801 a = tab1[i]; 7802 /* need to reduce the range so that the product is < 7803 2^(LIMB_BITS+NTT_MOD_LOG2_MIN) */ 7804 if (a >= m) 7805 a -= m; 7806 a = mul_mod_fast(a, tab2[i], m, m_inv); 7807 a = mul_mod_fast3(a, norm, m, norm_inv); 7808 tab1[i] = a; 7809 } 7810 } 7811 7812 static no_inline void mul_trig(NTTLimb *buf, 7813 limb_t n, limb_t c_mul, limb_t m, limb_t m_inv) 7814 { 7815 limb_t i, c0, c_mul_inv; 7816 7817 c0 = 1; 7818 c_mul_inv = init_mul_mod_fast2(c_mul, m); 7819 for(i = 0; i < n; i++) { 7820 buf[i] = mul_mod_fast(buf[i], c0, m, m_inv); 7821 c0 = mul_mod_fast2(c0, c_mul, m, c_mul_inv); 7822 } 7823 } 7824 7825 #endif /* !AVX2 */ 7826 7827 static no_inline NTTLimb *get_trig(BFNTTState *s, 7828 int k, int inverse, int m_idx) 7829 { 7830 NTTLimb *tab; 7831 limb_t i, n2, c, c_mul, m, c_mul_inv; 7832 7833 if (k > NTT_TRIG_K_MAX) 7834 return NULL; 7835 7836 tab = s->ntt_trig[m_idx][inverse][k]; 7837 if (tab) 7838 return tab; 7839 n2 = (limb_t)1 << (k - 1); 7840 m = ntt_mods[m_idx]; 7841 #ifdef __AVX2__ 7842 tab = ntt_malloc(s, sizeof(NTTLimb) * n2); 7843 #else 7844 tab = ntt_malloc(s, sizeof(NTTLimb) * n2 * 2); 7845 #endif 7846 if (!tab) 7847 return NULL; 7848 c = 1; 7849 c_mul = s->ntt_proot_pow[m_idx][inverse][k]; 7850 c_mul_inv = s->ntt_proot_pow_inv[m_idx][inverse][k]; 7851 for(i = 0; i < n2; i++) { 7852 #ifdef __AVX2__ 7853 tab[i] = int_to_ntt_limb2(c, m); 7854 #else 7855 tab[2 * i] = int_to_ntt_limb(c, m); 7856 tab[2 * i + 1] = init_mul_mod_fast2(c, m); 7857 #endif 7858 c = mul_mod_fast2(c, c_mul, m, c_mul_inv); 7859 } 7860 s->ntt_trig[m_idx][inverse][k] = tab; 7861 return tab; 7862 } 7863 7864 void fft_clear_cache(bf_context_t *s1) 7865 { 7866 int m_idx, inverse, k; 7867 BFNTTState *s = s1->ntt_state; 7868 if (s) { 7869 for(m_idx = 0; m_idx < NB_MODS; m_idx++) { 7870 for(inverse = 0; inverse < 2; inverse++) { 7871 for(k = 0; k < NTT_TRIG_K_MAX + 1; k++) { 7872 if (s->ntt_trig[m_idx][inverse][k]) { 7873 ntt_free(s, s->ntt_trig[m_idx][inverse][k]); 7874 s->ntt_trig[m_idx][inverse][k] = NULL; 7875 } 7876 } 7877 } 7878 } 7879 #if defined(__AVX2__) 7880 bf_aligned_free(s1, s); 7881 #else 7882 bf_free(s1, s); 7883 #endif 7884 s1->ntt_state = NULL; 7885 } 7886 } 7887 7888 #define STRIP_LEN 16 7889 7890 /* dst = buf1, src = buf2 */ 7891 static int ntt_fft_partial(BFNTTState *s, NTTLimb *buf1, 7892 int k1, int k2, limb_t n1, limb_t n2, int inverse, 7893 limb_t m_idx) 7894 { 7895 limb_t i, j, c_mul, c0, m, m_inv, strip_len, l; 7896 NTTLimb *buf2, *buf3; 7897 7898 buf2 = NULL; 7899 buf3 = ntt_malloc(s, sizeof(NTTLimb) * n1); 7900 if (!buf3) 7901 goto fail; 7902 if (k2 == 0) { 7903 if (ntt_fft(s, buf1, buf1, buf3, k1, inverse, m_idx)) 7904 goto fail; 7905 } else { 7906 strip_len = STRIP_LEN; 7907 buf2 = ntt_malloc(s, sizeof(NTTLimb) * n1 * strip_len); 7908 if (!buf2) 7909 goto fail; 7910 m = ntt_mods[m_idx]; 7911 m_inv = s->ntt_mods_div[m_idx]; 7912 c0 = s->ntt_proot_pow[m_idx][inverse][k1 + k2]; 7913 c_mul = 1; 7914 assert((n2 % strip_len) == 0); 7915 for(j = 0; j < n2; j += strip_len) { 7916 for(i = 0; i < n1; i++) { 7917 for(l = 0; l < strip_len; l++) { 7918 buf2[i + l * n1] = buf1[i * n2 + (j + l)]; 7919 } 7920 } 7921 for(l = 0; l < strip_len; l++) { 7922 if (inverse) 7923 mul_trig(buf2 + l * n1, n1, c_mul, m, m_inv); 7924 if (ntt_fft(s, buf2 + l * n1, buf2 + l * n1, buf3, k1, inverse, m_idx)) 7925 goto fail; 7926 if (!inverse) 7927 mul_trig(buf2 + l * n1, n1, c_mul, m, m_inv); 7928 c_mul = mul_mod_fast(c_mul, c0, m, m_inv); 7929 } 7930 7931 for(i = 0; i < n1; i++) { 7932 for(l = 0; l < strip_len; l++) { 7933 buf1[i * n2 + (j + l)] = buf2[i + l *n1]; 7934 } 7935 } 7936 } 7937 ntt_free(s, buf2); 7938 } 7939 ntt_free(s, buf3); 7940 return 0; 7941 fail: 7942 ntt_free(s, buf2); 7943 ntt_free(s, buf3); 7944 return -1; 7945 } 7946 7947 7948 /* dst = buf1, src = buf2, tmp = buf3 */ 7949 static int ntt_conv(BFNTTState *s, NTTLimb *buf1, NTTLimb *buf2, 7950 int k, int k_tot, limb_t m_idx) 7951 { 7952 limb_t n1, n2, i; 7953 int k1, k2; 7954 7955 if (k <= NTT_TRIG_K_MAX) { 7956 k1 = k; 7957 } else { 7958 /* recursive split of the FFT */ 7959 k1 = bf_min(k / 2, NTT_TRIG_K_MAX); 7960 } 7961 k2 = k - k1; 7962 n1 = (limb_t)1 << k1; 7963 n2 = (limb_t)1 << k2; 7964 7965 if (ntt_fft_partial(s, buf1, k1, k2, n1, n2, 0, m_idx)) 7966 return -1; 7967 if (ntt_fft_partial(s, buf2, k1, k2, n1, n2, 0, m_idx)) 7968 return -1; 7969 if (k2 == 0) { 7970 ntt_vec_mul(s, buf1, buf2, k, k_tot, m_idx); 7971 } else { 7972 for(i = 0; i < n1; i++) { 7973 ntt_conv(s, buf1 + i * n2, buf2 + i * n2, k2, k_tot, m_idx); 7974 } 7975 } 7976 if (ntt_fft_partial(s, buf1, k1, k2, n1, n2, 1, m_idx)) 7977 return -1; 7978 return 0; 7979 } 7980 7981 7982 static no_inline void limb_to_ntt(BFNTTState *s, 7983 NTTLimb *tabr, limb_t fft_len, 7984 const limb_t *taba, limb_t a_len, int dpl, 7985 int first_m_idx, int nb_mods) 7986 { 7987 slimb_t i, n; 7988 dlimb_t a, b; 7989 int j, shift; 7990 limb_t base_mask1, a0, a1, a2, r, m, m_inv; 7991 7992 #if 0 7993 for(i = 0; i < a_len; i++) { 7994 printf("%" PRId64 ": " FMT_LIMB "\n", 7995 (int64_t)i, taba[i]); 7996 } 7997 #endif 7998 memset(tabr, 0, sizeof(NTTLimb) * fft_len * nb_mods); 7999 shift = dpl & (LIMB_BITS - 1); 8000 if (shift == 0) 8001 base_mask1 = -1; 8002 else 8003 base_mask1 = ((limb_t)1 << shift) - 1; 8004 n = bf_min(fft_len, (a_len * LIMB_BITS + dpl - 1) / dpl); 8005 for(i = 0; i < n; i++) { 8006 a0 = get_bits(taba, a_len, i * dpl); 8007 if (dpl <= LIMB_BITS) { 8008 a0 &= base_mask1; 8009 a = a0; 8010 } else { 8011 a1 = get_bits(taba, a_len, i * dpl + LIMB_BITS); 8012 if (dpl <= (LIMB_BITS + NTT_MOD_LOG2_MIN)) { 8013 a = a0 | ((dlimb_t)(a1 & base_mask1) << LIMB_BITS); 8014 } else { 8015 if (dpl > 2 * LIMB_BITS) { 8016 a2 = get_bits(taba, a_len, i * dpl + LIMB_BITS * 2) & 8017 base_mask1; 8018 } else { 8019 a1 &= base_mask1; 8020 a2 = 0; 8021 } 8022 // printf("a=0x%016lx%016lx%016lx\n", a2, a1, a0); 8023 a = (a0 >> (LIMB_BITS - NTT_MOD_LOG2_MAX + NTT_MOD_LOG2_MIN)) | 8024 ((dlimb_t)a1 << (NTT_MOD_LOG2_MAX - NTT_MOD_LOG2_MIN)) | 8025 ((dlimb_t)a2 << (LIMB_BITS + NTT_MOD_LOG2_MAX - NTT_MOD_LOG2_MIN)); 8026 a0 &= ((limb_t)1 << (LIMB_BITS - NTT_MOD_LOG2_MAX + NTT_MOD_LOG2_MIN)) - 1; 8027 } 8028 } 8029 for(j = 0; j < nb_mods; j++) { 8030 m = ntt_mods[first_m_idx + j]; 8031 m_inv = s->ntt_mods_div[first_m_idx + j]; 8032 r = mod_fast(a, m, m_inv); 8033 if (dpl > (LIMB_BITS + NTT_MOD_LOG2_MIN)) { 8034 b = ((dlimb_t)r << (LIMB_BITS - NTT_MOD_LOG2_MAX + NTT_MOD_LOG2_MIN)) | a0; 8035 r = mod_fast(b, m, m_inv); 8036 } 8037 tabr[i + j * fft_len] = int_to_ntt_limb(r, m); 8038 } 8039 } 8040 } 8041 8042 #if defined(__AVX2__) 8043 8044 #define VEC_LEN 4 8045 8046 typedef union { 8047 __m256d v; 8048 double d[4]; 8049 } VecUnion; 8050 8051 static no_inline void ntt_to_limb(BFNTTState *s, limb_t *tabr, limb_t r_len, 8052 const NTTLimb *buf, int fft_len_log2, int dpl, 8053 int nb_mods) 8054 { 8055 const limb_t *mods = ntt_mods + NB_MODS - nb_mods; 8056 const __m256d *mods_cr_vec, *mf, *m_inv; 8057 VecUnion y[NB_MODS]; 8058 limb_t u[NB_MODS], carry[NB_MODS], fft_len, base_mask1, r; 8059 slimb_t i, len, pos; 8060 int j, k, l, shift, n_limb1, p; 8061 dlimb_t t; 8062 8063 j = NB_MODS * (NB_MODS - 1) / 2 - nb_mods * (nb_mods - 1) / 2; 8064 mods_cr_vec = s->ntt_mods_cr_vec + j; 8065 mf = s->ntt_mods_vec + NB_MODS - nb_mods; 8066 m_inv = s->ntt_mods_inv_vec + NB_MODS - nb_mods; 8067 8068 shift = dpl & (LIMB_BITS - 1); 8069 if (shift == 0) 8070 base_mask1 = -1; 8071 else 8072 base_mask1 = ((limb_t)1 << shift) - 1; 8073 n_limb1 = ((unsigned)dpl - 1) / LIMB_BITS; 8074 for(j = 0; j < NB_MODS; j++) 8075 carry[j] = 0; 8076 for(j = 0; j < NB_MODS; j++) 8077 u[j] = 0; /* avoid warnings */ 8078 memset(tabr, 0, sizeof(limb_t) * r_len); 8079 fft_len = (limb_t)1 << fft_len_log2; 8080 len = bf_min(fft_len, (r_len * LIMB_BITS + dpl - 1) / dpl); 8081 len = (len + VEC_LEN - 1) & ~(VEC_LEN - 1); 8082 i = 0; 8083 while (i < len) { 8084 for(j = 0; j < nb_mods; j++) 8085 y[j].v = *(__m256d *)&buf[i + fft_len * j]; 8086 8087 /* Chinese remainder to get mixed radix representation */ 8088 l = 0; 8089 for(j = 0; j < nb_mods - 1; j++) { 8090 y[j].v = ntt_mod1(y[j].v, mf[j]); 8091 for(k = j + 1; k < nb_mods; k++) { 8092 y[k].v = ntt_mul_mod(y[k].v - y[j].v, 8093 mods_cr_vec[l], mf[k], m_inv[k]); 8094 l++; 8095 } 8096 } 8097 y[j].v = ntt_mod1(y[j].v, mf[j]); 8098 8099 for(p = 0; p < VEC_LEN; p++) { 8100 /* back to normal representation */ 8101 u[0] = (int64_t)y[nb_mods - 1].d[p]; 8102 l = 1; 8103 for(j = nb_mods - 2; j >= 1; j--) { 8104 r = (int64_t)y[j].d[p]; 8105 for(k = 0; k < l; k++) { 8106 t = (dlimb_t)u[k] * mods[j] + r; 8107 r = t >> LIMB_BITS; 8108 u[k] = t; 8109 } 8110 u[l] = r; 8111 l++; 8112 } 8113 /* XXX: for nb_mods = 5, l should be 4 */ 8114 8115 /* last step adds the carry */ 8116 r = (int64_t)y[0].d[p]; 8117 for(k = 0; k < l; k++) { 8118 t = (dlimb_t)u[k] * mods[j] + r + carry[k]; 8119 r = t >> LIMB_BITS; 8120 u[k] = t; 8121 } 8122 u[l] = r + carry[l]; 8123 8124 #if 0 8125 printf("%" PRId64 ": ", i); 8126 for(j = nb_mods - 1; j >= 0; j--) { 8127 printf(" %019" PRIu64, u[j]); 8128 } 8129 printf("\n"); 8130 #endif 8131 8132 /* write the digits */ 8133 pos = i * dpl; 8134 for(j = 0; j < n_limb1; j++) { 8135 put_bits(tabr, r_len, pos, u[j]); 8136 pos += LIMB_BITS; 8137 } 8138 put_bits(tabr, r_len, pos, u[n_limb1] & base_mask1); 8139 /* shift by dpl digits and set the carry */ 8140 if (shift == 0) { 8141 for(j = n_limb1 + 1; j < nb_mods; j++) 8142 carry[j - (n_limb1 + 1)] = u[j]; 8143 } else { 8144 for(j = n_limb1; j < nb_mods - 1; j++) { 8145 carry[j - n_limb1] = (u[j] >> shift) | 8146 (u[j + 1] << (LIMB_BITS - shift)); 8147 } 8148 carry[nb_mods - 1 - n_limb1] = u[nb_mods - 1] >> shift; 8149 } 8150 i++; 8151 } 8152 } 8153 } 8154 #else 8155 static no_inline void ntt_to_limb(BFNTTState *s, limb_t *tabr, limb_t r_len, 8156 const NTTLimb *buf, int fft_len_log2, int dpl, 8157 int nb_mods) 8158 { 8159 const limb_t *mods = ntt_mods + NB_MODS - nb_mods; 8160 const limb_t *mods_cr, *mods_cr_inv; 8161 limb_t y[NB_MODS], u[NB_MODS], carry[NB_MODS], fft_len, base_mask1, r; 8162 slimb_t i, len, pos; 8163 int j, k, l, shift, n_limb1; 8164 dlimb_t t; 8165 8166 j = NB_MODS * (NB_MODS - 1) / 2 - nb_mods * (nb_mods - 1) / 2; 8167 mods_cr = ntt_mods_cr + j; 8168 mods_cr_inv = s->ntt_mods_cr_inv + j; 8169 8170 shift = dpl & (LIMB_BITS - 1); 8171 if (shift == 0) 8172 base_mask1 = -1; 8173 else 8174 base_mask1 = ((limb_t)1 << shift) - 1; 8175 n_limb1 = ((unsigned)dpl - 1) / LIMB_BITS; 8176 for(j = 0; j < NB_MODS; j++) 8177 carry[j] = 0; 8178 for(j = 0; j < NB_MODS; j++) 8179 u[j] = 0; /* avoid warnings */ 8180 memset(tabr, 0, sizeof(limb_t) * r_len); 8181 fft_len = (limb_t)1 << fft_len_log2; 8182 len = bf_min(fft_len, (r_len * LIMB_BITS + dpl - 1) / dpl); 8183 for(i = 0; i < len; i++) { 8184 for(j = 0; j < nb_mods; j++) { 8185 y[j] = ntt_limb_to_int(buf[i + fft_len * j], mods[j]); 8186 } 8187 8188 /* Chinese remainder to get mixed radix representation */ 8189 l = 0; 8190 for(j = 0; j < nb_mods - 1; j++) { 8191 for(k = j + 1; k < nb_mods; k++) { 8192 limb_t m; 8193 m = mods[k]; 8194 /* Note: there is no overflow in the sub_mod() because 8195 the modulos are sorted by increasing order */ 8196 y[k] = mul_mod_fast2(y[k] - y[j] + m, 8197 mods_cr[l], m, mods_cr_inv[l]); 8198 l++; 8199 } 8200 } 8201 8202 /* back to normal representation */ 8203 u[0] = y[nb_mods - 1]; 8204 l = 1; 8205 for(j = nb_mods - 2; j >= 1; j--) { 8206 r = y[j]; 8207 for(k = 0; k < l; k++) { 8208 t = (dlimb_t)u[k] * mods[j] + r; 8209 r = t >> LIMB_BITS; 8210 u[k] = t; 8211 } 8212 u[l] = r; 8213 l++; 8214 } 8215 8216 /* last step adds the carry */ 8217 r = y[0]; 8218 for(k = 0; k < l; k++) { 8219 t = (dlimb_t)u[k] * mods[j] + r + carry[k]; 8220 r = t >> LIMB_BITS; 8221 u[k] = t; 8222 } 8223 u[l] = r + carry[l]; 8224 8225 #if 0 8226 printf("%" PRId64 ": ", (int64_t)i); 8227 for(j = nb_mods - 1; j >= 0; j--) { 8228 printf(" " FMT_LIMB, u[j]); 8229 } 8230 printf("\n"); 8231 #endif 8232 8233 /* write the digits */ 8234 pos = i * dpl; 8235 for(j = 0; j < n_limb1; j++) { 8236 put_bits(tabr, r_len, pos, u[j]); 8237 pos += LIMB_BITS; 8238 } 8239 put_bits(tabr, r_len, pos, u[n_limb1] & base_mask1); 8240 /* shift by dpl digits and set the carry */ 8241 if (shift == 0) { 8242 for(j = n_limb1 + 1; j < nb_mods; j++) 8243 carry[j - (n_limb1 + 1)] = u[j]; 8244 } else { 8245 for(j = n_limb1; j < nb_mods - 1; j++) { 8246 carry[j - n_limb1] = (u[j] >> shift) | 8247 (u[j + 1] << (LIMB_BITS - shift)); 8248 } 8249 carry[nb_mods - 1 - n_limb1] = u[nb_mods - 1] >> shift; 8250 } 8251 } 8252 } 8253 #endif 8254 8255 static int ntt_static_init(bf_context_t *s1) 8256 { 8257 BFNTTState *s; 8258 int inverse, i, j, k, l; 8259 limb_t c, c_inv, c_inv2, m, m_inv; 8260 8261 if (s1->ntt_state) 8262 return 0; 8263 #if defined(__AVX2__) 8264 s = bf_aligned_malloc(s1, sizeof(*s), 64); 8265 #else 8266 s = bf_malloc(s1, sizeof(*s)); 8267 #endif 8268 if (!s) 8269 return -1; 8270 memset(s, 0, sizeof(*s)); 8271 s1->ntt_state = s; 8272 s->ctx = s1; 8273 8274 for(j = 0; j < NB_MODS; j++) { 8275 m = ntt_mods[j]; 8276 m_inv = init_mul_mod_fast(m); 8277 s->ntt_mods_div[j] = m_inv; 8278 #if defined(__AVX2__) 8279 s->ntt_mods_vec[j] = _mm256_set1_pd(m); 8280 s->ntt_mods_inv_vec[j] = _mm256_set1_pd(1.0 / (double)m); 8281 #endif 8282 c_inv2 = (m + 1) / 2; /* 1/2 */ 8283 c_inv = 1; 8284 for(i = 0; i <= NTT_PROOT_2EXP; i++) { 8285 s->ntt_len_inv[j][i][0] = c_inv; 8286 s->ntt_len_inv[j][i][1] = init_mul_mod_fast2(c_inv, m); 8287 c_inv = mul_mod_fast(c_inv, c_inv2, m, m_inv); 8288 } 8289 8290 for(inverse = 0; inverse < 2; inverse++) { 8291 c = ntt_proot[inverse][j]; 8292 for(i = 0; i < NTT_PROOT_2EXP; i++) { 8293 s->ntt_proot_pow[j][inverse][NTT_PROOT_2EXP - i] = c; 8294 s->ntt_proot_pow_inv[j][inverse][NTT_PROOT_2EXP - i] = 8295 init_mul_mod_fast2(c, m); 8296 c = mul_mod_fast(c, c, m, m_inv); 8297 } 8298 } 8299 } 8300 8301 l = 0; 8302 for(j = 0; j < NB_MODS - 1; j++) { 8303 for(k = j + 1; k < NB_MODS; k++) { 8304 #if defined(__AVX2__) 8305 s->ntt_mods_cr_vec[l] = _mm256_set1_pd(int_to_ntt_limb2(ntt_mods_cr[l], 8306 ntt_mods[k])); 8307 #else 8308 s->ntt_mods_cr_inv[l] = init_mul_mod_fast2(ntt_mods_cr[l], 8309 ntt_mods[k]); 8310 #endif 8311 l++; 8312 } 8313 } 8314 return 0; 8315 } 8316 8317 int bf_get_fft_size(int *pdpl, int *pnb_mods, limb_t len) 8318 { 8319 int dpl, fft_len_log2, n_bits, nb_mods, dpl_found, fft_len_log2_found; 8320 int int_bits, nb_mods_found; 8321 limb_t cost, min_cost; 8322 8323 min_cost = -1; 8324 dpl_found = 0; 8325 nb_mods_found = 4; 8326 fft_len_log2_found = 0; 8327 for(nb_mods = 3; nb_mods <= NB_MODS; nb_mods++) { 8328 int_bits = ntt_int_bits[NB_MODS - nb_mods]; 8329 dpl = bf_min((int_bits - 4) / 2, 8330 2 * LIMB_BITS + 2 * NTT_MOD_LOG2_MIN - NTT_MOD_LOG2_MAX); 8331 for(;;) { 8332 fft_len_log2 = ceil_log2((len * LIMB_BITS + dpl - 1) / dpl); 8333 if (fft_len_log2 > NTT_PROOT_2EXP) 8334 goto next; 8335 n_bits = fft_len_log2 + 2 * dpl; 8336 if (n_bits <= int_bits) { 8337 cost = ((limb_t)(fft_len_log2 + 1) << fft_len_log2) * nb_mods; 8338 // printf("n=%d dpl=%d: cost=%" PRId64 "\n", nb_mods, dpl, (int64_t)cost); 8339 if (cost < min_cost) { 8340 min_cost = cost; 8341 dpl_found = dpl; 8342 nb_mods_found = nb_mods; 8343 fft_len_log2_found = fft_len_log2; 8344 } 8345 break; 8346 } 8347 dpl--; 8348 if (dpl == 0) 8349 break; 8350 } 8351 next: ; 8352 } 8353 if (!dpl_found) 8354 abort(); 8355 /* limit dpl if possible to reduce fixed cost of limb/NTT conversion */ 8356 if (dpl_found > (LIMB_BITS + NTT_MOD_LOG2_MIN) && 8357 ((limb_t)(LIMB_BITS + NTT_MOD_LOG2_MIN) << fft_len_log2_found) >= 8358 len * LIMB_BITS) { 8359 dpl_found = LIMB_BITS + NTT_MOD_LOG2_MIN; 8360 } 8361 *pnb_mods = nb_mods_found; 8362 *pdpl = dpl_found; 8363 return fft_len_log2_found; 8364 } 8365 8366 /* return 0 if OK, -1 if memory error */ 8367 static no_inline int fft_mul(bf_context_t *s1, 8368 bf_t *res, limb_t *a_tab, limb_t a_len, 8369 limb_t *b_tab, limb_t b_len, int mul_flags) 8370 { 8371 BFNTTState *s; 8372 int dpl, fft_len_log2, j, nb_mods, reduced_mem; 8373 slimb_t len, fft_len; 8374 NTTLimb *buf1, *buf2, *ptr; 8375 #if defined(USE_MUL_CHECK) 8376 limb_t ha, hb, hr, h_ref; 8377 #endif 8378 8379 if (ntt_static_init(s1)) 8380 return -1; 8381 s = s1->ntt_state; 8382 8383 /* find the optimal number of digits per limb (dpl) */ 8384 len = a_len + b_len; 8385 fft_len_log2 = bf_get_fft_size(&dpl, &nb_mods, len); 8386 fft_len = (uint64_t)1 << fft_len_log2; 8387 // printf("len=%" PRId64 " fft_len_log2=%d dpl=%d\n", len, fft_len_log2, dpl); 8388 #if defined(USE_MUL_CHECK) 8389 ha = mp_mod1(a_tab, a_len, BF_CHKSUM_MOD, 0); 8390 hb = mp_mod1(b_tab, b_len, BF_CHKSUM_MOD, 0); 8391 #endif 8392 if ((mul_flags & (FFT_MUL_R_OVERLAP_A | FFT_MUL_R_OVERLAP_B)) == 0) { 8393 if (!(mul_flags & FFT_MUL_R_NORESIZE)) 8394 bf_resize(res, 0); 8395 } else if (mul_flags & FFT_MUL_R_OVERLAP_B) { 8396 limb_t *tmp_tab, tmp_len; 8397 /* it is better to free 'b' first */ 8398 tmp_tab = a_tab; 8399 a_tab = b_tab; 8400 b_tab = tmp_tab; 8401 tmp_len = a_len; 8402 a_len = b_len; 8403 b_len = tmp_len; 8404 } 8405 buf1 = ntt_malloc(s, sizeof(NTTLimb) * fft_len * nb_mods); 8406 if (!buf1) 8407 return -1; 8408 limb_to_ntt(s, buf1, fft_len, a_tab, a_len, dpl, 8409 NB_MODS - nb_mods, nb_mods); 8410 if ((mul_flags & (FFT_MUL_R_OVERLAP_A | FFT_MUL_R_OVERLAP_B)) == 8411 FFT_MUL_R_OVERLAP_A) { 8412 if (!(mul_flags & FFT_MUL_R_NORESIZE)) 8413 bf_resize(res, 0); 8414 } 8415 reduced_mem = (fft_len_log2 >= 14); 8416 if (!reduced_mem) { 8417 buf2 = ntt_malloc(s, sizeof(NTTLimb) * fft_len * nb_mods); 8418 if (!buf2) 8419 goto fail; 8420 limb_to_ntt(s, buf2, fft_len, b_tab, b_len, dpl, 8421 NB_MODS - nb_mods, nb_mods); 8422 if (!(mul_flags & FFT_MUL_R_NORESIZE)) 8423 bf_resize(res, 0); /* in case res == b */ 8424 } else { 8425 buf2 = ntt_malloc(s, sizeof(NTTLimb) * fft_len); 8426 if (!buf2) 8427 goto fail; 8428 } 8429 for(j = 0; j < nb_mods; j++) { 8430 if (reduced_mem) { 8431 limb_to_ntt(s, buf2, fft_len, b_tab, b_len, dpl, 8432 NB_MODS - nb_mods + j, 1); 8433 ptr = buf2; 8434 } else { 8435 ptr = buf2 + fft_len * j; 8436 } 8437 if (ntt_conv(s, buf1 + fft_len * j, ptr, 8438 fft_len_log2, fft_len_log2, j + NB_MODS - nb_mods)) 8439 goto fail; 8440 } 8441 if (!(mul_flags & FFT_MUL_R_NORESIZE)) 8442 bf_resize(res, 0); /* in case res == b and reduced mem */ 8443 ntt_free(s, buf2); 8444 buf2 = NULL; 8445 if (!(mul_flags & FFT_MUL_R_NORESIZE)) { 8446 if (bf_resize(res, len)) 8447 goto fail; 8448 } 8449 ntt_to_limb(s, res->tab, len, buf1, fft_len_log2, dpl, nb_mods); 8450 ntt_free(s, buf1); 8451 #if defined(USE_MUL_CHECK) 8452 hr = mp_mod1(res->tab, len, BF_CHKSUM_MOD, 0); 8453 h_ref = mul_mod(ha, hb, BF_CHKSUM_MOD); 8454 if (hr != h_ref) { 8455 printf("ntt_mul_error: len=%" PRId_LIMB " fft_len_log2=%d dpl=%d nb_mods=%d\n", 8456 len, fft_len_log2, dpl, nb_mods); 8457 // printf("ha=0x" FMT_LIMB" hb=0x" FMT_LIMB " hr=0x" FMT_LIMB " expected=0x" FMT_LIMB "\n", ha, hb, hr, h_ref); 8458 exit(1); 8459 } 8460 #endif 8461 return 0; 8462 fail: 8463 ntt_free(s, buf1); 8464 ntt_free(s, buf2); 8465 return -1; 8466 } 8467 8468 #else /* USE_FFT_MUL */ 8469 8470 int bf_get_fft_size(int *pdpl, int *pnb_mods, limb_t len) 8471 { 8472 return 0; 8473 } 8474 8475 #endif /* !USE_FFT_MUL */