src/libcore/num/dec2flt/algorithm.rs

   1 // Copyright 2015 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 //! The various algorithms from the paper.
  12
  13 use prelude::v1::*;
  14 use cmp::min;
  15 use cmp::Ordering::{Less, Equal, Greater};
  16 use num::diy_float::Fp;
  17 use num::dec2flt::table;
  18 use num::dec2flt::rawfp::{self, Unpacked, RawFloat, fp_to_float, next_float, prev_float};
  19 use num::dec2flt::num::{self, Big};
  20
  21 /// Number of significand bits in Fp
  22 const P: u32 = 64;
  23
  24 // We simply store the best approximation for *all* exponents, so the variable "h" and the
  25 // associated conditions can be omitted. This trades performance for a couple kilobytes of space.
  26
  27 fn power_of_ten(e: i16) -> Fp {
  28     assert!(e >= table::MIN_E);
  29     let i = e - table::MIN_E;
  30     let sig = table::POWERS.0[i as usize];
  31     let exp = table::POWERS.1[i as usize];
  32     Fp { f: sig, e: exp }
  33 }
  34
  35 // In most architectures, floating point operations have an explicit bit size, therefore the
  36 // precision of the computation is determined on a per-operation basis.
  37 #[cfg(any(not(target_arch="x86"), target_feature="sse2"))]
  38 mod fpu_precision {
  39     pub fn set_precision<T>() { }
  40 }
  41
  42 // On x86, the x87 FPU is used for float operations if the SSE/SSE2 extensions are not available.
  43 // The x87 FPU operates with 80 bits of precision by default, which means that operations will
  44 // round to 80 bits causing double rounding to happen when values are eventually represented as
  45 // 32/64 bit float values. To overcome this, the FPU control word can be set so that the
  46 // computations are performed in the desired precision.
  47 #[cfg(all(target_arch="x86", not(target_feature="sse2")))]
  48 mod fpu_precision {
  49     use mem::size_of;
  50     use ops::Drop;
  51
  52     /// A structure used to preserve the original value of the FPU control word, so that it can be
  53     /// restored when the structure is dropped.
  54     ///
  55     /// The x87 FPU is a 16-bits register whose fields are as follows:
  56     ///
  57     /// | 12-15 | 10-11 | 8-9 | 6-7 |  5 |  4 |  3 |  2 |  1 |  0 |
  58     /// |------:|------:|----:|----:|---:|---:|---:|---:|---:|---:|
  59     /// |       | RC    | PC  |     | PM | UM | OM | ZM | DM | IM |
  60     ///
  61     /// The documentation for all of the fields is available in the IA-32 Architectures Software
  62     /// Developer's Manual (Volume 1).
  63     ///
  64     /// The only field which is relevant for the following code is PC, Precision Control. This
  65     /// field determines the precision of the operations performed by the  FPU. It can be set to:
  66     ///  - 0b00, single precision i.e. 32-bits
  67     ///  - 0b10, double precision i.e. 64-bits
  68     ///  - 0b11, double extended precision i.e. 80-bits (default state)
  69     /// The 0b01 value is reserved and should not be used.
  70     pub struct FPUControlWord(u16);
  71
  72     fn set_cw(cw: u16) {
  73         unsafe { asm!("fldcw $0" :: "m" (cw) :: "volatile") }
  74     }
  75
  76     /// Set the precision field of the FPU to `T` and return a `FPUControlWord`
  77     pub fn set_precision<T>() -> FPUControlWord {
  78         let cw = 0u16;
  79
  80         // Compute the value for the Precision Control field that is appropriate for `T`.
  81         let cw_precision = match size_of::<T>() {
  82             4 => 0x0000, // 32 bits
  83             8 => 0x0200, // 64 bits
  84             _ => 0x0300, // default, 80 bits
  85         };
  86
  87         // Get the original value of the control word to restore it later, when the
  88         // `FPUControlWord` structure is dropped
  89         unsafe { asm!("fnstcw $0" : "=*m" (&cw) ::: "volatile") }
  90
  91         // Set the control word to the desired precision. This is achieved by masking away the old
  92         // precision (bits 8 and 9, 0x300) and replacing it with the precision flag computed above.
  93         set_cw((cw & 0xFCFF) | cw_precision);
  94
  95         FPUControlWord(cw)
  96     }
  97
  98     impl Drop for FPUControlWord {
  99         fn drop(&mut self) {
 100             set_cw(self.0)
 101         }
 102     }
 103 }
 104
 105 /// The fast path of Bellerophon using machine-sized integers and floats.
 106 ///
 107 /// This is extracted into a separate function so that it can be attempted before constructing
 108 /// a bignum.
 109 pub fn fast_path<T: RawFloat>(integral: &[u8], fractional: &[u8], e: i64) -> Option<T> {
 110     let num_digits = integral.len() + fractional.len();
 111     // log_10(f64::max_sig) ~ 15.95. We compare the exact value to max_sig near the end,
 112     // this is just a quick, cheap rejection (and also frees the rest of the code from
 113     // worrying about underflow).
 114     if num_digits > 16 {
 115         return None;
 116     }
 117     if e.abs() >= T::ceil_log5_of_max_sig() as i64 {
 118         return None;
 119     }
 120     let f = num::from_str_unchecked(integral.iter().chain(fractional.iter()));
 121     if f > T::max_sig() {
 122         return None;
 123     }
 124
 125     // The fast path crucially depends on arithmetic being rounded to the correct number of bits
 126     // without any intermediate rounding. On x86 (without SSE or SSE2) this requires the precision
 127     // of the x87 FPU stack to be changed so that it directly rounds to 64/32 bit.
 128     // The `set_precision` function takes care of setting the precision on architectures which
 129     // require setting it by changing the global state (like the control word of the x87 FPU).
 130     let _cw = fpu_precision::set_precision::<T>();
 131
 132     // The case e < 0 cannot be folded into the other branch. Negative powers result in
 133     // a repeating fractional part in binary, which are rounded, which causes real
 134     // (and occasionally quite significant!) errors in the final result.
 135     if e >= 0 {
 136         Some(T::from_int(f) * T::short_fast_pow10(e as usize))
 137     } else {
 138         Some(T::from_int(f) / T::short_fast_pow10(e.abs() as usize))
 139     }
 140 }
 141
 142 /// Algorithm Bellerophon is trivial code justified by non-trivial numeric analysis.
 143 ///
 144 /// It rounds ``f`` to a float with 64 bit significand and multiplies it by the best approximation
 145 /// of `10^e` (in the same floating point format). This is often enough to get the correct result.
 146 /// However, when the result is close to halfway between two adjecent (ordinary) floats, the
 147 /// compound rounding error from multiplying two approximation means the result may be off by a
 148 /// few bits. When this happens, the iterative Algorithm R fixes things up.
 149 ///
 150 /// The hand-wavy "close to halfway" is made precise by the numeric analysis in the paper.
 151 /// In the words of Clinger:
 152 ///
 153 /// > Slop, expressed in units of the least significant bit, is an inclusive bound for the error
 154 /// > accumulated during the floating point calculation of the approximation to f * 10^e. (Slop is
 155 /// > not a bound for the true error, but bounds the difference between the approximation z and
 156 /// > the best possible approximation that uses p bits of significand.)
 157 pub fn bellerophon<T: RawFloat>(f: &Big, e: i16) -> T {
 158     let slop;
 159     if f <= &Big::from_u64(T::max_sig()) {
 160         // The cases abs(e) < log5(2^N) are in fast_path()
 161         slop = if e >= 0 { 0 } else { 3 };
 162     } else {
 163         slop = if e >= 0 { 1 } else { 4 };
 164     }
 165     let z = rawfp::big_to_fp(f).mul(&power_of_ten(e)).normalize();
 166     let exp_p_n = 1 << (P - T::sig_bits() as u32);
 167     let lowbits: i64 = (z.f % exp_p_n) as i64;
 168     // Is the slop large enough to make a difference when
 169     // rounding to n bits?
 170     if (lowbits - exp_p_n as i64 / 2).abs() <= slop {
 171         algorithm_r(f, e, fp_to_float(z))
 172     } else {
 173         fp_to_float(z)
 174     }
 175 }
 176
 177 /// An iterative algorithm that improves a floating point approximation of `f * 10^e`.
 178 ///
 179 /// Each iteration gets one unit in the last place closer, which of course takes terribly long to
 180 /// converge if `z0` is even mildly off. Luckily, when used as fallback for Bellerophon, the
 181 /// starting approximation is off by at most one ULP.
 182 fn algorithm_r<T: RawFloat>(f: &Big, e: i16, z0: T) -> T {
 183     let mut z = z0;
 184     loop {
 185         let raw = z.unpack();
 186         let (m, k) = (raw.sig, raw.k);
 187         let mut x = f.clone();
 188         let mut y = Big::from_u64(m);
 189
 190         // Find positive integers `x`, `y` such that `x / y` is exactly `(f * 10^e) / (m * 2^k)`.
 191         // This not only avoids dealing with the signs of `e` and `k`, we also eliminate the
 192         // power of two common to `10^e` and `2^k` to make the numbers smaller.
 193         make_ratio(&mut x, &mut y, e, k);
 194
 195         let m_digits = [(m & 0xFF_FF_FF_FF) as u32, (m >> 32) as u32];
 196         // This is written a bit awkwardly because our bignums don't support
 197         // negative numbers, so we use the absolute value + sign information.
 198         // The multiplication with m_digits can't overflow. If `x` or `y` are large enough that
 199         // we need to worry about overflow, then they are also large enough that `make_ratio` has
 200         // reduced the fraction by a factor of 2^64 or more.
 201         let (d2, d_negative) = if x >= y {
 202             // Don't need x any more, save a clone().
 203             x.sub(&y).mul_pow2(1).mul_digits(&m_digits);
 204             (x, false)
 205         } else {
 206             // Still need y - make a copy.
 207             let mut y = y.clone();
 208             y.sub(&x).mul_pow2(1).mul_digits(&m_digits);
 209             (y, true)
 210         };
 211
 212         if d2 < y {
 213             let mut d2_double = d2;
 214             d2_double.mul_pow2(1);
 215             if m == T::min_sig() && d_negative && d2_double > y {
 216                 z = prev_float(z);
 217             } else {
 218                 return z;
 219             }
 220         } else if d2 == y {
 221             if m % 2 == 0 {
 222                 if m == T::min_sig() && d_negative {
 223                     z = prev_float(z);
 224                 } else {
 225                     return z;
 226                 }
 227             } else if d_negative {
 228                 z = prev_float(z);
 229             } else {
 230                 z = next_float(z);
 231             }
 232         } else if d_negative {
 233             z = prev_float(z);
 234         } else {
 235             z = next_float(z);
 236         }
 237     }
 238 }
 239
 240 /// Given `x = f` and `y = m` where `f` represent input decimal digits as usual and `m` is the
 241 /// significand of a floating point approximation, make the ratio `x / y` equal to
 242 /// `(f * 10^e) / (m * 2^k)`, possibly reduced by a power of two both have in common.
 243 fn make_ratio(x: &mut Big, y: &mut Big, e: i16, k: i16) {
 244     let (e_abs, k_abs) = (e.abs() as usize, k.abs() as usize);
 245     if e >= 0 {
 246         if k >= 0 {
 247             // x = f * 10^e, y = m * 2^k, except that we reduce the fraction by some power of two.
 248             let common = min(e_abs, k_abs);
 249             x.mul_pow5(e_abs).mul_pow2(e_abs - common);
 250             y.mul_pow2(k_abs - common);
 251         } else {
 252             // x = f * 10^e * 2^abs(k), y = m
 253             // This can't overflow because it requires positive `e` and negative `k`, which can
 254             // only happen for values extremely close to 1, which means that `e` and `k` will be
 255             // comparatively tiny.
 256             x.mul_pow5(e_abs).mul_pow2(e_abs + k_abs);
 257         }
 258     } else {
 259         if k >= 0 {
 260             // x = f, y = m * 10^abs(e) * 2^k
 261             // This can't overflow either, see above.
 262             y.mul_pow5(e_abs).mul_pow2(k_abs + e_abs);
 263         } else {
 264             // x = f * 2^abs(k), y = m * 10^abs(e), again reducing by a common power of two.
 265             let common = min(e_abs, k_abs);
 266             x.mul_pow2(k_abs - common);
 267             y.mul_pow5(e_abs).mul_pow2(e_abs - common);
 268         }
 269     }
 270 }
 271
 272 /// Conceptually, Algorithm M is the simplest way to convert a decimal to a float.
 273 ///
 274 /// We form a ratio that is equal to `f * 10^e`, then throwing in powers of two until it gives
 275 /// a valid float significand. The binary exponent `k` is the number of times we multiplied
 276 /// numerator or denominator by two, i.e., at all times `f * 10^e` equals `(u / v) * 2^k`.
 277 /// When we have found out significand, we only need to round by inspecting the remainder of the
 278 /// division, which is done in helper functions further below.
 279 ///
 280 /// This algorithm is super slow, even with the optimization described in `quick_start()`.
 281 /// However, it's the simplest of the algorithms to adapt for overflow, underflow, and subnormal
 282 /// results. This implementation takes over when Bellerophon and Algorithm R are overwhelmed.
 283 /// Detecting underflow and overflow is easy: The ratio still isn't an in-range significand,
 284 /// yet the minimum/maximum exponent has been reached. In the case of overflow, we simply return
 285 /// infinity.
 286 ///
 287 /// Handling underflow and subnormals is trickier. One big problem is that, with the minimum
 288 /// exponent, the ratio might still be too large for a significand. See underflow() for details.
 289 pub fn algorithm_m<T: RawFloat>(f: &Big, e: i16) -> T {
 290     let mut u;
 291     let mut v;
 292     let e_abs = e.abs() as usize;
 293     let mut k = 0;
 294     if e < 0 {
 295         u = f.clone();
 296         v = Big::from_small(1);
 297         v.mul_pow5(e_abs).mul_pow2(e_abs);
 298     } else {
 299         // FIXME possible optimization: generalize big_to_fp so that we can do the equivalent of
 300         // fp_to_float(big_to_fp(u)) here, only without the double rounding.
 301         u = f.clone();
 302         u.mul_pow5(e_abs).mul_pow2(e_abs);
 303         v = Big::from_small(1);
 304     }
 305     quick_start::<T>(&mut u, &mut v, &mut k);
 306     let mut rem = Big::from_small(0);
 307     let mut x = Big::from_small(0);
 308     let min_sig = Big::from_u64(T::min_sig());
 309     let max_sig = Big::from_u64(T::max_sig());
 310     loop {
 311         u.div_rem(&v, &mut x, &mut rem);
 312         if k == T::min_exp_int() {
 313             // We have to stop at the minimum exponent, if we wait until `k < T::min_exp_int()`,
 314             // then we'd be off by a factor of two. Unfortunately this means we have to special-
 315             // case normal numbers with the minimum exponent.
 316             // FIXME find a more elegant formulation, but run the `tiny-pow10` test to make sure
 317             // that it's actually correct!
 318             if x >= min_sig && x <= max_sig {
 319                 break;
 320             }
 321             return underflow(x, v, rem);
 322         }
 323         if k > T::max_exp_int() {
 324             return T::infinity2();
 325         }
 326         if x < min_sig {
 327             u.mul_pow2(1);
 328             k -= 1;
 329         } else if x > max_sig {
 330             v.mul_pow2(1);
 331             k += 1;
 332         } else {
 333             break;
 334         }
 335     }
 336     let q = num::to_u64(&x);
 337     let z = rawfp::encode_normal(Unpacked::new(q, k));
 338     round_by_remainder(v, rem, q, z)
 339 }
 340
 341 /// Skip over most AlgorithmM iterations by checking the bit length.
 342 fn quick_start<T: RawFloat>(u: &mut Big, v: &mut Big, k: &mut i16) {
 343     // The bit length is an estimate of the base two logarithm, and log(u / v) = log(u) - log(v).
 344     // The estimate is off by at most 1, but always an under-estimate, so the error on log(u)
 345     // and log(v) are of the same sign and cancel out (if both are large). Therefore the error
 346     // for log(u / v) is at most one as well.
 347     // The target ratio is one where u/v is in an in-range significand. Thus our termination
 348     // condition is log2(u / v) being the significand bits, plus/minus one.
 349     // FIXME Looking at the second bit could improve the estimate and avoid some more divisions.
 350     let target_ratio = T::sig_bits() as i16;
 351     let log2_u = u.bit_length() as i16;
 352     let log2_v = v.bit_length() as i16;
 353     let mut u_shift: i16 = 0;
 354     let mut v_shift: i16 = 0;
 355     assert!(*k == 0);
 356     loop {
 357         if *k == T::min_exp_int() {
 358             // Underflow or subnormal. Leave it to the main function.
 359             break;
 360         }
 361         if *k == T::max_exp_int() {
 362             // Overflow. Leave it to the main function.
 363             break;
 364         }
 365         let log2_ratio = (log2_u + u_shift) - (log2_v + v_shift);
 366         if log2_ratio < target_ratio - 1 {
 367             u_shift += 1;
 368             *k -= 1;
 369         } else if log2_ratio > target_ratio + 1 {
 370             v_shift += 1;
 371             *k += 1;
 372         } else {
 373             break;
 374         }
 375     }
 376     u.mul_pow2(u_shift as usize);
 377     v.mul_pow2(v_shift as usize);
 378 }
 379
 380 fn underflow<T: RawFloat>(x: Big, v: Big, rem: Big) -> T {
 381     if x < Big::from_u64(T::min_sig()) {
 382         let q = num::to_u64(&x);
 383         let z = rawfp::encode_subnormal(q);
 384         return round_by_remainder(v, rem, q, z);
 385     }
 386     // Ratio isn't an in-range significand with the minimum exponent, so we need to round off
 387     // excess bits and adjust the exponent accordingly. The real value now looks like this:
 388     //
 389     //        x        lsb
 390     // /--------------\/
 391     // 1010101010101010.10101010101010 * 2^k
 392     // \-----/\-------/ \------------/
 393     //    q     trunc.    (represented by rem)
 394     //
 395     // Therefore, when the rounded-off bits are != 0.5 ULP, they decide the rounding
 396     // on their own. When they are equal and the remainder is non-zero, the value still
 397     // needs to be rounded up. Only when the rounded off bits are 1/2 and the remainer
 398     // is zero, we have a half-to-even situation.
 399     let bits = x.bit_length();
 400     let lsb = bits - T::sig_bits() as usize;
 401     let q = num::get_bits(&x, lsb, bits);
 402     let k = T::min_exp_int() + lsb as i16;
 403     let z = rawfp::encode_normal(Unpacked::new(q, k));
 404     let q_even = q % 2 == 0;
 405     match num::compare_with_half_ulp(&x, lsb) {
 406         Greater => next_float(z),
 407         Less => z,
 408         Equal if rem.is_zero() && q_even => z,
 409         Equal => next_float(z),
 410     }
 411 }
 412
 413 /// Ordinary round-to-even, obfuscated by having to round based on the remainder of a division.
 414 fn round_by_remainder<T: RawFloat>(v: Big, r: Big, q: u64, z: T) -> T {
 415     let mut v_minus_r = v;
 416     v_minus_r.sub(&r);
 417     if r < v_minus_r {
 418         z
 419     } else if r > v_minus_r {
 420         next_float(z)
 421     } else if q % 2 == 0 {
 422         z
 423     } else {
 424         next_float(z)
 425     }
 426 }