src/libcore/num/dec2flt/rawfp.rs

   1 // Copyright 2015 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 //! Bit fiddling on positive IEEE 754 floats. Negative numbers aren't and needn't be handled.
  12 //! Normal floating point numbers have a canonical representation as (frac, exp) such that the
  13 //! value is 2^exp * (1 + sum(frac[N-i] / 2^i)) where N is the number of bits. Subnormals are
  14 //! slightly different and weird, but the same principle applies.
  15 //!
  16 //! Here, however, we represent them as (sig, k) with f positive, such that the value is f * 2^e.
  17 //! Besides making the "hidden bit" explicit, this changes the exponent by the so-called
  18 //! mantissa shift.
  19 //!
  20 //! Put another way, normally floats are written as (1) but here they are written as (2):
  21 //!
  22 //! 1. `1.101100...11 * 2^m`
  23 //! 2. `1101100...11 * 2^n`
  24 //!
  25 //! We call (1) the **fractional representation** and (2) the **integral representation**.
  26 //!
  27 //! Many functions in this module only handle normal numbers. The dec2flt routines conservatively
  28 //! take the universally-correct slow path (Algorithm M) for very small and very large numbers.
  29 //! That algorithm needs only next_float() which does handle subnormals and zeros.
  30 use prelude::v1::*;
  31 use u32;
  32 use cmp::Ordering::{Less, Equal, Greater};
  33 use ops::{Mul, Div, Neg};
  34 use fmt::{Debug, LowerExp};
  35 use mem::transmute;
  36 use num::diy_float::Fp;
  37 use num::FpCategory::{Infinite, Zero, Subnormal, Normal, Nan};
  38 use num::Float;
  39 use num::dec2flt::num::{self, Big};
  40 use num::dec2flt::table;
  41
  42 #[derive(Copy, Clone, Debug)]
  43 pub struct Unpacked {
  44     pub sig: u64,
  45     pub k: i16,
  46 }
  47
  48 impl Unpacked {
  49     pub fn new(sig: u64, k: i16) -> Self {
  50         Unpacked { sig: sig, k: k }
  51     }
  52 }
  53
  54 /// A helper trait to avoid duplicating basically all the conversion code for `f32` and `f64`.
  55 ///
  56 /// See the parent module's doc comment for why this is necessary.
  57 ///
  58 /// Should **never ever** be implemented for other types or be used outside the dec2flt module.
  59 /// Inherits from `Float` because there is some overlap, but all the reused methods are trivial.
  60 /// The "methods" (pseudo-constants) with default implementation should not be overriden.
  61 pub trait RawFloat : Float + Copy + Debug + LowerExp
  62                     + Mul<Output=Self> + Div<Output=Self> + Neg<Output=Self>
  63 {
  64     // suffix of "2" because Float::infinity is deprecated
  65     #[allow(deprecated)]
  66     fn infinity2() -> Self {
  67         Float::infinity()
  68     }
  69
  70     // suffix of "2" because Float::nan is deprecated
  71     #[allow(deprecated)]
  72     fn nan2() -> Self {
  73         Float::nan()
  74     }
  75
  76     // suffix of "2" because Float::zero is deprecated
  77     fn zero2() -> Self;
  78
  79     // suffix of "2" because Float::integer_decode is deprecated
  80     #[allow(deprecated)]
  81     fn integer_decode2(self) -> (u64, i16, i8) {
  82         Float::integer_decode(self)
  83     }
  84
  85     /// Get the raw binary representation of the float.
  86     fn transmute(self) -> u64;
  87
  88     /// Transmute the raw binary representation into a float.
  89     fn from_bits(bits: u64) -> Self;
  90
  91     /// Decode the float.
  92     fn unpack(self) -> Unpacked;
  93
  94     /// Cast from a small integer that can be represented exactly.  Panic if the integer can't be
  95     /// represented, the other code in this module makes sure to never let that happen.
  96     fn from_int(x: u64) -> Self;
  97
  98     /// Get the value 10^e from a pre-computed table. Panics for e >= ceil_log5_of_max_sig().
  99     fn short_fast_pow10(e: usize) -> Self;
 100
 101     // FIXME Everything that follows should be associated constants, but taking the value of an
 102     // associated constant from a type parameter does not work (yet?)
 103     // A possible workaround is having a `FloatInfo` struct for all the constants, but so far
 104     // the methods aren't painful enough to rewrite.
 105
 106     /// What the name says. It's easier to hard code than juggling intrinsics and
 107     /// hoping LLVM constant folds it.
 108     fn ceil_log5_of_max_sig() -> i16;
 109
 110     // A conservative bound on the decimal digits of inputs that can't produce overflow or zero or
 111     /// subnormals. Probably the decimal exponent of the maximum normal value, hence the name.
 112     fn max_normal_digits() -> usize;
 113
 114     /// When the most significant decimal digit has a place value greater than this, the number
 115     /// is certainly rounded to infinity.
 116     fn inf_cutoff() -> i64;
 117
 118     /// When the most significant decimal digit has a place value less than this, the number
 119     /// is certainly rounded to zero.
 120     fn zero_cutoff() -> i64;
 121
 122     /// The number of bits in the exponent.
 123     fn exp_bits() -> u8;
 124
 125     /// The number of bits in the singificand, *including* the hidden bit.
 126     fn sig_bits() -> u8;
 127
 128     /// The number of bits in the singificand, *excluding* the hidden bit.
 129     fn explicit_sig_bits() -> u8 {
 130         Self::sig_bits() - 1
 131     }
 132
 133     /// The maximum legal exponent in fractional representation.
 134     fn max_exp() -> i16 {
 135         (1 << (Self::exp_bits() - 1)) - 1
 136     }
 137
 138     /// The minimum legal exponent in fractional representation, excluding subnormals.
 139     fn min_exp() -> i16 {
 140         -Self::max_exp() + 1
 141     }
 142
 143     /// `MAX_EXP` for integral representation, i.e., with the shift applied.
 144     fn max_exp_int() -> i16 {
 145         Self::max_exp() - (Self::sig_bits() as i16 - 1)
 146     }
 147
 148     /// `MAX_EXP` encoded (i.e., with offset bias)
 149     fn max_encoded_exp() -> i16 {
 150         (1 << Self::exp_bits()) - 1
 151     }
 152
 153     /// `MIN_EXP` for integral representation, i.e., with the shift applied.
 154     fn min_exp_int() -> i16 {
 155         Self::min_exp() - (Self::sig_bits() as i16 - 1)
 156     }
 157
 158     /// The maximum normalized singificand in integral representation.
 159     fn max_sig() -> u64 {
 160         (1 << Self::sig_bits()) - 1
 161     }
 162
 163     /// The minimal normalized significand in integral representation.
 164     fn min_sig() -> u64 {
 165         1 << (Self::sig_bits() - 1)
 166     }
 167 }
 168
 169 impl RawFloat for f32 {
 170     fn zero2() -> Self {
 171         0.0
 172     }
 173
 174     fn sig_bits() -> u8 {
 175         24
 176     }
 177
 178     fn exp_bits() -> u8 {
 179         8
 180     }
 181
 182     fn ceil_log5_of_max_sig() -> i16 {
 183         11
 184     }
 185
 186     fn transmute(self) -> u64 {
 187         let bits: u32 = unsafe { transmute(self) };
 188         bits as u64
 189     }
 190
 191     fn from_bits(bits: u64) -> f32 {
 192         assert!(bits < u32::MAX as u64, "f32::from_bits: too many bits");
 193         unsafe { transmute(bits as u32) }
 194     }
 195
 196     fn unpack(self) -> Unpacked {
 197         let (sig, exp, _sig) = self.integer_decode2();
 198         Unpacked::new(sig, exp)
 199     }
 200
 201     fn from_int(x: u64) -> f32 {
 202         // rkruppe is uncertain whether `as` rounds correctly on all platforms.
 203         debug_assert!(x as f32 == fp_to_float(Fp { f: x, e: 0 }));
 204         x as f32
 205     }
 206
 207     fn short_fast_pow10(e: usize) -> Self {
 208         table::F32_SHORT_POWERS[e]
 209     }
 210
 211     fn max_normal_digits() -> usize {
 212         35
 213     }
 214
 215     fn inf_cutoff() -> i64 {
 216         40
 217     }
 218
 219     fn zero_cutoff() -> i64 {
 220         -48
 221     }
 222 }
 223
 224
 225 impl RawFloat for f64 {
 226     fn zero2() -> Self {
 227         0.0
 228     }
 229
 230     fn sig_bits() -> u8 {
 231         53
 232     }
 233
 234     fn exp_bits() -> u8 {
 235         11
 236     }
 237
 238     fn ceil_log5_of_max_sig() -> i16 {
 239         23
 240     }
 241
 242     fn transmute(self) -> u64 {
 243         let bits: u64 = unsafe { transmute(self) };
 244         bits
 245     }
 246
 247     fn from_bits(bits: u64) -> f64 {
 248         unsafe { transmute(bits) }
 249     }
 250
 251     fn unpack(self) -> Unpacked {
 252         let (sig, exp, _sig) = self.integer_decode2();
 253         Unpacked::new(sig, exp)
 254     }
 255
 256     fn from_int(x: u64) -> f64 {
 257         // rkruppe is uncertain whether `as` rounds correctly on all platforms.
 258         debug_assert!(x as f64 == fp_to_float(Fp { f: x, e: 0 }));
 259         x as f64
 260     }
 261
 262     fn short_fast_pow10(e: usize) -> Self {
 263         table::F64_SHORT_POWERS[e]
 264     }
 265
 266     fn max_normal_digits() -> usize {
 267         305
 268     }
 269
 270     fn inf_cutoff() -> i64 {
 271         310
 272     }
 273
 274     fn zero_cutoff() -> i64 {
 275         -326
 276     }
 277
 278 }
 279
 280 /// Convert an Fp to the closest f64. Only handles number that fit into a normalized f64.
 281 pub fn fp_to_float<T: RawFloat>(x: Fp) -> T {
 282     let x = x.normalize();
 283     // x.f is 64 bit, so x.e has a mantissa shift of 63
 284     let e = x.e + 63;
 285     if e > T::max_exp() {
 286         panic!("fp_to_float: exponent {} too large", e)
 287     }  else if e > T::min_exp() {
 288         encode_normal(round_normal::<T>(x))
 289     } else {
 290         panic!("fp_to_float: exponent {} too small", e)
 291     }
 292 }
 293
 294 /// Round the 64-bit significand to 53 bit with half-to-even. Does not handle exponent overflow.
 295 pub fn round_normal<T: RawFloat>(x: Fp) -> Unpacked {
 296     let excess = 64 - T::sig_bits() as i16;
 297     let half: u64 = 1 << (excess - 1);
 298     let (q, rem) = (x.f >> excess, x.f & ((1 << excess) - 1));
 299     assert_eq!(q << excess | rem, x.f);
 300     // Adjust mantissa shift
 301     let k = x.e + excess;
 302     if rem < half {
 303         Unpacked::new(q, k)
 304     } else if rem == half && (q % 2) == 0 {
 305         Unpacked::new(q, k)
 306     } else if q == T::max_sig() {
 307         Unpacked::new(T::min_sig(), k + 1)
 308     } else {
 309         Unpacked::new(q + 1, k)
 310     }
 311 }
 312
 313 /// Inverse of `RawFloat::unpack()` for normalized numbers.
 314 /// Panics if the significand or exponent are not valid for normalized numbers.
 315 pub fn encode_normal<T: RawFloat>(x: Unpacked) -> T {
 316     debug_assert!(T::min_sig() <= x.sig && x.sig <= T::max_sig(),
 317         "encode_normal: significand not normalized");
 318     // Remove the hidden bit
 319     let sig_enc = x.sig & !(1 << T::explicit_sig_bits());
 320     // Adjust the exponent for exponent bias and mantissa shift
 321     let k_enc = x.k + T::max_exp() + T::explicit_sig_bits() as i16;
 322     debug_assert!(k_enc != 0 && k_enc < T::max_encoded_exp(),
 323         "encode_normal: exponent out of range");
 324     // Leave sign bit at 0 ("+"), our numbers are all positive
 325     let bits = (k_enc as u64) << T::explicit_sig_bits() | sig_enc;
 326     T::from_bits(bits)
 327 }
 328
 329 /// Construct the subnormal. A mantissa of 0 is allowed and constructs zero.
 330 pub fn encode_subnormal<T: RawFloat>(significand: u64) -> T {
 331     assert!(significand < T::min_sig(), "encode_subnormal: not actually subnormal");
 332     // Encoded exponent is 0, the sign bit is 0, so we just have to reinterpret the bits.
 333     T::from_bits(significand)
 334 }
 335
 336 /// Approximate a bignum with an Fp. Rounds within 0.5 ULP with half-to-even.
 337 pub fn big_to_fp(f: &Big) -> Fp {
 338     let end = f.bit_length();
 339     assert!(end != 0, "big_to_fp: unexpectedly, input is zero");
 340     let start = end.saturating_sub(64);
 341     let leading = num::get_bits(f, start, end);
 342     // We cut off all bits prior to the index `start`, i.e., we effectively right-shift by
 343     // an amount of `start`, so this is also the exponent we need.
 344     let e = start as i16;
 345     let rounded_down = Fp { f: leading, e: e }.normalize();
 346     // Round (half-to-even) depending on the truncated bits.
 347     match num::compare_with_half_ulp(f, start) {
 348         Less => rounded_down,
 349         Equal if leading % 2 == 0 => rounded_down,
 350         Equal | Greater => match leading.checked_add(1) {
 351             Some(f) => Fp { f: f, e: e }.normalize(),
 352             None => Fp { f: 1 << 63, e: e + 1 },
 353         }
 354     }
 355 }
 356
 357 /// Find the largest floating point number strictly smaller than the argument.
 358 /// Does not handle subnormals, zero, or exponent underflow.
 359 pub fn prev_float<T: RawFloat>(x: T) -> T {
 360     match x.classify() {
 361         Infinite => panic!("prev_float: argument is infinite"),
 362         Nan => panic!("prev_float: argument is NaN"),
 363         Subnormal => panic!("prev_float: argument is subnormal"),
 364         Zero => panic!("prev_float: argument is zero"),
 365         Normal => {
 366             let Unpacked { sig, k } = x.unpack();
 367             if sig == T::min_sig() {
 368                 encode_normal(Unpacked::new(T::max_sig(), k - 1))
 369             } else {
 370                 encode_normal(Unpacked::new(sig - 1, k))
 371             }
 372         }
 373     }
 374 }
 375
 376 // Find the smallest floating point number strictly larger than the argument.
 377 // This operation is saturating, i.e. next_float(inf) == inf.
 378 // Unlike most code in this module, this function does handle zero, subnormals, and infinities.
 379 // However, like all other code here, it does not deal with NaN and negative numbers.
 380 pub fn next_float<T: RawFloat>(x: T) -> T {
 381     match x.classify() {
 382         Nan => panic!("next_float: argument is NaN"),
 383         Infinite => T::infinity2(),
 384         // This seems too good to be true, but it works.
 385         // 0.0 is encoded as the all-zero word. Subnormals are 0x000m...m where m is the mantissa.
 386         // In particular, the smallest subnormal is 0x0...01 and the largest is 0x000F...F.
 387         // The smallest normal number is 0x0010...0, so this corner case works as well.
 388         // If the increment overflows the mantissa, the carry bit increments the exponent as we
 389         // want, and the mantissa bits become zero. Because of the hidden bit convention, this
 390         // too is exactly what we want!
 391         // Finally, f64::MAX + 1 = 7eff...f + 1 = 7ff0...0 = f64::INFINITY.
 392         Zero | Subnormal | Normal => {
 393             let bits: u64 = x.transmute();
 394             T::from_bits(bits + 1)
 395         }
 396     }
 397 }