1 use crate::{Category, ExpInt, IEK_INF, IEK_NAN, IEK_ZERO};
2 use crate::{Float, FloatConvert, ParseError, Round, Status, StatusAnd};
4 use core::cmp::{self, Ordering};
5 use core::fmt::{self, Write};
6 use core::marker::PhantomData;
9 use smallvec::{smallvec, SmallVec};
12 pub struct IeeeFloat<S> {
13 /// Absolute significand value (including the integer bit).
16 /// The signed unbiased exponent of the value.
19 /// What kind of floating point number this is.
22 /// Sign bit of the number.
25 marker: PhantomData<S>,
28 /// Fundamental unit of big integer arithmetic, but also
29 /// large to store the largest significands by itself.
31 const LIMB_BITS: usize = 128;
32 fn limbs_for_bits(bits: usize) -> usize {
33 (bits + LIMB_BITS - 1) / LIMB_BITS
36 /// Enum that represents what fraction of the LSB truncated bits of an fp number
39 /// This essentially combines the roles of guard and sticky bits.
41 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
43 // Example of truncated bits:
44 ExactlyZero, // 000000
45 LessThanHalf, // 0xxxxx x's not all zero
46 ExactlyHalf, // 100000
47 MoreThanHalf, // 1xxxxx x's not all zero
50 /// Represents floating point arithmetic semantics.
51 pub trait Semantics: Sized {
52 /// Total number of bits in the in-memory format.
55 /// Number of bits in the significand. This includes the integer bit.
56 const PRECISION: usize;
58 /// The largest E such that 2<sup>E</sup> is representable; this matches the
59 /// definition of IEEE 754.
60 const MAX_EXP: ExpInt;
62 /// The smallest E such that 2<sup>E</sup> is a normalized number; this
63 /// matches the definition of IEEE 754.
64 const MIN_EXP: ExpInt = -Self::MAX_EXP + 1;
66 /// The significand bit that marks NaN as quiet.
67 const QNAN_BIT: usize = Self::PRECISION - 2;
69 /// The significand bitpattern to mark a NaN as quiet.
70 /// NOTE: for X87DoubleExtended we need to set two bits instead of 2.
71 const QNAN_SIGNIFICAND: Limb = 1 << Self::QNAN_BIT;
73 fn from_bits(bits: u128) -> IeeeFloat<Self> {
74 assert!(Self::BITS > Self::PRECISION);
76 let sign = bits & (1 << (Self::BITS - 1));
77 let exponent = (bits & !sign) >> (Self::PRECISION - 1);
78 let mut r = IeeeFloat {
79 sig: [bits & ((1 << (Self::PRECISION - 1)) - 1)],
80 // Convert the exponent from its bias representation to a signed integer.
81 exp: (exponent as ExpInt) - Self::MAX_EXP,
82 category: Category::Zero,
87 if r.exp == Self::MIN_EXP - 1 && r.sig == [0] {
88 // Exponent, significand meaningless.
89 r.category = Category::Zero;
90 } else if r.exp == Self::MAX_EXP + 1 && r.sig == [0] {
91 // Exponent, significand meaningless.
92 r.category = Category::Infinity;
93 } else if r.exp == Self::MAX_EXP + 1 && r.sig != [0] {
94 // Sign, exponent, significand meaningless.
95 r.category = Category::NaN;
97 r.category = Category::Normal;
98 if r.exp == Self::MIN_EXP - 1 {
100 r.exp = Self::MIN_EXP;
103 sig::set_bit(&mut r.sig, Self::PRECISION - 1);
110 fn to_bits(x: IeeeFloat<Self>) -> u128 {
111 assert!(Self::BITS > Self::PRECISION);
113 // Split integer bit from significand.
114 let integer_bit = sig::get_bit(&x.sig, Self::PRECISION - 1);
115 let mut significand = x.sig[0] & ((1 << (Self::PRECISION - 1)) - 1);
116 let exponent = match x.category {
117 Category::Normal => {
118 if x.exp == Self::MIN_EXP && !integer_bit {
126 // FIXME(eddyb) Maybe we should guarantee an invariant instead?
130 Category::Infinity => {
131 // FIXME(eddyb) Maybe we should guarantee an invariant instead?
135 Category::NaN => Self::MAX_EXP + 1,
138 // Convert the exponent from a signed integer to its bias representation.
139 let exponent = (exponent + Self::MAX_EXP) as u128;
141 ((x.sign as u128) << (Self::BITS - 1)) | (exponent << (Self::PRECISION - 1)) | significand
145 impl<S> Copy for IeeeFloat<S> {}
146 impl<S> Clone for IeeeFloat<S> {
147 fn clone(&self) -> Self {
152 macro_rules! ieee_semantics {
153 ($($name:ident = $sem:ident($bits:tt : $exp_bits:tt)),*) => {
155 $(pub type $name = IeeeFloat<$sem>;)*
156 $(impl Semantics for $sem {
157 const BITS: usize = $bits;
158 const PRECISION: usize = ($bits - 1 - $exp_bits) + 1;
159 const MAX_EXP: ExpInt = (1 << ($exp_bits - 1)) - 1;
166 Single = SingleS(32:8),
167 Double = DoubleS(64:11),
171 pub struct X87DoubleExtendedS;
172 pub type X87DoubleExtended = IeeeFloat<X87DoubleExtendedS>;
173 impl Semantics for X87DoubleExtendedS {
174 const BITS: usize = 80;
175 const PRECISION: usize = 64;
176 const MAX_EXP: ExpInt = (1 << (15 - 1)) - 1;
178 /// For x87 extended precision, we want to make a NaN, not a
179 /// pseudo-NaN. Maybe we should expose the ability to make
181 const QNAN_SIGNIFICAND: Limb = 0b11 << Self::QNAN_BIT;
183 /// Integer bit is explicit in this format. Intel hardware (387 and later)
184 /// does not support these bit patterns:
185 /// exponent = all 1's, integer bit 0, significand 0 ("pseudoinfinity")
186 /// exponent = all 1's, integer bit 0, significand nonzero ("pseudoNaN")
187 /// exponent = 0, integer bit 1 ("pseudodenormal")
188 /// exponent != 0 nor all 1's, integer bit 0 ("unnormal")
189 /// At the moment, the first two are treated as NaNs, the second two as Normal.
190 fn from_bits(bits: u128) -> IeeeFloat<Self> {
191 let sign = bits & (1 << (Self::BITS - 1));
192 let exponent = (bits & !sign) >> Self::PRECISION;
193 let mut r = IeeeFloat {
194 sig: [bits & ((1 << (Self::PRECISION - 1)) - 1)],
195 // Convert the exponent from its bias representation to a signed integer.
196 exp: (exponent as ExpInt) - Self::MAX_EXP,
197 category: Category::Zero,
202 if r.exp == Self::MIN_EXP - 1 && r.sig == [0] {
203 // Exponent, significand meaningless.
204 r.category = Category::Zero;
205 } else if r.exp == Self::MAX_EXP + 1 && r.sig == [1 << (Self::PRECISION - 1)] {
206 // Exponent, significand meaningless.
207 r.category = Category::Infinity;
208 } else if r.exp == Self::MAX_EXP + 1 && r.sig != [1 << (Self::PRECISION - 1)] {
209 // Sign, exponent, significand meaningless.
210 r.category = Category::NaN;
212 r.category = Category::Normal;
213 if r.exp == Self::MIN_EXP - 1 {
215 r.exp = Self::MIN_EXP;
222 fn to_bits(x: IeeeFloat<Self>) -> u128 {
223 // Get integer bit from significand.
224 let integer_bit = sig::get_bit(&x.sig, Self::PRECISION - 1);
225 let mut significand = x.sig[0] & ((1 << Self::PRECISION) - 1);
226 let exponent = match x.category {
227 Category::Normal => {
228 if x.exp == Self::MIN_EXP && !integer_bit {
236 // FIXME(eddyb) Maybe we should guarantee an invariant instead?
240 Category::Infinity => {
241 // FIXME(eddyb) Maybe we should guarantee an invariant instead?
242 significand = 1 << (Self::PRECISION - 1);
245 Category::NaN => Self::MAX_EXP + 1,
248 // Convert the exponent from a signed integer to its bias representation.
249 let exponent = (exponent + Self::MAX_EXP) as u128;
251 ((x.sign as u128) << (Self::BITS - 1)) | (exponent << Self::PRECISION) | significand
255 float_common_impls!(IeeeFloat<S>);
257 impl<S: Semantics> PartialEq for IeeeFloat<S> {
258 fn eq(&self, rhs: &Self) -> bool {
259 self.partial_cmp(rhs) == Some(Ordering::Equal)
263 impl<S: Semantics> PartialOrd for IeeeFloat<S> {
264 fn partial_cmp(&self, rhs: &Self) -> Option<Ordering> {
265 match (self.category, rhs.category) {
266 (Category::NaN, _) | (_, Category::NaN) => None,
268 (Category::Infinity, Category::Infinity) => Some((!self.sign).cmp(&(!rhs.sign))),
270 (Category::Zero, Category::Zero) => Some(Ordering::Equal),
272 (Category::Infinity, _) | (Category::Normal, Category::Zero) => {
273 Some((!self.sign).cmp(&self.sign))
276 (_, Category::Infinity) | (Category::Zero, Category::Normal) => {
277 Some(rhs.sign.cmp(&(!rhs.sign)))
280 (Category::Normal, Category::Normal) => {
281 // Two normal numbers. Do they have the same sign?
282 Some((!self.sign).cmp(&(!rhs.sign)).then_with(|| {
283 // Compare absolute values; invert result if negative.
284 let result = self.cmp_abs_normal(*rhs);
286 if self.sign { result.reverse() } else { result }
293 impl<S> Neg for IeeeFloat<S> {
295 fn neg(mut self) -> Self {
296 self.sign = !self.sign;
301 /// Prints this value as a decimal string.
303 /// \param precision The maximum number of digits of
304 /// precision to output. If there are fewer digits available,
305 /// zero padding will not be used unless the value is
306 /// integral and small enough to be expressed in
307 /// precision digits. 0 means to use the natural
308 /// precision of the number.
309 /// \param width The maximum number of zeros to
310 /// consider inserting before falling back to scientific
311 /// notation. 0 means to always use scientific notation.
313 /// \param alternate Indicate whether to remove the trailing zero in
314 /// fraction part or not. Also setting this parameter to true forces
315 /// producing of output more similar to default printf behavior.
316 /// Specifically the lower e is used as exponent delimiter and exponent
317 /// always contains no less than two digits.
319 /// Number precision width Result
320 /// ------ --------- ----- ------
321 /// 1.01E+4 5 2 10100
322 /// 1.01E+4 4 2 1.01E+4
323 /// 1.01E+4 5 1 1.01E+4
324 /// 1.01E-2 5 2 0.0101
325 /// 1.01E-2 4 2 0.0101
326 /// 1.01E-2 4 1 1.01E-2
327 impl<S: Semantics> fmt::Display for IeeeFloat<S> {
328 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
329 let width = f.width().unwrap_or(3);
330 let alternate = f.alternate();
332 match self.category {
333 Category::Infinity => {
335 return f.write_str("-Inf");
337 return f.write_str("+Inf");
341 Category::NaN => return f.write_str("NaN"),
351 if let Some(n) = f.precision() {
356 f.write_str("e+00")?;
358 f.write_str("0.0E+0")?;
366 Category::Normal => {}
373 // We use enough digits so the number can be round-tripped back to an
374 // APFloat. The formula comes from "How to Print Floating-Point Numbers
375 // Accurately" by Steele and White.
376 // FIXME: Using a formula based purely on the precision is conservative;
377 // we can print fewer digits depending on the actual value being printed.
379 // precision = 2 + floor(S::PRECISION / lg_2(10))
380 let precision = f.precision().unwrap_or(2 + S::PRECISION * 59 / 196);
382 // Decompose the number into an APInt and an exponent.
383 let mut exp = self.exp - (S::PRECISION as ExpInt - 1);
384 let mut sig = vec![self.sig[0]];
386 // Ignore trailing binary zeros.
387 let trailing_zeros = sig[0].trailing_zeros();
388 let _: Loss = sig::shift_right(&mut sig, &mut exp, trailing_zeros as usize);
390 // Change the exponent from 2^e to 10^e.
395 let shift = exp as usize;
396 sig.resize(limbs_for_bits(S::PRECISION + shift), 0);
397 sig::shift_left(&mut sig, &mut exp, shift);
400 let mut texp = -exp as usize;
402 // We transform this using the identity:
403 // (N)(2^-e) == (N)(5^e)(10^-e)
405 // Multiply significand by 5^e.
406 // N * 5^0101 == N * 5^(1*1) * 5^(0*2) * 5^(1*4) * 5^(0*8)
407 let mut sig_scratch = vec![];
409 let mut p5_scratch = vec![];
414 p5_scratch.resize(p5.len() * 2, 0);
416 sig::mul(&mut p5_scratch, &mut 0, &p5, &p5, p5.len() * 2 * LIMB_BITS);
417 while p5_scratch.last() == Some(&0) {
420 mem::swap(&mut p5, &mut p5_scratch);
423 sig_scratch.resize(sig.len() + p5.len(), 0);
424 let _: Loss = sig::mul(
429 (sig.len() + p5.len()) * LIMB_BITS,
431 while sig_scratch.last() == Some(&0) {
434 mem::swap(&mut sig, &mut sig_scratch);
441 let mut buffer = vec![];
443 // Ignore digits from the significand until it is no more
444 // precise than is required for the desired precision.
445 // 196/59 is a very slight overestimate of lg_2(10).
446 let required = (precision * 196 + 58) / 59;
447 let mut discard_digits = sig::omsb(&sig).saturating_sub(required) * 59 / 196;
448 let mut in_trail = true;
449 while !sig.is_empty() {
450 // Perform short division by 10 to extract the rightmost digit.
455 // Use 64-bit division and remainder, with 32-bit chunks from sig.
456 sig::each_chunk(&mut sig, 32, |chunk| {
457 let chunk = chunk as u32;
458 let combined = ((rem as u64) << 32) | (chunk as u64);
459 rem = (combined % 10) as u8;
460 (combined / 10) as u32 as Limb
463 // Reduce the significand to avoid wasting time dividing 0's.
464 while sig.last() == Some(&0) {
470 // Ignore digits we don't need.
471 if discard_digits > 0 {
477 // Drop trailing zeros.
478 if in_trail && digit == 0 {
482 buffer.push(b'0' + digit);
486 assert!(!buffer.is_empty(), "no characters in buffer!");
488 // Drop down to precision.
489 // FIXME: don't do more precise calculations above than are required.
490 if buffer.len() > precision {
491 // The most significant figures are the last ones in the buffer.
492 let mut first_sig = buffer.len() - precision;
495 // FIXME: this probably shouldn't use 'round half up'.
497 // Rounding down is just a truncation, except we also want to drop
498 // trailing zeros from the new result.
499 if buffer[first_sig - 1] < b'5' {
500 while first_sig < buffer.len() && buffer[first_sig] == b'0' {
504 // Rounding up requires a decimal add-with-carry. If we continue
505 // the carry, the newly-introduced zeros will just be truncated.
506 for x in &mut buffer[first_sig..] {
516 exp += first_sig as ExpInt;
517 buffer.drain(..first_sig);
519 // If we carried through, we have exactly one digit of precision.
520 if buffer.is_empty() {
525 let digits = buffer.len();
527 // Check whether we should use scientific notation.
528 let scientific = if width == 0 {
533 // But we shouldn't make the number look more precise than it is.
534 exp as usize > width || digits + exp as usize > precision
536 // Power of the most significant digit.
537 let msd = exp + (digits - 1) as ExpInt;
544 -msd as usize > width
548 // Scientific formatting is pretty straightforward.
550 exp += digits as ExpInt - 1;
552 f.write_char(buffer[digits - 1] as char)?;
554 let truncate_zero = !alternate;
555 if digits == 1 && truncate_zero {
558 for &d in buffer[..digits - 1].iter().rev() {
559 f.write_char(d as char)?;
562 // Fill with zeros up to precision.
563 if !truncate_zero && precision > digits - 1 {
564 for _ in 0..=precision - digits {
568 // For alternate we use lower 'e'.
569 f.write_char(if alternate { 'e' } else { 'E' })?;
571 // Exponent always at least two digits if we do not truncate zeros.
573 write!(f, "{:+}", exp)?;
575 write!(f, "{:+03}", exp)?;
581 // Non-scientific, positive exponents.
583 for &d in buffer.iter().rev() {
584 f.write_char(d as char)?;
592 // Non-scientific, negative exponents.
593 let unit_place = -exp as usize;
594 if unit_place < digits {
595 for &d in buffer[unit_place..].iter().rev() {
596 f.write_char(d as char)?;
599 for &d in buffer[..unit_place].iter().rev() {
600 f.write_char(d as char)?;
604 for _ in digits..unit_place {
607 for &d in buffer.iter().rev() {
608 f.write_char(d as char)?;
616 impl<S: Semantics> fmt::Debug for IeeeFloat<S> {
617 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
620 "{}({:?} | {}{:?} * 2^{})",
623 if self.sign { "-" } else { "+" },
630 impl<S: Semantics> Float for IeeeFloat<S> {
631 const BITS: usize = S::BITS;
632 const PRECISION: usize = S::PRECISION;
633 const MAX_EXP: ExpInt = S::MAX_EXP;
634 const MIN_EXP: ExpInt = S::MIN_EXP;
636 const ZERO: Self = IeeeFloat {
639 category: Category::Zero,
644 const INFINITY: Self = IeeeFloat {
647 category: Category::Infinity,
652 // FIXME(eddyb) remove when qnan becomes const fn.
653 const NAN: Self = IeeeFloat {
654 sig: [S::QNAN_SIGNIFICAND],
656 category: Category::NaN,
661 fn qnan(payload: Option<u128>) -> Self {
663 sig: [S::QNAN_SIGNIFICAND
664 | payload.map_or(0, |payload| {
665 // Zero out the excess bits of the significand.
666 payload & ((1 << S::QNAN_BIT) - 1)
669 category: Category::NaN,
675 fn snan(payload: Option<u128>) -> Self {
676 let mut snan = Self::qnan(payload);
678 // We always have to clear the QNaN bit to make it an SNaN.
679 sig::clear_bit(&mut snan.sig, S::QNAN_BIT);
681 // If there are no bits set in the payload, we have to set
682 // *something* to make it a NaN instead of an infinity;
683 // conventionally, this is the next bit down from the QNaN bit.
684 if snan.sig[0] & !S::QNAN_SIGNIFICAND == 0 {
685 sig::set_bit(&mut snan.sig, S::QNAN_BIT - 1);
691 fn largest() -> Self {
692 // We want (in interchange format):
694 // significand = 1..1
696 sig: [(1 << S::PRECISION) - 1],
698 category: Category::Normal,
704 // We want (in interchange format):
706 // significand = 0..01
707 const SMALLEST: Self = IeeeFloat {
710 category: Category::Normal,
715 fn smallest_normalized() -> Self {
716 // We want (in interchange format):
718 // significand = 10..0
720 sig: [1 << (S::PRECISION - 1)],
722 category: Category::Normal,
728 fn add_r(mut self, rhs: Self, round: Round) -> StatusAnd<Self> {
729 let status = match (self.category, rhs.category) {
730 (Category::Infinity, Category::Infinity) => {
731 // Differently signed infinities can only be validly
733 if self.sign != rhs.sign {
741 // Sign may depend on rounding mode; handled below.
742 (_, Category::Zero) | (Category::NaN, _) | (Category::Infinity, Category::Normal) => {
746 (Category::Zero, _) | (_, Category::NaN | Category::Infinity) => {
751 // This return code means it was not a simple case.
752 (Category::Normal, Category::Normal) => {
753 let loss = sig::add_or_sub(
762 self = unpack!(status=, self.normalize(round, loss));
764 // Can only be zero if we lost no fraction.
765 assert!(self.category != Category::Zero || loss == Loss::ExactlyZero);
771 // If two numbers add (exactly) to zero, IEEE 754 decrees it is a
772 // positive zero unless rounding to minus infinity, except that
773 // adding two like-signed zeroes gives that zero.
774 if self.category == Category::Zero
775 && (rhs.category != Category::Zero || self.sign != rhs.sign)
777 self.sign = round == Round::TowardNegative;
783 fn mul_r(mut self, rhs: Self, round: Round) -> StatusAnd<Self> {
784 self.sign ^= rhs.sign;
786 match (self.category, rhs.category) {
787 (Category::NaN, _) => {
792 (_, Category::NaN) => {
794 self.category = Category::NaN;
799 (Category::Zero, Category::Infinity) | (Category::Infinity, Category::Zero) => {
800 Status::INVALID_OP.and(Self::NAN)
803 (_, Category::Infinity) | (Category::Infinity, _) => {
804 self.category = Category::Infinity;
808 (Category::Zero, _) | (_, Category::Zero) => {
809 self.category = Category::Zero;
813 (Category::Normal, Category::Normal) => {
815 let mut wide_sig = [0; 2];
817 sig::mul(&mut wide_sig, &mut self.exp, &self.sig, &rhs.sig, S::PRECISION);
818 self.sig = [wide_sig[0]];
820 self = unpack!(status=, self.normalize(round, loss));
821 if loss != Loss::ExactlyZero {
822 status |= Status::INEXACT;
829 fn mul_add_r(mut self, multiplicand: Self, addend: Self, round: Round) -> StatusAnd<Self> {
830 // If and only if all arguments are normal do we need to do an
831 // extended-precision calculation.
832 if !self.is_finite_non_zero() || !multiplicand.is_finite_non_zero() || !addend.is_finite() {
834 self = unpack!(status=, self.mul_r(multiplicand, round));
836 // FS can only be Status::OK or Status::INVALID_OP. There is no more work
837 // to do in the latter case. The IEEE-754R standard says it is
838 // implementation-defined in this case whether, if ADDEND is a
839 // quiet NaN, we raise invalid op; this implementation does so.
841 // If we need to do the addition we can do so with normal
843 if status == Status::OK {
844 self = unpack!(status=, self.add_r(addend, round));
846 return status.and(self);
849 // Post-multiplication sign, before addition.
850 self.sign ^= multiplicand.sign;
852 // Allocate space for twice as many bits as the original significand, plus one
853 // extra bit for the addition to overflow into.
854 assert!(limbs_for_bits(S::PRECISION * 2 + 1) <= 2);
855 let mut wide_sig = sig::widening_mul(self.sig[0], multiplicand.sig[0]);
857 let mut loss = Loss::ExactlyZero;
858 let mut omsb = sig::omsb(&wide_sig);
859 self.exp += multiplicand.exp;
861 // Assume the operands involved in the multiplication are single-precision
862 // FP, and the two multiplicants are:
863 // lhs = a23 . a22 ... a0 * 2^e1
864 // rhs = b23 . b22 ... b0 * 2^e2
865 // the result of multiplication is:
866 // lhs = c48 c47 c46 . c45 ... c0 * 2^(e1+e2)
867 // Note that there are three significant bits at the left-hand side of the
868 // radix point: two for the multiplication, and an overflow bit for the
869 // addition (that will always be zero at this point). Move the radix point
870 // toward left by two bits, and adjust exponent accordingly.
873 if addend.is_non_zero() {
874 // Normalize our MSB to one below the top bit to allow for overflow.
875 let ext_precision = 2 * S::PRECISION + 1;
876 if omsb != ext_precision - 1 {
877 assert!(ext_precision > omsb);
878 sig::shift_left(&mut wide_sig, &mut self.exp, (ext_precision - 1) - omsb);
881 // The intermediate result of the multiplication has "2 * S::PRECISION"
882 // significant bit; adjust the addend to be consistent with mul result.
883 let mut ext_addend_sig = [addend.sig[0], 0];
885 // Extend the addend significand to ext_precision - 1. This guarantees
886 // that the high bit of the significand is zero (same as wide_sig),
887 // so the addition will overflow (if it does overflow at all) into the top bit.
888 sig::shift_left(&mut ext_addend_sig, &mut 0, ext_precision - 1 - S::PRECISION);
889 loss = sig::add_or_sub(
898 omsb = sig::omsb(&wide_sig);
901 // Convert the result having "2 * S::PRECISION" significant-bits back to the one
902 // having "S::PRECISION" significant-bits. First, move the radix point from
903 // position "2*S::PRECISION - 1" to "S::PRECISION - 1". The exponent need to be
904 // adjusted by "2*S::PRECISION - 1" - "S::PRECISION - 1" = "S::PRECISION".
905 self.exp -= S::PRECISION as ExpInt + 1;
907 // In case MSB resides at the left-hand side of radix point, shift the
908 // mantissa right by some amount to make sure the MSB reside right before
909 // the radix point (i.e., "MSB . rest-significant-bits").
910 if omsb > S::PRECISION {
911 let bits = omsb - S::PRECISION;
912 loss = sig::shift_right(&mut wide_sig, &mut self.exp, bits).combine(loss);
915 self.sig[0] = wide_sig[0];
918 self = unpack!(status=, self.normalize(round, loss));
919 if loss != Loss::ExactlyZero {
920 status |= Status::INEXACT;
923 // If two numbers add (exactly) to zero, IEEE 754 decrees it is a
924 // positive zero unless rounding to minus infinity, except that
925 // adding two like-signed zeroes gives that zero.
926 if self.category == Category::Zero
927 && !status.intersects(Status::UNDERFLOW)
928 && self.sign != addend.sign
930 self.sign = round == Round::TowardNegative;
936 fn div_r(mut self, rhs: Self, round: Round) -> StatusAnd<Self> {
937 self.sign ^= rhs.sign;
939 match (self.category, rhs.category) {
940 (Category::NaN, _) => {
945 (_, Category::NaN) => {
946 self.category = Category::NaN;
952 (Category::Infinity, Category::Infinity) | (Category::Zero, Category::Zero) => {
953 Status::INVALID_OP.and(Self::NAN)
956 (Category::Infinity | Category::Zero, _) => Status::OK.and(self),
958 (Category::Normal, Category::Infinity) => {
959 self.category = Category::Zero;
963 (Category::Normal, Category::Zero) => {
964 self.category = Category::Infinity;
965 Status::DIV_BY_ZERO.and(self)
968 (Category::Normal, Category::Normal) => {
970 let dividend = self.sig[0];
979 self = unpack!(status=, self.normalize(round, loss));
980 if loss != Loss::ExactlyZero {
981 status |= Status::INEXACT;
988 fn c_fmod(mut self, rhs: Self) -> StatusAnd<Self> {
989 match (self.category, rhs.category) {
991 | (Category::Zero, Category::Infinity | Category::Normal)
992 | (Category::Normal, Category::Infinity) => Status::OK.and(self),
994 (_, Category::NaN) => {
996 self.category = Category::NaN;
1001 (Category::Infinity, _) | (_, Category::Zero) => Status::INVALID_OP.and(Self::NAN),
1003 (Category::Normal, Category::Normal) => {
1004 while self.is_finite_non_zero()
1005 && rhs.is_finite_non_zero()
1006 && self.cmp_abs_normal(rhs) != Ordering::Less
1008 let mut v = rhs.scalbn(self.ilogb() - rhs.ilogb());
1009 if self.cmp_abs_normal(v) == Ordering::Less {
1015 self = unpack!(status=, self - v);
1016 assert_eq!(status, Status::OK);
1018 Status::OK.and(self)
1023 fn round_to_integral(self, round: Round) -> StatusAnd<Self> {
1024 // If the exponent is large enough, we know that this value is already
1025 // integral, and the arithmetic below would potentially cause it to saturate
1026 // to +/-Inf. Bail out early instead.
1027 if self.is_finite_non_zero() && self.exp + 1 >= S::PRECISION as ExpInt {
1028 return Status::OK.and(self);
1031 // The algorithm here is quite simple: we add 2^(p-1), where p is the
1032 // precision of our format, and then subtract it back off again. The choice
1033 // of rounding modes for the addition/subtraction determines the rounding mode
1034 // for our integral rounding as well.
1035 // NOTE: When the input value is negative, we do subtraction followed by
1036 // addition instead.
1037 assert!(S::PRECISION <= 128);
1039 let magic_const = unpack!(status=, Self::from_u128(1 << (S::PRECISION - 1)));
1040 let magic_const = magic_const.copy_sign(self);
1042 if status != Status::OK {
1043 return status.and(self);
1047 r = unpack!(status=, r.add_r(magic_const, round));
1048 if status != Status::OK && status != Status::INEXACT {
1049 return status.and(self);
1052 // Restore the input sign to handle 0.0/-0.0 cases correctly.
1053 r.sub_r(magic_const, round).map(|r| r.copy_sign(self))
1056 fn next_up(mut self) -> StatusAnd<Self> {
1057 // Compute nextUp(x), handling each float category separately.
1058 match self.category {
1059 Category::Infinity => {
1061 // nextUp(-inf) = -largest
1062 Status::OK.and(-Self::largest())
1064 // nextUp(+inf) = +inf
1065 Status::OK.and(self)
1069 // IEEE-754R 2008 6.2 Par 2: nextUp(sNaN) = qNaN. Set Invalid flag.
1070 // IEEE-754R 2008 6.2: nextUp(qNaN) = qNaN. Must be identity so we do not
1071 // change the payload.
1072 if self.is_signaling() {
1073 // For consistency, propagate the sign of the sNaN to the qNaN.
1074 Status::INVALID_OP.and(Self::NAN.copy_sign(self))
1076 Status::OK.and(self)
1080 // nextUp(pm 0) = +smallest
1081 Status::OK.and(Self::SMALLEST)
1083 Category::Normal => {
1084 // nextUp(-smallest) = -0
1085 if self.is_smallest() && self.sign {
1086 return Status::OK.and(-Self::ZERO);
1089 // nextUp(largest) == INFINITY
1090 if self.is_largest() && !self.sign {
1091 return Status::OK.and(Self::INFINITY);
1094 // Excluding the integral bit. This allows us to test for binade boundaries.
1095 let sig_mask = (1 << (S::PRECISION - 1)) - 1;
1097 // nextUp(normal) == normal + inc.
1099 // If we are negative, we need to decrement the significand.
1101 // We only cross a binade boundary that requires adjusting the exponent
1103 // 1. exponent != S::MIN_EXP. This implies we are not in the
1104 // smallest binade or are dealing with denormals.
1105 // 2. Our significand excluding the integral bit is all zeros.
1106 let crossing_binade_boundary =
1107 self.exp != S::MIN_EXP && self.sig[0] & sig_mask == 0;
1109 // Decrement the significand.
1111 // We always do this since:
1112 // 1. If we are dealing with a non-binade decrement, by definition we
1113 // just decrement the significand.
1114 // 2. If we are dealing with a normal -> normal binade decrement, since
1115 // we have an explicit integral bit the fact that all bits but the
1116 // integral bit are zero implies that subtracting one will yield a
1117 // significand with 0 integral bit and 1 in all other spots. Thus we
1118 // must just adjust the exponent and set the integral bit to 1.
1119 // 3. If we are dealing with a normal -> denormal binade decrement,
1120 // since we set the integral bit to 0 when we represent denormals, we
1121 // just decrement the significand.
1122 sig::decrement(&mut self.sig);
1124 if crossing_binade_boundary {
1125 // Our result is a normal number. Do the following:
1126 // 1. Set the integral bit to 1.
1127 // 2. Decrement the exponent.
1128 sig::set_bit(&mut self.sig, S::PRECISION - 1);
1132 // If we are positive, we need to increment the significand.
1134 // We only cross a binade boundary that requires adjusting the exponent if
1135 // the input is not a denormal and all of said input's significand bits
1136 // are set. If all of said conditions are true: clear the significand, set
1137 // the integral bit to 1, and increment the exponent. If we have a
1138 // denormal always increment since moving denormals and the numbers in the
1139 // smallest normal binade have the same exponent in our representation.
1140 let crossing_binade_boundary =
1141 !self.is_denormal() && self.sig[0] & sig_mask == sig_mask;
1143 if crossing_binade_boundary {
1145 sig::set_bit(&mut self.sig, S::PRECISION - 1);
1149 "We can not increment an exponent beyond the MAX_EXP \
1150 allowed by the given floating point semantics."
1154 sig::increment(&mut self.sig);
1157 Status::OK.and(self)
1162 fn from_bits(input: u128) -> Self {
1163 // Dispatch to semantics.
1167 fn from_u128_r(input: u128, round: Round) -> StatusAnd<Self> {
1170 exp: S::PRECISION as ExpInt - 1,
1171 category: Category::Normal,
1173 marker: PhantomData,
1175 .normalize(round, Loss::ExactlyZero)
1178 fn from_str_r(mut s: &str, mut round: Round) -> Result<StatusAnd<Self>, ParseError> {
1180 return Err(ParseError("Invalid string length"));
1183 // Handle special cases.
1185 "inf" | "INFINITY" => return Ok(Status::OK.and(Self::INFINITY)),
1186 "-inf" | "-INFINITY" => return Ok(Status::OK.and(-Self::INFINITY)),
1187 "nan" | "NaN" => return Ok(Status::OK.and(Self::NAN)),
1188 "-nan" | "-NaN" => return Ok(Status::OK.and(-Self::NAN)),
1192 // Handle a leading minus sign.
1193 let minus = s.starts_with('-');
1194 if minus || s.starts_with('+') {
1197 return Err(ParseError("String has no digits"));
1201 // Adjust the rounding mode for the absolute value below.
1206 let r = if s.starts_with("0x") || s.starts_with("0X") {
1209 return Err(ParseError("Invalid string"));
1211 Self::from_hexadecimal_string(s, round)?
1213 Self::from_decimal_string(s, round)?
1216 Ok(r.map(|r| if minus { -r } else { r }))
1219 fn to_bits(self) -> u128 {
1220 // Dispatch to semantics.
1224 fn to_u128_r(self, width: usize, round: Round, is_exact: &mut bool) -> StatusAnd<u128> {
1225 // The result of trying to convert a number too large.
1226 let overflow = if self.sign {
1227 // Negative numbers cannot be represented as unsigned.
1230 // Largest unsigned integer of the given width.
1236 match self.category {
1237 Category::NaN => Status::INVALID_OP.and(0),
1239 Category::Infinity => Status::INVALID_OP.and(overflow),
1242 // Negative zero can't be represented as an int.
1243 *is_exact = !self.sign;
1247 Category::Normal => {
1250 // Step 1: place our absolute value, with any fraction truncated, in
1252 let truncated_bits = if self.exp < 0 {
1253 // Our absolute value is less than one; truncate everything.
1254 // For exponent -1 the integer bit represents .5, look at that.
1255 // For smaller exponents leftmost truncated bit is 0.
1256 S::PRECISION - 1 + (-self.exp) as usize
1258 // We want the most significant (exponent + 1) bits; the rest are
1260 let bits = self.exp as usize + 1;
1262 // Hopelessly large in magnitude?
1264 return Status::INVALID_OP.and(overflow);
1267 if bits < S::PRECISION {
1268 // We truncate (S::PRECISION - bits) bits.
1269 r = self.sig[0] >> (S::PRECISION - bits);
1272 // We want at least as many bits as are available.
1273 r = self.sig[0] << (bits - S::PRECISION);
1278 // Step 2: work out any lost fraction, and increment the absolute
1279 // value if we would round away from zero.
1280 let mut loss = Loss::ExactlyZero;
1281 if truncated_bits > 0 {
1282 loss = Loss::through_truncation(&self.sig, truncated_bits);
1283 if loss != Loss::ExactlyZero
1284 && self.round_away_from_zero(round, loss, truncated_bits)
1286 r = r.wrapping_add(1);
1288 return Status::INVALID_OP.and(overflow); // Overflow.
1293 // Step 3: check if we fit in the destination.
1295 return Status::INVALID_OP.and(overflow);
1298 if loss == Loss::ExactlyZero {
1302 Status::INEXACT.and(r)
1308 fn cmp_abs_normal(self, rhs: Self) -> Ordering {
1309 assert!(self.is_finite_non_zero());
1310 assert!(rhs.is_finite_non_zero());
1312 // If exponents are equal, do an unsigned comparison of the significands.
1313 self.exp.cmp(&rhs.exp).then_with(|| sig::cmp(&self.sig, &rhs.sig))
1316 fn bitwise_eq(self, rhs: Self) -> bool {
1317 if self.category != rhs.category || self.sign != rhs.sign {
1321 if self.category == Category::Zero || self.category == Category::Infinity {
1325 if self.is_finite_non_zero() && self.exp != rhs.exp {
1332 fn is_negative(self) -> bool {
1336 fn is_denormal(self) -> bool {
1337 self.is_finite_non_zero()
1338 && self.exp == S::MIN_EXP
1339 && !sig::get_bit(&self.sig, S::PRECISION - 1)
1342 fn is_signaling(self) -> bool {
1343 // IEEE-754R 2008 6.2.1: A signaling NaN bit string should be encoded with the
1344 // first bit of the trailing significand being 0.
1345 self.is_nan() && !sig::get_bit(&self.sig, S::QNAN_BIT)
1348 fn category(self) -> Category {
1352 fn get_exact_inverse(self) -> Option<Self> {
1353 // Special floats and denormals have no exact inverse.
1354 if !self.is_finite_non_zero() {
1358 // Check that the number is a power of two by making sure that only the
1359 // integer bit is set in the significand.
1360 if self.sig != [1 << (S::PRECISION - 1)] {
1365 let mut reciprocal = Self::from_u128(1).value;
1367 reciprocal = unpack!(status=, reciprocal / self);
1368 if status != Status::OK {
1372 // Avoid multiplication with a denormal, it is not safe on all platforms and
1373 // may be slower than a normal division.
1374 if reciprocal.is_denormal() {
1378 assert!(reciprocal.is_finite_non_zero());
1379 assert_eq!(reciprocal.sig, [1 << (S::PRECISION - 1)]);
1384 fn ilogb(mut self) -> ExpInt {
1391 if self.is_infinite() {
1394 if !self.is_denormal() {
1398 let sig_bits = (S::PRECISION - 1) as ExpInt;
1399 self.exp += sig_bits;
1400 self = self.normalize(Round::NearestTiesToEven, Loss::ExactlyZero).value;
1404 fn scalbn_r(mut self, exp: ExpInt, round: Round) -> Self {
1405 // If exp is wildly out-of-scale, simply adding it to self.exp will
1406 // overflow; clamp it to a safe range before adding, but ensure that the range
1407 // is large enough that the clamp does not change the result. The range we
1408 // need to support is the difference between the largest possible exponent and
1409 // the normalized exponent of half the smallest denormal.
1411 let sig_bits = (S::PRECISION - 1) as i32;
1412 let max_change = S::MAX_EXP as i32 - (S::MIN_EXP as i32 - sig_bits) + 1;
1414 // Clamp to one past the range ends to let normalize handle overflow.
1415 let exp_change = cmp::min(cmp::max(exp as i32, -max_change - 1), max_change);
1416 self.exp = self.exp.saturating_add(exp_change as ExpInt);
1417 self = self.normalize(round, Loss::ExactlyZero).value;
1419 sig::set_bit(&mut self.sig, S::QNAN_BIT);
1424 fn frexp_r(mut self, exp: &mut ExpInt, round: Round) -> Self {
1425 *exp = self.ilogb();
1427 // Quiet signalling nans.
1428 if *exp == IEK_NAN {
1429 sig::set_bit(&mut self.sig, S::QNAN_BIT);
1433 if *exp == IEK_INF {
1437 // 1 is added because frexp is defined to return a normalized fraction in
1438 // +/-[0.5, 1.0), rather than the usual +/-[1.0, 2.0).
1439 if *exp == IEK_ZERO {
1444 self.scalbn_r(-*exp, round)
1448 impl<S: Semantics, T: Semantics> FloatConvert<IeeeFloat<T>> for IeeeFloat<S> {
1449 fn convert_r(self, round: Round, loses_info: &mut bool) -> StatusAnd<IeeeFloat<T>> {
1450 let mut r = IeeeFloat {
1453 category: self.category,
1455 marker: PhantomData,
1458 // x86 has some unusual NaNs which cannot be represented in any other
1459 // format; note them here.
1460 fn is_x87_double_extended<S: Semantics>() -> bool {
1461 S::QNAN_SIGNIFICAND == X87DoubleExtendedS::QNAN_SIGNIFICAND
1463 let x87_special_nan = is_x87_double_extended::<S>()
1464 && !is_x87_double_extended::<T>()
1465 && r.category == Category::NaN
1466 && (r.sig[0] & S::QNAN_SIGNIFICAND) != S::QNAN_SIGNIFICAND;
1468 // If this is a truncation of a denormal number, and the target semantics
1469 // has larger exponent range than the source semantics (this can happen
1470 // when truncating from PowerPC double-double to double format), the
1471 // right shift could lose result mantissa bits. Adjust exponent instead
1472 // of performing excessive shift.
1473 let mut shift = T::PRECISION as ExpInt - S::PRECISION as ExpInt;
1474 if shift < 0 && r.is_finite_non_zero() {
1475 let mut exp_change = sig::omsb(&r.sig) as ExpInt - S::PRECISION as ExpInt;
1476 if r.exp + exp_change < T::MIN_EXP {
1477 exp_change = T::MIN_EXP - r.exp;
1479 if exp_change < shift {
1483 shift -= exp_change;
1484 r.exp += exp_change;
1488 // If this is a truncation, perform the shift.
1489 let loss = if shift < 0 && (r.is_finite_non_zero() || r.category == Category::NaN) {
1490 sig::shift_right(&mut r.sig, &mut 0, -shift as usize)
1495 // If this is an extension, perform the shift.
1496 if shift > 0 && (r.is_finite_non_zero() || r.category == Category::NaN) {
1497 sig::shift_left(&mut r.sig, &mut 0, shift as usize);
1501 if r.is_finite_non_zero() {
1502 r = unpack!(status=, r.normalize(round, loss));
1503 *loses_info = status != Status::OK;
1504 } else if r.category == Category::NaN {
1505 *loses_info = loss != Loss::ExactlyZero || x87_special_nan;
1507 // For x87 extended precision, we want to make a NaN, not a special NaN if
1508 // the input wasn't special either.
1509 if !x87_special_nan && is_x87_double_extended::<T>() {
1510 sig::set_bit(&mut r.sig, T::PRECISION - 1);
1513 // Convert of sNaN creates qNaN and raises an exception (invalid op).
1514 // This also guarantees that a sNaN does not become Inf on a truncation
1515 // that loses all payload bits.
1516 if self.is_signaling() {
1517 // Quiet signaling NaN.
1518 sig::set_bit(&mut r.sig, T::QNAN_BIT);
1519 status = Status::INVALID_OP;
1521 status = Status::OK;
1524 *loses_info = false;
1525 status = Status::OK;
1532 impl<S: Semantics> IeeeFloat<S> {
1533 /// Handle positive overflow. We either return infinity or
1534 /// the largest finite number. For negative overflow,
1535 /// negate the `round` argument before calling.
1536 fn overflow_result(round: Round) -> StatusAnd<Self> {
1539 Round::NearestTiesToEven | Round::NearestTiesToAway | Round::TowardPositive => {
1540 (Status::OVERFLOW | Status::INEXACT).and(Self::INFINITY)
1542 // Otherwise we become the largest finite number.
1543 Round::TowardNegative | Round::TowardZero => Status::INEXACT.and(Self::largest()),
1547 /// Returns `true` if, when truncating the current number, with `bit` the
1548 /// new LSB, with the given lost fraction and rounding mode, the result
1549 /// would need to be rounded away from zero (i.e., by increasing the
1550 /// signficand). This routine must work for `Category::Zero` of both signs, and
1551 /// `Category::Normal` numbers.
1552 fn round_away_from_zero(&self, round: Round, loss: Loss, bit: usize) -> bool {
1553 // NaNs and infinities should not have lost fractions.
1554 assert!(self.is_finite_non_zero() || self.is_zero());
1556 // Current callers never pass this so we don't handle it.
1557 assert_ne!(loss, Loss::ExactlyZero);
1560 Round::NearestTiesToAway => loss == Loss::ExactlyHalf || loss == Loss::MoreThanHalf,
1561 Round::NearestTiesToEven => {
1562 if loss == Loss::MoreThanHalf {
1566 // Our zeros don't have a significand to test.
1567 if loss == Loss::ExactlyHalf && self.category != Category::Zero {
1568 return sig::get_bit(&self.sig, bit);
1573 Round::TowardZero => false,
1574 Round::TowardPositive => !self.sign,
1575 Round::TowardNegative => self.sign,
1579 fn normalize(mut self, round: Round, mut loss: Loss) -> StatusAnd<Self> {
1580 if !self.is_finite_non_zero() {
1581 return Status::OK.and(self);
1584 // Before rounding normalize the exponent of Category::Normal numbers.
1585 let mut omsb = sig::omsb(&self.sig);
1588 // OMSB is numbered from 1. We want to place it in the integer
1589 // bit numbered PRECISION if possible, with a compensating change in
1591 let mut final_exp = self.exp.saturating_add(omsb as ExpInt - S::PRECISION as ExpInt);
1593 // If the resulting exponent is too high, overflow according to
1594 // the rounding mode.
1595 if final_exp > S::MAX_EXP {
1596 let round = if self.sign { -round } else { round };
1597 return Self::overflow_result(round).map(|r| r.copy_sign(self));
1600 // Subnormal numbers have exponent MIN_EXP, and their MSB
1601 // is forced based on that.
1602 if final_exp < S::MIN_EXP {
1603 final_exp = S::MIN_EXP;
1606 // Shifting left is easy as we don't lose precision.
1607 if final_exp < self.exp {
1608 assert_eq!(loss, Loss::ExactlyZero);
1610 let exp_change = (self.exp - final_exp) as usize;
1611 sig::shift_left(&mut self.sig, &mut self.exp, exp_change);
1613 return Status::OK.and(self);
1616 // Shift right and capture any new lost fraction.
1617 if final_exp > self.exp {
1618 let exp_change = (final_exp - self.exp) as usize;
1619 loss = sig::shift_right(&mut self.sig, &mut self.exp, exp_change).combine(loss);
1621 // Keep OMSB up-to-date.
1622 omsb = omsb.saturating_sub(exp_change);
1626 // Now round the number according to round given the lost
1629 // As specified in IEEE 754, since we do not trap we do not report
1630 // underflow for exact results.
1631 if loss == Loss::ExactlyZero {
1632 // Canonicalize zeros.
1634 self.category = Category::Zero;
1637 return Status::OK.and(self);
1640 // Increment the significand if we're rounding away from zero.
1641 if self.round_away_from_zero(round, loss, 0) {
1643 self.exp = S::MIN_EXP;
1646 // We should never overflow.
1647 assert_eq!(sig::increment(&mut self.sig), 0);
1648 omsb = sig::omsb(&self.sig);
1650 // Did the significand increment overflow?
1651 if omsb == S::PRECISION + 1 {
1652 // Renormalize by incrementing the exponent and shifting our
1653 // significand right one. However if we already have the
1654 // maximum exponent we overflow to infinity.
1655 if self.exp == S::MAX_EXP {
1656 self.category = Category::Infinity;
1658 return (Status::OVERFLOW | Status::INEXACT).and(self);
1661 let _: Loss = sig::shift_right(&mut self.sig, &mut self.exp, 1);
1663 return Status::INEXACT.and(self);
1667 // The normal case - we were and are not denormal, and any
1668 // significand increment above didn't overflow.
1669 if omsb == S::PRECISION {
1670 return Status::INEXACT.and(self);
1673 // We have a non-zero denormal.
1674 assert!(omsb < S::PRECISION);
1676 // Canonicalize zeros.
1678 self.category = Category::Zero;
1681 // The Category::Zero case is a denormal that underflowed to zero.
1682 (Status::UNDERFLOW | Status::INEXACT).and(self)
1685 fn from_hexadecimal_string(s: &str, round: Round) -> Result<StatusAnd<Self>, ParseError> {
1686 let mut r = IeeeFloat {
1689 category: Category::Normal,
1691 marker: PhantomData,
1694 let mut any_digits = false;
1695 let mut has_exp = false;
1696 let mut bit_pos = LIMB_BITS as isize;
1697 let mut loss = None;
1699 // Without leading or trailing zeros, irrespective of the dot.
1700 let mut first_sig_digit = None;
1701 let mut dot = s.len();
1703 for (p, c) in s.char_indices() {
1704 // Skip leading zeros and any (hexa)decimal point.
1707 return Err(ParseError("String contains multiple dots"));
1710 } else if let Some(hex_value) = c.to_digit(16) {
1713 if first_sig_digit.is_none() {
1717 first_sig_digit = Some(p);
1720 // Store the number while we have space.
1723 r.sig[0] |= (hex_value as Limb) << bit_pos;
1724 // If zero or one-half (the hexadecimal digit 8) are followed
1725 // by non-zero, they're a little more than zero or one-half.
1726 } else if let Some(ref mut loss) = loss {
1728 if *loss == Loss::ExactlyZero {
1729 *loss = Loss::LessThanHalf;
1731 if *loss == Loss::ExactlyHalf {
1732 *loss = Loss::MoreThanHalf;
1736 loss = Some(match hex_value {
1737 0 => Loss::ExactlyZero,
1738 1..=7 => Loss::LessThanHalf,
1739 8 => Loss::ExactlyHalf,
1740 9..=15 => Loss::MoreThanHalf,
1741 _ => unreachable!(),
1744 } else if c == 'p' || c == 'P' {
1746 return Err(ParseError("Significand has no digits"));
1753 let mut chars = s[p + 1..].chars().peekable();
1755 // Adjust for the given exponent.
1756 let exp_minus = chars.peek() == Some(&'-');
1757 if exp_minus || chars.peek() == Some(&'+') {
1762 if let Some(value) = c.to_digit(10) {
1764 r.exp = r.exp.saturating_mul(10).saturating_add(value as ExpInt);
1766 return Err(ParseError("Invalid character in exponent"));
1770 return Err(ParseError("Exponent has no digits"));
1779 return Err(ParseError("Invalid character in significand"));
1783 return Err(ParseError("Significand has no digits"));
1786 // Hex floats require an exponent but not a hexadecimal point.
1788 return Err(ParseError("Hex strings require an exponent"));
1791 // Ignore the exponent if we are zero.
1792 let first_sig_digit = match first_sig_digit {
1794 None => return Ok(Status::OK.and(Self::ZERO)),
1797 // Calculate the exponent adjustment implicit in the number of
1798 // significant digits and adjust for writing the significand starting
1799 // at the most significant nibble.
1800 let exp_adjustment = if dot > first_sig_digit {
1801 ExpInt::try_from(dot - first_sig_digit).unwrap()
1803 -ExpInt::try_from(first_sig_digit - dot - 1).unwrap()
1805 let exp_adjustment = exp_adjustment
1808 .saturating_add(S::PRECISION as ExpInt)
1809 .saturating_sub(LIMB_BITS as ExpInt);
1810 r.exp = r.exp.saturating_add(exp_adjustment);
1812 Ok(r.normalize(round, loss.unwrap_or(Loss::ExactlyZero)))
1815 fn from_decimal_string(s: &str, round: Round) -> Result<StatusAnd<Self>, ParseError> {
1816 // Given a normal decimal floating point number of the form
1818 // dddd.dddd[eE][+-]ddd
1820 // where the decimal point and exponent are optional, fill out the
1821 // variables below. Exponent is appropriate if the significand is
1822 // treated as an integer, and normalized_exp if the significand
1823 // is taken to have the decimal point after a single leading
1826 // If the value is zero, first_sig_digit is None.
1828 let mut any_digits = false;
1829 let mut dec_exp = 0i32;
1831 // Without leading or trailing zeros, irrespective of the dot.
1832 let mut first_sig_digit = None;
1833 let mut last_sig_digit = 0;
1834 let mut dot = s.len();
1836 for (p, c) in s.char_indices() {
1839 return Err(ParseError("String contains multiple dots"));
1842 } else if let Some(dec_value) = c.to_digit(10) {
1846 if first_sig_digit.is_none() {
1847 first_sig_digit = Some(p);
1851 } else if c == 'e' || c == 'E' {
1853 return Err(ParseError("Significand has no digits"));
1860 let mut chars = s[p + 1..].chars().peekable();
1862 // Adjust for the given exponent.
1863 let exp_minus = chars.peek() == Some(&'-');
1864 if exp_minus || chars.peek() == Some(&'+') {
1870 if let Some(value) = c.to_digit(10) {
1872 dec_exp = dec_exp.saturating_mul(10).saturating_add(value as i32);
1874 return Err(ParseError("Invalid character in exponent"));
1878 return Err(ParseError("Exponent has no digits"));
1887 return Err(ParseError("Invalid character in significand"));
1891 return Err(ParseError("Significand has no digits"));
1894 // Test if we have a zero number allowing for non-zero exponents.
1895 let first_sig_digit = match first_sig_digit {
1897 None => return Ok(Status::OK.and(Self::ZERO)),
1900 // Adjust the exponents for any decimal point.
1901 if dot > last_sig_digit {
1902 dec_exp = dec_exp.saturating_add((dot - last_sig_digit - 1) as i32);
1904 dec_exp = dec_exp.saturating_sub((last_sig_digit - dot) as i32);
1906 let significand_digits = last_sig_digit - first_sig_digit + 1
1907 - (dot > first_sig_digit && dot < last_sig_digit) as usize;
1908 let normalized_exp = dec_exp.saturating_add(significand_digits as i32 - 1);
1910 // Handle the cases where exponents are obviously too large or too
1911 // small. Writing L for log 10 / log 2, a number d.ddddd*10^dec_exp
1912 // definitely overflows if
1914 // (dec_exp - 1) * L >= MAX_EXP
1916 // and definitely underflows to zero where
1918 // (dec_exp + 1) * L <= MIN_EXP - PRECISION
1920 // With integer arithmetic the tightest bounds for L are
1922 // 93/28 < L < 196/59 [ numerator <= 256 ]
1923 // 42039/12655 < L < 28738/8651 [ numerator <= 65536 ]
1925 // Check for MAX_EXP.
1926 if normalized_exp.saturating_sub(1).saturating_mul(42039) >= 12655 * S::MAX_EXP as i32 {
1927 // Overflow and round.
1928 return Ok(Self::overflow_result(round));
1931 // Check for MIN_EXP.
1932 if normalized_exp.saturating_add(1).saturating_mul(28738)
1933 <= 8651 * (S::MIN_EXP as i32 - S::PRECISION as i32)
1935 // Underflow to zero and round.
1937 if round == Round::TowardPositive { IeeeFloat::SMALLEST } else { IeeeFloat::ZERO };
1938 return Ok((Status::UNDERFLOW | Status::INEXACT).and(r));
1941 // A tight upper bound on number of bits required to hold an
1942 // N-digit decimal integer is N * 196 / 59. Allocate enough space
1943 // to hold the full significand, and an extra limb required by
1945 let max_limbs = limbs_for_bits(1 + 196 * significand_digits / 59);
1946 let mut dec_sig: SmallVec<[Limb; 1]> = SmallVec::with_capacity(max_limbs);
1948 // Convert to binary efficiently - we do almost all multiplication
1949 // in a Limb. When this would overflow do we do a single
1950 // bignum multiplication, and then revert again to multiplication
1952 let mut chars = s[first_sig_digit..=last_sig_digit].chars();
1955 let mut multiplier = 1;
1958 let dec_value = match chars.next() {
1959 Some('.') => continue,
1960 Some(c) => c.to_digit(10).unwrap(),
1965 val = val * 10 + dec_value as Limb;
1967 // The maximum number that can be multiplied by ten with any
1968 // digit added without overflowing a Limb.
1969 if multiplier > (!0 - 9) / 10 {
1974 // If we've consumed no digits, we're done.
1975 if multiplier == 1 {
1979 // Multiply out the current limb.
1980 let mut carry = val;
1981 for x in &mut dec_sig {
1982 let [low, mut high] = sig::widening_mul(*x, multiplier);
1985 let (low, overflow) = low.overflowing_add(carry);
1986 high += overflow as Limb;
1992 // If we had carry, we need another limb (likely but not guaranteed).
1994 dec_sig.push(carry);
1998 // Calculate pow(5, abs(dec_exp)) into `pow5_full`.
1999 // The *_calc Vec's are reused scratch space, as an optimization.
2000 let (pow5_full, mut pow5_calc, mut sig_calc, mut sig_scratch_calc) = {
2001 let mut power = dec_exp.abs() as usize;
2003 const FIRST_EIGHT_POWERS: [Limb; 8] = [1, 5, 25, 125, 625, 3125, 15625, 78125];
2005 let mut p5_scratch = smallvec![];
2006 let mut p5: SmallVec<[Limb; 1]> = smallvec![FIRST_EIGHT_POWERS[4]];
2008 let mut r_scratch = smallvec![];
2009 let mut r: SmallVec<[Limb; 1]> = smallvec![FIRST_EIGHT_POWERS[power & 7]];
2013 // Calculate pow(5,pow(2,n+3)).
2014 p5_scratch.resize(p5.len() * 2, 0);
2015 let _: Loss = sig::mul(&mut p5_scratch, &mut 0, &p5, &p5, p5.len() * 2 * LIMB_BITS);
2016 while p5_scratch.last() == Some(&0) {
2019 mem::swap(&mut p5, &mut p5_scratch);
2022 r_scratch.resize(r.len() + p5.len(), 0);
2024 sig::mul(&mut r_scratch, &mut 0, &r, &p5, (r.len() + p5.len()) * LIMB_BITS);
2025 while r_scratch.last() == Some(&0) {
2028 mem::swap(&mut r, &mut r_scratch);
2034 (r, r_scratch, p5, p5_scratch)
2037 // Attempt dec_sig * 10^dec_exp with increasing precision.
2038 let mut attempt = 0;
2040 let calc_precision = (LIMB_BITS << attempt) - 1;
2043 let calc_normal_from_limbs = |sig: &mut SmallVec<[Limb; 1]>,
2045 -> StatusAnd<ExpInt> {
2046 sig.resize(limbs_for_bits(calc_precision), 0);
2047 let (mut loss, mut exp) = sig::from_limbs(sig, limbs, calc_precision);
2049 // Before rounding normalize the exponent of Category::Normal numbers.
2050 let mut omsb = sig::omsb(sig);
2052 assert_ne!(omsb, 0);
2054 // OMSB is numbered from 1. We want to place it in the integer
2055 // bit numbered PRECISION if possible, with a compensating change in
2057 let final_exp = exp.saturating_add(omsb as ExpInt - calc_precision as ExpInt);
2059 // Shifting left is easy as we don't lose precision.
2060 if final_exp < exp {
2061 assert_eq!(loss, Loss::ExactlyZero);
2063 let exp_change = (exp - final_exp) as usize;
2064 sig::shift_left(sig, &mut exp, exp_change);
2066 return Status::OK.and(exp);
2069 // Shift right and capture any new lost fraction.
2070 if final_exp > exp {
2071 let exp_change = (final_exp - exp) as usize;
2072 loss = sig::shift_right(sig, &mut exp, exp_change).combine(loss);
2074 // Keep OMSB up-to-date.
2075 omsb = omsb.saturating_sub(exp_change);
2078 assert_eq!(omsb, calc_precision);
2080 // Now round the number according to round given the lost
2083 // As specified in IEEE 754, since we do not trap we do not report
2084 // underflow for exact results.
2085 if loss == Loss::ExactlyZero {
2086 return Status::OK.and(exp);
2089 // Increment the significand if we're rounding away from zero.
2090 if loss == Loss::MoreThanHalf || loss == Loss::ExactlyHalf && sig::get_bit(sig, 0) {
2091 // We should never overflow.
2092 assert_eq!(sig::increment(sig), 0);
2093 omsb = sig::omsb(sig);
2095 // Did the significand increment overflow?
2096 if omsb == calc_precision + 1 {
2097 let _: Loss = sig::shift_right(sig, &mut exp, 1);
2099 return Status::INEXACT.and(exp);
2103 // The normal case - we were and are not denormal, and any
2104 // significand increment above didn't overflow.
2105 Status::INEXACT.and(exp)
2109 let mut exp = unpack!(status=,
2110 calc_normal_from_limbs(&mut sig_calc, &dec_sig));
2112 let pow5_exp = unpack!(pow5_status=,
2113 calc_normal_from_limbs(&mut pow5_calc, &pow5_full));
2115 // Add dec_exp, as 10^n = 5^n * 2^n.
2116 exp += dec_exp as ExpInt;
2118 let mut used_bits = S::PRECISION;
2119 let mut truncated_bits = calc_precision - used_bits;
2121 let half_ulp_err1 = (status != Status::OK) as Limb;
2122 let (calc_loss, half_ulp_err2);
2126 sig_scratch_calc.resize(sig_calc.len() + pow5_calc.len(), 0);
2127 calc_loss = sig::mul(
2128 &mut sig_scratch_calc,
2134 mem::swap(&mut sig_calc, &mut sig_scratch_calc);
2136 half_ulp_err2 = (pow5_status != Status::OK) as Limb;
2140 sig_scratch_calc.resize(sig_calc.len(), 0);
2141 calc_loss = sig::div(
2142 &mut sig_scratch_calc,
2148 mem::swap(&mut sig_calc, &mut sig_scratch_calc);
2150 // Denormal numbers have less precision.
2151 if exp < S::MIN_EXP {
2152 truncated_bits += (S::MIN_EXP - exp) as usize;
2153 used_bits = calc_precision.saturating_sub(truncated_bits);
2155 // Extra half-ulp lost in reciprocal of exponent.
2157 2 * (pow5_status != Status::OK || calc_loss != Loss::ExactlyZero) as Limb;
2160 // Both sig::mul and sig::div return the
2161 // result with the integer bit set.
2162 assert!(sig::get_bit(&sig_calc, calc_precision - 1));
2164 // The error from the true value, in half-ulps, on multiplying two
2165 // floating point numbers, which differ from the value they
2166 // approximate by at most half_ulp_err1 and half_ulp_err2 half-ulps, is strictly less
2167 // than the returned value.
2169 // See "How to Read Floating Point Numbers Accurately" by William D Clinger.
2170 assert!(half_ulp_err1 < 2 || half_ulp_err2 < 2 || (half_ulp_err1 + half_ulp_err2 < 8));
2172 let inexact = (calc_loss != Loss::ExactlyZero) as Limb;
2173 let half_ulp_err = if half_ulp_err1 + half_ulp_err2 == 0 {
2174 inexact * 2 // <= inexact half-ulps.
2176 inexact + 2 * (half_ulp_err1 + half_ulp_err2)
2179 let ulps_from_boundary = {
2180 let bits = calc_precision - used_bits - 1;
2182 let i = bits / LIMB_BITS;
2183 let limb = sig_calc[i] & (!0 >> (LIMB_BITS - 1 - bits % LIMB_BITS));
2184 let boundary = match round {
2185 Round::NearestTiesToEven | Round::NearestTiesToAway => 1 << (bits % LIMB_BITS),
2189 let delta = limb.wrapping_sub(boundary);
2190 cmp::min(delta, delta.wrapping_neg())
2191 } else if limb == boundary {
2192 if !sig::is_all_zeros(&sig_calc[1..i]) {
2197 } else if limb == boundary.wrapping_sub(1) {
2198 if sig_calc[1..i].iter().any(|&x| x.wrapping_neg() != 1) {
2201 sig_calc[0].wrapping_neg()
2208 // Are we guaranteed to round correctly if we truncate?
2209 if ulps_from_boundary.saturating_mul(2) >= half_ulp_err {
2210 let mut r = IeeeFloat {
2213 category: Category::Normal,
2215 marker: PhantomData,
2217 sig::extract(&mut r.sig, &sig_calc, used_bits, calc_precision - used_bits);
2218 // If we extracted less bits above we must adjust our exponent
2219 // to compensate for the implicit right shift.
2220 r.exp += (S::PRECISION - used_bits) as ExpInt;
2221 let loss = Loss::through_truncation(&sig_calc, truncated_bits);
2222 return Ok(r.normalize(round, loss));
2229 /// Combine the effect of two lost fractions.
2230 fn combine(self, less_significant: Loss) -> Loss {
2231 let mut more_significant = self;
2232 if less_significant != Loss::ExactlyZero {
2233 if more_significant == Loss::ExactlyZero {
2234 more_significant = Loss::LessThanHalf;
2235 } else if more_significant == Loss::ExactlyHalf {
2236 more_significant = Loss::MoreThanHalf;
2243 /// Returns the fraction lost were a bignum truncated losing the least
2244 /// significant `bits` bits.
2245 fn through_truncation(limbs: &[Limb], bits: usize) -> Loss {
2247 return Loss::ExactlyZero;
2250 let half_bit = bits - 1;
2251 let half_limb = half_bit / LIMB_BITS;
2252 let (half_limb, rest) = if half_limb < limbs.len() {
2253 (limbs[half_limb], &limbs[..half_limb])
2257 let half = 1 << (half_bit % LIMB_BITS);
2258 let has_half = half_limb & half != 0;
2259 let has_rest = half_limb & (half - 1) != 0 || !sig::is_all_zeros(rest);
2261 match (has_half, has_rest) {
2262 (false, false) => Loss::ExactlyZero,
2263 (false, true) => Loss::LessThanHalf,
2264 (true, false) => Loss::ExactlyHalf,
2265 (true, true) => Loss::MoreThanHalf,
2270 /// Implementation details of IeeeFloat significands, such as big integer arithmetic.
2271 /// As a rule of thumb, no functions in this module should dynamically allocate.
2273 use super::{limbs_for_bits, ExpInt, Limb, Loss, LIMB_BITS};
2274 use core::cmp::Ordering;
2278 pub(super) fn is_all_zeros(limbs: &[Limb]) -> bool {
2279 limbs.iter().all(|&l| l == 0)
2282 /// One, not zero, based LSB. That is, returns 0 for a zeroed significand.
2283 pub(super) fn olsb(limbs: &[Limb]) -> usize {
2287 .find(|(_, &limb)| limb != 0)
2288 .map_or(0, |(i, limb)| i * LIMB_BITS + limb.trailing_zeros() as usize + 1)
2291 /// One, not zero, based MSB. That is, returns 0 for a zeroed significand.
2292 pub(super) fn omsb(limbs: &[Limb]) -> usize {
2296 .rfind(|(_, &limb)| limb != 0)
2297 .map_or(0, |(i, limb)| (i + 1) * LIMB_BITS - limb.leading_zeros() as usize)
2300 /// Comparison (unsigned) of two significands.
2301 pub(super) fn cmp(a: &[Limb], b: &[Limb]) -> Ordering {
2302 assert_eq!(a.len(), b.len());
2303 for (a, b) in a.iter().zip(b).rev() {
2305 Ordering::Equal => {}
2313 /// Extracts the given bit.
2314 pub(super) fn get_bit(limbs: &[Limb], bit: usize) -> bool {
2315 limbs[bit / LIMB_BITS] & (1 << (bit % LIMB_BITS)) != 0
2318 /// Sets the given bit.
2319 pub(super) fn set_bit(limbs: &mut [Limb], bit: usize) {
2320 limbs[bit / LIMB_BITS] |= 1 << (bit % LIMB_BITS);
2323 /// Clear the given bit.
2324 pub(super) fn clear_bit(limbs: &mut [Limb], bit: usize) {
2325 limbs[bit / LIMB_BITS] &= !(1 << (bit % LIMB_BITS));
2328 /// Shifts `dst` left `bits` bits, subtract `bits` from its exponent.
2329 pub(super) fn shift_left(dst: &mut [Limb], exp: &mut ExpInt, bits: usize) {
2331 // Our exponent should not underflow.
2332 *exp = exp.checked_sub(bits as ExpInt).unwrap();
2334 // Jump is the inter-limb jump; shift is the intra-limb shift.
2335 let jump = bits / LIMB_BITS;
2336 let shift = bits % LIMB_BITS;
2338 for i in (0..dst.len()).rev() {
2344 // dst[i] comes from the two limbs src[i - jump] and, if we have
2345 // an intra-limb shift, src[i - jump - 1].
2346 limb = dst[i - jump];
2350 limb |= dst[i - jump - 1] >> (LIMB_BITS - shift);
2360 /// Shifts `dst` right `bits` bits noting lost fraction.
2361 pub(super) fn shift_right(dst: &mut [Limb], exp: &mut ExpInt, bits: usize) -> Loss {
2362 let loss = Loss::through_truncation(dst, bits);
2365 // Our exponent should not overflow.
2366 *exp = exp.checked_add(bits as ExpInt).unwrap();
2368 // Jump is the inter-limb jump; shift is the intra-limb shift.
2369 let jump = bits / LIMB_BITS;
2370 let shift = bits % LIMB_BITS;
2372 // Perform the shift. This leaves the most significant `bits` bits
2373 // of the result at zero.
2374 for i in 0..dst.len() {
2377 if i + jump >= dst.len() {
2380 limb = dst[i + jump];
2383 if i + jump + 1 < dst.len() {
2384 limb |= dst[i + jump + 1] << (LIMB_BITS - shift);
2396 /// Copies the bit vector of width `src_bits` from `src`, starting at bit SRC_LSB,
2397 /// to `dst`, such that the bit SRC_LSB becomes the least significant bit of `dst`.
2398 /// All high bits above `src_bits` in `dst` are zero-filled.
2399 pub(super) fn extract(dst: &mut [Limb], src: &[Limb], src_bits: usize, src_lsb: usize) {
2404 let dst_limbs = limbs_for_bits(src_bits);
2405 assert!(dst_limbs <= dst.len());
2407 let src = &src[src_lsb / LIMB_BITS..];
2408 dst[..dst_limbs].copy_from_slice(&src[..dst_limbs]);
2410 let shift = src_lsb % LIMB_BITS;
2411 let _: Loss = shift_right(&mut dst[..dst_limbs], &mut 0, shift);
2413 // We now have (dst_limbs * LIMB_BITS - shift) bits from `src`
2414 // in `dst`. If this is less that src_bits, append the rest, else
2415 // clear the high bits.
2416 let n = dst_limbs * LIMB_BITS - shift;
2418 let mask = (1 << (src_bits - n)) - 1;
2419 dst[dst_limbs - 1] |= (src[dst_limbs] & mask) << (n % LIMB_BITS);
2420 } else if n > src_bits && src_bits % LIMB_BITS > 0 {
2421 dst[dst_limbs - 1] &= (1 << (src_bits % LIMB_BITS)) - 1;
2424 // Clear high limbs.
2425 for x in &mut dst[dst_limbs..] {
2430 /// We want the most significant PRECISION bits of `src`. There may not
2431 /// be that many; extract what we can.
2432 pub(super) fn from_limbs(dst: &mut [Limb], src: &[Limb], precision: usize) -> (Loss, ExpInt) {
2433 let omsb = omsb(src);
2435 if precision <= omsb {
2436 extract(dst, src, precision, omsb - precision);
2437 (Loss::through_truncation(src, omsb - precision), omsb as ExpInt - 1)
2439 extract(dst, src, omsb, 0);
2440 (Loss::ExactlyZero, precision as ExpInt - 1)
2444 /// For every consecutive chunk of `bits` bits from `limbs`,
2445 /// going from most significant to the least significant bits,
2446 /// call `f` to transform those bits and store the result back.
2447 pub(super) fn each_chunk<F: FnMut(Limb) -> Limb>(limbs: &mut [Limb], bits: usize, mut f: F) {
2448 assert_eq!(LIMB_BITS % bits, 0);
2449 for limb in limbs.iter_mut().rev() {
2451 for i in (0..LIMB_BITS / bits).rev() {
2452 r |= f((*limb >> (i * bits)) & ((1 << bits) - 1)) << (i * bits);
2458 /// Increment in-place, return the carry flag.
2459 pub(super) fn increment(dst: &mut [Limb]) -> Limb {
2461 *x = x.wrapping_add(1);
2470 /// Decrement in-place, return the borrow flag.
2471 pub(super) fn decrement(dst: &mut [Limb]) -> Limb {
2473 *x = x.wrapping_sub(1);
2482 /// `a += b + c` where `c` is zero or one. Returns the carry flag.
2483 pub(super) fn add(a: &mut [Limb], b: &[Limb], mut c: Limb) -> Limb {
2486 for (a, &b) in iter::zip(a, b) {
2487 let (r, overflow) = a.overflowing_add(b);
2488 let (r, overflow2) = r.overflowing_add(c);
2490 c = (overflow | overflow2) as Limb;
2496 /// `a -= b + c` where `c` is zero or one. Returns the borrow flag.
2497 pub(super) fn sub(a: &mut [Limb], b: &[Limb], mut c: Limb) -> Limb {
2500 for (a, &b) in iter::zip(a, b) {
2501 let (r, overflow) = a.overflowing_sub(b);
2502 let (r, overflow2) = r.overflowing_sub(c);
2504 c = (overflow | overflow2) as Limb;
2510 /// `a += b` or `a -= b`. Does not preserve `b`.
2511 pub(super) fn add_or_sub(
2519 // Are we bigger exponent-wise than the RHS?
2520 let bits = *a_exp - b_exp;
2522 // Determine if the operation on the absolute values is effectively
2523 // an addition or subtraction.
2524 // Subtraction is more subtle than one might naively expect.
2525 if *a_sign ^ b_sign {
2526 let (reverse, loss);
2529 reverse = cmp(a_sig, b_sig) == Ordering::Less;
2530 loss = Loss::ExactlyZero;
2531 } else if bits > 0 {
2532 loss = shift_right(b_sig, &mut 0, (bits - 1) as usize);
2533 shift_left(a_sig, a_exp, 1);
2536 loss = shift_right(a_sig, a_exp, (-bits - 1) as usize);
2537 shift_left(b_sig, &mut 0, 1);
2541 let borrow = (loss != Loss::ExactlyZero) as Limb;
2543 // The code above is intended to ensure that no borrow is necessary.
2544 assert_eq!(sub(b_sig, a_sig, borrow), 0);
2545 a_sig.copy_from_slice(b_sig);
2548 // The code above is intended to ensure that no borrow is necessary.
2549 assert_eq!(sub(a_sig, b_sig, borrow), 0);
2552 // Invert the lost fraction - it was on the RHS and subtracted.
2554 Loss::LessThanHalf => Loss::MoreThanHalf,
2555 Loss::MoreThanHalf => Loss::LessThanHalf,
2559 let loss = if bits > 0 {
2560 shift_right(b_sig, &mut 0, bits as usize)
2562 shift_right(a_sig, a_exp, -bits as usize)
2564 // We have a guard bit; generating a carry cannot happen.
2565 assert_eq!(add(a_sig, b_sig, 0), 0);
2570 /// `[low, high] = a * b`.
2572 /// This cannot overflow, because
2574 /// `(n - 1) * (n - 1) + 2 * (n - 1) == (n - 1) * (n + 1)`
2576 /// which is less than n<sup>2</sup>.
2577 pub(super) fn widening_mul(a: Limb, b: Limb) -> [Limb; 2] {
2578 let mut wide = [0, 0];
2580 if a == 0 || b == 0 {
2584 const HALF_BITS: usize = LIMB_BITS / 2;
2586 let select = |limb, i| (limb >> (i * HALF_BITS)) & ((1 << HALF_BITS) - 1);
2589 let mut x = [select(a, i) * select(b, j), 0];
2590 shift_left(&mut x, &mut 0, (i + j) * HALF_BITS);
2591 assert_eq!(add(&mut wide, &x, 0), 0);
2598 /// `dst = a * b` (for normal `a` and `b`). Returns the lost fraction.
2599 pub(super) fn mul<'a>(
2606 // Put the narrower number on the `a` for less loops below.
2607 if a.len() > b.len() {
2608 mem::swap(&mut a, &mut b);
2611 for x in &mut dst[..b.len()] {
2615 for i in 0..a.len() {
2617 for j in 0..b.len() {
2618 let [low, mut high] = widening_mul(a[i], b[j]);
2621 let (low, overflow) = low.overflowing_add(carry);
2622 high += overflow as Limb;
2624 // And now `dst[i + j]`, and store the new low part there.
2625 let (low, overflow) = low.overflowing_add(dst[i + j]);
2626 high += overflow as Limb;
2631 dst[i + b.len()] = carry;
2634 // Assume the operands involved in the multiplication are single-precision
2635 // FP, and the two multiplicants are:
2636 // a = a23 . a22 ... a0 * 2^e1
2637 // b = b23 . b22 ... b0 * 2^e2
2638 // the result of multiplication is:
2639 // dst = c48 c47 c46 . c45 ... c0 * 2^(e1+e2)
2640 // Note that there are three significant bits at the left-hand side of the
2641 // radix point: two for the multiplication, and an overflow bit for the
2642 // addition (that will always be zero at this point). Move the radix point
2643 // toward left by two bits, and adjust exponent accordingly.
2646 // Convert the result having "2 * precision" significant-bits back to the one
2647 // having "precision" significant-bits. First, move the radix point from
2648 // poision "2*precision - 1" to "precision - 1". The exponent need to be
2649 // adjusted by "2*precision - 1" - "precision - 1" = "precision".
2650 *exp -= precision as ExpInt + 1;
2652 // In case MSB resides at the left-hand side of radix point, shift the
2653 // mantissa right by some amount to make sure the MSB reside right before
2654 // the radix point (i.e., "MSB . rest-significant-bits").
2656 // Note that the result is not normalized when "omsb < precision". So, the
2657 // caller needs to call IeeeFloat::normalize() if normalized value is
2659 let omsb = omsb(dst);
2660 if omsb <= precision { Loss::ExactlyZero } else { shift_right(dst, exp, omsb - precision) }
2663 /// `quotient = dividend / divisor`. Returns the lost fraction.
2664 /// Does not preserve `dividend` or `divisor`.
2666 quotient: &mut [Limb],
2668 dividend: &mut [Limb],
2669 divisor: &mut [Limb],
2672 // Normalize the divisor.
2673 let bits = precision - omsb(divisor);
2674 shift_left(divisor, &mut 0, bits);
2675 *exp += bits as ExpInt;
2677 // Normalize the dividend.
2678 let bits = precision - omsb(dividend);
2679 shift_left(dividend, exp, bits);
2682 let olsb_divisor = olsb(divisor);
2683 if olsb_divisor == precision {
2684 quotient.copy_from_slice(dividend);
2685 return Loss::ExactlyZero;
2688 // Ensure the dividend >= divisor initially for the loop below.
2689 // Incidentally, this means that the division loop below is
2690 // guaranteed to set the integer bit to one.
2691 if cmp(dividend, divisor) == Ordering::Less {
2692 shift_left(dividend, exp, 1);
2693 assert_ne!(cmp(dividend, divisor), Ordering::Less)
2696 // Helper for figuring out the lost fraction.
2697 let lost_fraction = |dividend: &[Limb], divisor: &[Limb]| match cmp(dividend, divisor) {
2698 Ordering::Greater => Loss::MoreThanHalf,
2699 Ordering::Equal => Loss::ExactlyHalf,
2701 if is_all_zeros(dividend) {
2709 // Try to perform a (much faster) short division for small divisors.
2710 let divisor_bits = precision - (olsb_divisor - 1);
2711 macro_rules! try_short_div {
2712 ($W:ty, $H:ty, $half:expr) => {
2713 if divisor_bits * 2 <= $half {
2714 // Extract the small divisor.
2715 let _: Loss = shift_right(divisor, &mut 0, olsb_divisor - 1);
2716 let divisor = divisor[0] as $H as $W;
2718 // Shift the dividend to produce a quotient with the unit bit set.
2719 let top_limb = *dividend.last().unwrap();
2720 let mut rem = (top_limb >> (LIMB_BITS - (divisor_bits - 1))) as $H;
2721 shift_left(dividend, &mut 0, divisor_bits - 1);
2723 // Apply short division in place on $H (of $half bits) chunks.
2724 each_chunk(dividend, $half, |chunk| {
2725 let chunk = chunk as $H;
2726 let combined = ((rem as $W) << $half) | (chunk as $W);
2727 rem = (combined % divisor) as $H;
2728 (combined / divisor) as $H as Limb
2730 quotient.copy_from_slice(dividend);
2732 return lost_fraction(&[(rem as Limb) << 1], &[divisor as Limb]);
2737 try_short_div!(u32, u16, 16);
2738 try_short_div!(u64, u32, 32);
2739 try_short_div!(u128, u64, 64);
2741 // Zero the quotient before setting bits in it.
2742 for x in &mut quotient[..limbs_for_bits(precision)] {
2747 for bit in (0..precision).rev() {
2748 if cmp(dividend, divisor) != Ordering::Less {
2749 sub(dividend, divisor, 0);
2750 set_bit(quotient, bit);
2752 shift_left(dividend, &mut 0, 1);
2755 lost_fraction(dividend, divisor)