// Copyright 2014 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // This file implements multi-precision floating-point numbers. // Like in the GNU MPFR library (https://www.mpfr.org/), operands // can be of mixed precision. Unlike MPFR, the rounding mode is // not specified with each operation, but with each operand. The // rounding mode of the result operand determines the rounding // mode of an operation. This is a from-scratch implementation. package big import ( "fmt" "math" "math/bits" ) const debugFloat = false // enable for debugging // A nonzero finite Float represents a multi-precision floating point number // // sign × mantissa × 2**exponent // // with 0.5 <= mantissa < 1.0, and MinExp <= exponent <= MaxExp. // A Float may also be zero (+0, -0) or infinite (+Inf, -Inf). // All Floats are ordered, and the ordering of two Floats x and y // is defined by x.Cmp(y). // // Each Float value also has a precision, rounding mode, and accuracy. // The precision is the maximum number of mantissa bits available to // represent the value. The rounding mode specifies how a result should // be rounded to fit into the mantissa bits, and accuracy describes the // rounding error with respect to the exact result. // // Unless specified otherwise, all operations (including setters) that // specify a *Float variable for the result (usually via the receiver // with the exception of [Float.MantExp]), round the numeric result according // to the precision and rounding mode of the result variable. // // If the provided result precision is 0 (see below), it is set to the // precision of the argument with the largest precision value before any // rounding takes place, and the rounding mode remains unchanged. Thus, // uninitialized Floats provided as result arguments will have their // precision set to a reasonable value determined by the operands, and // their mode is the zero value for RoundingMode (ToNearestEven). // // By setting the desired precision to 24 or 53 and using matching rounding // mode (typically [ToNearestEven]), Float operations produce the same results // as the corresponding float32 or float64 IEEE-754 arithmetic for operands // that correspond to normal (i.e., not denormal) float32 or float64 numbers. // Exponent underflow and overflow lead to a 0 or an Infinity for different // values than IEEE-754 because Float exponents have a much larger range. // // The zero (uninitialized) value for a Float is ready to use and represents // the number +0.0 exactly, with precision 0 and rounding mode [ToNearestEven]. // // Operations always take pointer arguments (*Float) rather // than Float values, and each unique Float value requires // its own unique *Float pointer. To "copy" a Float value, // an existing (or newly allocated) Float must be set to // a new value using the [Float.Set] method; shallow copies // of Floats are not supported and may lead to errors. type Float struct { prec uint32 mode RoundingMode acc Accuracy form form neg bool mant nat exp int32 } // An ErrNaN panic is raised by a [Float] operation that would lead to // a NaN under IEEE-754 rules. An ErrNaN implements the error interface. type ErrNaN struct { msg string } func (err ErrNaN) Error() string { return err.msg } // NewFloat allocates and returns a new [Float] set to x, // with precision 53 and rounding mode [ToNearestEven]. // NewFloat panics with [ErrNaN] if x is a NaN. func NewFloat(x float64) *Float { if math.IsNaN(x) { panic(ErrNaN{"NewFloat(NaN)"}) } return new(Float).SetFloat64(x) } // Exponent and precision limits. const ( MaxExp = math.MaxInt32 // largest supported exponent MinExp = math.MinInt32 // smallest supported exponent MaxPrec = math.MaxUint32 // largest (theoretically) supported precision; likely memory-limited ) // Internal representation: The mantissa bits x.mant of a nonzero finite // Float x are stored in a nat slice long enough to hold up to x.prec bits; // the slice may (but doesn't have to) be shorter if the mantissa contains // trailing 0 bits. x.mant is normalized if the msb of x.mant == 1 (i.e., // the msb is shifted all the way "to the left"). Thus, if the mantissa has // trailing 0 bits or x.prec is not a multiple of the Word size _W, // x.mant[0] has trailing zero bits. The msb of the mantissa corresponds // to the value 0.5; the exponent x.exp shifts the binary point as needed. // // A zero or non-finite Float x ignores x.mant and x.exp. // // x form neg mant exp // ---------------------------------------------------------- // ±0 zero sign - - // 0 < |x| < +Inf finite sign mantissa exponent // ±Inf inf sign - - // A form value describes the internal representation. type form byte // The form value order is relevant - do not change! const ( zero form = iota finite inf ) // RoundingMode determines how a [Float] value is rounded to the // desired precision. Rounding may change the [Float] value; the // rounding error is described by the [Float]'s [Accuracy]. type RoundingMode byte // These constants define supported rounding modes. const ( ToNearestEven RoundingMode = iota // == IEEE 754-2008 roundTiesToEven ToNearestAway // == IEEE 754-2008 roundTiesToAway ToZero // == IEEE 754-2008 roundTowardZero AwayFromZero // no IEEE 754-2008 equivalent ToNegativeInf // == IEEE 754-2008 roundTowardNegative ToPositiveInf // == IEEE 754-2008 roundTowardPositive ) //go:generate stringer -type=RoundingMode // Accuracy describes the rounding error produced by the most recent // operation that generated a [Float] value, relative to the exact value. type Accuracy int8 // Constants describing the [Accuracy] of a [Float]. const ( Below Accuracy = -1 Exact Accuracy = 0 Above Accuracy = +1 ) //go:generate stringer -type=Accuracy // SetPrec sets z's precision to prec and returns the (possibly) rounded // value of z. Rounding occurs according to z's rounding mode if the mantissa // cannot be represented in prec bits without loss of precision. // SetPrec(0) maps all finite values to ±0; infinite values remain unchanged. // If prec > [MaxPrec], it is set to [MaxPrec]. func (z *Float) SetPrec(prec uint) *Float { z.acc = Exact // optimistically assume no rounding is needed // special case if prec == 0 { z.prec = 0 if z.form == finite { // truncate z to 0 z.acc = makeAcc(z.neg) z.form = zero } return z } // general case if prec > MaxPrec { prec = MaxPrec } old := z.prec z.prec = uint32(prec) if z.prec < old { z.round(0) } return z } func makeAcc(above bool) Accuracy { if above { return Above } return Below } // SetMode sets z's rounding mode to mode and returns an exact z. // z remains unchanged otherwise. // z.SetMode(z.Mode()) is a cheap way to set z's accuracy to [Exact]. func (z *Float) SetMode(mode RoundingMode) *Float { z.mode = mode z.acc = Exact return z } // Prec returns the mantissa precision of x in bits. // The result may be 0 for |x| == 0 and |x| == Inf. func (x *Float) Prec() uint { return uint(x.prec) } // MinPrec returns the minimum precision required to represent x exactly // (i.e., the smallest prec before x.SetPrec(prec) would start rounding x). // The result is 0 for |x| == 0 and |x| == Inf. func (x *Float) MinPrec() uint { if x.form != finite { return 0 } return uint(len(x.mant))*_W - x.mant.trailingZeroBits() } // Mode returns the rounding mode of x. func (x *Float) Mode() RoundingMode { return x.mode } // Acc returns the accuracy of x produced by the most recent // operation, unless explicitly documented otherwise by that // operation. func (x *Float) Acc() Accuracy { return x.acc } // Sign returns: // // -1 if x < 0 // 0 if x is ±0 // +1 if x > 0 func (x *Float) Sign() int { if debugFloat { x.validate() } if x.form == zero { return 0 } if x.neg { return -1 } return 1 } // MantExp breaks x into its mantissa and exponent components // and returns the exponent. If a non-nil mant argument is // provided its value is set to the mantissa of x, with the // same precision and rounding mode as x. The components // satisfy x == mant × 2**exp, with 0.5 <= |mant| < 1.0. // Calling MantExp with a nil argument is an efficient way to // get the exponent of the receiver. // // Special cases are: // // ( ±0).MantExp(mant) = 0, with mant set to ±0 // (±Inf).MantExp(mant) = 0, with mant set to ±Inf // // x and mant may be the same in which case x is set to its // mantissa value. func (x *Float) MantExp(mant *Float) (exp int) { if debugFloat { x.validate() } if x.form == finite { exp = int(x.exp) } if mant != nil { mant.Copy(x) if mant.form == finite { mant.exp = 0 } } return } func (z *Float) setExpAndRound(exp int64, sbit uint) { if exp < MinExp { // underflow z.acc = makeAcc(z.neg) z.form = zero return } if exp > MaxExp { // overflow z.acc = makeAcc(!z.neg) z.form = inf return } z.form = finite z.exp = int32(exp) z.round(sbit) } // SetMantExp sets z to mant × 2**exp and returns z. // The result z has the same precision and rounding mode // as mant. SetMantExp is an inverse of [Float.MantExp] but does // not require 0.5 <= |mant| < 1.0. Specifically, for a // given x of type *[Float], SetMantExp relates to [Float.MantExp] // as follows: // // mant := new(Float) // new(Float).SetMantExp(mant, x.MantExp(mant)).Cmp(x) == 0 // // Special cases are: // // z.SetMantExp( ±0, exp) = ±0 // z.SetMantExp(±Inf, exp) = ±Inf // // z and mant may be the same in which case z's exponent // is set to exp. func (z *Float) SetMantExp(mant *Float, exp int) *Float { if debugFloat { z.validate() mant.validate() } z.Copy(mant) if z.form == finite { // 0 < |mant| < +Inf z.setExpAndRound(int64(z.exp)+int64(exp), 0) } return z } // Signbit reports whether x is negative or negative zero. func (x *Float) Signbit() bool { return x.neg } // IsInf reports whether x is +Inf or -Inf. func (x *Float) IsInf() bool { return x.form == inf } // IsInt reports whether x is an integer. // ±Inf values are not integers. func (x *Float) IsInt() bool { if debugFloat { x.validate() } // special cases if x.form != finite { return x.form == zero } // x.form == finite if x.exp <= 0 { return false } // x.exp > 0 return x.prec <= uint32(x.exp) || x.MinPrec() <= uint(x.exp) // not enough bits for fractional mantissa } // debugging support func (x *Float) validate() { if !debugFloat { // avoid performance bugs panic("validate called but debugFloat is not set") } if msg := x.validate0(); msg != "" { panic(msg) } } func (x *Float) validate0() string { if x.form != finite { return "" } m := len(x.mant) if m == 0 { return "nonzero finite number with empty mantissa" } const msb = 1 << (_W - 1) if x.mant[m-1]&msb == 0 { return fmt.Sprintf("msb not set in last word %#x of %s", x.mant[m-1], x.Text('p', 0)) } if x.prec == 0 { return "zero precision finite number" } return "" } // round rounds z according to z.mode to z.prec bits and sets z.acc accordingly. // sbit must be 0 or 1 and summarizes any "sticky bit" information one might // have before calling round. z's mantissa must be normalized (with the msb set) // or empty. // // CAUTION: The rounding modes ToNegativeInf, ToPositiveInf are affected by the // sign of z. For correct rounding, the sign of z must be set correctly before // calling round. func (z *Float) round(sbit uint) { if debugFloat { z.validate() } z.acc = Exact if z.form != finite { // ±0 or ±Inf => nothing left to do return } // z.form == finite && len(z.mant) > 0 // m > 0 implies z.prec > 0 (checked by validate) m := uint32(len(z.mant)) // present mantissa length in words bits := m * _W // present mantissa bits; bits > 0 if bits <= z.prec { // mantissa fits => nothing to do return } // bits > z.prec // Rounding is based on two bits: the rounding bit (rbit) and the // sticky bit (sbit). The rbit is the bit immediately before the // z.prec leading mantissa bits (the "0.5"). The sbit is set if any // of the bits before the rbit are set (the "0.25", "0.125", etc.): // // rbit sbit => "fractional part" // // 0 0 == 0 // 0 1 > 0 , < 0.5 // 1 0 == 0.5 // 1 1 > 0.5, < 1.0 // bits > z.prec: mantissa too large => round r := uint(bits - z.prec - 1) // rounding bit position; r >= 0 rbit := z.mant.bit(r) & 1 // rounding bit; be safe and ensure it's a single bit // The sticky bit is only needed for rounding ToNearestEven // or when the rounding bit is zero. Avoid computation otherwise. if sbit == 0 && (rbit == 0 || z.mode == ToNearestEven) { sbit = z.mant.sticky(r) } sbit &= 1 // be safe and ensure it's a single bit // cut off extra words n := (z.prec + (_W - 1)) / _W // mantissa length in words for desired precision if m > n { copy(z.mant, z.mant[m-n:]) // move n last words to front z.mant = z.mant[:n] } // determine number of trailing zero bits (ntz) and compute lsb mask of mantissa's least-significant word ntz := n*_W - z.prec // 0 <= ntz < _W lsb := Word(1) << ntz // round if result is inexact if rbit|sbit != 0 { // Make rounding decision: The result mantissa is truncated ("rounded down") // by default. Decide if we need to increment, or "round up", the (unsigned) // mantissa. inc := false switch z.mode { case ToNegativeInf: inc = z.neg case ToZero: // nothing to do case ToNearestEven: inc = rbit != 0 && (sbit != 0 || z.mant[0]&lsb != 0) case ToNearestAway: inc = rbit != 0 case AwayFromZero: inc = true case ToPositiveInf: inc = !z.neg default: panic("unreachable") } // A positive result (!z.neg) is Above the exact result if we increment, // and it's Below if we truncate (Exact results require no rounding). // For a negative result (z.neg) it is exactly the opposite. z.acc = makeAcc(inc != z.neg) if inc { // add 1 to mantissa if addVW(z.mant, z.mant, lsb) != 0 { // mantissa overflow => adjust exponent if z.exp >= MaxExp { // exponent overflow z.form = inf return } z.exp++ // adjust mantissa: divide by 2 to compensate for exponent adjustment shrVU(z.mant, z.mant, 1) // set msb == carry == 1 from the mantissa overflow above const msb = 1 << (_W - 1) z.mant[n-1] |= msb } } } // zero out trailing bits in least-significant word z.mant[0] &^= lsb - 1 if debugFloat { z.validate() } } func (z *Float) setBits64(neg bool, x uint64) *Float { if z.prec == 0 { z.prec = 64 } z.acc = Exact z.neg = neg if x == 0 { z.form = zero return z } // x != 0 z.form = finite s := bits.LeadingZeros64(x) z.mant = z.mant.setUint64(x << uint(s)) z.exp = int32(64 - s) // always fits if z.prec < 64 { z.round(0) } return z } // SetUint64 sets z to the (possibly rounded) value of x and returns z. // If z's precision is 0, it is changed to 64 (and rounding will have // no effect). func (z *Float) SetUint64(x uint64) *Float { return z.setBits64(false, x) } // SetInt64 sets z to the (possibly rounded) value of x and returns z. // If z's precision is 0, it is changed to 64 (and rounding will have // no effect). func (z *Float) SetInt64(x int64) *Float { u := x if u < 0 { u = -u } // We cannot simply call z.SetUint64(uint64(u)) and change // the sign afterwards because the sign affects rounding. return z.setBits64(x < 0, uint64(u)) } // SetFloat64 sets z to the (possibly rounded) value of x and returns z. // If z's precision is 0, it is changed to 53 (and rounding will have // no effect). SetFloat64 panics with [ErrNaN] if x is a NaN. func (z *Float) SetFloat64(x float64) *Float { if z.prec == 0 { z.prec = 53 } if math.IsNaN(x) { panic(ErrNaN{"Float.SetFloat64(NaN)"}) } z.acc = Exact z.neg = math.Signbit(x) // handle -0, -Inf correctly if x == 0 { z.form = zero return z } if math.IsInf(x, 0) { z.form = inf return z } // normalized x != 0 z.form = finite fmant, exp := math.Frexp(x) // get normalized mantissa z.mant = z.mant.setUint64(1<<63 | math.Float64bits(fmant)<<11) z.exp = int32(exp) // always fits if z.prec < 53 { z.round(0) } return z } // fnorm normalizes mantissa m by shifting it to the left // such that the msb of the most-significant word (msw) is 1. // It returns the shift amount. It assumes that len(m) != 0. func fnorm(m nat) int64 { if debugFloat && (len(m) == 0 || m[len(m)-1] == 0) { panic("msw of mantissa is 0") } s := nlz(m[len(m)-1]) if s > 0 { c := shlVU(m, m, s) if debugFloat && c != 0 { panic("nlz or shlVU incorrect") } } return int64(s) } // SetInt sets z to the (possibly rounded) value of x and returns z. // If z's precision is 0, it is changed to the larger of x.BitLen() // or 64 (and rounding will have no effect). func (z *Float) SetInt(x *Int) *Float { // TODO(gri) can be more efficient if z.prec > 0 // but small compared to the size of x, or if there // are many trailing 0's. bits := uint32(x.BitLen()) if z.prec == 0 { z.prec = umax32(bits, 64) } z.acc = Exact z.neg = x.neg if len(x.abs) == 0 { z.form = zero return z } // x != 0 z.mant = z.mant.set(x.abs) fnorm(z.mant) z.setExpAndRound(int64(bits), 0) return z } // SetRat sets z to the (possibly rounded) value of x and returns z. // If z's precision is 0, it is changed to the largest of a.BitLen(), // b.BitLen(), or 64; with x = a/b. func (z *Float) SetRat(x *Rat) *Float { if x.IsInt() { return z.SetInt(x.Num()) } var a, b Float a.SetInt(x.Num()) b.SetInt(x.Denom()) if z.prec == 0 { z.prec = umax32(a.prec, b.prec) } return z.Quo(&a, &b) } // SetInf sets z to the infinite Float -Inf if signbit is // set, or +Inf if signbit is not set, and returns z. The // precision of z is unchanged and the result is always // [Exact]. func (z *Float) SetInf(signbit bool) *Float { z.acc = Exact z.form = inf z.neg = signbit return z } // Set sets z to the (possibly rounded) value of x and returns z. // If z's precision is 0, it is changed to the precision of x // before setting z (and rounding will have no effect). // Rounding is performed according to z's precision and rounding // mode; and z's accuracy reports the result error relative to the // exact (not rounded) result. func (z *Float) Set(x *Float) *Float { if debugFloat { x.validate() } z.acc = Exact if z != x { z.form = x.form z.neg = x.neg if x.form == finite { z.exp = x.exp z.mant = z.mant.set(x.mant) } if z.prec == 0 { z.prec = x.prec } else if z.prec < x.prec { z.round(0) } } return z } // Copy sets z to x, with the same precision, rounding mode, and // accuracy as x, and returns z. x is not changed even if z and // x are the same. func (z *Float) Copy(x *Float) *Float { if debugFloat { x.validate() } if z != x { z.prec = x.prec z.mode = x.mode z.acc = x.acc z.form = x.form z.neg = x.neg if z.form == finite { z.mant = z.mant.set(x.mant) z.exp = x.exp } } return z } // msb32 returns the 32 most significant bits of x. func msb32(x nat) uint32 { i := len(x) - 1 if i < 0 { return 0 } if debugFloat && x[i]&(1<<(_W-1)) == 0 { panic("x not normalized") } switch _W { case 32: return uint32(x[i]) case 64: return uint32(x[i] >> 32) } panic("unreachable") } // msb64 returns the 64 most significant bits of x. func msb64(x nat) uint64 { i := len(x) - 1 if i < 0 { return 0 } if debugFloat && x[i]&(1<<(_W-1)) == 0 { panic("x not normalized") } switch _W { case 32: v := uint64(x[i]) << 32 if i > 0 { v |= uint64(x[i-1]) } return v case 64: return uint64(x[i]) } panic("unreachable") } // Uint64 returns the unsigned integer resulting from truncating x // towards zero. If 0 <= x <= math.MaxUint64, the result is [Exact] // if x is an integer and [Below] otherwise. // The result is (0, [Above]) for x < 0, and ([math.MaxUint64], [Below]) // for x > [math.MaxUint64]. func (x *Float) Uint64() (uint64, Accuracy) { if debugFloat { x.validate() } switch x.form { case finite: if x.neg { return 0, Above } // 0 < x < +Inf if x.exp <= 0 { // 0 < x < 1 return 0, Below } // 1 <= x < Inf if x.exp <= 64 { // u = trunc(x) fits into a uint64 u := msb64(x.mant) >> (64 - uint32(x.exp)) if x.MinPrec() <= 64 { return u, Exact } return u, Below // x truncated } // x too large return math.MaxUint64, Below case zero: return 0, Exact case inf: if x.neg { return 0, Above } return math.MaxUint64, Below } panic("unreachable") } // Int64 returns the integer resulting from truncating x towards zero. // If [math.MinInt64] <= x <= [math.MaxInt64], the result is [Exact] if x is // an integer, and [Above] (x < 0) or [Below] (x > 0) otherwise. // The result is ([math.MinInt64], [Above]) for x < [math.MinInt64], // and ([math.MaxInt64], [Below]) for x > [math.MaxInt64]. func (x *Float) Int64() (int64, Accuracy) { if debugFloat { x.validate() } switch x.form { case finite: // 0 < |x| < +Inf acc := makeAcc(x.neg) if x.exp <= 0 { // 0 < |x| < 1 return 0, acc } // x.exp > 0 // 1 <= |x| < +Inf if x.exp <= 63 { // i = trunc(x) fits into an int64 (excluding math.MinInt64) i := int64(msb64(x.mant) >> (64 - uint32(x.exp))) if x.neg { i = -i } if x.MinPrec() <= uint(x.exp) { return i, Exact } return i, acc // x truncated } if x.neg { // check for special case x == math.MinInt64 (i.e., x == -(0.5 << 64)) if x.exp == 64 && x.MinPrec() == 1 { acc = Exact } return math.MinInt64, acc } // x too large return math.MaxInt64, Below case zero: return 0, Exact case inf: if x.neg { return math.MinInt64, Above } return math.MaxInt64, Below } panic("unreachable") } // Float32 returns the float32 value nearest to x. If x is too small to be // represented by a float32 (|x| < [math.SmallestNonzeroFloat32]), the result // is (0, [Below]) or (-0, [Above]), respectively, depending on the sign of x. // If x is too large to be represented by a float32 (|x| > [math.MaxFloat32]), // the result is (+Inf, [Above]) or (-Inf, [Below]), depending on the sign of x. func (x *Float) Float32() (float32, Accuracy) { if debugFloat { x.validate() } switch x.form { case finite: // 0 < |x| < +Inf const ( fbits = 32 // float size mbits = 23 // mantissa size (excluding implicit msb) ebits = fbits - mbits - 1 // 8 exponent size bias = 1<<(ebits-1) - 1 // 127 exponent bias dmin = 1 - bias - mbits // -149 smallest unbiased exponent (denormal) emin = 1 - bias // -126 smallest unbiased exponent (normal) emax = bias // 127 largest unbiased exponent (normal) ) // Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float32 mantissa. e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0 // Compute precision p for float32 mantissa. // If the exponent is too small, we have a denormal number before // rounding and fewer than p mantissa bits of precision available // (the exponent remains fixed but the mantissa gets shifted right). p := mbits + 1 // precision of normal float if e < emin { // recompute precision p = mbits + 1 - emin + int(e) // If p == 0, the mantissa of x is shifted so much to the right // that its msb falls immediately to the right of the float32 // mantissa space. In other words, if the smallest denormal is // considered "1.0", for p == 0, the mantissa value m is >= 0.5. // If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal. // If m == 0.5, it is rounded down to even, i.e., 0.0. // If p < 0, the mantissa value m is <= "0.25" which is never rounded up. if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ { // underflow to ±0 if x.neg { var z float32 return -z, Above } return 0.0, Below } // otherwise, round up // We handle p == 0 explicitly because it's easy and because // Float.round doesn't support rounding to 0 bits of precision. if p == 0 { if x.neg { return -math.SmallestNonzeroFloat32, Below } return math.SmallestNonzeroFloat32, Above } } // p > 0 // round var r Float r.prec = uint32(p) r.Set(x) e = r.exp - 1 // Rounding may have caused r to overflow to ±Inf // (rounding never causes underflows to 0). // If the exponent is too large, also overflow to ±Inf. if r.form == inf || e > emax { // overflow if x.neg { return float32(math.Inf(-1)), Below } return float32(math.Inf(+1)), Above } // e <= emax // Determine sign, biased exponent, and mantissa. var sign, bexp, mant uint32 if x.neg { sign = 1 << (fbits - 1) } // Rounding may have caused a denormal number to // become normal. Check again. if e < emin { // denormal number: recompute precision // Since rounding may have at best increased precision // and we have eliminated p <= 0 early, we know p > 0. // bexp == 0 for denormals p = mbits + 1 - emin + int(e) mant = msb32(r.mant) >> uint(fbits-p) } else { // normal number: emin <= e <= emax bexp = uint32(e+bias) << mbits mant = msb32(r.mant) >> ebits & (1< [math.MaxFloat64]), // the result is (+Inf, [Above]) or (-Inf, [Below]), depending on the sign of x. func (x *Float) Float64() (float64, Accuracy) { if debugFloat { x.validate() } switch x.form { case finite: // 0 < |x| < +Inf const ( fbits = 64 // float size mbits = 52 // mantissa size (excluding implicit msb) ebits = fbits - mbits - 1 // 11 exponent size bias = 1<<(ebits-1) - 1 // 1023 exponent bias dmin = 1 - bias - mbits // -1074 smallest unbiased exponent (denormal) emin = 1 - bias // -1022 smallest unbiased exponent (normal) emax = bias // 1023 largest unbiased exponent (normal) ) // Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float64 mantissa. e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0 // Compute precision p for float64 mantissa. // If the exponent is too small, we have a denormal number before // rounding and fewer than p mantissa bits of precision available // (the exponent remains fixed but the mantissa gets shifted right). p := mbits + 1 // precision of normal float if e < emin { // recompute precision p = mbits + 1 - emin + int(e) // If p == 0, the mantissa of x is shifted so much to the right // that its msb falls immediately to the right of the float64 // mantissa space. In other words, if the smallest denormal is // considered "1.0", for p == 0, the mantissa value m is >= 0.5. // If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal. // If m == 0.5, it is rounded down to even, i.e., 0.0. // If p < 0, the mantissa value m is <= "0.25" which is never rounded up. if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ { // underflow to ±0 if x.neg { var z float64 return -z, Above } return 0.0, Below } // otherwise, round up // We handle p == 0 explicitly because it's easy and because // Float.round doesn't support rounding to 0 bits of precision. if p == 0 { if x.neg { return -math.SmallestNonzeroFloat64, Below } return math.SmallestNonzeroFloat64, Above } } // p > 0 // round var r Float r.prec = uint32(p) r.Set(x) e = r.exp - 1 // Rounding may have caused r to overflow to ±Inf // (rounding never causes underflows to 0). // If the exponent is too large, also overflow to ±Inf. if r.form == inf || e > emax { // overflow if x.neg { return math.Inf(-1), Below } return math.Inf(+1), Above } // e <= emax // Determine sign, biased exponent, and mantissa. var sign, bexp, mant uint64 if x.neg { sign = 1 << (fbits - 1) } // Rounding may have caused a denormal number to // become normal. Check again. if e < emin { // denormal number: recompute precision // Since rounding may have at best increased precision // and we have eliminated p <= 0 early, we know p > 0. // bexp == 0 for denormals p = mbits + 1 - emin + int(e) mant = msb64(r.mant) >> uint(fbits-p) } else { // normal number: emin <= e <= emax bexp = uint64(e+bias) << mbits mant = msb64(r.mant) >> ebits & (1< 0, and [Above] for x < 0. // If a non-nil *[Int] argument z is provided, [Int] stores // the result in z instead of allocating a new [Int]. func (x *Float) Int(z *Int) (*Int, Accuracy) { if debugFloat { x.validate() } if z == nil && x.form <= finite { z = new(Int) } switch x.form { case finite: // 0 < |x| < +Inf acc := makeAcc(x.neg) if x.exp <= 0 { // 0 < |x| < 1 return z.SetInt64(0), acc } // x.exp > 0 // 1 <= |x| < +Inf // determine minimum required precision for x allBits := uint(len(x.mant)) * _W exp := uint(x.exp) if x.MinPrec() <= exp { acc = Exact } // shift mantissa as needed if z == nil { z = new(Int) } z.neg = x.neg switch { case exp > allBits: z.abs = z.abs.shl(x.mant, exp-allBits) default: z.abs = z.abs.set(x.mant) case exp < allBits: z.abs = z.abs.shr(x.mant, allBits-exp) } return z, acc case zero: return z.SetInt64(0), Exact case inf: return nil, makeAcc(x.neg) } panic("unreachable") } // Rat returns the rational number corresponding to x; // or nil if x is an infinity. // The result is [Exact] if x is not an Inf. // If a non-nil *[Rat] argument z is provided, [Rat] stores // the result in z instead of allocating a new [Rat]. func (x *Float) Rat(z *Rat) (*Rat, Accuracy) { if debugFloat { x.validate() } if z == nil && x.form <= finite { z = new(Rat) } switch x.form { case finite: // 0 < |x| < +Inf allBits := int32(len(x.mant)) * _W // build up numerator and denominator z.a.neg = x.neg switch { case x.exp > allBits: z.a.abs = z.a.abs.shl(x.mant, uint(x.exp-allBits)) z.b.abs = z.b.abs[:0] // == 1 (see Rat) // z already in normal form default: z.a.abs = z.a.abs.set(x.mant) z.b.abs = z.b.abs[:0] // == 1 (see Rat) // z already in normal form case x.exp < allBits: z.a.abs = z.a.abs.set(x.mant) t := z.b.abs.setUint64(1) z.b.abs = t.shl(t, uint(allBits-x.exp)) z.norm() } return z, Exact case zero: return z.SetInt64(0), Exact case inf: return nil, makeAcc(x.neg) } panic("unreachable") } // Abs sets z to the (possibly rounded) value |x| (the absolute value of x) // and returns z. func (z *Float) Abs(x *Float) *Float { z.Set(x) z.neg = false return z } // Neg sets z to the (possibly rounded) value of x with its sign negated, // and returns z. func (z *Float) Neg(x *Float) *Float { z.Set(x) z.neg = !z.neg return z } func validateBinaryOperands(x, y *Float) { if !debugFloat { // avoid performance bugs panic("validateBinaryOperands called but debugFloat is not set") } if len(x.mant) == 0 { panic("empty mantissa for x") } if len(y.mant) == 0 { panic("empty mantissa for y") } } // z = x + y, ignoring signs of x and y for the addition // but using the sign of z for rounding the result. // x and y must have a non-empty mantissa and valid exponent. func (z *Float) uadd(x, y *Float) { // Note: This implementation requires 2 shifts most of the // time. It is also inefficient if exponents or precisions // differ by wide margins. The following article describes // an efficient (but much more complicated) implementation // compatible with the internal representation used here: // // Vincent Lefèvre: "The Generic Multiple-Precision Floating- // Point Addition With Exact Rounding (as in the MPFR Library)" // http://www.vinc17.net/research/papers/rnc6.pdf if debugFloat { validateBinaryOperands(x, y) } // compute exponents ex, ey for mantissa with "binary point" // on the right (mantissa.0) - use int64 to avoid overflow ex := int64(x.exp) - int64(len(x.mant))*_W ey := int64(y.exp) - int64(len(y.mant))*_W al := alias(z.mant, x.mant) || alias(z.mant, y.mant) // TODO(gri) having a combined add-and-shift primitive // could make this code significantly faster switch { case ex < ey: if al { t := nat(nil).shl(y.mant, uint(ey-ex)) z.mant = z.mant.add(x.mant, t) } else { z.mant = z.mant.shl(y.mant, uint(ey-ex)) z.mant = z.mant.add(x.mant, z.mant) } default: // ex == ey, no shift needed z.mant = z.mant.add(x.mant, y.mant) case ex > ey: if al { t := nat(nil).shl(x.mant, uint(ex-ey)) z.mant = z.mant.add(t, y.mant) } else { z.mant = z.mant.shl(x.mant, uint(ex-ey)) z.mant = z.mant.add(z.mant, y.mant) } ex = ey } // len(z.mant) > 0 z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0) } // z = x - y for |x| > |y|, ignoring signs of x and y for the subtraction // but using the sign of z for rounding the result. // x and y must have a non-empty mantissa and valid exponent. func (z *Float) usub(x, y *Float) { // This code is symmetric to uadd. // We have not factored the common code out because // eventually uadd (and usub) should be optimized // by special-casing, and the code will diverge. if debugFloat { validateBinaryOperands(x, y) } ex := int64(x.exp) - int64(len(x.mant))*_W ey := int64(y.exp) - int64(len(y.mant))*_W al := alias(z.mant, x.mant) || alias(z.mant, y.mant) switch { case ex < ey: if al { t := nat(nil).shl(y.mant, uint(ey-ex)) z.mant = t.sub(x.mant, t) } else { z.mant = z.mant.shl(y.mant, uint(ey-ex)) z.mant = z.mant.sub(x.mant, z.mant) } default: // ex == ey, no shift needed z.mant = z.mant.sub(x.mant, y.mant) case ex > ey: if al { t := nat(nil).shl(x.mant, uint(ex-ey)) z.mant = t.sub(t, y.mant) } else { z.mant = z.mant.shl(x.mant, uint(ex-ey)) z.mant = z.mant.sub(z.mant, y.mant) } ex = ey } // operands may have canceled each other out if len(z.mant) == 0 { z.acc = Exact z.form = zero z.neg = false return } // len(z.mant) > 0 z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0) } // z = x * y, ignoring signs of x and y for the multiplication // but using the sign of z for rounding the result. // x and y must have a non-empty mantissa and valid exponent. func (z *Float) umul(x, y *Float) { if debugFloat { validateBinaryOperands(x, y) } // Note: This is doing too much work if the precision // of z is less than the sum of the precisions of x // and y which is often the case (e.g., if all floats // have the same precision). // TODO(gri) Optimize this for the common case. e := int64(x.exp) + int64(y.exp) if x == y { z.mant = z.mant.sqr(x.mant) } else { z.mant = z.mant.mul(x.mant, y.mant) } z.setExpAndRound(e-fnorm(z.mant), 0) } // z = x / y, ignoring signs of x and y for the division // but using the sign of z for rounding the result. // x and y must have a non-empty mantissa and valid exponent. func (z *Float) uquo(x, y *Float) { if debugFloat { validateBinaryOperands(x, y) } // mantissa length in words for desired result precision + 1 // (at least one extra bit so we get the rounding bit after // the division) n := int(z.prec/_W) + 1 // compute adjusted x.mant such that we get enough result precision xadj := x.mant if d := n - len(x.mant) + len(y.mant); d > 0 { // d extra words needed => add d "0 digits" to x xadj = make(nat, len(x.mant)+d) copy(xadj[d:], x.mant) } // TODO(gri): If we have too many digits (d < 0), we should be able // to shorten x for faster division. But we must be extra careful // with rounding in that case. // Compute d before division since there may be aliasing of x.mant // (via xadj) or y.mant with z.mant. d := len(xadj) - len(y.mant) // divide var r nat z.mant, r = z.mant.div(nil, xadj, y.mant) e := int64(x.exp) - int64(y.exp) - int64(d-len(z.mant))*_W // The result is long enough to include (at least) the rounding bit. // If there's a non-zero remainder, the corresponding fractional part // (if it were computed), would have a non-zero sticky bit (if it were // zero, it couldn't have a non-zero remainder). var sbit uint if len(r) > 0 { sbit = 1 } z.setExpAndRound(e-fnorm(z.mant), sbit) } // ucmp returns -1, 0, or +1, depending on whether // |x| < |y|, |x| == |y|, or |x| > |y|. // x and y must have a non-empty mantissa and valid exponent. func (x *Float) ucmp(y *Float) int { if debugFloat { validateBinaryOperands(x, y) } switch { case x.exp < y.exp: return -1 case x.exp > y.exp: return +1 } // x.exp == y.exp // compare mantissas i := len(x.mant) j := len(y.mant) for i > 0 || j > 0 { var xm, ym Word if i > 0 { i-- xm = x.mant[i] } if j > 0 { j-- ym = y.mant[j] } switch { case xm < ym: return -1 case xm > ym: return +1 } } return 0 } // Handling of sign bit as defined by IEEE 754-2008, section 6.3: // // When neither the inputs nor result are NaN, the sign of a product or // quotient is the exclusive OR of the operands’ signs; the sign of a sum, // or of a difference x−y regarded as a sum x+(−y), differs from at most // one of the addends’ signs; and the sign of the result of conversions, // the quantize operation, the roundToIntegral operations, and the // roundToIntegralExact (see 5.3.1) is the sign of the first or only operand. // These rules shall apply even when operands or results are zero or infinite. // // When the sum of two operands with opposite signs (or the difference of // two operands with like signs) is exactly zero, the sign of that sum (or // difference) shall be +0 in all rounding-direction attributes except // roundTowardNegative; under that attribute, the sign of an exact zero // sum (or difference) shall be −0. However, x+x = x−(−x) retains the same // sign as x even when x is zero. // // See also: https://play.golang.org/p/RtH3UCt5IH // Add sets z to the rounded sum x+y and returns z. If z's precision is 0, // it is changed to the larger of x's or y's precision before the operation. // Rounding is performed according to z's precision and rounding mode; and // z's accuracy reports the result error relative to the exact (not rounded) // result. Add panics with [ErrNaN] if x and y are infinities with opposite // signs. The value of z is undefined in that case. func (z *Float) Add(x, y *Float) *Float { if debugFloat { x.validate() y.validate() } if z.prec == 0 { z.prec = umax32(x.prec, y.prec) } if x.form == finite && y.form == finite { // x + y (common case) // Below we set z.neg = x.neg, and when z aliases y this will // change the y operand's sign. This is fine, because if an // operand aliases the receiver it'll be overwritten, but we still // want the original x.neg and y.neg values when we evaluate // x.neg != y.neg, so we need to save y.neg before setting z.neg. yneg := y.neg z.neg = x.neg if x.neg == yneg { // x + y == x + y // (-x) + (-y) == -(x + y) z.uadd(x, y) } else { // x + (-y) == x - y == -(y - x) // (-x) + y == y - x == -(x - y) if x.ucmp(y) > 0 { z.usub(x, y) } else { z.neg = !z.neg z.usub(y, x) } } if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact { z.neg = true } return z } if x.form == inf && y.form == inf && x.neg != y.neg { // +Inf + -Inf // -Inf + +Inf // value of z is undefined but make sure it's valid z.acc = Exact z.form = zero z.neg = false panic(ErrNaN{"addition of infinities with opposite signs"}) } if x.form == zero && y.form == zero { // ±0 + ±0 z.acc = Exact z.form = zero z.neg = x.neg && y.neg // -0 + -0 == -0 return z } if x.form == inf || y.form == zero { // ±Inf + y // x + ±0 return z.Set(x) } // ±0 + y // x + ±Inf return z.Set(y) } // Sub sets z to the rounded difference x-y and returns z. // Precision, rounding, and accuracy reporting are as for [Float.Add]. // Sub panics with [ErrNaN] if x and y are infinities with equal // signs. The value of z is undefined in that case. func (z *Float) Sub(x, y *Float) *Float { if debugFloat { x.validate() y.validate() } if z.prec == 0 { z.prec = umax32(x.prec, y.prec) } if x.form == finite && y.form == finite { // x - y (common case) yneg := y.neg z.neg = x.neg if x.neg != yneg { // x - (-y) == x + y // (-x) - y == -(x + y) z.uadd(x, y) } else { // x - y == x - y == -(y - x) // (-x) - (-y) == y - x == -(x - y) if x.ucmp(y) > 0 { z.usub(x, y) } else { z.neg = !z.neg z.usub(y, x) } } if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact { z.neg = true } return z } if x.form == inf && y.form == inf && x.neg == y.neg { // +Inf - +Inf // -Inf - -Inf // value of z is undefined but make sure it's valid z.acc = Exact z.form = zero z.neg = false panic(ErrNaN{"subtraction of infinities with equal signs"}) } if x.form == zero && y.form == zero { // ±0 - ±0 z.acc = Exact z.form = zero z.neg = x.neg && !y.neg // -0 - +0 == -0 return z } if x.form == inf || y.form == zero { // ±Inf - y // x - ±0 return z.Set(x) } // ±0 - y // x - ±Inf return z.Neg(y) } // Mul sets z to the rounded product x*y and returns z. // Precision, rounding, and accuracy reporting are as for [Float.Add]. // Mul panics with [ErrNaN] if one operand is zero and the other // operand an infinity. The value of z is undefined in that case. func (z *Float) Mul(x, y *Float) *Float { if debugFloat { x.validate() y.validate() } if z.prec == 0 { z.prec = umax32(x.prec, y.prec) } z.neg = x.neg != y.neg if x.form == finite && y.form == finite { // x * y (common case) z.umul(x, y) return z } z.acc = Exact if x.form == zero && y.form == inf || x.form == inf && y.form == zero { // ±0 * ±Inf // ±Inf * ±0 // value of z is undefined but make sure it's valid z.form = zero z.neg = false panic(ErrNaN{"multiplication of zero with infinity"}) } if x.form == inf || y.form == inf { // ±Inf * y // x * ±Inf z.form = inf return z } // ±0 * y // x * ±0 z.form = zero return z } // Quo sets z to the rounded quotient x/y and returns z. // Precision, rounding, and accuracy reporting are as for [Float.Add]. // Quo panics with [ErrNaN] if both operands are zero or infinities. // The value of z is undefined in that case. func (z *Float) Quo(x, y *Float) *Float { if debugFloat { x.validate() y.validate() } if z.prec == 0 { z.prec = umax32(x.prec, y.prec) } z.neg = x.neg != y.neg if x.form == finite && y.form == finite { // x / y (common case) z.uquo(x, y) return z } z.acc = Exact if x.form == zero && y.form == zero || x.form == inf && y.form == inf { // ±0 / ±0 // ±Inf / ±Inf // value of z is undefined but make sure it's valid z.form = zero z.neg = false panic(ErrNaN{"division of zero by zero or infinity by infinity"}) } if x.form == zero || y.form == inf { // ±0 / y // x / ±Inf z.form = zero return z } // x / ±0 // ±Inf / y z.form = inf return z } // Cmp compares x and y and returns: // // -1 if x < y // 0 if x == y (incl. -0 == 0, -Inf == -Inf, and +Inf == +Inf) // +1 if x > y func (x *Float) Cmp(y *Float) int { if debugFloat { x.validate() y.validate() } mx := x.ord() my := y.ord() switch { case mx < my: return -1 case mx > my: return +1 } // mx == my // only if |mx| == 1 we have to compare the mantissae switch mx { case -1: return y.ucmp(x) case +1: return x.ucmp(y) } return 0 } // ord classifies x and returns: // // -2 if -Inf == x // -1 if -Inf < x < 0 // 0 if x == 0 (signed or unsigned) // +1 if 0 < x < +Inf // +2 if x == +Inf func (x *Float) ord() int { var m int switch x.form { case finite: m = 1 case zero: return 0 case inf: m = 2 } if x.neg { m = -m } return m } func umax32(x, y uint32) uint32 { if x > y { return x } return y }