Lines Matching +full:high +full:- +full:z +full:- +full:input
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
15 http://www.jhauser.us/arithmetic/SoftFloat-2b/SoftFloat-source.txt
38 -------------------------------------------------------------------------------
39 Primitive arithmetic functions, including multi-word arithmetic, and
42 -------------------------------------------------------------------------------
44 #include "softfloat-macros"
47 -------------------------------------------------------------------------------
52 are propagated from function inputs to output. These details are target-
54 -------------------------------------------------------------------------------
56 #include "softfloat-specialize"
59 -------------------------------------------------------------------------------
60 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
61 and 7, and returns the properly rounded 32-bit integer corresponding to the
62 input. If `zSign' is nonzero, the input is negated before being converted
63 to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point
64 input is simply rounded to an integer, with the inexact exception raised if
65 the input cannot be represented exactly as an integer. If the fixed-point
66 input is too large, however, the invalid exception is raised and the largest
68 -------------------------------------------------------------------------------
75 int32 z; in roundAndPackInt32() local
77 roundingMode = roundData->mode; in roundAndPackInt32()
97 z = absZ; in roundAndPackInt32()
98 if ( zSign ) z = - z; in roundAndPackInt32()
99 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { in roundAndPackInt32()
100 roundData->exception |= float_flag_invalid; in roundAndPackInt32()
103 if ( roundBits ) roundData->exception |= float_flag_inexact; in roundAndPackInt32()
104 return z; in roundAndPackInt32()
109 -------------------------------------------------------------------------------
110 Returns the fraction bits of the single-precision floating-point value `a'.
111 -------------------------------------------------------------------------------
121 -------------------------------------------------------------------------------
122 Returns the exponent bits of the single-precision floating-point value `a'.
123 -------------------------------------------------------------------------------
133 -------------------------------------------------------------------------------
134 Returns the sign bit of the single-precision floating-point value `a'.
135 -------------------------------------------------------------------------------
147 -------------------------------------------------------------------------------
148 Normalizes the subnormal single-precision floating-point value represented
152 -------------------------------------------------------------------------------
159 shiftCount = countLeadingZeros32( aSig ) - 8; in normalizeFloat32Subnormal()
161 *zExpPtr = 1 - shiftCount; in normalizeFloat32Subnormal()
166 -------------------------------------------------------------------------------
168 single-precision floating-point value, returning the result. After being
172 will have an integer portion equal to 1, the `zExp' input should be 1 less
175 -------------------------------------------------------------------------------
195 -------------------------------------------------------------------------------
196 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
197 and significand `zSig', and returns the proper single-precision floating-
198 point value corresponding to the abstract input. Ordinarily, the abstract
199 value is simply rounded and packed into the single-precision format, with
200 the inexact exception raised if the abstract input cannot be represented
203 returned. If the abstract value is too small, the input value is rounded to
205 the abstract input cannot be represented exactly as a subnormal single-
206 precision floating-point number.
207 The input significand `zSig' has its binary point between bits 30
212 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
214 Binary Floating-point Arithmetic.
215 -------------------------------------------------------------------------------
224 roundingMode = roundData->mode; in roundAndPackFloat32()
247 roundData->exception |= float_flag_overflow | float_flag_inexact; in roundAndPackFloat32()
248 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); in roundAndPackFloat32()
253 || ( zExp < -1 ) in roundAndPackFloat32()
255 shift32RightJamming( zSig, - zExp, &zSig ); in roundAndPackFloat32()
258 if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow; in roundAndPackFloat32()
261 if ( roundBits ) roundData->exception |= float_flag_inexact; in roundAndPackFloat32()
270 -------------------------------------------------------------------------------
271 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
272 and significand `zSig', and returns the proper single-precision floating-
273 point value corresponding to the abstract input. This routine is just like
275 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
277 -------------------------------------------------------------------------------
284 shiftCount = countLeadingZeros32( zSig ) - 1; in normalizeRoundAndPackFloat32()
285 return roundAndPackFloat32( roundData, zSign, zExp - shiftCount, zSig<<shiftCount ); in normalizeRoundAndPackFloat32()
290 -------------------------------------------------------------------------------
291 Returns the fraction bits of the double-precision floating-point value `a'.
292 -------------------------------------------------------------------------------
302 -------------------------------------------------------------------------------
303 Returns the exponent bits of the double-precision floating-point value `a'.
304 -------------------------------------------------------------------------------
314 -------------------------------------------------------------------------------
315 Returns the sign bit of the double-precision floating-point value `a'.
316 -------------------------------------------------------------------------------
328 -------------------------------------------------------------------------------
329 Normalizes the subnormal double-precision floating-point value represented
333 -------------------------------------------------------------------------------
340 shiftCount = countLeadingZeros64( aSig ) - 11; in normalizeFloat64Subnormal()
342 *zExpPtr = 1 - shiftCount; in normalizeFloat64Subnormal()
347 -------------------------------------------------------------------------------
349 double-precision floating-point value, returning the result. After being
353 will have an integer portion equal to 1, the `zExp' input should be 1 less
356 -------------------------------------------------------------------------------
366 -------------------------------------------------------------------------------
367 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
368 and significand `zSig', and returns the proper double-precision floating-
369 point value corresponding to the abstract input. Ordinarily, the abstract
370 value is simply rounded and packed into the double-precision format, with
371 the inexact exception raised if the abstract input cannot be represented
374 returned. If the abstract value is too small, the input value is rounded to
376 the abstract input cannot be represented exactly as a subnormal double-
377 precision floating-point number.
378 The input significand `zSig' has its binary point between bits 62
383 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
385 Binary Floating-point Arithmetic.
386 -------------------------------------------------------------------------------
395 roundingMode = roundData->mode; in roundAndPackFloat64()
420 roundData->exception |= float_flag_overflow | float_flag_inexact; in roundAndPackFloat64()
421 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 ); in roundAndPackFloat64()
426 || ( zExp < -1 ) in roundAndPackFloat64()
428 shift64RightJamming( zSig, - zExp, &zSig ); in roundAndPackFloat64()
431 if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow; in roundAndPackFloat64()
434 if ( roundBits ) roundData->exception |= float_flag_inexact; in roundAndPackFloat64()
443 -------------------------------------------------------------------------------
444 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
445 and significand `zSig', and returns the proper double-precision floating-
446 point value corresponding to the abstract input. This routine is just like
448 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
450 -------------------------------------------------------------------------------
457 shiftCount = countLeadingZeros64( zSig ) - 1; in normalizeRoundAndPackFloat64()
458 return roundAndPackFloat64( roundData, zSign, zExp - shiftCount, zSig<<shiftCount ); in normalizeRoundAndPackFloat64()
465 -------------------------------------------------------------------------------
466 Returns the fraction bits of the extended double-precision floating-point
468 -------------------------------------------------------------------------------
478 -------------------------------------------------------------------------------
479 Returns the exponent bits of the extended double-precision floating-point
481 -------------------------------------------------------------------------------
486 return a.high & 0x7FFF; in extractFloatx80Exp()
491 -------------------------------------------------------------------------------
492 Returns the sign bit of the extended double-precision floating-point value
494 -------------------------------------------------------------------------------
499 return a.high>>15; in extractFloatx80Sign()
504 -------------------------------------------------------------------------------
505 Normalizes the subnormal extended double-precision floating-point value
509 -------------------------------------------------------------------------------
518 *zExpPtr = 1 - shiftCount; in normalizeFloatx80Subnormal()
523 -------------------------------------------------------------------------------
525 extended double-precision floating-point value, returning the result.
526 -------------------------------------------------------------------------------
530 floatx80 z; in packFloatx80() local
532 z.low = zSig; in packFloatx80()
533 z.high = ( ( (bits16) zSign )<<15 ) + zExp; in packFloatx80()
534 z.__padding = 0; in packFloatx80()
535 return z; in packFloatx80()
540 -------------------------------------------------------------------------------
541 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
543 and returns the proper extended double-precision floating-point value
544 corresponding to the abstract input. Ordinarily, the abstract value is
545 rounded and packed into the extended double-precision format, with the
546 inexact exception raised if the abstract input cannot be represented
549 returned. If the abstract value is too small, the input value is rounded to
551 the abstract input cannot be represented exactly as a subnormal extended
552 double-precision floating-point number.
555 result is rounded to the full precision of the extended double-precision
557 The input significand must be normalized or smaller. If the input
561 Floating-point Arithmetic.
562 -------------------------------------------------------------------------------
573 roundingMode = roundData->mode; in roundAndPackFloatx80()
574 roundingPrecision = roundData->precision; in roundAndPackFloatx80()
604 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { in roundAndPackFloatx80()
615 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); in roundAndPackFloatx80()
618 if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow; in roundAndPackFloatx80()
619 if ( roundBits ) roundData->exception |= float_flag_inexact; in roundAndPackFloatx80()
630 if ( roundBits ) roundData->exception |= float_flag_inexact; in roundAndPackFloatx80()
658 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { in roundAndPackFloatx80()
667 roundData->exception |= float_flag_overflow | float_flag_inexact; in roundAndPackFloatx80()
682 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); in roundAndPackFloatx80()
684 if ( isTiny && zSig1 ) roundData->exception |= float_flag_underflow; in roundAndPackFloatx80()
685 if ( zSig1 ) roundData->exception |= float_flag_inexact; in roundAndPackFloatx80()
705 if ( zSig1 ) roundData->exception |= float_flag_inexact; in roundAndPackFloatx80()
724 -------------------------------------------------------------------------------
725 Takes an abstract floating-point value having sign `zSign', exponent
727 and returns the proper extended double-precision floating-point value
728 corresponding to the abstract input. This routine is just like
729 `roundAndPackFloatx80' except that the input significand does not have to be
731 -------------------------------------------------------------------------------
743 zExp -= 64; in normalizeRoundAndPackFloatx80()
747 zExp -= shiftCount; in normalizeRoundAndPackFloatx80()
756 -------------------------------------------------------------------------------
757 Returns the result of converting the 32-bit two's complement integer `a' to
758 the single-precision floating-point format. The conversion is performed
759 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
760 -------------------------------------------------------------------------------
769 return normalizeRoundAndPackFloat32( roundData, zSign, 0x9C, zSign ? - a : a ); in int32_to_float32()
774 -------------------------------------------------------------------------------
775 Returns the result of converting the 32-bit two's complement integer `a' to
776 the double-precision floating-point format. The conversion is performed
777 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
778 -------------------------------------------------------------------------------
789 absA = aSign ? - a : a; in int32_to_float64()
792 return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount ); in int32_to_float64()
799 -------------------------------------------------------------------------------
800 Returns the result of converting the 32-bit two's complement integer `a'
801 to the extended double-precision floating-point format. The conversion
802 is performed according to the IEC/IEEE Standard for Binary Floating-point
804 -------------------------------------------------------------------------------
815 absA = zSign ? - a : a; in int32_to_floatx80()
818 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); in int32_to_floatx80()
825 -------------------------------------------------------------------------------
826 Returns the result of converting the single-precision floating-point value
827 `a' to the 32-bit two's complement integer format. The conversion is
828 performed according to the IEC/IEEE Standard for Binary Floating-point
829 Arithmetic---which means in particular that the conversion is rounded
833 -------------------------------------------------------------------------------
847 shiftCount = 0xAF - aExp; in float32_to_int32()
856 -------------------------------------------------------------------------------
857 Returns the result of converting the single-precision floating-point value
858 `a' to the 32-bit two's complement integer format. The conversion is
859 performed according to the IEC/IEEE Standard for Binary Floating-point
864 -------------------------------------------------------------------------------
871 int32 z; in float32_to_int32_round_to_zero() local
876 shiftCount = aExp - 0x9E; in float32_to_int32_round_to_zero()
888 z = aSig>>( - shiftCount ); in float32_to_int32_round_to_zero()
892 return aSign ? - z : z; in float32_to_int32_round_to_zero()
897 -------------------------------------------------------------------------------
898 Returns the result of converting the single-precision floating-point value
899 `a' to the double-precision floating-point format. The conversion is
900 performed according to the IEC/IEEE Standard for Binary Floating-point
902 -------------------------------------------------------------------------------
920 --aExp; in float32_to_float64()
929 -------------------------------------------------------------------------------
930 Returns the result of converting the single-precision floating-point value
931 `a' to the extended double-precision floating-point format. The conversion
932 is performed according to the IEC/IEEE Standard for Binary Floating-point
934 -------------------------------------------------------------------------------
961 -------------------------------------------------------------------------------
962 Rounds the single-precision floating-point value `a' to an integer, and
963 returns the result as a single-precision floating-point value. The
965 Floating-point Arithmetic.
966 -------------------------------------------------------------------------------
974 float32 z; in float32_round_to_int() local
983 roundingMode = roundData->mode; in float32_round_to_int()
986 roundData->exception |= float_flag_inexact; in float32_round_to_int()
1002 lastBitMask <<= 0x96 - aExp; in float32_round_to_int()
1003 roundBitsMask = lastBitMask - 1; in float32_round_to_int()
1004 z = a; in float32_round_to_int()
1006 z += lastBitMask>>1; in float32_round_to_int()
1007 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; in float32_round_to_int()
1010 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { in float32_round_to_int()
1011 z += roundBitsMask; in float32_round_to_int()
1014 z &= ~ roundBitsMask; in float32_round_to_int()
1015 if ( z != a ) roundData->exception |= float_flag_inexact; in float32_round_to_int()
1016 return z; in float32_round_to_int()
1021 -------------------------------------------------------------------------------
1022 Returns the result of adding the absolute values of the single-precision
1023 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1026 Floating-point Arithmetic.
1027 -------------------------------------------------------------------------------
1039 expDiff = aExp - bExp; in addFloat32Sigs()
1048 --expDiff; in addFloat32Sigs()
1067 shift32RightJamming( aSig, - expDiff, &aSig ); in addFloat32Sigs()
1082 --zExp; in addFloat32Sigs()
1093 -------------------------------------------------------------------------------
1094 Returns the result of subtracting the absolute values of the single-
1095 precision floating-point values `a' and `b'. If `zSign' is true, the
1098 Standard for Binary Floating-point Arithmetic.
1099 -------------------------------------------------------------------------------
1111 expDiff = aExp - bExp; in subFloat32Sigs()
1118 roundData->exception |= float_flag_invalid; in subFloat32Sigs()
1127 return packFloat32( roundData->mode == float_round_down, 0, 0 ); in subFloat32Sigs()
1139 shift32RightJamming( aSig, - expDiff, &aSig ); in subFloat32Sigs()
1142 zSig = bSig - aSig; in subFloat32Sigs()
1152 --expDiff; in subFloat32Sigs()
1160 zSig = aSig - bSig; in subFloat32Sigs()
1163 --zExp; in subFloat32Sigs()
1169 -------------------------------------------------------------------------------
1170 Returns the result of adding the single-precision floating-point values `a'
1172 Binary Floating-point Arithmetic.
1173 -------------------------------------------------------------------------------
1191 -------------------------------------------------------------------------------
1192 Returns the result of subtracting the single-precision floating-point values
1194 for Binary Floating-point Arithmetic.
1195 -------------------------------------------------------------------------------
1213 -------------------------------------------------------------------------------
1214 Returns the result of multiplying the single-precision floating-point values
1216 for Binary Floating-point Arithmetic.
1217 -------------------------------------------------------------------------------
1239 roundData->exception |= float_flag_invalid; in float32_mul()
1247 roundData->exception |= float_flag_invalid; in float32_mul()
1260 zExp = aExp + bExp - 0x7F; in float32_mul()
1267 --zExp; in float32_mul()
1274 -------------------------------------------------------------------------------
1275 Returns the result of dividing the single-precision floating-point value `a'
1277 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1278 -------------------------------------------------------------------------------
1297 roundData->exception |= float_flag_invalid; in float32_div()
1309 roundData->exception |= float_flag_invalid; in float32_div()
1312 roundData->exception |= float_flag_divbyzero; in float32_div()
1321 zExp = aExp - bExp + 0x7D; in float32_div()
1341 -------------------------------------------------------------------------------
1342 Returns the remainder of the single-precision floating-point value `a'
1344 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1345 -------------------------------------------------------------------------------
1367 roundData->exception |= float_flag_invalid; in float32_rem()
1376 roundData->exception |= float_flag_invalid; in float32_rem()
1385 expDiff = aExp - bExp; in float32_rem()
1392 if ( expDiff < -1 ) return a; in float32_rem()
1396 if ( q ) aSig -= bSig; in float32_rem()
1401 q >>= 32 - expDiff; in float32_rem()
1403 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; in float32_rem()
1411 if ( bSig <= aSig ) aSig -= bSig; in float32_rem()
1414 expDiff -= 64; in float32_rem()
1417 q64 = ( 2 < q64 ) ? q64 - 2 : 0; in float32_rem()
1418 aSig64 = - ( ( bSig * q64 )<<38 ); in float32_rem()
1419 expDiff -= 62; in float32_rem()
1423 q64 = ( 2 < q64 ) ? q64 - 2 : 0; in float32_rem()
1424 q = q64>>( 64 - expDiff ); in float32_rem()
1426 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; in float32_rem()
1431 aSig -= bSig; in float32_rem()
1438 if ( zSign ) aSig = - aSig; in float32_rem()
1444 -------------------------------------------------------------------------------
1445 Returns the square root of the single-precision floating-point value `a'.
1447 Floating-point Arithmetic.
1448 -------------------------------------------------------------------------------
1463 roundData->exception |= float_flag_invalid; in float32_sqrt()
1468 roundData->exception |= float_flag_invalid; in float32_sqrt()
1475 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; in float32_sqrt()
1485 rem = ( ( (bits64) aSig )<<32 ) - term; in float32_sqrt()
1487 --zSig; in float32_sqrt()
1499 -------------------------------------------------------------------------------
1500 Returns 1 if the single-precision floating-point value `a' is equal to the
1502 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1503 -------------------------------------------------------------------------------
1521 -------------------------------------------------------------------------------
1522 Returns 1 if the single-precision floating-point value `a' is less than or
1524 performed according to the IEC/IEEE Standard for Binary Floating-point
1526 -------------------------------------------------------------------------------
1546 -------------------------------------------------------------------------------
1547 Returns 1 if the single-precision floating-point value `a' is less than
1549 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1550 -------------------------------------------------------------------------------
1570 -------------------------------------------------------------------------------
1571 Returns 1 if the single-precision floating-point value `a' is equal to the
1574 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1575 -------------------------------------------------------------------------------
1591 -------------------------------------------------------------------------------
1592 Returns 1 if the single-precision floating-point value `a' is less than or
1595 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1596 -------------------------------------------------------------------------------
1617 -------------------------------------------------------------------------------
1618 Returns 1 if the single-precision floating-point value `a' is less than
1621 Standard for Binary Floating-point Arithmetic.
1622 -------------------------------------------------------------------------------
1642 -------------------------------------------------------------------------------
1643 Returns the result of converting the double-precision floating-point value
1644 `a' to the 32-bit two's complement integer format. The conversion is
1645 performed according to the IEC/IEEE Standard for Binary Floating-point
1646 Arithmetic---which means in particular that the conversion is rounded
1650 -------------------------------------------------------------------------------
1663 shiftCount = 0x42C - aExp; in float64_to_int32()
1670 -------------------------------------------------------------------------------
1671 Returns the result of converting the double-precision floating-point value
1672 `a' to the 32-bit two's complement integer format. The conversion is
1673 performed according to the IEC/IEEE Standard for Binary Floating-point
1678 -------------------------------------------------------------------------------
1685 int32 z; in float64_to_int32_round_to_zero() local
1690 shiftCount = 0x433 - aExp; in float64_to_int32_round_to_zero()
1702 z = aSig; in float64_to_int32_round_to_zero()
1703 if ( aSign ) z = - z; in float64_to_int32_round_to_zero()
1704 if ( ( z < 0 ) ^ aSign ) { in float64_to_int32_round_to_zero()
1712 return z; in float64_to_int32_round_to_zero()
1717 -------------------------------------------------------------------------------
1718 Returns the result of converting the double-precision floating-point value
1719 `a' to the 32-bit two's complement unsigned integer format. The conversion
1720 is performed according to the IEC/IEEE Standard for Binary Floating-point
1721 Arithmetic---which means in particular that the conversion is rounded
1725 -------------------------------------------------------------------------------
1738 shiftCount = 0x42C - aExp; in float64_to_uint32()
1744 -------------------------------------------------------------------------------
1745 Returns the result of converting the double-precision floating-point value
1746 `a' to the 32-bit two's complement integer format. The conversion is
1747 performed according to the IEC/IEEE Standard for Binary Floating-point
1751 -------------------------------------------------------------------------------
1758 int32 z; in float64_to_uint32_round_to_zero() local
1763 shiftCount = 0x433 - aExp; in float64_to_uint32_round_to_zero()
1775 z = aSig; in float64_to_uint32_round_to_zero()
1776 if ( aSign ) z = - z; in float64_to_uint32_round_to_zero()
1777 if ( ( z < 0 ) ^ aSign ) { in float64_to_uint32_round_to_zero()
1785 return z; in float64_to_uint32_round_to_zero()
1789 -------------------------------------------------------------------------------
1790 Returns the result of converting the double-precision floating-point value
1791 `a' to the single-precision floating-point format. The conversion is
1792 performed according to the IEC/IEEE Standard for Binary Floating-point
1794 -------------------------------------------------------------------------------
1814 aExp -= 0x381; in float64_to_float32()
1823 -------------------------------------------------------------------------------
1824 Returns the result of converting the double-precision floating-point value
1825 `a' to the extended double-precision floating-point format. The conversion
1826 is performed according to the IEC/IEEE Standard for Binary Floating-point
1828 -------------------------------------------------------------------------------
1856 -------------------------------------------------------------------------------
1857 Rounds the double-precision floating-point value `a' to an integer, and
1858 returns the result as a double-precision floating-point value. The
1860 Floating-point Arithmetic.
1861 -------------------------------------------------------------------------------
1869 float64 z; in float64_round_to_int() local
1880 roundData->exception |= float_flag_inexact; in float64_round_to_int()
1882 switch ( roundData->mode ) { in float64_round_to_int()
1897 lastBitMask <<= 0x433 - aExp; in float64_round_to_int()
1898 roundBitsMask = lastBitMask - 1; in float64_round_to_int()
1899 z = a; in float64_round_to_int()
1900 roundingMode = roundData->mode; in float64_round_to_int()
1902 z += lastBitMask>>1; in float64_round_to_int()
1903 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; in float64_round_to_int()
1906 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { in float64_round_to_int()
1907 z += roundBitsMask; in float64_round_to_int()
1910 z &= ~ roundBitsMask; in float64_round_to_int()
1911 if ( z != a ) roundData->exception |= float_flag_inexact; in float64_round_to_int()
1912 return z; in float64_round_to_int()
1917 -------------------------------------------------------------------------------
1918 Returns the result of adding the absolute values of the double-precision
1919 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1922 Floating-point Arithmetic.
1923 -------------------------------------------------------------------------------
1935 expDiff = aExp - bExp; in addFloat64Sigs()
1944 --expDiff; in addFloat64Sigs()
1963 shift64RightJamming( aSig, - expDiff, &aSig ); in addFloat64Sigs()
1978 --zExp; in addFloat64Sigs()
1989 -------------------------------------------------------------------------------
1990 Returns the result of subtracting the absolute values of the double-
1991 precision floating-point values `a' and `b'. If `zSign' is true, the
1994 Standard for Binary Floating-point Arithmetic.
1995 -------------------------------------------------------------------------------
2007 expDiff = aExp - bExp; in subFloat64Sigs()
2014 roundData->exception |= float_flag_invalid; in subFloat64Sigs()
2023 return packFloat64( roundData->mode == float_round_down, 0, 0 ); in subFloat64Sigs()
2035 shift64RightJamming( aSig, - expDiff, &aSig ); in subFloat64Sigs()
2038 zSig = bSig - aSig; in subFloat64Sigs()
2048 --expDiff; in subFloat64Sigs()
2056 zSig = aSig - bSig; in subFloat64Sigs()
2059 --zExp; in subFloat64Sigs()
2065 -------------------------------------------------------------------------------
2066 Returns the result of adding the double-precision floating-point values `a'
2068 Binary Floating-point Arithmetic.
2069 -------------------------------------------------------------------------------
2087 -------------------------------------------------------------------------------
2088 Returns the result of subtracting the double-precision floating-point values
2090 for Binary Floating-point Arithmetic.
2091 -------------------------------------------------------------------------------
2109 -------------------------------------------------------------------------------
2110 Returns the result of multiplying the double-precision floating-point values
2112 for Binary Floating-point Arithmetic.
2113 -------------------------------------------------------------------------------
2133 roundData->exception |= float_flag_invalid; in float64_mul()
2141 roundData->exception |= float_flag_invalid; in float64_mul()
2154 zExp = aExp + bExp - 0x3FF; in float64_mul()
2161 --zExp; in float64_mul()
2168 -------------------------------------------------------------------------------
2169 Returns the result of dividing the double-precision floating-point value `a'
2171 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2172 -------------------------------------------------------------------------------
2193 roundData->exception |= float_flag_invalid; in float64_div()
2205 roundData->exception |= float_flag_invalid; in float64_div()
2208 roundData->exception |= float_flag_divbyzero; in float64_div()
2217 zExp = aExp - bExp + 0x3FD; in float64_div()
2229 --zSig; in float64_div()
2239 -------------------------------------------------------------------------------
2240 Returns the remainder of the double-precision floating-point value `a'
2242 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2243 -------------------------------------------------------------------------------
2263 roundData->exception |= float_flag_invalid; in float64_rem()
2272 roundData->exception |= float_flag_invalid; in float64_rem()
2281 expDiff = aExp - bExp; in float64_rem()
2285 if ( expDiff < -1 ) return a; in float64_rem()
2289 if ( q ) aSig -= bSig; in float64_rem()
2290 expDiff -= 64; in float64_rem()
2293 q = ( 2 < q ) ? q - 2 : 0; in float64_rem()
2294 aSig = - ( ( bSig>>2 ) * q ); in float64_rem()
2295 expDiff -= 62; in float64_rem()
2300 q = ( 2 < q ) ? q - 2 : 0; in float64_rem()
2301 q >>= 64 - expDiff; in float64_rem()
2303 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; in float64_rem()
2312 aSig -= bSig; in float64_rem()
2319 if ( zSign ) aSig = - aSig; in float64_rem()
2325 -------------------------------------------------------------------------------
2326 Returns the square root of the double-precision floating-point value `a'.
2328 Floating-point Arithmetic.
2329 -------------------------------------------------------------------------------
2337 //float64 z; in float64_sqrt()
2345 roundData->exception |= float_flag_invalid; in float64_sqrt()
2350 roundData->exception |= float_flag_invalid; in float64_sqrt()
2357 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; in float64_sqrt()
2361 aSig <<= 9 - ( aExp & 1 ); in float64_sqrt()
2372 --zSig; in float64_sqrt()
2386 -------------------------------------------------------------------------------
2387 Returns 1 if the double-precision floating-point value `a' is equal to the
2389 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2390 -------------------------------------------------------------------------------
2408 -------------------------------------------------------------------------------
2409 Returns 1 if the double-precision floating-point value `a' is less than or
2411 performed according to the IEC/IEEE Standard for Binary Floating-point
2413 -------------------------------------------------------------------------------
2433 -------------------------------------------------------------------------------
2434 Returns 1 if the double-precision floating-point value `a' is less than
2436 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2437 -------------------------------------------------------------------------------
2457 -------------------------------------------------------------------------------
2458 Returns 1 if the double-precision floating-point value `a' is equal to the
2461 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2462 -------------------------------------------------------------------------------
2478 -------------------------------------------------------------------------------
2479 Returns 1 if the double-precision floating-point value `a' is less than or
2482 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2483 -------------------------------------------------------------------------------
2504 -------------------------------------------------------------------------------
2505 Returns 1 if the double-precision floating-point value `a' is less than
2508 Standard for Binary Floating-point Arithmetic.
2509 -------------------------------------------------------------------------------
2531 -------------------------------------------------------------------------------
2532 Returns the result of converting the extended double-precision floating-
2533 point value `a' to the 32-bit two's complement integer format. The
2535 Floating-point Arithmetic---which means in particular that the conversion
2539 -------------------------------------------------------------------------------
2551 shiftCount = 0x4037 - aExp; in floatx80_to_int32()
2559 -------------------------------------------------------------------------------
2560 Returns the result of converting the extended double-precision floating-
2561 point value `a' to the 32-bit two's complement integer format. The
2563 Floating-point Arithmetic, except that the conversion is always rounded
2567 -------------------------------------------------------------------------------
2574 int32 z; in floatx80_to_int32_round_to_zero() local
2579 shiftCount = 0x403E - aExp; in floatx80_to_int32_round_to_zero()
2590 z = aSig; in floatx80_to_int32_round_to_zero()
2591 if ( aSign ) z = - z; in floatx80_to_int32_round_to_zero()
2592 if ( ( z < 0 ) ^ aSign ) { in floatx80_to_int32_round_to_zero()
2600 return z; in floatx80_to_int32_round_to_zero()
2605 -------------------------------------------------------------------------------
2606 Returns the result of converting the extended double-precision floating-
2607 point value `a' to the single-precision floating-point format. The
2609 Floating-point Arithmetic.
2610 -------------------------------------------------------------------------------
2628 if ( aExp || aSig ) aExp -= 0x3F81; in floatx80_to_float32()
2634 -------------------------------------------------------------------------------
2635 Returns the result of converting the extended double-precision floating-
2636 point value `a' to the double-precision floating-point format. The
2638 Floating-point Arithmetic.
2639 -------------------------------------------------------------------------------
2657 if ( aExp || aSig ) aExp -= 0x3C01; in floatx80_to_float64()
2663 -------------------------------------------------------------------------------
2664 Rounds the extended double-precision floating-point value `a' to an integer,
2665 and returns the result as an extended quadruple-precision floating-point
2667 Binary Floating-point Arithmetic.
2668 -------------------------------------------------------------------------------
2676 floatx80 z; in floatx80_round_to_int() local
2690 roundData->exception |= float_flag_inexact; in floatx80_round_to_int()
2692 switch ( roundData->mode ) { in floatx80_round_to_int()
2713 lastBitMask <<= 0x403E - aExp; in floatx80_round_to_int()
2714 roundBitsMask = lastBitMask - 1; in floatx80_round_to_int()
2715 z = a; in floatx80_round_to_int()
2716 roundingMode = roundData->mode; in floatx80_round_to_int()
2718 z.low += lastBitMask>>1; in floatx80_round_to_int()
2719 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; in floatx80_round_to_int()
2722 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { in floatx80_round_to_int()
2723 z.low += roundBitsMask; in floatx80_round_to_int()
2726 z.low &= ~ roundBitsMask; in floatx80_round_to_int()
2727 if ( z.low == 0 ) { in floatx80_round_to_int()
2728 ++z.high; in floatx80_round_to_int()
2729 z.low = LIT64( 0x8000000000000000 ); in floatx80_round_to_int()
2731 if ( z.low != a.low ) roundData->exception |= float_flag_inexact; in floatx80_round_to_int()
2732 return z; in floatx80_round_to_int()
2737 -------------------------------------------------------------------------------
2738 Returns the result of adding the absolute values of the extended double-
2739 precision floating-point values `a' and `b'. If `zSign' is true, the sum is
2742 Floating-point Arithmetic.
2743 -------------------------------------------------------------------------------
2755 expDiff = aExp - bExp; in addFloatx80Sigs()
2761 if ( bExp == 0 ) --expDiff; in addFloatx80Sigs()
2771 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); in addFloatx80Sigs()
2806 -------------------------------------------------------------------------------
2808 double-precision floating-point values `a' and `b'. If `zSign' is true,
2811 Standard for Binary Floating-point Arithmetic.
2812 -------------------------------------------------------------------------------
2819 floatx80 z; in subFloatx80Sigs() local
2825 expDiff = aExp - bExp; in subFloatx80Sigs()
2832 roundData->exception |= float_flag_invalid; in subFloatx80Sigs()
2833 z.low = floatx80_default_nan_low; in subFloatx80Sigs()
2834 z.high = floatx80_default_nan_high; in subFloatx80Sigs()
2835 z.__padding = 0; in subFloatx80Sigs()
2836 return z; in subFloatx80Sigs()
2845 return packFloatx80( roundData->mode == float_round_down, 0, 0 ); in subFloatx80Sigs()
2852 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); in subFloatx80Sigs()
2863 if ( bExp == 0 ) --expDiff; in subFloatx80Sigs()
2876 -------------------------------------------------------------------------------
2877 Returns the result of adding the extended double-precision floating-point
2879 Standard for Binary Floating-point Arithmetic.
2880 -------------------------------------------------------------------------------
2898 -------------------------------------------------------------------------------
2899 Returns the result of subtracting the extended double-precision floating-
2901 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2902 -------------------------------------------------------------------------------
2920 -------------------------------------------------------------------------------
2921 Returns the result of multiplying the extended double-precision floating-
2923 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2924 -------------------------------------------------------------------------------
2931 floatx80 z; in floatx80_mul() local
2952 roundData->exception |= float_flag_invalid; in floatx80_mul()
2953 z.low = floatx80_default_nan_low; in floatx80_mul()
2954 z.high = floatx80_default_nan_high; in floatx80_mul()
2955 z.__padding = 0; in floatx80_mul()
2956 return z; in floatx80_mul()
2968 zExp = aExp + bExp - 0x3FFE; in floatx80_mul()
2972 --zExp; in floatx80_mul()
2981 -------------------------------------------------------------------------------
2982 Returns the result of dividing the extended double-precision floating-point
2984 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2985 -------------------------------------------------------------------------------
2993 floatx80 z; in floatx80_div() local
3018 roundData->exception |= float_flag_invalid; in floatx80_div()
3019 z.low = floatx80_default_nan_low; in floatx80_div()
3020 z.high = floatx80_default_nan_high; in floatx80_div()
3021 z.__padding = 0; in floatx80_div()
3022 return z; in floatx80_div()
3024 roundData->exception |= float_flag_divbyzero; in floatx80_div()
3033 zExp = aExp - bExp + 0x3FFE; in floatx80_div()
3043 --zSig0; in floatx80_div()
3051 --zSig1; in floatx80_div()
3063 -------------------------------------------------------------------------------
3064 Returns the remainder of the extended double-precision floating-point value
3066 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3067 -------------------------------------------------------------------------------
3075 floatx80 z; in floatx80_rem() local
3097 roundData->exception |= float_flag_invalid; in floatx80_rem()
3098 z.low = floatx80_default_nan_low; in floatx80_rem()
3099 z.high = floatx80_default_nan_high; in floatx80_rem()
3100 z.__padding = 0; in floatx80_rem()
3101 return z; in floatx80_rem()
3111 expDiff = aExp - bExp; in floatx80_rem()
3114 if ( expDiff < -1 ) return a; in floatx80_rem()
3119 if ( q ) aSig0 -= bSig; in floatx80_rem()
3120 expDiff -= 64; in floatx80_rem()
3123 q = ( 2 < q ) ? q - 2 : 0; in floatx80_rem()
3127 expDiff -= 62; in floatx80_rem()
3132 q = ( 2 < q ) ? q - 2 : 0; in floatx80_rem()
3133 q >>= 64 - expDiff; in floatx80_rem()
3134 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); in floatx80_rem()
3136 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); in floatx80_rem()
3163 -------------------------------------------------------------------------------
3164 Returns the square root of the extended double-precision floating-point
3166 for Binary Floating-point Arithmetic.
3167 -------------------------------------------------------------------------------
3176 floatx80 z; in floatx80_sqrt() local
3189 roundData->exception |= float_flag_invalid; in floatx80_sqrt()
3190 z.low = floatx80_default_nan_low; in floatx80_sqrt()
3191 z.high = floatx80_default_nan_high; in floatx80_sqrt()
3192 z.__padding = 0; in floatx80_sqrt()
3193 return z; in floatx80_sqrt()
3199 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; in floatx80_sqrt()
3210 --zSig0; in floatx80_sqrt()
3225 --zSig1; in floatx80_sqrt()
3240 -------------------------------------------------------------------------------
3241 Returns 1 if the extended double-precision floating-point value `a' is
3243 performed according to the IEC/IEEE Standard for Binary Floating-point
3245 -------------------------------------------------------------------------------
3263 && ( ( a.high == b.high ) in floatx80_eq()
3265 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) in floatx80_eq()
3271 -------------------------------------------------------------------------------
3272 Returns 1 if the extended double-precision floating-point value `a' is
3275 Floating-point Arithmetic.
3276 -------------------------------------------------------------------------------
3295 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) in floatx80_le()
3299 aSign ? le128( b.high, b.low, a.high, a.low ) in floatx80_le()
3300 : le128( a.high, a.low, b.high, b.low ); in floatx80_le()
3305 -------------------------------------------------------------------------------
3306 Returns 1 if the extended double-precision floating-point value `a' is
3308 is performed according to the IEC/IEEE Standard for Binary Floating-point
3310 -------------------------------------------------------------------------------
3329 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) in floatx80_lt()
3333 aSign ? lt128( b.high, b.low, a.high, a.low ) in floatx80_lt()
3334 : lt128( a.high, a.low, b.high, b.low ); in floatx80_lt()
3339 -------------------------------------------------------------------------------
3340 Returns 1 if the extended double-precision floating-point value `a' is equal
3343 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3344 -------------------------------------------------------------------------------
3359 && ( ( a.high == b.high ) in floatx80_eq_signaling()
3361 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) in floatx80_eq_signaling()
3367 -------------------------------------------------------------------------------
3368 Returns 1 if the extended double-precision floating-point value `a' is less
3371 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3372 -------------------------------------------------------------------------------
3391 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) in floatx80_le_quiet()
3395 aSign ? le128( b.high, b.low, a.high, a.low ) in floatx80_le_quiet()
3396 : le128( a.high, a.low, b.high, b.low ); in floatx80_le_quiet()
3401 -------------------------------------------------------------------------------
3402 Returns 1 if the extended double-precision floating-point value `a' is less
3405 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3406 -------------------------------------------------------------------------------
3425 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) in floatx80_lt_quiet()
3429 aSign ? lt128( b.high, b.low, a.high, a.low ) in floatx80_lt_quiet()
3430 : lt128( a.high, a.low, b.high, b.low ); in floatx80_lt_quiet()