Arithmetic & Logic Operations
Header aarith/float/float_operations.hpp
-
namespace aarith
Functions
-
template<size_t E, size_t M, class Function_add, class Function_sub>
auto add_(const floating_point<E, M> lhs, const floating_point<E, M> rhs, Function_add fun_add, Function_sub fun_sub) -> floating_point<E, M> Generic addition of two
floating_point
values.This method computes the sum of two floating-point values using the provided functions
fun_add
andfun_sub
to compute the new mantissa. This generic function allows to easily implement own adders, e.g. to develop new hardware implementations.Note
As an end-user of aarith, you will, most likely, never need to call this function.
- Template Parameters
E – Exponent width
M – Mantissa width
Function_add – Function object type for performing an addition
Function_sub – Function object fype for performing a subtraction
- Parameters
lhs – Left-hand side argument of the usm
rhs – Right-hand side argument of the sum
fun_add – Function performing the addition of the mantissae
fun_sub – Function performing the subtraction of the mantissae
- Returns
The sum of lhs + rhs using the provided functions
-
template<size_t E, size_t M, class Function_add, class Function_sub>
auto sub_(const floating_point<E, M> lhs, const floating_point<E, M> rhs, Function_add fun_add, Function_sub fun_sub) -> floating_point<E, M> Generic subtraction of two
floating_point
values.This method computes the difference of two floating-point values using the provided functions
fun_add
andfun_sub
to compute the new mantissa. This generic function allows to easily implement own adders, e.g. to develop new hardware implementations.*Note
As an end-user of aarith, you will, most likely, never need to call this function.
- Template Parameters
E – Exponent width
M – Mantissa width
Function_add – Function object type for performing an addition
Function_sub – Function object fype for performing a subtraction
- Parameters
lhs – Left-hand side argument of the usm
rhs – Right-hand side argument of the sum
fun_add – Function performing the addition of the mantissae
fun_sub – Function performing the subtraction of the mantissae
- Returns
The sum of lhs + rhs using the provided functions
-
template<size_t E, size_t M>
auto add(const floating_point<E, M> lhs, const floating_point<E, M> rhs) -> floating_point<E, M> Adds two
floating_point
values.- Parameters
lhs – The first number that is to be summed up
rhs – The second number that is to be summed up
- Template Parameters
E – Width of exponent
M – Width of mantissa including the leading 1
- Returns
The sum
-
template<size_t E, size_t M>
auto sub(const floating_point<E, M> lhs, const floating_point<E, M> rhs) -> floating_point<E, M> Subtract two
floating_point
values.- Parameters
lhs – The minuend
rhs – The subtrahend
- Template Parameters
E – Width of exponent
M – Width of mantissa including the leading 1
- Returns
The difference lhs-rhs
-
template<size_t E, size_t M, typename WordType>
auto mul(const floating_point<E, M, WordType> lhs, const floating_point<E, M, WordType> rhs) -> floating_point<E, M, WordType> Multiplies two
floating_point
numbers.- Parameters
lhs – The multiplicand
rhs – The multiplicator
- Template Parameters
E – Width of exponent
M – Width of mantissa including the leading 1
- Returns
The product lhs*rhs
-
template<size_t E, size_t M, typename WordType>
auto div(const floating_point<E, M, WordType> lhs, const floating_point<E, M, WordType> rhs) -> floating_point<E, M, WordType> Division with floating_points: lhs/rhs.
- Parameters
lhs – The dividend
rhs – The divisor
- Template Parameters
E – Width of exponent
M – Width of mantissa including the leading 1
WordType – The word type used to internally store the data
- Returns
The quotient lhs/rhs
-
template<size_t E, size_t M, typename WordType = uint64_t>
constexpr floating_point<E, M, WordType> negate(const floating_point<E, M, WordType> &x) Computes the negative value of the floating-point number.
Quoting the standard: copies a floating-point operand x to a destination in the same format, reversing the sign bit. negate(x) is not the same as subtraction(0, x)
Note
This method ignores NaN values in the sense that they are also copied and the sign bit flipped.
- Template Parameters
E – Width of exponent
M – Width of mantissa
WordType – The word type used to internally store the data
- Returns
The negated value of the provided number
-
template<size_t E, size_t M, typename WordType = uint64_t>
constexpr floating_point<E, M, WordType> copy(const floating_point<E, M, WordType> &x) Copies the floating-point number.
Quoting the standard: copies a floating-point operand x to a destination in the same format, with no change to the sign bit.
Note
This method ignores NaN values in the sense that they are also copied not signalling any error.
Note
This is a rather useless method that only exists to be more compliant with the IEEE 754 (2019) standard.
- Template Parameters
E – Width of exponent
M – Width of mantissa
WordType – The word type used to internally store the data
- Returns
The copied value
-
template<size_t E, size_t M, typename WordType = uint64_t>
constexpr floating_point<E, M, WordType> copySign(const floating_point<E, M, WordType> &x, const floating_point<E, M, WordType> &y) Copies a floating-point number using the sign of another number.
Quoting the standard: copies a floating-point operand x to a destination in the same format as x, but with the sign bit of y.
Note
This method ignores NaN values in the sense that they are also copied not signalling any error.
- Template Parameters
E – Width of exponent
M – Width of mantissa
WordType – The word type used to internally store the data
- Returns
The copied value
-
template<size_t Start, size_t End, size_t E, size_t M, typename WordType>
constexpr word_array<(Start - End) + 1, WordType> bit_range(const floating_point<E, M, WordType> &f) Extracts a bitstring range from the bit representation of the float.
Note that the indexing is done
zero based starting from the LSB
is inclusive (i.e. the start and end point are part of the range)
- Template Parameters
Start – Starting index (inclusive, from left to right)
Eend – Ending index (inclusive, from left to right)
E – Width of the exponent
M – Width of the mantissa
- Parameters
f – Float from which the range is taken from
- Returns
Range float[End,Start], inclusive
-
namespace float_operators
This additional nesting of a namespace allows to include aarith without having the usual operator names imported as well.
The use case for this is to allow explicitly replace the conventional arithmetic operations with sepcialized ones. This can, e.g., be used when evaluating approximate operations in the context of neural networks. The name lookup of C++ makes it necessary not to see the operators earlier.
Functions
-
template<size_t E, size_t M, typename WordType>
auto operator+(const floating_point<E, M, WordType> &lhs, const floating_point<E, M, WordType> &rhs) -> floating_point<E, M, WordType>
-
template<size_t E, size_t M, typename WordType>
auto operator-(const floating_point<E, M, WordType> &lhs, const floating_point<E, M, WordType> &rhs) -> floating_point<E, M, WordType>
-
template<size_t E, size_t M, typename WordType>
auto operator*(const floating_point<E, M, WordType> &lhs, const floating_point<E, M, WordType> &rhs) -> floating_point<E, M, WordType>
-
template<size_t E, size_t M, typename WordType>
auto operator/(const floating_point<E, M, WordType> &lhs, const floating_point<E, M, WordType> &rhs) -> floating_point<E, M, WordType>
-
template<size_t E, size_t M, typename WordType>
auto operator-(const floating_point<E, M, WordType> &x) -> floating_point<E, M, WordType>
-
template<size_t E, size_t M, typename WordType>
-
template<size_t E, size_t M, class Function_add, class Function_sub>