/* This file was generated by etc/gen-rounded-common.sh. */ // // double // static inline double rounded_add_impl_double(native_rounding_mode mode, VOLATILE double a, VOLATILE double b) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, mode); MAY_MODIFY(a); MAY_MODIFY(b); VOLATILE double c = a + b; FORCE_EVAL(c); restore_fp_reg(oldreg); return c; } extern double rounded_hw_add_double(HsInt mode, double a, double b) { return rounded_add_impl_double(hs_rounding_mode_to_native(mode), a, b); } extern double rounded_hw_add_double_up(double a, double b) { return rounded_add_impl_double(ROUND_UPWARD, a, b); } extern double rounded_hw_add_double_down(double a, double b) { return rounded_add_impl_double(ROUND_DOWNWARD, a, b); } extern double rounded_hw_add_double_zero(double a, double b) { return rounded_add_impl_double(ROUND_TOWARDZERO, a, b); } static inline double rounded_sub_impl_double(native_rounding_mode mode, VOLATILE double a, VOLATILE double b) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, mode); MAY_MODIFY(a); MAY_MODIFY(b); VOLATILE double c = a - b; FORCE_EVAL(c); restore_fp_reg(oldreg); return c; } extern double rounded_hw_sub_double(HsInt mode, double a, double b) { return rounded_sub_impl_double(hs_rounding_mode_to_native(mode), a, b); } extern double rounded_hw_sub_double_up(double a, double b) { return rounded_sub_impl_double(ROUND_UPWARD, a, b); } extern double rounded_hw_sub_double_down(double a, double b) { return rounded_sub_impl_double(ROUND_DOWNWARD, a, b); } extern double rounded_hw_sub_double_zero(double a, double b) { return rounded_sub_impl_double(ROUND_TOWARDZERO, a, b); } static inline double rounded_mul_impl_double(native_rounding_mode mode, VOLATILE double a, VOLATILE double b) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, mode); MAY_MODIFY(a); MAY_MODIFY(b); VOLATILE double c = a * b; FORCE_EVAL(c); restore_fp_reg(oldreg); return c; } extern double rounded_hw_mul_double(HsInt mode, double a, double b) { return rounded_mul_impl_double(hs_rounding_mode_to_native(mode), a, b); } extern double rounded_hw_mul_double_up(double a, double b) { return rounded_mul_impl_double(ROUND_UPWARD, a, b); } extern double rounded_hw_mul_double_down(double a, double b) { return rounded_mul_impl_double(ROUND_DOWNWARD, a, b); } extern double rounded_hw_mul_double_zero(double a, double b) { return rounded_mul_impl_double(ROUND_TOWARDZERO, a, b); } static inline double rounded_div_impl_double(native_rounding_mode mode, VOLATILE double a, VOLATILE double b) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, mode); MAY_MODIFY(a); MAY_MODIFY(b); VOLATILE double c = a / b; FORCE_EVAL(c); restore_fp_reg(oldreg); return c; } extern double rounded_hw_div_double(HsInt mode, double a, double b) { return rounded_div_impl_double(hs_rounding_mode_to_native(mode), a, b); } extern double rounded_hw_div_double_up(double a, double b) { return rounded_div_impl_double(ROUND_UPWARD, a, b); } extern double rounded_hw_div_double_down(double a, double b) { return rounded_div_impl_double(ROUND_DOWNWARD, a, b); } extern double rounded_hw_div_double_zero(double a, double b) { return rounded_div_impl_double(ROUND_TOWARDZERO, a, b); } static inline double rounded_sqrt_impl_double(native_rounding_mode mode, VOLATILE double a) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, mode); #if defined(__GNUC__) && defined(USE_SSE2) // mingw-w64's sqrt may use x87 instead of SSE2 double c; __asm__ volatile("sqrtsd %1, %0" : "=x"(c) : "x"(a)); #else MAY_MODIFY(a); VOLATILE double c = sqrt(a); FORCE_EVAL(c); #endif restore_fp_reg(oldreg); return c; } extern double rounded_hw_sqrt_double(HsInt mode, double a) { return rounded_sqrt_impl_double(hs_rounding_mode_to_native(mode), a); } extern double rounded_hw_sqrt_double_up(double a) { return rounded_sqrt_impl_double(ROUND_UPWARD, a); } extern double rounded_hw_sqrt_double_down(double a) { return rounded_sqrt_impl_double(ROUND_DOWNWARD, a); } extern double rounded_hw_sqrt_double_zero(double a) { return rounded_sqrt_impl_double(ROUND_TOWARDZERO, a); } static inline double rounded_fma_impl_double(native_rounding_mode mode, VOLATILE double a, VOLATILE double b, VOLATILE double c) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, mode); MAY_MODIFY(a); MAY_MODIFY(b); MAY_MODIFY(c); VOLATILE double result = fma(a, b, c); FORCE_EVAL(result); restore_fp_reg(oldreg); return result; } extern double rounded_hw_fma_double(HsInt mode, double a, double b, double c) { return rounded_fma_impl_double(hs_rounding_mode_to_native(mode), a, b, c); } extern double rounded_hw_fma_double_up(double a, double b, double c) { return rounded_fma_impl_double(ROUND_UPWARD, a, b, c); } extern double rounded_hw_fma_double_down(double a, double b, double c) { return rounded_fma_impl_double(ROUND_DOWNWARD, a, b, c); } extern double rounded_hw_fma_double_zero(double a, double b, double c) { return rounded_fma_impl_double(ROUND_TOWARDZERO, a, b, c); } static inline double rounded_fma_if_fast_impl_double(native_rounding_mode mode, VOLATILE double a, VOLATILE double b, VOLATILE double c) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, mode); MAY_MODIFY(a); MAY_MODIFY(b); MAY_MODIFY(c); #ifdef FP_FAST_FMA VOLATILE double result = fma(a, b, c); #else VOLATILE double result = a * b + c; #endif FORCE_EVAL(result); restore_fp_reg(oldreg); return result; } extern double rounded_hw_fma_if_fast_double(HsInt mode, double a, double b, double c) { return rounded_fma_if_fast_impl_double(hs_rounding_mode_to_native(mode), a, b, c); } extern double rounded_hw_fma_if_fast_double_up(double a, double b, double c) { return rounded_fma_if_fast_impl_double(ROUND_UPWARD, a, b, c); } extern double rounded_hw_fma_if_fast_double_down(double a, double b, double c) { return rounded_fma_if_fast_impl_double(ROUND_DOWNWARD, a, b, c); } extern double rounded_hw_fma_if_fast_double_zero(double a, double b, double c) { return rounded_fma_if_fast_impl_double(ROUND_TOWARDZERO, a, b, c); } // // Conversion // static inline double rounded_int64_to_double_impl(native_rounding_mode mode, int64_t x) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, mode); VOLATILE double result = (double)x; FORCE_EVAL(result); restore_fp_reg(oldreg); return result; } extern double rounded_hw_int64_to_double(HsInt mode, int64_t x) { return rounded_int64_to_double_impl(hs_rounding_mode_to_native(mode), x); } extern double rounded_hw_int64_to_double_up(int64_t x) { return rounded_int64_to_double_impl(ROUND_UPWARD, x); } extern double rounded_hw_int64_to_double_down(int64_t x) { return rounded_int64_to_double_impl(ROUND_DOWNWARD, x); } extern double rounded_hw_int64_to_double_zero(int64_t x) { return rounded_int64_to_double_impl(ROUND_TOWARDZERO, x); } static inline double rounded_word64_to_double_impl(native_rounding_mode mode, uint64_t x) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, mode); VOLATILE double result = (double)x; FORCE_EVAL(result); restore_fp_reg(oldreg); return result; } extern double rounded_hw_word64_to_double(HsInt mode, uint64_t x) { return rounded_word64_to_double_impl(hs_rounding_mode_to_native(mode), x); } extern double rounded_hw_word64_to_double_up(uint64_t x) { return rounded_word64_to_double_impl(ROUND_UPWARD, x); } extern double rounded_hw_word64_to_double_down(uint64_t x) { return rounded_word64_to_double_impl(ROUND_DOWNWARD, x); } extern double rounded_hw_word64_to_double_zero(uint64_t x) { return rounded_word64_to_double_impl(ROUND_TOWARDZERO, x); } // // Interval arithmetic // static inline double fast_fmax_double(double x, double y) { // should compile to MAX[SP][SD] instruction on x86 return x > y ? x : y; } static inline double fast_fmax4_double(double x, double y, double z, double w) { return fast_fmax_double(fast_fmax_double(x, y), fast_fmax_double(z, w)); } static inline double fast_fmin_double(double x, double y) { // should compile to MIN[SP][SD] instruction on x86 return x < y ? x : y; } static inline double fast_fmin4_double(double x, double y, double z, double w) { return fast_fmin_double(fast_fmin_double(x, y), fast_fmin_double(z, w)); } extern double rounded_hw_interval_mul_double_up(VOLATILE double lo1, VOLATILE double hi1, VOLATILE double lo2, VOLATILE double hi2) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, ROUND_UPWARD); MAY_MODIFY(lo1); MAY_MODIFY(hi1); MAY_MODIFY(lo2); MAY_MODIFY(hi2); VOLATILE double x = lo1 * lo2; VOLATILE double y = lo1 * hi2; VOLATILE double z = hi1 * lo2; VOLATILE double w = hi1 * hi2; if (isnan(x)) x = 0.0; /* 0 * inf -> 0 */ if (isnan(y)) y = 0.0; /* 0 * inf -> 0 */ if (isnan(z)) z = 0.0; /* 0 * inf -> 0 */ if (isnan(w)) w = 0.0; /* 0 * inf -> 0 */ VOLATILE double hi = fast_fmax4_double(x, y, z, w); FORCE_EVAL(hi); restore_fp_reg(oldreg); return hi; } extern double rounded_hw_interval_mul_double_down(VOLATILE double lo1, VOLATILE double hi1, VOLATILE double lo2, VOLATILE double hi2) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, ROUND_DOWNWARD); MAY_MODIFY(lo1); MAY_MODIFY(hi1); MAY_MODIFY(lo2); MAY_MODIFY(hi2); VOLATILE double x = lo1 * lo2; VOLATILE double y = lo1 * hi2; VOLATILE double z = hi1 * lo2; VOLATILE double w = hi1 * hi2; if (isnan(x)) x = 0.0; /* 0 * inf -> 0 */ if (isnan(y)) y = 0.0; /* 0 * inf -> 0 */ if (isnan(z)) z = 0.0; /* 0 * inf -> 0 */ if (isnan(w)) w = 0.0; /* 0 * inf -> 0 */ VOLATILE double lo = fast_fmin4_double(x, y, z, w); FORCE_EVAL(lo); restore_fp_reg(oldreg); return lo; } extern double rounded_hw_interval_mul_add_double_up(VOLATILE double lo1, VOLATILE double hi1, VOLATILE double lo2, VOLATILE double hi2, VOLATILE double hi3) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, ROUND_UPWARD); MAY_MODIFY(lo1); MAY_MODIFY(hi1); MAY_MODIFY(lo2); MAY_MODIFY(hi2); MAY_MODIFY(hi3); double x = lo1 * lo2; double y = lo1 * hi2; double z = hi1 * lo2; double w = hi1 * hi2; if (isnan(x)) x = 0.0; /* 0 * inf -> 0 */ if (isnan(y)) y = 0.0; /* 0 * inf -> 0 */ if (isnan(z)) z = 0.0; /* 0 * inf -> 0 */ if (isnan(w)) w = 0.0; /* 0 * inf -> 0 */ VOLATILE double hi = fast_fmax4_double(x, y, z, w) + hi3; FORCE_EVAL(hi); restore_fp_reg(oldreg); return hi; } extern double rounded_hw_interval_mul_add_double_down(VOLATILE double lo1, VOLATILE double hi1, VOLATILE double lo2, VOLATILE double hi2, VOLATILE double lo3) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, ROUND_DOWNWARD); MAY_MODIFY(lo1); MAY_MODIFY(hi1); MAY_MODIFY(lo2); MAY_MODIFY(hi2); MAY_MODIFY(lo3); double x = lo1 * lo2; double y = lo1 * hi2; double z = hi1 * lo2; double w = hi1 * hi2; if (isnan(x)) x = 0.0; /* 0 * inf -> 0 */ if (isnan(y)) y = 0.0; /* 0 * inf -> 0 */ if (isnan(z)) z = 0.0; /* 0 * inf -> 0 */ if (isnan(w)) w = 0.0; /* 0 * inf -> 0 */ VOLATILE double lo = fast_fmin4_double(x, y, z, w) + lo3; FORCE_EVAL(lo); restore_fp_reg(oldreg); return lo; } extern double rounded_hw_interval_div_double_up(VOLATILE double lo1, VOLATILE double hi1, VOLATILE double lo2, VOLATILE double hi2) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, ROUND_UPWARD); MAY_MODIFY(lo1); MAY_MODIFY(hi1); MAY_MODIFY(lo2); MAY_MODIFY(hi2); double x = lo1 / lo2; double y = lo1 / hi2; double z = hi1 / lo2; double w = hi1 / hi2; if (isnan(x)) x = 0.0; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(y)) y = 0.0; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(z)) z = 0.0; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(w)) w = 0.0; /* 0 / 0, +-inf / +-inf -> 0 */ VOLATILE double hi = fast_fmax4_double(x, y, z, w); FORCE_EVAL(hi); restore_fp_reg(oldreg); return hi; } extern double rounded_hw_interval_div_double_down(VOLATILE double lo1, VOLATILE double hi1, VOLATILE double lo2, VOLATILE double hi2) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, ROUND_DOWNWARD); MAY_MODIFY(lo1); MAY_MODIFY(hi1); MAY_MODIFY(lo2); MAY_MODIFY(hi2); double x = lo1 / lo2; double y = lo1 / hi2; double z = hi1 / lo2; double w = hi1 / hi2; if (isnan(x)) x = 0.0; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(y)) y = 0.0; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(z)) z = 0.0; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(w)) w = 0.0; /* 0 / 0, +-inf / +-inf -> 0 */ VOLATILE double lo = fast_fmin4_double(x, y, z, w); FORCE_EVAL(lo); restore_fp_reg(oldreg); return lo; } extern double rounded_hw_interval_div_add_double_up(VOLATILE double lo1, VOLATILE double hi1, VOLATILE double lo2, VOLATILE double hi2, VOLATILE double hi3) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, ROUND_UPWARD); MAY_MODIFY(lo1); MAY_MODIFY(hi1); MAY_MODIFY(lo2); MAY_MODIFY(hi2); MAY_MODIFY(hi3); double x = lo1 / lo2; double y = lo1 / hi2; double z = hi1 / lo2; double w = hi1 / hi2; if (isnan(x)) x = 0.0; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(y)) y = 0.0; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(z)) z = 0.0; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(w)) w = 0.0; /* 0 / 0, +-inf / +-inf -> 0 */ VOLATILE double hi = fast_fmax4_double(x, y, z, w) + hi3; FORCE_EVAL(hi); restore_fp_reg(oldreg); return hi; } extern double rounded_hw_interval_div_add_double_down(VOLATILE double lo1, VOLATILE double hi1, VOLATILE double lo2, VOLATILE double hi2, VOLATILE double lo3) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, ROUND_DOWNWARD); MAY_MODIFY(lo1); MAY_MODIFY(hi1); MAY_MODIFY(lo2); MAY_MODIFY(hi2); MAY_MODIFY(lo3); double x = lo1 / lo2; double y = lo1 / hi2; double z = hi1 / lo2; double w = hi1 / hi2; if (isnan(x)) x = 0.0; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(y)) y = 0.0; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(z)) z = 0.0; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(w)) w = 0.0; /* 0 / 0, +-inf / +-inf -> 0 */ VOLATILE double lo = fast_fmin4_double(x, y, z, w) + lo3; FORCE_EVAL(lo); restore_fp_reg(oldreg); return lo; } // // Vector Operations // extern double rounded_hw_vector_sum_double(HsInt mode, HsInt length, HsInt offset, const double *a) { native_rounding_mode nmode = hs_rounding_mode_to_native(mode); fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, nmode); VOLATILE double s = 0.0; MAY_MODIFY(s); for (HsInt i = 0; i < length; ++i) { s += a[offset + i]; } FORCE_EVAL(s); restore_fp_reg(oldreg); return s; } extern void rounded_hw_vector_add_double(HsInt mode, HsInt length, HsInt offsetR, double * restrict result, HsInt offsetA, const double * restrict a, HsInt offsetB, const double * restrict b) { native_rounding_mode nmode = hs_rounding_mode_to_native(mode); fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, nmode); for (HsInt i = 0; i < length; ++i) { result[offsetR + i] = a[offsetA + i] + b[offsetB + i]; } restore_fp_reg(oldreg); } extern void rounded_hw_vector_sub_double(HsInt mode, HsInt length, HsInt offsetR, double * restrict result, HsInt offsetA, const double * restrict a, HsInt offsetB, const double * restrict b) { native_rounding_mode nmode = hs_rounding_mode_to_native(mode); fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, nmode); for (HsInt i = 0; i < length; ++i) { result[offsetR + i] = a[offsetA + i] - b[offsetB + i]; } restore_fp_reg(oldreg); } extern void rounded_hw_vector_mul_double(HsInt mode, HsInt length, HsInt offsetR, double * restrict result, HsInt offsetA, const double * restrict a, HsInt offsetB, const double * restrict b) { native_rounding_mode nmode = hs_rounding_mode_to_native(mode); fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, nmode); for (HsInt i = 0; i < length; ++i) { result[offsetR + i] = a[offsetA + i] * b[offsetB + i]; } restore_fp_reg(oldreg); } extern void rounded_hw_vector_fma_double(HsInt mode, HsInt length, HsInt offsetR, double * restrict result, HsInt offsetA, const double * restrict a, HsInt offsetB, const double * restrict b, HsInt offsetC, const double * restrict c) { native_rounding_mode nmode = hs_rounding_mode_to_native(mode); fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, nmode); for (HsInt i = 0; i < length; ++i) { result[offsetR + i] = fma(a[offsetA + i], b[offsetB + i], c[offsetC + i]); } restore_fp_reg(oldreg); } extern void rounded_hw_vector_div_double(HsInt mode, HsInt length, HsInt offsetR, double * restrict result, HsInt offsetA, const double * restrict a, HsInt offsetB, const double * restrict b) { native_rounding_mode nmode = hs_rounding_mode_to_native(mode); fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, nmode); for (HsInt i = 0; i < length; ++i) { result[offsetR + i] = a[offsetA + i] / b[offsetB + i]; } restore_fp_reg(oldreg); } extern void rounded_hw_vector_sqrt_double(HsInt mode, HsInt length, HsInt offsetR, double * restrict result, HsInt offsetA, const double * restrict a) { native_rounding_mode nmode = hs_rounding_mode_to_native(mode); fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, nmode); #if defined(__GNUC__) && defined(USE_SSE2) // mingw-w64's sqrt may use x87 instead of SSE2 HsInt i = 0; for (; i + 2 <= length; i += 2) { __m128d aa = _mm_loadu_pd(&a[offsetA + i]); __m128d bb = _mm_sqrt_pd(aa); _mm_storeu_pd(&result[offsetR + i], bb); } for (; i < length; ++i) { double aa = a[offsetA + i]; double bb; __asm__ volatile("sqrtsd %1, %0" : "=x"(bb) : "x"(aa)); result[offsetR + i] = bb; } #else for (HsInt i = 0; i < length; ++i) { result[offsetR + i] = sqrt(a[offsetA + i]); } #endif restore_fp_reg(oldreg); } // // float // static inline float rounded_add_impl_float(native_rounding_mode mode, VOLATILE float a, VOLATILE float b) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, mode); MAY_MODIFY(a); MAY_MODIFY(b); VOLATILE float c = a + b; FORCE_EVAL(c); restore_fp_reg(oldreg); return c; } extern float rounded_hw_add_float(HsInt mode, float a, float b) { return rounded_add_impl_float(hs_rounding_mode_to_native(mode), a, b); } extern float rounded_hw_add_float_up(float a, float b) { return rounded_add_impl_float(ROUND_UPWARD, a, b); } extern float rounded_hw_add_float_down(float a, float b) { return rounded_add_impl_float(ROUND_DOWNWARD, a, b); } extern float rounded_hw_add_float_zero(float a, float b) { return rounded_add_impl_float(ROUND_TOWARDZERO, a, b); } static inline float rounded_sub_impl_float(native_rounding_mode mode, VOLATILE float a, VOLATILE float b) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, mode); MAY_MODIFY(a); MAY_MODIFY(b); VOLATILE float c = a - b; FORCE_EVAL(c); restore_fp_reg(oldreg); return c; } extern float rounded_hw_sub_float(HsInt mode, float a, float b) { return rounded_sub_impl_float(hs_rounding_mode_to_native(mode), a, b); } extern float rounded_hw_sub_float_up(float a, float b) { return rounded_sub_impl_float(ROUND_UPWARD, a, b); } extern float rounded_hw_sub_float_down(float a, float b) { return rounded_sub_impl_float(ROUND_DOWNWARD, a, b); } extern float rounded_hw_sub_float_zero(float a, float b) { return rounded_sub_impl_float(ROUND_TOWARDZERO, a, b); } static inline float rounded_mul_impl_float(native_rounding_mode mode, VOLATILE float a, VOLATILE float b) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, mode); MAY_MODIFY(a); MAY_MODIFY(b); VOLATILE float c = a * b; FORCE_EVAL(c); restore_fp_reg(oldreg); return c; } extern float rounded_hw_mul_float(HsInt mode, float a, float b) { return rounded_mul_impl_float(hs_rounding_mode_to_native(mode), a, b); } extern float rounded_hw_mul_float_up(float a, float b) { return rounded_mul_impl_float(ROUND_UPWARD, a, b); } extern float rounded_hw_mul_float_down(float a, float b) { return rounded_mul_impl_float(ROUND_DOWNWARD, a, b); } extern float rounded_hw_mul_float_zero(float a, float b) { return rounded_mul_impl_float(ROUND_TOWARDZERO, a, b); } static inline float rounded_div_impl_float(native_rounding_mode mode, VOLATILE float a, VOLATILE float b) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, mode); MAY_MODIFY(a); MAY_MODIFY(b); VOLATILE float c = a / b; FORCE_EVAL(c); restore_fp_reg(oldreg); return c; } extern float rounded_hw_div_float(HsInt mode, float a, float b) { return rounded_div_impl_float(hs_rounding_mode_to_native(mode), a, b); } extern float rounded_hw_div_float_up(float a, float b) { return rounded_div_impl_float(ROUND_UPWARD, a, b); } extern float rounded_hw_div_float_down(float a, float b) { return rounded_div_impl_float(ROUND_DOWNWARD, a, b); } extern float rounded_hw_div_float_zero(float a, float b) { return rounded_div_impl_float(ROUND_TOWARDZERO, a, b); } static inline float rounded_sqrt_impl_float(native_rounding_mode mode, VOLATILE float a) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, mode); #if defined(__GNUC__) && defined(USE_SSE2) // mingw-w64's sqrtf may use x87 instead of SSE2 float c; __asm__ volatile("sqrtss %1, %0" : "=x"(c) : "x"(a)); #else MAY_MODIFY(a); VOLATILE float c = sqrtf(a); FORCE_EVAL(c); #endif restore_fp_reg(oldreg); return c; } extern float rounded_hw_sqrt_float(HsInt mode, float a) { return rounded_sqrt_impl_float(hs_rounding_mode_to_native(mode), a); } extern float rounded_hw_sqrt_float_up(float a) { return rounded_sqrt_impl_float(ROUND_UPWARD, a); } extern float rounded_hw_sqrt_float_down(float a) { return rounded_sqrt_impl_float(ROUND_DOWNWARD, a); } extern float rounded_hw_sqrt_float_zero(float a) { return rounded_sqrt_impl_float(ROUND_TOWARDZERO, a); } static inline float rounded_fma_impl_float(native_rounding_mode mode, VOLATILE float a, VOLATILE float b, VOLATILE float c) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, mode); MAY_MODIFY(a); MAY_MODIFY(b); MAY_MODIFY(c); VOLATILE float result = fmaf(a, b, c); FORCE_EVAL(result); restore_fp_reg(oldreg); return result; } extern float rounded_hw_fma_float(HsInt mode, float a, float b, float c) { return rounded_fma_impl_float(hs_rounding_mode_to_native(mode), a, b, c); } extern float rounded_hw_fma_float_up(float a, float b, float c) { return rounded_fma_impl_float(ROUND_UPWARD, a, b, c); } extern float rounded_hw_fma_float_down(float a, float b, float c) { return rounded_fma_impl_float(ROUND_DOWNWARD, a, b, c); } extern float rounded_hw_fma_float_zero(float a, float b, float c) { return rounded_fma_impl_float(ROUND_TOWARDZERO, a, b, c); } static inline float rounded_fma_if_fast_impl_float(native_rounding_mode mode, VOLATILE float a, VOLATILE float b, VOLATILE float c) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, mode); MAY_MODIFY(a); MAY_MODIFY(b); MAY_MODIFY(c); #ifdef FP_FAST_FMAF VOLATILE float result = fmaf(a, b, c); #else VOLATILE float result = a * b + c; #endif FORCE_EVAL(result); restore_fp_reg(oldreg); return result; } extern float rounded_hw_fma_if_fast_float(HsInt mode, float a, float b, float c) { return rounded_fma_if_fast_impl_float(hs_rounding_mode_to_native(mode), a, b, c); } extern float rounded_hw_fma_if_fast_float_up(float a, float b, float c) { return rounded_fma_if_fast_impl_float(ROUND_UPWARD, a, b, c); } extern float rounded_hw_fma_if_fast_float_down(float a, float b, float c) { return rounded_fma_if_fast_impl_float(ROUND_DOWNWARD, a, b, c); } extern float rounded_hw_fma_if_fast_float_zero(float a, float b, float c) { return rounded_fma_if_fast_impl_float(ROUND_TOWARDZERO, a, b, c); } // // Conversion // static inline float rounded_int64_to_float_impl(native_rounding_mode mode, int64_t x) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, mode); VOLATILE float result = (float)x; FORCE_EVAL(result); restore_fp_reg(oldreg); return result; } extern float rounded_hw_int64_to_float(HsInt mode, int64_t x) { return rounded_int64_to_float_impl(hs_rounding_mode_to_native(mode), x); } extern float rounded_hw_int64_to_float_up(int64_t x) { return rounded_int64_to_float_impl(ROUND_UPWARD, x); } extern float rounded_hw_int64_to_float_down(int64_t x) { return rounded_int64_to_float_impl(ROUND_DOWNWARD, x); } extern float rounded_hw_int64_to_float_zero(int64_t x) { return rounded_int64_to_float_impl(ROUND_TOWARDZERO, x); } static inline float rounded_word64_to_float_impl(native_rounding_mode mode, uint64_t x) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, mode); VOLATILE float result = (float)x; FORCE_EVAL(result); restore_fp_reg(oldreg); return result; } extern float rounded_hw_word64_to_float(HsInt mode, uint64_t x) { return rounded_word64_to_float_impl(hs_rounding_mode_to_native(mode), x); } extern float rounded_hw_word64_to_float_up(uint64_t x) { return rounded_word64_to_float_impl(ROUND_UPWARD, x); } extern float rounded_hw_word64_to_float_down(uint64_t x) { return rounded_word64_to_float_impl(ROUND_DOWNWARD, x); } extern float rounded_hw_word64_to_float_zero(uint64_t x) { return rounded_word64_to_float_impl(ROUND_TOWARDZERO, x); } // // Interval arithmetic // static inline float fast_fmax_float(float x, float y) { // should compile to MAX[SP][SD] instruction on x86 return x > y ? x : y; } static inline float fast_fmax4_float(float x, float y, float z, float w) { return fast_fmax_float(fast_fmax_float(x, y), fast_fmax_float(z, w)); } static inline float fast_fmin_float(float x, float y) { // should compile to MIN[SP][SD] instruction on x86 return x < y ? x : y; } static inline float fast_fmin4_float(float x, float y, float z, float w) { return fast_fmin_float(fast_fmin_float(x, y), fast_fmin_float(z, w)); } extern float rounded_hw_interval_mul_float_up(VOLATILE float lo1, VOLATILE float hi1, VOLATILE float lo2, VOLATILE float hi2) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, ROUND_UPWARD); MAY_MODIFY(lo1); MAY_MODIFY(hi1); MAY_MODIFY(lo2); MAY_MODIFY(hi2); VOLATILE float x = lo1 * lo2; VOLATILE float y = lo1 * hi2; VOLATILE float z = hi1 * lo2; VOLATILE float w = hi1 * hi2; if (isnan(x)) x = 0.0f; /* 0 * inf -> 0 */ if (isnan(y)) y = 0.0f; /* 0 * inf -> 0 */ if (isnan(z)) z = 0.0f; /* 0 * inf -> 0 */ if (isnan(w)) w = 0.0f; /* 0 * inf -> 0 */ VOLATILE float hi = fast_fmax4_float(x, y, z, w); FORCE_EVAL(hi); restore_fp_reg(oldreg); return hi; } extern float rounded_hw_interval_mul_float_down(VOLATILE float lo1, VOLATILE float hi1, VOLATILE float lo2, VOLATILE float hi2) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, ROUND_DOWNWARD); MAY_MODIFY(lo1); MAY_MODIFY(hi1); MAY_MODIFY(lo2); MAY_MODIFY(hi2); VOLATILE float x = lo1 * lo2; VOLATILE float y = lo1 * hi2; VOLATILE float z = hi1 * lo2; VOLATILE float w = hi1 * hi2; if (isnan(x)) x = 0.0f; /* 0 * inf -> 0 */ if (isnan(y)) y = 0.0f; /* 0 * inf -> 0 */ if (isnan(z)) z = 0.0f; /* 0 * inf -> 0 */ if (isnan(w)) w = 0.0f; /* 0 * inf -> 0 */ VOLATILE float lo = fast_fmin4_float(x, y, z, w); FORCE_EVAL(lo); restore_fp_reg(oldreg); return lo; } extern float rounded_hw_interval_mul_add_float_up(VOLATILE float lo1, VOLATILE float hi1, VOLATILE float lo2, VOLATILE float hi2, VOLATILE float hi3) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, ROUND_UPWARD); MAY_MODIFY(lo1); MAY_MODIFY(hi1); MAY_MODIFY(lo2); MAY_MODIFY(hi2); MAY_MODIFY(hi3); float x = lo1 * lo2; float y = lo1 * hi2; float z = hi1 * lo2; float w = hi1 * hi2; if (isnan(x)) x = 0.0f; /* 0 * inf -> 0 */ if (isnan(y)) y = 0.0f; /* 0 * inf -> 0 */ if (isnan(z)) z = 0.0f; /* 0 * inf -> 0 */ if (isnan(w)) w = 0.0f; /* 0 * inf -> 0 */ VOLATILE float hi = fast_fmax4_float(x, y, z, w) + hi3; FORCE_EVAL(hi); restore_fp_reg(oldreg); return hi; } extern float rounded_hw_interval_mul_add_float_down(VOLATILE float lo1, VOLATILE float hi1, VOLATILE float lo2, VOLATILE float hi2, VOLATILE float lo3) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, ROUND_DOWNWARD); MAY_MODIFY(lo1); MAY_MODIFY(hi1); MAY_MODIFY(lo2); MAY_MODIFY(hi2); MAY_MODIFY(lo3); float x = lo1 * lo2; float y = lo1 * hi2; float z = hi1 * lo2; float w = hi1 * hi2; if (isnan(x)) x = 0.0f; /* 0 * inf -> 0 */ if (isnan(y)) y = 0.0f; /* 0 * inf -> 0 */ if (isnan(z)) z = 0.0f; /* 0 * inf -> 0 */ if (isnan(w)) w = 0.0f; /* 0 * inf -> 0 */ VOLATILE float lo = fast_fmin4_float(x, y, z, w) + lo3; FORCE_EVAL(lo); restore_fp_reg(oldreg); return lo; } extern float rounded_hw_interval_div_float_up(VOLATILE float lo1, VOLATILE float hi1, VOLATILE float lo2, VOLATILE float hi2) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, ROUND_UPWARD); MAY_MODIFY(lo1); MAY_MODIFY(hi1); MAY_MODIFY(lo2); MAY_MODIFY(hi2); float x = lo1 / lo2; float y = lo1 / hi2; float z = hi1 / lo2; float w = hi1 / hi2; if (isnan(x)) x = 0.0f; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(y)) y = 0.0f; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(z)) z = 0.0f; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(w)) w = 0.0f; /* 0 / 0, +-inf / +-inf -> 0 */ VOLATILE float hi = fast_fmax4_float(x, y, z, w); FORCE_EVAL(hi); restore_fp_reg(oldreg); return hi; } extern float rounded_hw_interval_div_float_down(VOLATILE float lo1, VOLATILE float hi1, VOLATILE float lo2, VOLATILE float hi2) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, ROUND_DOWNWARD); MAY_MODIFY(lo1); MAY_MODIFY(hi1); MAY_MODIFY(lo2); MAY_MODIFY(hi2); float x = lo1 / lo2; float y = lo1 / hi2; float z = hi1 / lo2; float w = hi1 / hi2; if (isnan(x)) x = 0.0f; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(y)) y = 0.0f; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(z)) z = 0.0f; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(w)) w = 0.0f; /* 0 / 0, +-inf / +-inf -> 0 */ VOLATILE float lo = fast_fmin4_float(x, y, z, w); FORCE_EVAL(lo); restore_fp_reg(oldreg); return lo; } extern float rounded_hw_interval_div_add_float_up(VOLATILE float lo1, VOLATILE float hi1, VOLATILE float lo2, VOLATILE float hi2, VOLATILE float hi3) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, ROUND_UPWARD); MAY_MODIFY(lo1); MAY_MODIFY(hi1); MAY_MODIFY(lo2); MAY_MODIFY(hi2); MAY_MODIFY(hi3); float x = lo1 / lo2; float y = lo1 / hi2; float z = hi1 / lo2; float w = hi1 / hi2; if (isnan(x)) x = 0.0f; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(y)) y = 0.0f; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(z)) z = 0.0f; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(w)) w = 0.0f; /* 0 / 0, +-inf / +-inf -> 0 */ VOLATILE float hi = fast_fmax4_float(x, y, z, w) + hi3; FORCE_EVAL(hi); restore_fp_reg(oldreg); return hi; } extern float rounded_hw_interval_div_add_float_down(VOLATILE float lo1, VOLATILE float hi1, VOLATILE float lo2, VOLATILE float hi2, VOLATILE float lo3) { fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, ROUND_DOWNWARD); MAY_MODIFY(lo1); MAY_MODIFY(hi1); MAY_MODIFY(lo2); MAY_MODIFY(hi2); MAY_MODIFY(lo3); float x = lo1 / lo2; float y = lo1 / hi2; float z = hi1 / lo2; float w = hi1 / hi2; if (isnan(x)) x = 0.0f; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(y)) y = 0.0f; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(z)) z = 0.0f; /* 0 / 0, +-inf / +-inf -> 0 */ if (isnan(w)) w = 0.0f; /* 0 / 0, +-inf / +-inf -> 0 */ VOLATILE float lo = fast_fmin4_float(x, y, z, w) + lo3; FORCE_EVAL(lo); restore_fp_reg(oldreg); return lo; } // // Vector Operations // extern float rounded_hw_vector_sum_float(HsInt mode, HsInt length, HsInt offset, const float *a) { native_rounding_mode nmode = hs_rounding_mode_to_native(mode); fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, nmode); VOLATILE float s = 0.0f; MAY_MODIFY(s); for (HsInt i = 0; i < length; ++i) { s += a[offset + i]; } FORCE_EVAL(s); restore_fp_reg(oldreg); return s; } extern void rounded_hw_vector_add_float(HsInt mode, HsInt length, HsInt offsetR, float * restrict result, HsInt offsetA, const float * restrict a, HsInt offsetB, const float * restrict b) { native_rounding_mode nmode = hs_rounding_mode_to_native(mode); fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, nmode); for (HsInt i = 0; i < length; ++i) { result[offsetR + i] = a[offsetA + i] + b[offsetB + i]; } restore_fp_reg(oldreg); } extern void rounded_hw_vector_sub_float(HsInt mode, HsInt length, HsInt offsetR, float * restrict result, HsInt offsetA, const float * restrict a, HsInt offsetB, const float * restrict b) { native_rounding_mode nmode = hs_rounding_mode_to_native(mode); fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, nmode); for (HsInt i = 0; i < length; ++i) { result[offsetR + i] = a[offsetA + i] - b[offsetB + i]; } restore_fp_reg(oldreg); } extern void rounded_hw_vector_mul_float(HsInt mode, HsInt length, HsInt offsetR, float * restrict result, HsInt offsetA, const float * restrict a, HsInt offsetB, const float * restrict b) { native_rounding_mode nmode = hs_rounding_mode_to_native(mode); fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, nmode); for (HsInt i = 0; i < length; ++i) { result[offsetR + i] = a[offsetA + i] * b[offsetB + i]; } restore_fp_reg(oldreg); } extern void rounded_hw_vector_fma_float(HsInt mode, HsInt length, HsInt offsetR, float * restrict result, HsInt offsetA, const float * restrict a, HsInt offsetB, const float * restrict b, HsInt offsetC, const float * restrict c) { native_rounding_mode nmode = hs_rounding_mode_to_native(mode); fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, nmode); for (HsInt i = 0; i < length; ++i) { result[offsetR + i] = fmaf(a[offsetA + i], b[offsetB + i], c[offsetC + i]); } restore_fp_reg(oldreg); } extern void rounded_hw_vector_div_float(HsInt mode, HsInt length, HsInt offsetR, float * restrict result, HsInt offsetA, const float * restrict a, HsInt offsetB, const float * restrict b) { native_rounding_mode nmode = hs_rounding_mode_to_native(mode); fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, nmode); for (HsInt i = 0; i < length; ++i) { result[offsetR + i] = a[offsetA + i] / b[offsetB + i]; } restore_fp_reg(oldreg); } extern void rounded_hw_vector_sqrt_float(HsInt mode, HsInt length, HsInt offsetR, float * restrict result, HsInt offsetA, const float * restrict a) { native_rounding_mode nmode = hs_rounding_mode_to_native(mode); fp_reg oldreg = get_fp_reg(); set_rounding(oldreg, nmode); #if defined(__GNUC__) && defined(USE_SSE2) // mingw-w64's sqrtf may use x87 instead of SSE2 HsInt i = 0; for (; i + 4 <= length; i += 4) { __m128 aa = _mm_loadu_ps(&a[offsetA + i]); __m128 bb = _mm_sqrt_ps(aa); _mm_storeu_ps(&result[offsetR + i], bb); } for (; i < length; ++i) { float aa = a[offsetA + i]; float bb; __asm__ volatile("sqrtss %1, %0" : "=x"(bb) : "x"(aa)); result[offsetR + i] = bb; } #else for (HsInt i = 0; i < length; ++i) { result[offsetR + i] = sqrtf(a[offsetA + i]); } #endif restore_fp_reg(oldreg); }