mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-23 22:00:13 +01:00
glsl: Rewrite atan2 implementation to fix accuracy and handling of zero/infinity.
This addresses several issues of the current atan2 implementation:
- Negative zero (and negative denorms which end up getting flushed to
zero) isn't handled correctly by the current implementation. The
reason is that it does 'y >= 0' and 'x < 0' comparisons to decide
on which side of the branch cut the argument is, which causes us to
return incorrect results (off by up to 2π) for very small negative
values.
- There is a serious precision problem for x values of large enough
magnitude introduced by the floating point division operation being
implemented as a mul+rcp sequence. This can lead to the quotient
getting flushed to zero in some cases introducing an error of over
8e6 ULP in the result -- Or in the most catastrophic case will
cause us to return NaN instead of the correct value ±π/2 for y=±∞
and x very large. We can fix this easily by scaling down both
arguments when the absolute value of the denominator goes above
certain threshold. The error of this atan2 implementation remains
below 25 ULP in most of its domain except for a neighborhood of y=0
where it reaches a maximum error of about 180 ULP.
- It emits a bunch of instructions including no less than three
if-else branches per scalar component that don't seem to get
optimized out later on. This implementation uses about 13% less
instructions on Intel SKL hardware and doesn't emit any control
flow instructions.
v2: Fix up argument scaling to take into account the range and
precision of exotic FP24 hardware. Flip coordinate system for
arguments along the vertical line as if they were on the left
half-plane in order to avoid division by zero which may give
unspecified results on non-GLSL 4.1-capable hardware. Sprinkle in
some more comments.
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
This commit is contained in:
parent
69042a5be4
commit
e9ffd12827
1 changed files with 58 additions and 34 deletions
|
|
@ -3560,44 +3560,68 @@ builtin_builder::_acos(const glsl_type *type)
|
|||
ir_function_signature *
|
||||
builtin_builder::_atan2(const glsl_type *type)
|
||||
{
|
||||
ir_variable *vec_y = in_var(type, "vec_y");
|
||||
ir_variable *vec_x = in_var(type, "vec_x");
|
||||
MAKE_SIG(type, always_available, 2, vec_y, vec_x);
|
||||
const unsigned n = type->vector_elements;
|
||||
ir_variable *y = in_var(type, "y");
|
||||
ir_variable *x = in_var(type, "x");
|
||||
MAKE_SIG(type, always_available, 2, y, x);
|
||||
|
||||
ir_variable *vec_result = body.make_temp(type, "vec_result");
|
||||
ir_variable *r = body.make_temp(glsl_type::float_type, "r");
|
||||
for (int i = 0; i < type->vector_elements; i++) {
|
||||
ir_variable *y = body.make_temp(glsl_type::float_type, "y");
|
||||
ir_variable *x = body.make_temp(glsl_type::float_type, "x");
|
||||
body.emit(assign(y, swizzle(vec_y, i, 1)));
|
||||
body.emit(assign(x, swizzle(vec_x, i, 1)));
|
||||
/* If we're on the left half-plane rotate the coordinates π/2 clock-wise
|
||||
* for the y=0 discontinuity to end up aligned with the vertical
|
||||
* discontinuity of atan(s/t) along t=0. This also makes sure that we
|
||||
* don't attempt to divide by zero along the vertical line, which may give
|
||||
* unspecified results on non-GLSL 4.1-capable hardware.
|
||||
*/
|
||||
ir_variable *flip = body.make_temp(glsl_type::bvec(n), "flip");
|
||||
body.emit(assign(flip, gequal(imm(0.0f, n), x)));
|
||||
ir_variable *s = body.make_temp(type, "s");
|
||||
body.emit(assign(s, csel(flip, abs(x), y)));
|
||||
ir_variable *t = body.make_temp(type, "t");
|
||||
body.emit(assign(t, csel(flip, y, abs(x))));
|
||||
|
||||
/* If |x| >= 1.0e-8 * |y|: */
|
||||
ir_if *outer_if =
|
||||
new(mem_ctx) ir_if(greater(abs(x), mul(imm(1.0e-8f), abs(y))));
|
||||
/* If the magnitude of the denominator exceeds some huge value, scale down
|
||||
* the arguments in order to prevent the reciprocal operation from flushing
|
||||
* its result to zero, which would cause precision problems, and for s
|
||||
* infinite would cause us to return a NaN instead of the correct finite
|
||||
* value.
|
||||
*
|
||||
* If fmin and fmax are respectively the smallest and largest positive
|
||||
* normalized floating point values representable by the implementation,
|
||||
* the constants below should be in agreement with:
|
||||
*
|
||||
* huge <= 1 / fmin
|
||||
* scale <= 1 / fmin / fmax (for |t| >= huge)
|
||||
*
|
||||
* In addition scale should be a negative power of two in order to avoid
|
||||
* loss of precision. The values chosen below should work for most usual
|
||||
* floating point representations with at least the dynamic range of ATI's
|
||||
* 24-bit representation.
|
||||
*/
|
||||
ir_constant *huge = imm(1e18f, n);
|
||||
ir_variable *scale = body.make_temp(type, "scale");
|
||||
body.emit(assign(scale, csel(gequal(abs(t), huge),
|
||||
imm(0.25f, n), imm(1.0f, n))));
|
||||
ir_variable *rcp_scaled_t = body.make_temp(type, "rcp_scaled_t");
|
||||
body.emit(assign(rcp_scaled_t, rcp(mul(t, scale))));
|
||||
ir_expression *s_over_t = mul(mul(s, scale), rcp_scaled_t);
|
||||
|
||||
ir_factory outer_then(&outer_if->then_instructions, mem_ctx);
|
||||
/* Calculate the arctangent and fix up the result if we had flipped the
|
||||
* coordinate system.
|
||||
*/
|
||||
ir_variable *arc = body.make_temp(type, "arc");
|
||||
do_atan(body, type, arc, abs(s_over_t));
|
||||
body.emit(assign(arc, add(arc, mul(b2f(flip), imm(M_PI_2f)))));
|
||||
|
||||
/* Then...call atan(y/x) */
|
||||
do_atan(outer_then, glsl_type::float_type, r, div(y, x));
|
||||
|
||||
/* ...and fix it up: */
|
||||
ir_if *inner_if = new(mem_ctx) ir_if(less(x, imm(0.0f)));
|
||||
inner_if->then_instructions.push_tail(
|
||||
if_tree(gequal(y, imm(0.0f)),
|
||||
assign(r, add(r, imm(M_PIf))),
|
||||
assign(r, sub(r, imm(M_PIf)))));
|
||||
outer_then.emit(inner_if);
|
||||
|
||||
/* Else... */
|
||||
outer_if->else_instructions.push_tail(
|
||||
assign(r, mul(sign(y), imm(M_PI_2f))));
|
||||
|
||||
body.emit(outer_if);
|
||||
|
||||
body.emit(assign(vec_result, r, 1 << i));
|
||||
}
|
||||
body.emit(ret(vec_result));
|
||||
/* Rather convoluted calculation of the sign of the result. When x < 0 we
|
||||
* cannot use fsign because we need to be able to distinguish between
|
||||
* negative and positive zero. Unfortunately we cannot use bitwise
|
||||
* arithmetic tricks either because of back-ends without integer support.
|
||||
* When x >= 0 rcp_scaled_t will always be non-negative so this won't be
|
||||
* able to distinguish between negative and positive zero, but we don't
|
||||
* care because atan2 is continuous along the whole positive y = 0
|
||||
* half-line, so it won't affect the result significantly.
|
||||
*/
|
||||
body.emit(ret(csel(less(min2(y, rcp_scaled_t), imm(0.0f, n)),
|
||||
neg(arc), arc)));
|
||||
|
||||
return sig;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue