mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-30 12:10:09 +01:00
nir: use a seperate enum for per alu floating point math control
We don't need one bit per bitsize per instruction if only one actually matters in the end. First step towards moving NIR in the direction of full float_controls2 only. Also rename this from fp_fast_math, because that name implied that 0 is the no fast math mode, while the opposite was the case. Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39026>
This commit is contained in:
parent
44318091d8
commit
f3290219ab
21 changed files with 87 additions and 96 deletions
|
|
@ -1522,6 +1522,24 @@ nir_op_is_selection(nir_op op)
|
|||
{
|
||||
return (nir_op_infos[op].algebraic_properties & NIR_OP_IS_SELECTION) != 0;
|
||||
}
|
||||
/**
|
||||
* Floating point fast math control.
|
||||
*
|
||||
* All new bits must restrict optimizations when they are set, not when they
|
||||
* are missing. This means a bitwise OR always produces a no less restrictive set.
|
||||
*
|
||||
* See also nir_alu_instr::exact, which should (and hopefully will be) moved
|
||||
* to this enum in the future.
|
||||
*/
|
||||
typedef enum {
|
||||
nir_fp_preserve_signed_zero = BITFIELD_BIT(0),
|
||||
nir_fp_preserve_inf = BITFIELD_BIT(1),
|
||||
nir_fp_preserve_nan = BITFIELD_BIT(2),
|
||||
|
||||
nir_fp_preserve_sz_inf_nan = BITFIELD_MASK(3),
|
||||
nir_fp_fast_math = 0,
|
||||
nir_fp_no_fast_math = BITFIELD_MASK(3),
|
||||
} nir_fp_math_control;
|
||||
|
||||
/***/
|
||||
typedef struct nir_alu_instr {
|
||||
|
|
@ -1562,7 +1580,7 @@ typedef struct nir_alu_instr {
|
|||
* still handled through the exact bit, and the other float controls bits
|
||||
* (rounding mode and denorm handling) remain in the execution mode only.
|
||||
*/
|
||||
uint32_t fp_fast_math : 9;
|
||||
uint32_t fp_math_ctrl : 3;
|
||||
|
||||
/** Sources
|
||||
*
|
||||
|
|
@ -1574,25 +1592,25 @@ typedef struct nir_alu_instr {
|
|||
static inline bool
|
||||
nir_alu_instr_is_signed_zero_preserve(nir_alu_instr *alu)
|
||||
{
|
||||
return nir_is_float_control_signed_zero_preserve(alu->fp_fast_math, alu->def.bit_size);
|
||||
return alu->fp_math_ctrl & nir_fp_preserve_signed_zero;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
nir_alu_instr_is_inf_preserve(nir_alu_instr *alu)
|
||||
{
|
||||
return nir_is_float_control_inf_preserve(alu->fp_fast_math, alu->def.bit_size);
|
||||
return alu->fp_math_ctrl & nir_fp_preserve_inf;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
nir_alu_instr_is_nan_preserve(nir_alu_instr *alu)
|
||||
{
|
||||
return nir_is_float_control_nan_preserve(alu->fp_fast_math, alu->def.bit_size);
|
||||
return alu->fp_math_ctrl & nir_fp_preserve_nan;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
nir_alu_instr_is_signed_zero_inf_nan_preserve(nir_alu_instr *alu)
|
||||
{
|
||||
return nir_is_float_control_signed_zero_inf_nan_preserve(alu->fp_fast_math, alu->def.bit_size);
|
||||
return alu->fp_math_ctrl & nir_fp_preserve_sz_inf_nan;
|
||||
}
|
||||
|
||||
void nir_alu_src_copy(nir_alu_src *dest, const nir_alu_src *src);
|
||||
|
|
|
|||
|
|
@ -72,7 +72,7 @@ nir_builder_alu_instr_finish_and_insert(nir_builder *build, nir_alu_instr *instr
|
|||
const nir_op_info *op_info = &nir_op_infos[instr->op];
|
||||
|
||||
instr->exact = build->exact;
|
||||
instr->fp_fast_math = build->fp_fast_math;
|
||||
instr->fp_math_ctrl = build->fp_math_ctrl;
|
||||
|
||||
/* Guess the number of components the destination temporary should have
|
||||
* based on our input sizes, if it's not fixed for the op.
|
||||
|
|
@ -388,7 +388,7 @@ nir_vec_scalars(nir_builder *build, nir_scalar *comp, unsigned num_components)
|
|||
instr->src[i].swizzle[0] = comp[i].comp;
|
||||
}
|
||||
instr->exact = build->exact;
|
||||
instr->fp_fast_math = build->fp_fast_math;
|
||||
instr->fp_math_ctrl = build->fp_math_ctrl;
|
||||
|
||||
/* Note: not reusing nir_builder_alu_instr_finish_and_insert() because it
|
||||
* can't re-guess the num_components when num_components == 1 (nir_op_mov).
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ typedef struct nir_builder {
|
|||
bool constant_fold_alu;
|
||||
|
||||
/* Float_controls2 bits. See nir_alu_instr for details. */
|
||||
uint32_t fp_fast_math;
|
||||
uint32_t fp_math_ctrl;
|
||||
|
||||
nir_shader *shader;
|
||||
nir_function_impl *impl;
|
||||
|
|
@ -725,7 +725,7 @@ nir_mov_alu(nir_builder *build, nir_alu_src src, unsigned num_components)
|
|||
nir_def_init(&mov->instr, &mov->def, num_components,
|
||||
nir_src_bit_size(src.src));
|
||||
mov->exact = build->exact;
|
||||
mov->fp_fast_math = build->fp_fast_math;
|
||||
mov->fp_math_ctrl = build->fp_math_ctrl;
|
||||
mov->src[0] = src;
|
||||
nir_builder_instr_insert(build, &mov->instr);
|
||||
|
||||
|
|
|
|||
|
|
@ -268,7 +268,7 @@ clone_alu(clone_state *state, const nir_alu_instr *alu)
|
|||
nir_alu_instr *nalu = nir_alu_instr_create(state->ns, alu->op);
|
||||
clone_debug_info(state, &nalu->instr, &alu->instr);
|
||||
nalu->exact = alu->exact;
|
||||
nalu->fp_fast_math = alu->fp_fast_math;
|
||||
nalu->fp_math_ctrl = alu->fp_math_ctrl;
|
||||
nalu->no_signed_wrap = alu->no_signed_wrap;
|
||||
nalu->no_unsigned_wrap = alu->no_unsigned_wrap;
|
||||
|
||||
|
|
|
|||
|
|
@ -807,7 +807,7 @@ nir_instr_set_add_or_rewrite(struct set *instr_set, nir_instr *instr,
|
|||
*/
|
||||
if (instr->type == nir_instr_type_alu) {
|
||||
nir_instr_as_alu(match)->exact |= nir_instr_as_alu(instr)->exact;
|
||||
nir_instr_as_alu(match)->fp_fast_math |= nir_instr_as_alu(instr)->fp_fast_math;
|
||||
nir_instr_as_alu(match)->fp_math_ctrl |= nir_instr_as_alu(instr)->fp_math_ctrl;
|
||||
}
|
||||
|
||||
assert(!def == !new_def);
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ lower_alu_instr(nir_builder *b, nir_alu_instr *instr, UNUSED void *cb_data)
|
|||
|
||||
b->cursor = nir_before_instr(&instr->instr);
|
||||
b->exact = instr->exact;
|
||||
b->fp_fast_math = instr->fp_fast_math;
|
||||
b->fp_math_ctrl = instr->fp_math_ctrl;
|
||||
|
||||
switch (instr->op) {
|
||||
case nir_op_bitfield_reverse:
|
||||
|
|
@ -176,9 +176,9 @@ lower_alu_instr(nir_builder *b, nir_alu_instr *instr, UNUSED void *cb_data)
|
|||
* nir_lower_alu is idempotent, and allows the backend to implement
|
||||
* soundly the no_signed_zero subset of fmin/fmax.
|
||||
*/
|
||||
b->fp_fast_math &= ~FLOAT_CONTROLS_SIGNED_ZERO_PRESERVE;
|
||||
b->fp_math_ctrl &= ~nir_fp_preserve_signed_zero;
|
||||
nir_def *fminmax = max ? nir_fmax(b, s0, s1) : nir_fmin(b, s0, s1);
|
||||
b->fp_fast_math = instr->fp_fast_math;
|
||||
b->fp_math_ctrl = instr->fp_math_ctrl;
|
||||
|
||||
/* If we have a constant source, we can usually optimize */
|
||||
if (s0->num_components == 1 && s0->bit_size == 32) {
|
||||
|
|
|
|||
|
|
@ -111,7 +111,7 @@ lower_reduction(nir_alu_instr *alu, nir_op chan_op, nir_op merge_op,
|
|||
chan->src[1].swizzle[0] = chan->src[1].swizzle[channel];
|
||||
}
|
||||
chan->exact = alu->exact;
|
||||
chan->fp_fast_math = alu->fp_fast_math;
|
||||
chan->fp_math_ctrl = alu->fp_math_ctrl;
|
||||
|
||||
nir_builder_instr_insert(builder, &chan->instr);
|
||||
|
||||
|
|
@ -164,7 +164,7 @@ lower_bfdot_to_bfdot2_bfadd(nir_builder *b, nir_alu_instr *alu)
|
|||
}
|
||||
instr->src[2].src = nir_src_for_ssa(acc);
|
||||
instr->exact = b->exact;
|
||||
instr->fp_fast_math = b->fp_fast_math;
|
||||
instr->fp_math_ctrl = b->fp_math_ctrl;
|
||||
|
||||
nir_builder_instr_insert(b, &instr->instr);
|
||||
acc = &instr->def;
|
||||
|
|
@ -206,7 +206,7 @@ lower_fdot(nir_alu_instr *alu, nir_builder *builder, bool is_bfloat16)
|
|||
if (i != 0)
|
||||
instr->src[2].src = nir_src_for_ssa(prev);
|
||||
instr->exact = builder->exact;
|
||||
instr->fp_fast_math = builder->fp_fast_math;
|
||||
instr->fp_math_ctrl = builder->fp_math_ctrl;
|
||||
|
||||
nir_builder_instr_insert(builder, &instr->instr);
|
||||
|
||||
|
|
@ -225,7 +225,7 @@ lower_alu_instr_width(nir_builder *b, nir_instr *instr, void *_data)
|
|||
unsigned i, chan;
|
||||
|
||||
b->exact = alu->exact;
|
||||
b->fp_fast_math = alu->fp_fast_math;
|
||||
b->fp_math_ctrl = alu->fp_math_ctrl;
|
||||
|
||||
unsigned num_components = alu->def.num_components;
|
||||
unsigned target_width = 1;
|
||||
|
|
@ -449,7 +449,7 @@ lower_alu_instr_width(nir_builder *b, nir_instr *instr, void *_data)
|
|||
|
||||
nir_alu_ssa_dest_init(lower, components, alu->def.bit_size);
|
||||
lower->exact = alu->exact;
|
||||
lower->fp_fast_math = alu->fp_fast_math;
|
||||
lower->fp_math_ctrl = alu->fp_math_ctrl;
|
||||
|
||||
for (i = 0; i < components; i++) {
|
||||
vec->src[chan + i].src = nir_src_for_ssa(&lower->def);
|
||||
|
|
|
|||
|
|
@ -83,7 +83,7 @@ build_atomic(nir_builder *b, nir_intrinsic_instr *intr)
|
|||
b, nir_atomic_op_to_alu(nir_intrinsic_atomic_op(intr)), before, data);
|
||||
nir_alu_instr *op = nir_def_as_alu(expected);
|
||||
op->exact = true;
|
||||
op->fp_fast_math = 0;
|
||||
op->fp_math_ctrl = nir_fp_no_fast_math;
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_ssbo_atomic:
|
||||
xchg = nir_ssbo_atomic_swap(b, intr->def.bit_size,
|
||||
|
|
|
|||
|
|
@ -88,10 +88,8 @@ get_signed_inf(nir_builder *b, nir_def *zero)
|
|||
static nir_def *
|
||||
get_signed_zero(nir_builder *b, nir_def *src)
|
||||
{
|
||||
uint32_t exec_mode = b->fp_fast_math;
|
||||
|
||||
nir_def *zero;
|
||||
if (nir_is_float_control_signed_zero_preserve(exec_mode, 64)) {
|
||||
if (b->fp_math_ctrl & nir_fp_preserve_signed_zero) {
|
||||
nir_def *hi = nir_unpack_64_2x32_split_y(b, src);
|
||||
nir_def *sign = nir_iand_imm(b, hi, 0x80000000);
|
||||
zero = nir_pack_64_2x32_split(b, nir_imm_int(b, 0), sign);
|
||||
|
|
@ -105,9 +103,7 @@ get_signed_zero(nir_builder *b, nir_def *src)
|
|||
static nir_def *
|
||||
preserve_nan(nir_builder *b, nir_def *src, nir_def *res)
|
||||
{
|
||||
uint32_t exec_mode = b->fp_fast_math;
|
||||
|
||||
if (nir_is_float_control_nan_preserve(exec_mode, 64)) {
|
||||
if (b->fp_math_ctrl & nir_fp_preserve_nan) {
|
||||
nir_def *is_nan = nir_fneu(b, src, src);
|
||||
return nir_bcsel(b, is_nan, src, res);
|
||||
}
|
||||
|
|
@ -317,7 +313,6 @@ lower_sqrt_rsq(nir_builder *b, nir_def *src, bool sqrt)
|
|||
res = nir_ffma(b, y_1, r_1, y_1);
|
||||
}
|
||||
|
||||
uint32_t exec_mode = b->fp_fast_math;
|
||||
if (sqrt) {
|
||||
/* Here, the special cases we need to handle are
|
||||
* 0 -> 0 (sign preserving)
|
||||
|
|
@ -343,7 +338,7 @@ lower_sqrt_rsq(nir_builder *b, nir_def *src, bool sqrt)
|
|||
res = fix_inv_result(b, res, src, new_exp);
|
||||
}
|
||||
|
||||
if (nir_is_float_control_nan_preserve(exec_mode, 64))
|
||||
if (b->fp_math_ctrl & nir_fp_preserve_nan)
|
||||
res = nir_bcsel(b, nir_feq_imm(b, src, -INFINITY),
|
||||
nir_imm_double(b, NAN), res);
|
||||
|
||||
|
|
@ -504,7 +499,7 @@ lower_minmax(nir_builder *b, nir_op cmp, nir_def *src0, nir_def *src1)
|
|||
/* IEEE-754-2019 requires that fmin/fmax compare -0 < 0, but -0 and 0 are
|
||||
* indistinguishable for flt/fge. So, we fix up signed zeroes.
|
||||
*/
|
||||
if (nir_is_float_control_signed_zero_preserve(b->fp_fast_math, 64)) {
|
||||
if (b->fp_math_ctrl & nir_fp_preserve_signed_zero) {
|
||||
nir_def *src0_is_negzero = nir_ieq_imm(b, src0, 1ull << 63);
|
||||
nir_def *src1_is_poszero = nir_ieq_imm(b, src1, 0x0);
|
||||
nir_def *neg_pos_zero = nir_iand(b, src0_is_negzero, src1_is_poszero);
|
||||
|
|
@ -772,7 +767,7 @@ lower_doubles_instr(nir_builder *b, nir_instr *instr, void *_data)
|
|||
nir_alu_instr *alu = nir_instr_as_alu(instr);
|
||||
|
||||
/* Easier to set it here than pass it around all over ther place. */
|
||||
b->fp_fast_math = alu->fp_fast_math;
|
||||
b->fp_math_ctrl = alu->fp_math_ctrl;
|
||||
|
||||
nir_def *soft_def =
|
||||
lower_doubles_instr_to_soft(b, alu, data->softfp64, options);
|
||||
|
|
|
|||
|
|
@ -345,7 +345,7 @@ convert_flrp_instruction(nir_builder *bld,
|
|||
|
||||
bld->cursor = nir_before_instr(&alu->instr);
|
||||
bld->exact = alu->exact;
|
||||
bld->fp_fast_math = alu->fp_fast_math;
|
||||
bld->fp_math_ctrl = alu->fp_math_ctrl;
|
||||
|
||||
/* There are two methods to implement flrp(x, y, t). The strictly correct
|
||||
* implementation according to the GLSL spec is:
|
||||
|
|
|
|||
|
|
@ -866,7 +866,7 @@ clone_alu_and_replace_src_defs(nir_builder *b, const nir_alu_instr *alu,
|
|||
{
|
||||
nir_alu_instr *nalu = nir_alu_instr_create(b->shader, alu->op);
|
||||
nalu->exact = alu->exact;
|
||||
nalu->fp_fast_math = alu->fp_fast_math;
|
||||
nalu->fp_math_ctrl = alu->fp_math_ctrl;
|
||||
|
||||
nir_def_init(&nalu->instr, &nalu->def,
|
||||
alu->def.num_components,
|
||||
|
|
@ -881,7 +881,6 @@ clone_alu_and_replace_src_defs(nir_builder *b, const nir_alu_instr *alu,
|
|||
nir_builder_instr_insert(b, &nalu->instr);
|
||||
|
||||
return &nalu->def;
|
||||
;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
|||
|
|
@ -174,7 +174,7 @@ struct chain {
|
|||
unsigned length;
|
||||
nir_scalar srcs[MAX_CHAIN_LENGTH];
|
||||
bool do_global_cse, exact;
|
||||
unsigned fp_fast_math;
|
||||
unsigned fp_math_ctrl;
|
||||
};
|
||||
|
||||
UNUSED static void
|
||||
|
|
@ -222,7 +222,7 @@ build_chain(struct chain *c, nir_scalar def, unsigned reserved_count)
|
|||
* It is safe to add `exact` or float control bits, but not the reverse.
|
||||
*/
|
||||
c->exact |= alu->exact;
|
||||
c->fp_fast_math |= alu->fp_fast_math;
|
||||
c->fp_math_ctrl |= alu->fp_math_ctrl;
|
||||
|
||||
for (unsigned i = 0; i < 2; ++i) {
|
||||
nir_scalar src = nir_scalar_chase_alu_src(def, i);
|
||||
|
|
@ -451,7 +451,7 @@ reassociate_chain(struct chain *c, void *pair_freq)
|
|||
{
|
||||
nir_builder b = nir_builder_at(nir_before_instr(&c->root->instr));
|
||||
b.exact = c->exact;
|
||||
b.fp_fast_math = c->fp_fast_math;
|
||||
b.fp_math_ctrl = c->fp_math_ctrl;
|
||||
|
||||
/* Pick a new order using sort-by-rank and possibly the CSE heuristics */
|
||||
unsigned pinned = 0;
|
||||
|
|
@ -503,7 +503,7 @@ reassociate_chain(struct chain *c, void *pair_freq)
|
|||
/* Set flags conservatively, matching the rest of the chain */
|
||||
c->root->no_signed_wrap = c->root->no_unsigned_wrap = false;
|
||||
c->root->exact = c->exact;
|
||||
c->root->fp_fast_math = c->fp_fast_math;
|
||||
c->root->fp_math_ctrl = c->fp_math_ctrl;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -47,11 +47,11 @@ phi_srcs_equal(nir_def *a, nir_def *b)
|
|||
if (!nir_instrs_equal(a_instr, b_instr))
|
||||
return false;
|
||||
|
||||
/* nir_instrs_equal ignores exact/fast_math */
|
||||
/* nir_instrs_equal ignores exact/fp_math_ctrl */
|
||||
if (a_instr->type == nir_instr_type_alu) {
|
||||
nir_alu_instr *a_alu = nir_def_as_alu(a);
|
||||
nir_alu_instr *b_alu = nir_def_as_alu(b);
|
||||
if (a_alu->exact != b_alu->exact || a_alu->fp_fast_math != b_alu->fp_fast_math)
|
||||
if (a_alu->exact != b_alu->exact || a_alu->fp_math_ctrl != b_alu->fp_math_ctrl)
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -458,10 +458,9 @@ instr_try_combine_alu(struct set *instr_set, nir_alu_instr *alu1, nir_alu_instr
|
|||
*/
|
||||
new_alu->exact = alu1->exact || alu2->exact;
|
||||
|
||||
/* fp_fast_math is a set of FLOAT_CONTROLS_*_PRESERVE_*. Preserve anything
|
||||
* preserved by either instruction.
|
||||
/* fp_math_ctrl is a set of restrictions, take the union of both.
|
||||
*/
|
||||
new_alu->fp_fast_math = alu1->fp_fast_math | alu2->fp_fast_math;
|
||||
new_alu->fp_math_ctrl = alu1->fp_math_ctrl | alu2->fp_math_ctrl;
|
||||
|
||||
/* If all channels don't wrap, we can say that the whole vector doesn't
|
||||
* wrap.
|
||||
|
|
|
|||
|
|
@ -480,7 +480,7 @@ construct_value(nir_builder *build,
|
|||
* replacement should be exact.
|
||||
*/
|
||||
alu->exact = state->has_exact_alu || expr->exact;
|
||||
alu->fp_fast_math = nir_instr_as_alu(instr)->fp_fast_math;
|
||||
alu->fp_math_ctrl = nir_instr_as_alu(instr)->fp_math_ctrl;
|
||||
|
||||
for (unsigned i = 0; i < nir_op_infos[op].num_inputs; i++) {
|
||||
/* If the source is an explicitly sized source, then we need to reset
|
||||
|
|
|
|||
|
|
@ -733,7 +733,7 @@ write_alu(write_ctx *ctx, const nir_alu_instr *alu)
|
|||
}
|
||||
|
||||
write_def(ctx, &alu->def, header, alu->instr.type);
|
||||
blob_write_uint32(ctx->blob, alu->fp_fast_math);
|
||||
blob_write_uint32(ctx->blob, alu->fp_math_ctrl);
|
||||
|
||||
if (header.alu.packed_src_ssa_16bit) {
|
||||
for (unsigned i = 0; i < num_srcs; i++) {
|
||||
|
|
@ -788,7 +788,7 @@ read_alu(read_ctx *ctx, union packed_instr header)
|
|||
alu->no_unsigned_wrap = header.alu.no_unsigned_wrap;
|
||||
|
||||
read_def(ctx, &alu->def, &alu->instr, header);
|
||||
alu->fp_fast_math = blob_read_uint32(ctx->blob);
|
||||
alu->fp_math_ctrl = blob_read_uint32(ctx->blob);
|
||||
|
||||
if (header.alu.packed_src_ssa_16bit) {
|
||||
for (unsigned i = 0; i < num_srcs; i++) {
|
||||
|
|
|
|||
|
|
@ -1550,10 +1550,6 @@ enum gl_derivative_group {
|
|||
|
||||
enum float_controls
|
||||
{
|
||||
/* The order of these matters. For float_controls2, only the first 9 bits
|
||||
* are used and stored per-instruction in nir_alu_instr::fp_fast_math.
|
||||
* Any changes in this enum need to be synchronized with that.
|
||||
*/
|
||||
FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE = 0,
|
||||
FLOAT_CONTROLS_SIGNED_ZERO_PRESERVE_FP16 = BITFIELD_BIT(0),
|
||||
FLOAT_CONTROLS_SIGNED_ZERO_PRESERVE_FP32 = BITFIELD_BIT(1),
|
||||
|
|
|
|||
|
|
@ -417,22 +417,13 @@ handle_fp_fast_math(struct vtn_builder *b, UNUSED struct vtn_value *val,
|
|||
b->nb.exact = true;
|
||||
|
||||
/* Decoration overrides defaults */
|
||||
b->nb.fp_fast_math = 0;
|
||||
b->nb.fp_math_ctrl = 0;
|
||||
if (!(dec->operands[0] & SpvFPFastMathModeNSZMask))
|
||||
b->nb.fp_fast_math |=
|
||||
FLOAT_CONTROLS_SIGNED_ZERO_PRESERVE_FP16 |
|
||||
FLOAT_CONTROLS_SIGNED_ZERO_PRESERVE_FP32 |
|
||||
FLOAT_CONTROLS_SIGNED_ZERO_PRESERVE_FP64;
|
||||
b->nb.fp_math_ctrl |= nir_fp_preserve_signed_zero;
|
||||
if (!(dec->operands[0] & SpvFPFastMathModeNotNaNMask))
|
||||
b->nb.fp_fast_math |=
|
||||
FLOAT_CONTROLS_NAN_PRESERVE_FP16 |
|
||||
FLOAT_CONTROLS_NAN_PRESERVE_FP32 |
|
||||
FLOAT_CONTROLS_NAN_PRESERVE_FP64;
|
||||
b->nb.fp_math_ctrl |= nir_fp_preserve_nan;
|
||||
if (!(dec->operands[0] & SpvFPFastMathModeNotInfMask))
|
||||
b->nb.fp_fast_math |=
|
||||
FLOAT_CONTROLS_INF_PRESERVE_FP16 |
|
||||
FLOAT_CONTROLS_INF_PRESERVE_FP32 |
|
||||
FLOAT_CONTROLS_INF_PRESERVE_FP64;
|
||||
b->nb.fp_math_ctrl |= nir_fp_preserve_inf;
|
||||
}
|
||||
|
||||
void
|
||||
|
|
@ -441,18 +432,30 @@ vtn_handle_fp_fast_math(struct vtn_builder *b, struct vtn_value *val)
|
|||
/* Take the NaN/Inf/SZ preserve bits from the execution mode and set them
|
||||
* on the builder, so the generated instructions can take it from it.
|
||||
* We only care about some of them, check nir_alu_instr for details.
|
||||
* We also copy all bit widths, because we can't easily get the correct one
|
||||
* here.
|
||||
*/
|
||||
#define FLOAT_CONTROLS2_BITS (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 | \
|
||||
FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32 | \
|
||||
FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64)
|
||||
static_assert(FLOAT_CONTROLS2_BITS == BITSET_MASK(9),
|
||||
"enum float_controls and fp_fast_math out of sync!");
|
||||
b->nb.fp_fast_math = b->shader->info.float_controls_execution_mode &
|
||||
FLOAT_CONTROLS2_BITS;
|
||||
|
||||
b->nb.fp_math_ctrl = 0;
|
||||
unsigned exec_mode = b->shader->info.float_controls_execution_mode;
|
||||
if (val->type) {
|
||||
unsigned bit_size;
|
||||
|
||||
/* Some ALU like modf and frexp return a struct of two values. */
|
||||
if (glsl_type_is_struct(val->type->type))
|
||||
bit_size = glsl_get_bit_size(val->type->type->fields.structure[0].type);
|
||||
else
|
||||
bit_size = glsl_get_bit_size(val->type->type);
|
||||
|
||||
if (bit_size >= 16 && bit_size <= 64) {
|
||||
if (nir_is_float_control_signed_zero_preserve(exec_mode, bit_size))
|
||||
b->nb.fp_math_ctrl |= nir_fp_preserve_signed_zero;
|
||||
if (nir_is_float_control_inf_preserve(exec_mode, bit_size))
|
||||
b->nb.fp_math_ctrl |= nir_fp_preserve_inf;
|
||||
if (nir_is_float_control_nan_preserve(exec_mode, bit_size))
|
||||
b->nb.fp_math_ctrl |= nir_fp_preserve_nan;
|
||||
}
|
||||
}
|
||||
|
||||
vtn_foreach_decoration(b, val, handle_fp_fast_math, NULL);
|
||||
#undef FLOAT_CONTROLS2_BITS
|
||||
}
|
||||
|
||||
nir_rounding_mode
|
||||
|
|
@ -870,15 +873,15 @@ vtn_handle_alu(struct vtn_builder *b, SpvOp opcode,
|
|||
|
||||
case SpvOpIsInf: {
|
||||
const bool save_exact = b->nb.exact;
|
||||
const unsigned save_fast_math = b->nb.fp_fast_math;
|
||||
const unsigned save_math_ctrl = b->nb.fp_math_ctrl;
|
||||
|
||||
b->nb.exact = true;
|
||||
b->nb.fp_fast_math = 0;
|
||||
b->nb.fp_math_ctrl = nir_fp_no_fast_math;
|
||||
nir_def *inf = nir_imm_floatN_t(&b->nb, INFINITY, src[0]->bit_size);
|
||||
dest->def = nir_feq(&b->nb, nir_fabs(&b->nb, src[0]), inf);
|
||||
|
||||
b->nb.exact = save_exact;
|
||||
b->nb.fp_fast_math = save_fast_math;
|
||||
b->nb.fp_math_ctrl = save_math_ctrl;
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -38,21 +38,6 @@
|
|||
#define M_PI_4f ((float) M_PI_4)
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Some fp16 instructions (i.e., asin and acos) are lowered as fp32. In these cases the
|
||||
* generated fp32 instructions need the same fp_fast_math settings as fp16.
|
||||
*/
|
||||
static void
|
||||
propagate_fp16_fast_math_to_fp32(struct nir_builder *b)
|
||||
{
|
||||
static_assert(FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32 ==
|
||||
(FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 << 1),
|
||||
"FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32 is not "
|
||||
"FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 << 1.");
|
||||
|
||||
b->fp_fast_math |= (b->fp_fast_math & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16) << 1;
|
||||
}
|
||||
|
||||
static nir_def *build_det(nir_builder *b, nir_def **col, unsigned cols);
|
||||
|
||||
/* Computes the determinate of the submatrix given by taking src and
|
||||
|
|
@ -178,13 +163,9 @@ build_asin(nir_builder *b, nir_def *x, float p0, float p1, bool piecewise)
|
|||
* approximation in 32-bit math and then we convert the result back to
|
||||
* 16-bit.
|
||||
*/
|
||||
const uint32_t save = b->fp_fast_math;
|
||||
propagate_fp16_fast_math_to_fp32(b);
|
||||
|
||||
nir_def *result =
|
||||
nir_f2f16(b, build_asin(b, nir_f2f32(b, x), p0, p1, piecewise));
|
||||
|
||||
b->fp_fast_math = save;
|
||||
return result;
|
||||
}
|
||||
nir_def *one = nir_imm_floatN_t(b, 1.0f, x->bit_size);
|
||||
|
|
|
|||
|
|
@ -219,7 +219,7 @@ intel_nir_opt_peephole_ffma_instr(nir_builder *b,
|
|||
mul_src[0] = nir_fneg(b, mul_src[0]);
|
||||
|
||||
nir_alu_instr *ffma = nir_alu_instr_create(b->shader, nir_op_ffma);
|
||||
ffma->fp_fast_math = mul->fp_fast_math | add->fp_fast_math;
|
||||
ffma->fp_math_ctrl = mul->fp_math_ctrl | add->fp_math_ctrl;
|
||||
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
ffma->src[i].src = nir_src_for_ssa(mul_src[i]);
|
||||
|
|
|
|||
|
|
@ -51,7 +51,7 @@ lower_atomic_in_lock(nir_builder *b, nir_intrinsic_instr *intr, nir_def *loaded)
|
|||
b, nir_atomic_op_to_alu(nir_intrinsic_atomic_op(intr)), loaded, data);
|
||||
nir_alu_instr *alu = nir_def_as_alu(to_store);
|
||||
alu->exact = true;
|
||||
alu->fp_fast_math = 0;
|
||||
alu->fp_math_ctrl = nir_fp_no_fast_math;
|
||||
break;
|
||||
}
|
||||
case nir_atomic_op_xchg: {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue