From f047a67fba7bcdcb1f8426c35280daf84d7e36ac Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Tue, 10 Jun 2025 15:10:02 +0200 Subject: [PATCH] nir,aco: optimize FP16_OFVL pattern created by vkd3d-proton Reviewed-by: Rhys Perry Part-of: --- src/amd/common/nir/ac_nir.c | 1 + .../instruction_selection/aco_isel_setup.cpp | 1 + .../aco_select_nir_alu.cpp | 4 +++- src/compiler/nir/nir_opcodes.py | 2 ++ src/compiler/nir/nir_opt_algebraic.py | 24 +++++++++++++++++++ .../nir/nir_shader_compiler_options.h | 3 +++ 6 files changed, 34 insertions(+), 1 deletion(-) diff --git a/src/amd/common/nir/ac_nir.c b/src/amd/common/nir/ac_nir.c index 2a58272f7ef..52911038ae0 100644 --- a/src/amd/common/nir/ac_nir.c +++ b/src/amd/common/nir/ac_nir.c @@ -88,6 +88,7 @@ void ac_nir_set_options(struct radeon_info *info, bool use_llvm, options->has_msad = true; options->has_shfr32 = true; options->has_mul24_relaxed = true; + options->has_f2e4m3fn_satfn = !use_llvm && info->gfx_level >= GFX12; options->lower_int64_options = nir_lower_imul64 | nir_lower_imul_high64 | nir_lower_imul_2x32_64 | nir_lower_divmod64 | nir_lower_minmax64 | nir_lower_iabs64 | nir_lower_iadd_sat64 | nir_lower_conv64 | nir_lower_bitfield_extract64; diff --git a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp index 4368698ff32..489299b3655 100644 --- a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp +++ b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp @@ -417,6 +417,7 @@ init_context(isel_context* ctx, nir_shader* shader) break; case nir_op_f2e4m3fn: case nir_op_f2e4m3fn_sat: + case nir_op_f2e4m3fn_satfn: case nir_op_f2e5m2: case nir_op_f2e5m2_sat: case nir_op_fmulz: diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp index 7007c4b0118..0fb7a81df1e 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp @@ -2555,6 +2555,7 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) } case nir_op_f2e4m3fn: case nir_op_f2e4m3fn_sat: + case nir_op_f2e4m3fn_satfn: case nir_op_f2e5m2: case nir_op_f2e5m2_sat: { Operand src[2]; @@ -2588,7 +2589,8 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) aco_opcode opcode = instr->op == nir_op_f2e4m3fn || instr->op == nir_op_f2e4m3fn_sat ? aco_opcode::v_cvt_pk_fp8_f32 - : aco_opcode::v_cvt_pk_bf8_f32; + : instr->op == nir_op_f2e4m3fn_satfn ? aco_opcode::p_v_cvt_pk_fp8_f32_ovfl + : aco_opcode::v_cvt_pk_bf8_f32; bld.vop3(opcode, Definition(dst), src[0], src[1]); if (instr->def.num_components == 2) emit_split_vector(ctx, dst, 2); diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py index 3ae903b8654..05a100201dd 100644 --- a/src/compiler/nir/nir_opcodes.py +++ b/src/compiler/nir/nir_opcodes.py @@ -1770,6 +1770,8 @@ opcode("bfdot2_bfadd", 1, tint16, [2, 2, 1], [tint16, tint16, tint16], unop_numeric_convert("e4m3fn2f", tfloat32, tuint8, "_mesa_e4m3fn_to_float(src0)") unop_numeric_convert("f2e4m3fn", tuint8, tfloat32, "_mesa_float_to_e4m3fn(src0)") unop_numeric_convert("f2e4m3fn_sat", tuint8, tfloat32, "_mesa_float_to_e4m3fn_sat(src0)") +# AMD specific conversion that clamps finite values but not inf (GFX12 FP16_OVFL=1 behavior) +unop_numeric_convert("f2e4m3fn_satfn", tuint8, tfloat32, "isinf(src0) ? 0x7f : _mesa_float_to_e4m3fn_sat(src0)") unop_numeric_convert("e5m22f", tfloat32, tuint8, "_mesa_e5m2_to_float(src0)") unop_numeric_convert("f2e5m2", tuint8, tfloat32, "_mesa_float_to_e5m2(src0)") diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index f6b360ccf01..e9744c7c6aa 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -3144,6 +3144,30 @@ optimizations += [ (('iadd', ('msad_4x8', a, b, 0), c), ('msad_4x8', a, b, c)), ] +# VKD3D-Proton patterns for FP16_OVFL=1 conversion to e4m3fn +def vkd3d_proton_f2e4m3_ovfl(variant, x, nan): + if variant == 0: + cond = ('feq', ('fabs', x), float('inf')) + elif variant == 1: + cond = ('feq', f'{x}(is_not_negative)', float('inf')) + elif variant == 2: + cond = ('feq', f'{x}(is_not_positive)', -float('inf')) + + return ('bcsel', cond, f'#{nan}(is_nan)', x) + + +for var in range(3): + optimizations += [ + (('f2e4m3fn_sat', vkd3d_proton_f2e4m3_ovfl(var, a, b)), + ('f2e4m3fn_satfn', a), 'options->has_f2e4m3fn_satfn'), + ] + +for var0, var1 in itertools.product(range(3), repeat=2): + optimizations += [ + (('f2e4m3fn_sat', ('vec2', vkd3d_proton_f2e4m3_ovfl(var0, a, b), + vkd3d_proton_f2e4m3_ovfl(var1, c, d))), + ('f2e4m3fn_satfn', ('vec2', a, c)), 'options->has_f2e4m3fn_satfn'), + ] # "all_equal(eq(a, b), vec(~0))" is the same as "all_equal(a, b)" # "any_nequal(neq(a, b), vec(0))" is the same as "any_nequal(a, b)" diff --git a/src/compiler/nir/nir_shader_compiler_options.h b/src/compiler/nir/nir_shader_compiler_options.h index b1be138e135..e2815072bef 100644 --- a/src/compiler/nir/nir_shader_compiler_options.h +++ b/src/compiler/nir/nir_shader_compiler_options.h @@ -638,6 +638,9 @@ typedef struct nir_shader_compiler_options { /** Backend support msad_u4x8. */ bool has_msad; + /** Backend supports f2e4m3fn_satfn */ + bool has_f2e4m3fn_satfn; + /** * Is this the Intel vec4 backend? *