From dc55b47a584deee576c27c229b3fe0faeaa50dc8 Mon Sep 17 00:00:00 2001 From: Emma Anholt Date: Fri, 21 Feb 2025 10:49:36 -0800 Subject: [PATCH] intel/elk: Move pre-gen6 smooth interpolation 1/w multiply to NIR. NIR catches that if you're just doing something like adding two smooth inputs, we can do the multiply once on the result instead of on each input. BRW shader-db results: total instructions in shared programs: 4409146 -> 4408303 (-0.02%) instructions in affected programs: 800761 -> 799918 (-0.11%) total cycles in shared programs: 143203198 -> 142485036 (-0.50%) cycles in affected programs: 79081682 -> 78363520 (-0.91%) total sends in shared programs: 363044 -> 363042 (<.01%) sends in affected programs: 33 -> 31 (-6.06%) Reviewed-by: Alyssa Rosenzweig Part-of: --- src/intel/compiler/elk/elk_fs.h | 1 - src/intel/compiler/elk/elk_fs_nir.cpp | 10 +----- src/intel/compiler/elk/elk_fs_visitor.cpp | 7 ++-- src/intel/compiler/elk/elk_nir.c | 44 +++++++++++++++++++++++ 4 files changed, 47 insertions(+), 15 deletions(-) diff --git a/src/intel/compiler/elk/elk_fs.h b/src/intel/compiler/elk/elk_fs.h index a5b807e18a8..0aedb4a2e92 100644 --- a/src/intel/compiler/elk/elk_fs.h +++ b/src/intel/compiler/elk/elk_fs.h @@ -393,7 +393,6 @@ public: elk_fs_reg uw_pixel_y; elk_fs_reg pixel_z; elk_fs_reg wpos_w; - elk_fs_reg pixel_w; elk_fs_reg delta_xy[ELK_BARYCENTRIC_MODE_COUNT]; elk_fs_reg final_gs_vertex_count; elk_fs_reg control_data_bits; diff --git a/src/intel/compiler/elk/elk_fs_nir.cpp b/src/intel/compiler/elk/elk_fs_nir.cpp index da239cda24e..721a66ead5c 100644 --- a/src/intel/compiler/elk/elk_fs_nir.cpp +++ b/src/intel/compiler/elk/elk_fs_nir.cpp @@ -3941,8 +3941,6 @@ fs_nir_emit_fs_intrinsic(nir_to_elk_state &ntb, nir_intrinsic_instr *bary_intrinsic = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr); nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic; - enum glsl_interp_mode interp_mode = - (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic); elk_fs_reg dst_xy; if (bary_intrin == nir_intrinsic_load_barycentric_at_offset || @@ -3962,13 +3960,7 @@ fs_nir_emit_fs_intrinsic(nir_to_elk_state &ntb, interp.type = ELK_REGISTER_TYPE_F; dest.type = ELK_REGISTER_TYPE_F; - if (devinfo->ver < 6 && interp_mode == INTERP_MODE_SMOOTH) { - elk_fs_reg tmp = s.vgrf(glsl_float_type()); - bld.emit(ELK_FS_OPCODE_LINTERP, tmp, dst_xy, interp); - bld.MUL(offset(dest, bld, i), tmp, s.pixel_w); - } else { - bld.emit(ELK_FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp); - } + bld.emit(ELK_FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp); } break; } diff --git a/src/intel/compiler/elk/elk_fs_visitor.cpp b/src/intel/compiler/elk/elk_fs_visitor.cpp index 79bce09e1d2..9c2d3b5ab90 100644 --- a/src/intel/compiler/elk/elk_fs_visitor.cpp +++ b/src/intel/compiler/elk/elk_fs_visitor.cpp @@ -146,9 +146,6 @@ elk_fs_visitor::emit_interpolation_setup_gfx4() this->wpos_w = vgrf(glsl_float_type()); abld.emit(ELK_FS_OPCODE_LINTERP, wpos_w, delta_xy, interp_reg(abld, VARYING_SLOT_POS, 3, 0)); - /* Compute the pixel 1/W value from wpos.w. */ - this->pixel_w = vgrf(glsl_float_type()); - abld.emit(ELK_SHADER_OPCODE_RCP, this->pixel_w, wpos_w); } /** Emits the interpolation for the varying inputs. */ @@ -278,9 +275,9 @@ elk_fs_visitor::emit_interpolation_setup_gfx6() if (wm_prog_data->uses_src_w) { abld = bld.annotate("compute pos.w"); - this->pixel_w = fetch_payload_reg(abld, fs_payload().source_w_reg); this->wpos_w = vgrf(glsl_float_type()); - abld.emit(ELK_SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w); + abld.emit(ELK_SHADER_OPCODE_RCP, this->wpos_w, + fetch_payload_reg(abld, fs_payload().source_w_reg)); } if (wm_key->persample_interp == ELK_SOMETIMES) { diff --git a/src/intel/compiler/elk/elk_nir.c b/src/intel/compiler/elk/elk_nir.c index 0642880064d..d8f1b8ca211 100644 --- a/src/intel/compiler/elk/elk_nir.c +++ b/src/intel/compiler/elk/elk_nir.c @@ -488,6 +488,49 @@ lower_barycentric_at_offset(nir_builder *b, nir_intrinsic_instr *intrin, return true; } +static bool +elk_nir_lower_fs_smooth_interp_gfx4_instr(nir_builder *b, nir_intrinsic_instr *intr, void *_data) +{ + if (intr->intrinsic != nir_intrinsic_load_deref) + return false; + + nir_deref_instr *deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr); + nir_variable *var = nir_deref_instr_get_variable(deref); + + if (var->data.interpolation != INTERP_MODE_SMOOTH) + return false; + + /* If we haven't computed pixel_w yet, do so now (once, at the start of the + * shader). CSE could do this, but this makes things more legible and saves + * followup optimization. + */ + nir_def **pixel_w = _data; + if (!*pixel_w) { + b->cursor = nir_before_block(nir_start_block(b->impl)); + + nir_def *w = nir_load_frag_coord_w(b); + BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD_W); + *pixel_w = nir_frcp(b, w); + } + + b->cursor = nir_after_instr(&intr->instr); + nir_def *result = nir_fmul(b, &intr->def, *pixel_w); + + nir_def_rewrite_uses_after(&intr->def, result, result->parent_instr); + return true; +} + +/* Multiplies all smooth interpolation outputs by 1/frag_w. */ +static bool +elk_nir_lower_fs_smooth_interp_gfx4(nir_shader *shader) +{ + nir_def *pixel_w = NULL; + return nir_shader_intrinsics_pass(shader, elk_nir_lower_fs_smooth_interp_gfx4_instr, + nir_metadata_block_index | nir_metadata_dominance, + &pixel_w); +} + + static bool elk_nir_lower_load_frag_coord_w_gfx4_instr(nir_builder *b, nir_intrinsic_instr *intr, void *_data) { @@ -553,6 +596,7 @@ elk_nir_lower_fs_inputs(nir_shader *nir, NIR_PASS(_, nir, nir_lower_frag_coord_to_pixel_coord); if (devinfo->ver < 6) { /* Needs to be run before nir_lower_io. */ + NIR_PASS(_, nir, elk_nir_lower_fs_smooth_interp_gfx4); NIR_PASS(_, nir, elk_nir_lower_load_frag_coord_w_gfx4); }