From 7b651ac6c3e9e5c5971ab162312acedc451a810c Mon Sep 17 00:00:00 2001 From: Caio Oliveira Date: Sun, 11 Feb 2024 00:45:45 -0800 Subject: [PATCH] intel/elk: Remove Gfx9+ from compile/run functions Reviewed-by: Lionel Landwerlin Part-of: --- src/intel/compiler/elk/elk_fs.cpp | 215 ++---------------- src/intel/compiler/elk/elk_fs.h | 1 - src/intel/compiler/elk/elk_shader.cpp | 2 +- src/intel/compiler/elk/elk_simd_selection.cpp | 10 +- src/intel/compiler/elk/elk_vec4.cpp | 2 +- src/intel/compiler/elk/elk_vec4_tcs.cpp | 2 +- 6 files changed, 25 insertions(+), 207 deletions(-) diff --git a/src/intel/compiler/elk/elk_fs.cpp b/src/intel/compiler/elk/elk_fs.cpp index 33d26fc7267..6847cbc3235 100644 --- a/src/intel/compiler/elk/elk_fs.cpp +++ b/src/intel/compiler/elk/elk_fs.cpp @@ -5935,35 +5935,6 @@ elk_fs_visitor::fixup_3src_null_dest() DEPENDENCY_VARIABLES); } -/* Wa_14015360517 - * - * The first instruction of any kernel should have non-zero emask. - * Make sure this happens by introducing a dummy mov instruction. - */ -void -elk_fs_visitor::emit_dummy_mov_instruction() -{ - if (!intel_needs_workaround(devinfo, 14015360517)) - return; - - struct elk_backend_instruction *first_inst = - cfg->first_block()->start(); - - /* We can skip the WA if first instruction is marked with - * force_writemask_all or exec_size equals dispatch_width. - */ - if (first_inst->force_writemask_all || - first_inst->exec_size == dispatch_width) - return; - - /* Insert dummy mov as first instruction. */ - const fs_builder ubld = - fs_builder(this, cfg->first_block(), (elk_fs_inst *)first_inst).exec_all().group(8, 0); - ubld.MOV(ubld.null_reg_ud(), elk_imm_ud(0u)); - - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); -} - /** * Find the first instruction in the program that might start a region of * divergent control flow due to a HALT jump. There is no @@ -6350,9 +6321,6 @@ elk_fs_visitor::run_vs() fixup_3src_null_dest(); - /* Wa_14015360517 */ - emit_dummy_mov_instruction(); - allocate_registers(true /* allow_spilling */); return !failed; @@ -6479,9 +6447,6 @@ elk_fs_visitor::run_tcs() fixup_3src_null_dest(); - /* Wa_14015360517 */ - emit_dummy_mov_instruction(); - allocate_registers(true /* allow_spilling */); return !failed; @@ -6510,9 +6475,6 @@ elk_fs_visitor::run_tes() fixup_3src_null_dest(); - /* Wa_14015360517 */ - emit_dummy_mov_instruction(); - allocate_registers(true /* allow_spilling */); return !failed; @@ -6558,41 +6520,11 @@ elk_fs_visitor::run_gs() fixup_3src_null_dest(); - /* Wa_14015360517 */ - emit_dummy_mov_instruction(); - allocate_registers(true /* allow_spilling */); return !failed; } -/* From the SKL PRM, Volume 16, Workarounds: - * - * 0877 3D Pixel Shader Hang possible when pixel shader dispatched with - * only header phases (R0-R2) - * - * WA: Enable a non-header phase (e.g. push constant) when dispatch would - * have been header only. - * - * Instead of enabling push constants one can alternatively enable one of the - * inputs. Here one simply chooses "layer" which shouldn't impose much - * overhead. - */ -static void -gfx9_ps_header_only_workaround(struct elk_wm_prog_data *wm_prog_data) -{ - if (wm_prog_data->num_varying_inputs) - return; - - if (wm_prog_data->base.curb_read_length) - return; - - wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0; - wm_prog_data->num_varying_inputs = 1; - - elk_compute_urb_setup_index(wm_prog_data); -} - bool elk_fs_visitor::run_fs(bool allow_spilling, bool do_rep_send) { @@ -6626,11 +6558,9 @@ elk_fs_visitor::run_fs(bool allow_spilling, bool do_rep_send) for (unsigned i = 0; i < dispatch_width / lower_width; i++) { /* According to the "PS Thread Payload for Normal * Dispatch" pages on the BSpec, the dispatch mask is - * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on - * gfx6+. + * stored in R1.7/R2.7 on gfx6+. */ const elk_fs_reg dispatch_mask = - devinfo->ver >= 20 ? xe2_vec1_grf(i, 15) : devinfo->ver >= 6 ? elk_vec1_grf(i + 1, 7) : elk_vec1_grf(0, 0); bld.exec_all().group(1, 0) @@ -6658,16 +6588,10 @@ elk_fs_visitor::run_fs(bool allow_spilling, bool do_rep_send) assign_curb_setup(); - if (devinfo->ver == 9) - gfx9_ps_header_only_workaround(wm_prog_data); - assign_urb_setup(); fixup_3src_null_dest(); - /* Wa_14015360517 */ - emit_dummy_mov_instruction(); - allocate_registers(allow_spilling); } @@ -6705,9 +6629,6 @@ elk_fs_visitor::run_cs(bool allow_spilling) fixup_3src_null_dest(); - /* Wa_14015360517 */ - emit_dummy_mov_instruction(); - allocate_registers(allow_spilling); return !failed; @@ -7120,32 +7041,29 @@ elk_compile_fs(const struct elk_compiler *compiler, elk_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data); std::unique_ptr v8, v16, v32, vmulti; - elk_cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL, - *multi_cfg = NULL; + elk_cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL; float throughput = 0; bool has_spilled = false; - if (devinfo->ver < 20) { - v8 = std::make_unique(compiler, ¶ms->base, key, - prog_data, nir, 8, 1, - params->base.stats != NULL, - debug_enabled); - if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) { - params->base.error_str = ralloc_strdup(params->base.mem_ctx, - v8->fail_msg); - return NULL; - } else if (INTEL_SIMD(FS, 8)) { - simd8_cfg = v8->cfg; + v8 = std::make_unique(compiler, ¶ms->base, key, + prog_data, nir, 8, 1, + params->base.stats != NULL, + debug_enabled); + if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) { + params->base.error_str = ralloc_strdup(params->base.mem_ctx, + v8->fail_msg); + return NULL; + } else if (INTEL_SIMD(FS, 8)) { + simd8_cfg = v8->cfg; - assert(v8->payload().num_regs % reg_unit(devinfo) == 0); - prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo); + assert(v8->payload().num_regs % reg_unit(devinfo) == 0); + prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo); - prog_data->reg_blocks_8 = elk_register_blocks(v8->grf_used); - const performance &perf = v8->performance_analysis.require(); - throughput = MAX2(throughput, perf.throughput); - has_spilled = v8->spilled_any_registers; - allow_spilling = false; - } + prog_data->reg_blocks_8 = elk_register_blocks(v8->grf_used); + const performance &perf = v8->performance_analysis.require(); + throughput = MAX2(throughput, perf.throughput); + has_spilled = v8->spilled_any_registers; + allow_spilling = false; } /* Limit dispatch width to simd8 with dual source blending on gfx8. @@ -7158,18 +7076,6 @@ elk_compile_fs(const struct elk_compiler *compiler, "using SIMD8 when dual src blending.\n"); } - if (key->coarse_pixel && devinfo->ver < 20) { - if (prog_data->dual_src_blend) { - v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot" - " use SIMD8 messages.\n"); - } - v8->limit_dispatch_width(16, "SIMD32 not supported with coarse" - " pixel shading.\n"); - } - - if (nir->info.ray_queries > 0 && v8) - v8->limit_dispatch_width(16, "SIMD32 with ray queries.\n"); - if (!has_spilled && (!v8 || v8->max_dispatch_width >= 16) && (INTEL_SIMD(FS, 16) || params->use_rep_send)) { @@ -7238,78 +7144,6 @@ elk_compile_fs(const struct elk_compiler *compiler, } } - if (devinfo->ver >= 12 && !has_spilled && - params->max_polygons >= 2 && !key->coarse_pixel) { - elk_fs_visitor *vbase = v8 ? v8.get() : v16 ? v16.get() : v32.get(); - assert(vbase); - - if (devinfo->ver >= 20 && - params->max_polygons >= 4 && - vbase->max_dispatch_width >= 32 && - 4 * prog_data->num_varying_inputs <= MAX_VARYING && - INTEL_SIMD(FS, 4X8)) { - /* Try a quad-SIMD8 compile */ - vmulti = std::make_unique(compiler, ¶ms->base, key, - prog_data, nir, 32, 4, - params->base.stats != NULL, - debug_enabled); - vmulti->import_uniforms(vbase); - if (!vmulti->run_fs(false, params->use_rep_send)) { - elk_shader_perf_log(compiler, params->base.log_data, - "Quad-SIMD8 shader failed to compile: %s\n", - vmulti->fail_msg); - } else { - multi_cfg = vmulti->cfg; - assert(!vmulti->spilled_any_registers); - } - } - - if (!multi_cfg && devinfo->ver >= 20 && - vbase->max_dispatch_width >= 32 && - 2 * prog_data->num_varying_inputs <= MAX_VARYING && - INTEL_SIMD(FS, 2X16)) { - /* Try a dual-SIMD16 compile */ - vmulti = std::make_unique(compiler, ¶ms->base, key, - prog_data, nir, 32, 2, - params->base.stats != NULL, - debug_enabled); - vmulti->import_uniforms(vbase); - if (!vmulti->run_fs(false, params->use_rep_send)) { - elk_shader_perf_log(compiler, params->base.log_data, - "Dual-SIMD16 shader failed to compile: %s\n", - vmulti->fail_msg); - } else { - multi_cfg = vmulti->cfg; - assert(!vmulti->spilled_any_registers); - } - } - - if (!multi_cfg && vbase->max_dispatch_width >= 16 && - 2 * prog_data->num_varying_inputs <= MAX_VARYING && - INTEL_SIMD(FS, 2X8)) { - /* Try a dual-SIMD8 compile */ - vmulti = std::make_unique(compiler, ¶ms->base, key, - prog_data, nir, 16, 2, - params->base.stats != NULL, - debug_enabled); - vmulti->import_uniforms(vbase); - if (!vmulti->run_fs(allow_spilling, params->use_rep_send)) { - elk_shader_perf_log(compiler, params->base.log_data, - "Dual-SIMD8 shader failed to compile: %s\n", - vmulti->fail_msg); - } else { - multi_cfg = vmulti->cfg; - } - } - - if (multi_cfg) { - assert(vmulti->payload().num_regs % reg_unit(devinfo) == 0); - prog_data->base.dispatch_grf_start_reg = vmulti->payload().num_regs / reg_unit(devinfo); - - prog_data->reg_blocks_8 = elk_register_blocks(vmulti->grf_used); - } - } - /* When the caller requests a repclear shader, they want SIMD16-only */ if (params->use_rep_send) simd8_cfg = NULL; @@ -7358,16 +7192,7 @@ elk_compile_fs(const struct elk_compiler *compiler, struct elk_compile_stats *stats = params->base.stats; uint32_t max_dispatch_width = 0; - if (multi_cfg) { - prog_data->dispatch_multi = vmulti->dispatch_width; - prog_data->max_polygons = vmulti->max_polygons; - g.generate_code(multi_cfg, vmulti->dispatch_width, vmulti->shader_stats, - vmulti->performance_analysis.require(), - stats, vmulti->max_polygons); - stats = stats ? stats + 1 : NULL; - max_dispatch_width = vmulti->dispatch_width; - - } else if (simd8_cfg) { + if (simd8_cfg) { prog_data->dispatch_8 = true; g.generate_code(simd8_cfg, 8, v8->shader_stats, v8->performance_analysis.require(), stats, 1); diff --git a/src/intel/compiler/elk/elk_fs.h b/src/intel/compiler/elk/elk_fs.h index e97cd7e0de2..7c5642cc52d 100644 --- a/src/intel/compiler/elk/elk_fs.h +++ b/src/intel/compiler/elk/elk_fs.h @@ -213,7 +213,6 @@ public: void allocate_registers(bool allow_spilling); uint32_t compute_max_register_pressure(); void fixup_3src_null_dest(); - void emit_dummy_mov_instruction(); bool fixup_nomask_control_flow(); void assign_curb_setup(); void assign_urb_setup(); diff --git a/src/intel/compiler/elk/elk_shader.cpp b/src/intel/compiler/elk/elk_shader.cpp index 276b7c0284f..d0636efb3a9 100644 --- a/src/intel/compiler/elk/elk_shader.cpp +++ b/src/intel/compiler/elk/elk_shader.cpp @@ -1343,7 +1343,7 @@ elk_compile_tes(const struct elk_compiler *compiler, } if (is_scalar) { - const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8; + const unsigned dispatch_width = 8; elk_fs_visitor v(compiler, ¶ms->base, &key->base, &prog_data->base.base, nir, dispatch_width, params->base.stats != NULL, debug_enabled); diff --git a/src/intel/compiler/elk/elk_simd_selection.cpp b/src/intel/compiler/elk/elk_simd_selection.cpp index 08815e4a27c..04b87c522c2 100644 --- a/src/intel/compiler/elk/elk_simd_selection.cpp +++ b/src/intel/compiler/elk/elk_simd_selection.cpp @@ -102,8 +102,7 @@ elk_simd_should_compile(elk_simd_selection_state &state, unsigned simd) unsigned max_threads = state.devinfo->max_cs_workgroup_threads; - const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0; - if (simd > min_simd && state.compiled[simd - 1] && + if (simd > 0 && state.compiled[simd - 1] && workgroup_size <= (width / 2)) { state.error[simd] = "Workgroup size already fits in smaller SIMD"; return false; @@ -119,7 +118,7 @@ elk_simd_should_compile(elk_simd_selection_state &state, unsigned simd) * * TODO: Use performance_analysis and drop this rule. */ - if (width == 32 && state.devinfo->ver < 20) { + if (width == 32) { if (!INTEL_DEBUG(DEBUG_DO32) && (state.compiled[0] || state.compiled[1])) { state.error[simd] = "SIMD32 not required (use INTEL_DEBUG=do32 to force)"; return false; @@ -127,11 +126,6 @@ elk_simd_should_compile(elk_simd_selection_state &state, unsigned simd) } } - if (width == 8 && state.devinfo->ver >= 20) { - state.error[simd] = "SIMD8 not supported on Xe2+"; - return false; - } - uint64_t start; switch (prog_data->stage) { case MESA_SHADER_COMPUTE: diff --git a/src/intel/compiler/elk/elk_vec4.cpp b/src/intel/compiler/elk/elk_vec4.cpp index a51722e07e7..d6a24d23ccb 100644 --- a/src/intel/compiler/elk/elk_vec4.cpp +++ b/src/intel/compiler/elk/elk_vec4.cpp @@ -2648,7 +2648,7 @@ elk_compile_vs(const struct elk_compiler *compiler, } if (is_scalar) { - const unsigned dispatch_width = compiler->devinfo->ver >= 20 ? 16 : 8; + const unsigned dispatch_width = 8; prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8; elk_fs_visitor v(compiler, ¶ms->base, &key->base, diff --git a/src/intel/compiler/elk/elk_vec4_tcs.cpp b/src/intel/compiler/elk/elk_vec4_tcs.cpp index be28602c0c6..f1efa660f71 100644 --- a/src/intel/compiler/elk/elk_vec4_tcs.cpp +++ b/src/intel/compiler/elk/elk_vec4_tcs.cpp @@ -447,7 +447,7 @@ elk_compile_tcs(const struct elk_compiler *compiler, } if (is_scalar) { - const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8; + const unsigned dispatch_width = 8; elk_fs_visitor v(compiler, ¶ms->base, &key->base, &prog_data->base.base, nir, dispatch_width, params->base.stats != NULL, debug_enabled);