From fdb029fe1b40a8d86b3fec5e2800a0bf81e065b0 Mon Sep 17 00:00:00 2001 From: Caio Oliveira Date: Fri, 12 Jul 2024 14:20:57 -0700 Subject: [PATCH] intel/brw: Move and reduce scope of run_*() functions Reviewed-by: Ian Romanick Part-of: --- src/intel/compiler/brw_compile_bs.cpp | 32 +- src/intel/compiler/brw_compile_cs.cpp | 42 ++- src/intel/compiler/brw_compile_fs.cpp | 117 ++++++- src/intel/compiler/brw_compile_gs.cpp | 53 +++- src/intel/compiler/brw_compile_mesh.cpp | 36 ++- src/intel/compiler/brw_compile_tcs.cpp | 59 +++- src/intel/compiler/brw_compile_tes.cpp | 32 +- src/intel/compiler/brw_compile_vs.cpp | 32 +- src/intel/compiler/brw_fs.cpp | 394 ------------------------ src/intel/compiler/brw_fs.h | 9 - 10 files changed, 389 insertions(+), 417 deletions(-) diff --git a/src/intel/compiler/brw_compile_bs.cpp b/src/intel/compiler/brw_compile_bs.cpp index 83d9f020194..2d6a3c54e4a 100644 --- a/src/intel/compiler/brw_compile_bs.cpp +++ b/src/intel/compiler/brw_compile_bs.cpp @@ -28,6 +28,36 @@ brw_bsr(const struct intel_device_info *devinfo, SET_BITS(local_arg_offset / 8, 2, 0); } +static bool +run_bs(fs_visitor &s, bool allow_spilling) +{ + assert(s.stage >= MESA_SHADER_RAYGEN && s.stage <= MESA_SHADER_CALLABLE); + + s.payload_ = new bs_thread_payload(s); + + nir_to_brw(&s); + + if (s.failed) + return false; + + /* TODO(RT): Perhaps rename this? */ + s.emit_cs_terminate(); + + s.calculate_cfg(); + + brw_fs_optimize(s); + + s.assign_curb_setup(); + + brw_fs_lower_3src_null_dest(s); + brw_fs_workaround_memory_fence_before_eot(s); + brw_fs_workaround_emit_dummy_mov_instruction(s); + + s.allocate_registers(allow_spilling); + + return !s.failed; +} + static uint8_t compile_single_bs(const struct brw_compiler *compiler, struct brw_compile_bs_params *params, @@ -78,7 +108,7 @@ compile_single_bs(const struct brw_compiler *compiler, debug_enabled); const bool allow_spilling = !brw_simd_any_compiled(simd_state); - if (v[simd]->run_bs(allow_spilling)) { + if (run_bs(*v[simd], allow_spilling)) { brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers); } else { simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, diff --git a/src/intel/compiler/brw_compile_cs.cpp b/src/intel/compiler/brw_compile_cs.cpp index 90cbfc7afd7..5dd387979d4 100644 --- a/src/intel/compiler/brw_compile_cs.cpp +++ b/src/intel/compiler/brw_compile_cs.cpp @@ -4,6 +4,7 @@ */ #include "brw_fs.h" +#include "brw_fs_builder.h" #include "brw_fs_live_variables.h" #include "brw_nir.h" #include "brw_cfg.h" @@ -15,6 +16,8 @@ #include +using namespace brw; + static void fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords) { @@ -56,6 +59,43 @@ cs_fill_push_const_info(const struct intel_device_info *devinfo, prog_data->nr_params); } +static bool +run_cs(fs_visitor &s, bool allow_spilling) +{ + assert(gl_shader_stage_is_compute(s.stage)); + const fs_builder bld = fs_builder(&s).at_end(); + + s.payload_ = new cs_thread_payload(s); + + if (s.devinfo->platform == INTEL_PLATFORM_HSW && s.prog_data->total_shared > 0) { + /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */ + const fs_builder abld = bld.exec_all().group(1, 0); + abld.MOV(retype(brw_sr0_reg(1), BRW_TYPE_UW), + suboffset(retype(brw_vec1_grf(0, 0), BRW_TYPE_UW), 1)); + } + + nir_to_brw(&s); + + if (s.failed) + return false; + + s.emit_cs_terminate(); + + s.calculate_cfg(); + + brw_fs_optimize(s); + + s.assign_curb_setup(); + + brw_fs_lower_3src_null_dest(s); + brw_fs_workaround_memory_fence_before_eot(s); + brw_fs_workaround_emit_dummy_mov_instruction(s); + + s.allocate_registers(allow_spilling); + + return !s.failed; +} + const unsigned * brw_compile_cs(const struct brw_compiler *compiler, struct brw_compile_cs_params *params) @@ -119,7 +159,7 @@ brw_compile_cs(const struct brw_compiler *compiler, const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable; - if (v[simd]->run_cs(allow_spilling)) { + if (run_cs(*v[simd], allow_spilling)) { cs_fill_push_const_info(compiler->devinfo, prog_data); brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers); diff --git a/src/intel/compiler/brw_compile_fs.cpp b/src/intel/compiler/brw_compile_fs.cpp index 59565502221..234050a977b 100644 --- a/src/intel/compiler/brw_compile_fs.cpp +++ b/src/intel/compiler/brw_compile_fs.cpp @@ -5,6 +5,7 @@ #include "brw_eu.h" #include "brw_fs.h" +#include "brw_fs_builder.h" #include "brw_fs_live_variables.h" #include "brw_nir.h" #include "brw_cfg.h" @@ -590,6 +591,110 @@ brw_nir_populate_wm_prog_data(nir_shader *shader, brw_compute_flat_inputs(prog_data, shader); } +/* From the SKL PRM, Volume 16, Workarounds: + * + * 0877 3D Pixel Shader Hang possible when pixel shader dispatched with + * only header phases (R0-R2) + * + * WA: Enable a non-header phase (e.g. push constant) when dispatch would + * have been header only. + * + * Instead of enabling push constants one can alternatively enable one of the + * inputs. Here one simply chooses "layer" which shouldn't impose much + * overhead. + */ +static void +gfx9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data) +{ + if (wm_prog_data->num_varying_inputs) + return; + + if (wm_prog_data->base.curb_read_length) + return; + + wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0; + wm_prog_data->num_varying_inputs = 1; + + brw_compute_urb_setup_index(wm_prog_data); +} + +static bool +run_fs(fs_visitor &s, bool allow_spilling, bool do_rep_send) +{ + const struct intel_device_info *devinfo = s.devinfo; + struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data); + brw_wm_prog_key *wm_key = (brw_wm_prog_key *) s.key; + const fs_builder bld = fs_builder(&s).at_end(); + const nir_shader *nir = s.nir; + + assert(s.stage == MESA_SHADER_FRAGMENT); + + s.payload_ = new fs_thread_payload(s, s.source_depth_to_render_target); + + if (nir->info.ray_queries > 0) + s.limit_dispatch_width(16, "SIMD32 not supported with ray queries.\n"); + + if (do_rep_send) { + assert(s.dispatch_width == 16); + s.emit_repclear_shader(); + } else { + if (nir->info.inputs_read > 0 || + BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) || + (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) { + s.emit_interpolation_setup(); + } + + /* We handle discards by keeping track of the still-live pixels in f0.1. + * Initialize it with the dispatched pixels. + */ + if (devinfo->ver >= 20 || wm_prog_data->uses_kill) { + const unsigned lower_width = MIN2(s.dispatch_width, 16); + for (unsigned i = 0; i < s.dispatch_width / lower_width; i++) { + /* According to the "PS Thread Payload for Normal + * Dispatch" pages on the BSpec, the dispatch mask is + * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on + * gfx6+. + */ + const brw_reg dispatch_mask = + devinfo->ver >= 20 ? xe2_vec1_grf(i, 15) : + brw_vec1_grf(i + 1, 7); + bld.exec_all().group(1, 0) + .MOV(brw_sample_mask_reg(bld.group(lower_width, i)), + retype(dispatch_mask, BRW_TYPE_UW)); + } + } + + if (nir->info.writes_memory) + wm_prog_data->has_side_effects = true; + + nir_to_brw(&s); + + if (s.failed) + return false; + + s.emit_fb_writes(); + + s.calculate_cfg(); + + brw_fs_optimize(s); + + s.assign_curb_setup(); + + if (devinfo->ver == 9) + gfx9_ps_header_only_workaround(wm_prog_data); + + s.assign_urb_setup(); + + brw_fs_lower_3src_null_dest(s); + brw_fs_workaround_memory_fence_before_eot(s); + brw_fs_workaround_emit_dummy_mov_instruction(s); + + s.allocate_registers(allow_spilling); + } + + return !s.failed; +} + const unsigned * brw_compile_fs(const struct brw_compiler *compiler, struct brw_compile_fs_params *params) @@ -644,7 +749,7 @@ brw_compile_fs(const struct brw_compiler *compiler, prog_data, nir, 8, 1, params->base.stats != NULL, debug_enabled); - if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) { + if (!run_fs(*v8, allow_spilling, false /* do_rep_send */)) { params->base.error_str = ralloc_strdup(params->base.mem_ctx, v8->fail_msg); return NULL; @@ -680,7 +785,7 @@ brw_compile_fs(const struct brw_compiler *compiler, debug_enabled); if (v8) v16->import_uniforms(v8.get()); - if (!v16->run_fs(allow_spilling, params->use_rep_send)) { + if (!run_fs(*v16, allow_spilling, params->use_rep_send)) { brw_shader_perf_log(compiler, params->base.log_data, "SIMD16 shader failed to compile: %s\n", v16->fail_msg); @@ -715,7 +820,7 @@ brw_compile_fs(const struct brw_compiler *compiler, else if (v16) v32->import_uniforms(v16.get()); - if (!v32->run_fs(allow_spilling, false)) { + if (!run_fs(*v32, allow_spilling, false)) { brw_shader_perf_log(compiler, params->base.log_data, "SIMD32 shader failed to compile: %s\n", v32->fail_msg); @@ -752,7 +857,7 @@ brw_compile_fs(const struct brw_compiler *compiler, params->base.stats != NULL, debug_enabled); vmulti->import_uniforms(vbase); - if (!vmulti->run_fs(false, params->use_rep_send)) { + if (!run_fs(*vmulti, false, params->use_rep_send)) { brw_shader_perf_log(compiler, params->base.log_data, "Quad-SIMD8 shader failed to compile: %s\n", vmulti->fail_msg); @@ -772,7 +877,7 @@ brw_compile_fs(const struct brw_compiler *compiler, params->base.stats != NULL, debug_enabled); vmulti->import_uniforms(vbase); - if (!vmulti->run_fs(false, params->use_rep_send)) { + if (!run_fs(*vmulti, false, params->use_rep_send)) { brw_shader_perf_log(compiler, params->base.log_data, "Dual-SIMD16 shader failed to compile: %s\n", vmulti->fail_msg); @@ -791,7 +896,7 @@ brw_compile_fs(const struct brw_compiler *compiler, params->base.stats != NULL, debug_enabled); vmulti->import_uniforms(vbase); - if (!vmulti->run_fs(allow_spilling, params->use_rep_send)) { + if (!run_fs(*vmulti, allow_spilling, params->use_rep_send)) { brw_shader_perf_log(compiler, params->base.log_data, "Dual-SIMD8 shader failed to compile: %s\n", vmulti->fail_msg); diff --git a/src/intel/compiler/brw_compile_gs.cpp b/src/intel/compiler/brw_compile_gs.cpp index 69055e58503..d8b645d21d0 100644 --- a/src/intel/compiler/brw_compile_gs.cpp +++ b/src/intel/compiler/brw_compile_gs.cpp @@ -5,11 +5,14 @@ #include "brw_eu.h" #include "brw_fs.h" +#include "brw_fs_builder.h" #include "brw_prim.h" #include "brw_nir.h" #include "brw_private.h" #include "dev/intel_debug.h" +using namespace brw; + static const GLuint gl_prim_to_hw_prim[MESA_PRIM_TRIANGLE_STRIP_ADJACENCY+1] = { [MESA_PRIM_POINTS] =_3DPRIM_POINTLIST, [MESA_PRIM_LINES] = _3DPRIM_LINELIST, @@ -27,6 +30,54 @@ static const GLuint gl_prim_to_hw_prim[MESA_PRIM_TRIANGLE_STRIP_ADJACENCY+1] = { [MESA_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ, }; +static bool +run_gs(fs_visitor &s) +{ + assert(s.stage == MESA_SHADER_GEOMETRY); + + s.payload_ = new gs_thread_payload(s); + + const fs_builder bld = fs_builder(&s).at_end(); + + s.final_gs_vertex_count = bld.vgrf(BRW_TYPE_UD); + + if (s.gs_compile->control_data_header_size_bits > 0) { + /* Create a VGRF to store accumulated control data bits. */ + s.control_data_bits = bld.vgrf(BRW_TYPE_UD); + + /* If we're outputting more than 32 control data bits, then EmitVertex() + * will set control_data_bits to 0 after emitting the first vertex. + * Otherwise, we need to initialize it to 0 here. + */ + if (s.gs_compile->control_data_header_size_bits <= 32) { + const fs_builder abld = bld.annotate("initialize control data bits"); + abld.MOV(s.control_data_bits, brw_imm_ud(0u)); + } + } + + nir_to_brw(&s); + + s.emit_gs_thread_end(); + + if (s.failed) + return false; + + s.calculate_cfg(); + + brw_fs_optimize(s); + + s.assign_curb_setup(); + s.assign_gs_urb_setup(); + + brw_fs_lower_3src_null_dest(s); + brw_fs_workaround_memory_fence_before_eot(s); + brw_fs_workaround_emit_dummy_mov_instruction(s); + + s.allocate_registers(true /* allow_spilling */); + + return !s.failed; +} + extern "C" const unsigned * brw_compile_gs(const struct brw_compiler *compiler, struct brw_compile_gs_params *params) @@ -244,7 +295,7 @@ brw_compile_gs(const struct brw_compiler *compiler, fs_visitor v(compiler, ¶ms->base, &c, prog_data, nir, params->base.stats != NULL, debug_enabled); - if (v.run_gs()) { + if (run_gs(v)) { prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8; assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0); diff --git a/src/intel/compiler/brw_compile_mesh.cpp b/src/intel/compiler/brw_compile_mesh.cpp index 3095613ada4..0a272ea8d33 100644 --- a/src/intel/compiler/brw_compile_mesh.cpp +++ b/src/intel/compiler/brw_compile_mesh.cpp @@ -259,6 +259,38 @@ brw_nir_align_launch_mesh_workgroups(nir_shader *nir) NULL); } +static bool +run_task_mesh(fs_visitor &s, bool allow_spilling) +{ + assert(s.stage == MESA_SHADER_TASK || + s.stage == MESA_SHADER_MESH); + + s.payload_ = new task_mesh_thread_payload(s); + + nir_to_brw(&s); + + if (s.failed) + return false; + + s.emit_urb_fence(); + + s.emit_cs_terminate(); + + s.calculate_cfg(); + + brw_fs_optimize(s); + + s.assign_curb_setup(); + + brw_fs_lower_3src_null_dest(s); + brw_fs_workaround_memory_fence_before_eot(s); + brw_fs_workaround_emit_dummy_mov_instruction(s); + + s.allocate_registers(allow_spilling); + + return !s.failed; +} + const unsigned * brw_compile_task(const struct brw_compiler *compiler, struct brw_compile_task_params *params) @@ -331,7 +363,7 @@ brw_compile_task(const struct brw_compiler *compiler, } const bool allow_spilling = !brw_simd_any_compiled(simd_state); - if (v[simd]->run_task(allow_spilling)) + if (run_task_mesh(*v[simd], allow_spilling)) brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers); else simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg); @@ -1621,7 +1653,7 @@ brw_compile_mesh(const struct brw_compiler *compiler, } const bool allow_spilling = !brw_simd_any_compiled(simd_state); - if (v[simd]->run_mesh(allow_spilling)) + if (run_task_mesh(*v[simd], allow_spilling)) brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers); else simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg); diff --git a/src/intel/compiler/brw_compile_tcs.cpp b/src/intel/compiler/brw_compile_tcs.cpp index cfee80652a9..2502778f8b5 100644 --- a/src/intel/compiler/brw_compile_tcs.cpp +++ b/src/intel/compiler/brw_compile_tcs.cpp @@ -7,9 +7,12 @@ #include "intel_nir.h" #include "brw_nir.h" #include "brw_fs.h" +#include "brw_fs_builder.h" #include "brw_private.h" #include "dev/intel_debug.h" +using namespace brw; + /** * Return the number of patches to accumulate before a MULTI_PATCH mode thread is * launched. In cases with a large number of input control points and a large @@ -39,6 +42,60 @@ get_patch_count_threshold(int input_control_points) return 1; } +static bool +run_tcs(fs_visitor &s) +{ + assert(s.stage == MESA_SHADER_TESS_CTRL); + + struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(s.prog_data); + const fs_builder bld = fs_builder(&s).at_end(); + + assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH || + vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH); + + s.payload_ = new tcs_thread_payload(s); + + /* Initialize gl_InvocationID */ + s.set_tcs_invocation_id(); + + const bool fix_dispatch_mask = + vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH && + (s.nir->info.tess.tcs_vertices_out % 8) != 0; + + /* Fix the disptach mask */ + if (fix_dispatch_mask) { + bld.CMP(bld.null_reg_ud(), s.invocation_id, + brw_imm_ud(s.nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L); + bld.IF(BRW_PREDICATE_NORMAL); + } + + nir_to_brw(&s); + + if (fix_dispatch_mask) { + bld.emit(BRW_OPCODE_ENDIF); + } + + s.emit_tcs_thread_end(); + + if (s.failed) + return false; + + s.calculate_cfg(); + + brw_fs_optimize(s); + + s.assign_curb_setup(); + s.assign_tcs_urb_setup(); + + brw_fs_lower_3src_null_dest(s); + brw_fs_workaround_memory_fence_before_eot(s); + brw_fs_workaround_emit_dummy_mov_instruction(s); + + s.allocate_registers(true /* allow_spilling */); + + return !s.failed; +} + extern "C" const unsigned * brw_compile_tcs(const struct brw_compiler *compiler, struct brw_compile_tcs_params *params) @@ -136,7 +193,7 @@ brw_compile_tcs(const struct brw_compiler *compiler, fs_visitor v(compiler, ¶ms->base, &key->base, &prog_data->base.base, nir, dispatch_width, params->base.stats != NULL, debug_enabled); - if (!v.run_tcs()) { + if (!run_tcs(v)) { params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg); return NULL; diff --git a/src/intel/compiler/brw_compile_tes.cpp b/src/intel/compiler/brw_compile_tes.cpp index c8baca58cb1..f22c51c9f52 100644 --- a/src/intel/compiler/brw_compile_tes.cpp +++ b/src/intel/compiler/brw_compile_tes.cpp @@ -11,6 +11,36 @@ #include "dev/intel_debug.h" #include "util/macros.h" +static bool +run_tes(fs_visitor &s) +{ + assert(s.stage == MESA_SHADER_TESS_EVAL); + + s.payload_ = new tes_thread_payload(s); + + nir_to_brw(&s); + + if (s.failed) + return false; + + s.emit_urb_writes(); + + s.calculate_cfg(); + + brw_fs_optimize(s); + + s.assign_curb_setup(); + s.assign_tes_urb_setup(); + + brw_fs_lower_3src_null_dest(s); + brw_fs_workaround_memory_fence_before_eot(s); + brw_fs_workaround_emit_dummy_mov_instruction(s); + + s.allocate_registers(true /* allow_spilling */); + + return !s.failed; +} + const unsigned * brw_compile_tes(const struct brw_compiler *compiler, brw_compile_tes_params *params) @@ -109,7 +139,7 @@ brw_compile_tes(const struct brw_compiler *compiler, fs_visitor v(compiler, ¶ms->base, &key->base, &prog_data->base.base, nir, dispatch_width, params->base.stats != NULL, debug_enabled); - if (!v.run_tes()) { + if (!run_tes(v)) { params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg); return NULL; diff --git a/src/intel/compiler/brw_compile_vs.cpp b/src/intel/compiler/brw_compile_vs.cpp index 5f103c65e4a..825e9a9af37 100644 --- a/src/intel/compiler/brw_compile_vs.cpp +++ b/src/intel/compiler/brw_compile_vs.cpp @@ -11,6 +11,36 @@ using namespace brw; +static bool +run_vs(fs_visitor &s) +{ + assert(s.stage == MESA_SHADER_VERTEX); + + s.payload_ = new vs_thread_payload(s); + + nir_to_brw(&s); + + if (s.failed) + return false; + + s.emit_urb_writes(); + + s.calculate_cfg(); + + brw_fs_optimize(s); + + s.assign_curb_setup(); + s.assign_vs_urb_setup(); + + brw_fs_lower_3src_null_dest(s); + brw_fs_workaround_memory_fence_before_eot(s); + brw_fs_workaround_emit_dummy_mov_instruction(s); + + s.allocate_registers(true /* allow_spilling */); + + return !s.failed; +} + extern "C" const unsigned * brw_compile_vs(const struct brw_compiler *compiler, struct brw_compile_vs_params *params) @@ -102,7 +132,7 @@ brw_compile_vs(const struct brw_compiler *compiler, fs_visitor v(compiler, ¶ms->base, &key->base, &prog_data->base.base, nir, dispatch_width, params->base.stats != NULL, debug_enabled); - if (!v.run_vs()) { + if (!run_vs(v)) { params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg); return NULL; diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 36d5679244b..dab28efaefa 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -2709,36 +2709,6 @@ fs_visitor::allocate_registers(bool allow_spilling) brw_fs_lower_scoreboard(*this); } -bool -fs_visitor::run_vs() -{ - assert(stage == MESA_SHADER_VERTEX); - - payload_ = new vs_thread_payload(*this); - - nir_to_brw(this); - - if (failed) - return false; - - emit_urb_writes(); - - calculate_cfg(); - - brw_fs_optimize(*this); - - assign_curb_setup(); - assign_vs_urb_setup(); - - brw_fs_lower_3src_null_dest(*this); - brw_fs_workaround_memory_fence_before_eot(*this); - brw_fs_workaround_emit_dummy_mov_instruction(*this); - - allocate_registers(true /* allow_spilling */); - - return !failed; -} - void fs_visitor::set_tcs_invocation_id() { @@ -2811,370 +2781,6 @@ fs_visitor::emit_tcs_thread_end() inst->eot = true; } -bool -fs_visitor::run_tcs() -{ - assert(stage == MESA_SHADER_TESS_CTRL); - - struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data); - const fs_builder bld = fs_builder(this).at_end(); - - assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH || - vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH); - - payload_ = new tcs_thread_payload(*this); - - /* Initialize gl_InvocationID */ - set_tcs_invocation_id(); - - const bool fix_dispatch_mask = - vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH && - (nir->info.tess.tcs_vertices_out % 8) != 0; - - /* Fix the disptach mask */ - if (fix_dispatch_mask) { - bld.CMP(bld.null_reg_ud(), invocation_id, - brw_imm_ud(nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L); - bld.IF(BRW_PREDICATE_NORMAL); - } - - nir_to_brw(this); - - if (fix_dispatch_mask) { - bld.emit(BRW_OPCODE_ENDIF); - } - - emit_tcs_thread_end(); - - if (failed) - return false; - - calculate_cfg(); - - brw_fs_optimize(*this); - - assign_curb_setup(); - assign_tcs_urb_setup(); - - brw_fs_lower_3src_null_dest(*this); - brw_fs_workaround_memory_fence_before_eot(*this); - brw_fs_workaround_emit_dummy_mov_instruction(*this); - - allocate_registers(true /* allow_spilling */); - - return !failed; -} - -bool -fs_visitor::run_tes() -{ - assert(stage == MESA_SHADER_TESS_EVAL); - - payload_ = new tes_thread_payload(*this); - - nir_to_brw(this); - - if (failed) - return false; - - emit_urb_writes(); - - calculate_cfg(); - - brw_fs_optimize(*this); - - assign_curb_setup(); - assign_tes_urb_setup(); - - brw_fs_lower_3src_null_dest(*this); - brw_fs_workaround_memory_fence_before_eot(*this); - brw_fs_workaround_emit_dummy_mov_instruction(*this); - - allocate_registers(true /* allow_spilling */); - - return !failed; -} - -bool -fs_visitor::run_gs() -{ - assert(stage == MESA_SHADER_GEOMETRY); - - payload_ = new gs_thread_payload(*this); - - const fs_builder bld = fs_builder(this).at_end(); - - this->final_gs_vertex_count = bld.vgrf(BRW_TYPE_UD); - - if (gs_compile->control_data_header_size_bits > 0) { - /* Create a VGRF to store accumulated control data bits. */ - this->control_data_bits = bld.vgrf(BRW_TYPE_UD); - - /* If we're outputting more than 32 control data bits, then EmitVertex() - * will set control_data_bits to 0 after emitting the first vertex. - * Otherwise, we need to initialize it to 0 here. - */ - if (gs_compile->control_data_header_size_bits <= 32) { - const fs_builder abld = bld.annotate("initialize control data bits"); - abld.MOV(this->control_data_bits, brw_imm_ud(0u)); - } - } - - nir_to_brw(this); - - emit_gs_thread_end(); - - if (failed) - return false; - - calculate_cfg(); - - brw_fs_optimize(*this); - - assign_curb_setup(); - assign_gs_urb_setup(); - - brw_fs_lower_3src_null_dest(*this); - brw_fs_workaround_memory_fence_before_eot(*this); - brw_fs_workaround_emit_dummy_mov_instruction(*this); - - allocate_registers(true /* allow_spilling */); - - return !failed; -} - -/* From the SKL PRM, Volume 16, Workarounds: - * - * 0877 3D Pixel Shader Hang possible when pixel shader dispatched with - * only header phases (R0-R2) - * - * WA: Enable a non-header phase (e.g. push constant) when dispatch would - * have been header only. - * - * Instead of enabling push constants one can alternatively enable one of the - * inputs. Here one simply chooses "layer" which shouldn't impose much - * overhead. - */ -static void -gfx9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data) -{ - if (wm_prog_data->num_varying_inputs) - return; - - if (wm_prog_data->base.curb_read_length) - return; - - wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0; - wm_prog_data->num_varying_inputs = 1; - - brw_compute_urb_setup_index(wm_prog_data); -} - -bool -fs_visitor::run_fs(bool allow_spilling, bool do_rep_send) -{ - struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data); - brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key; - const fs_builder bld = fs_builder(this).at_end(); - - assert(stage == MESA_SHADER_FRAGMENT); - - payload_ = new fs_thread_payload(*this, source_depth_to_render_target); - - if (nir->info.ray_queries > 0) - limit_dispatch_width(16, "SIMD32 not supported with ray queries.\n"); - - if (do_rep_send) { - assert(dispatch_width == 16); - emit_repclear_shader(); - } else { - if (nir->info.inputs_read > 0 || - BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) || - (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) { - emit_interpolation_setup(); - } - - /* We handle discards by keeping track of the still-live pixels in f0.1. - * Initialize it with the dispatched pixels. - */ - if (devinfo->ver >= 20 || wm_prog_data->uses_kill) { - const unsigned lower_width = MIN2(dispatch_width, 16); - for (unsigned i = 0; i < dispatch_width / lower_width; i++) { - /* According to the "PS Thread Payload for Normal - * Dispatch" pages on the BSpec, the dispatch mask is - * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on - * gfx6+. - */ - const brw_reg dispatch_mask = - devinfo->ver >= 20 ? xe2_vec1_grf(i, 15) : - brw_vec1_grf(i + 1, 7); - bld.exec_all().group(1, 0) - .MOV(brw_sample_mask_reg(bld.group(lower_width, i)), - retype(dispatch_mask, BRW_TYPE_UW)); - } - } - - if (nir->info.writes_memory) - wm_prog_data->has_side_effects = true; - - nir_to_brw(this); - - if (failed) - return false; - - emit_fb_writes(); - - calculate_cfg(); - - brw_fs_optimize(*this); - - assign_curb_setup(); - - if (devinfo->ver == 9) - gfx9_ps_header_only_workaround(wm_prog_data); - - assign_urb_setup(); - - brw_fs_lower_3src_null_dest(*this); - brw_fs_workaround_memory_fence_before_eot(*this); - brw_fs_workaround_emit_dummy_mov_instruction(*this); - - allocate_registers(allow_spilling); - } - - return !failed; -} - -bool -fs_visitor::run_cs(bool allow_spilling) -{ - assert(gl_shader_stage_is_compute(stage)); - const fs_builder bld = fs_builder(this).at_end(); - - payload_ = new cs_thread_payload(*this); - - if (devinfo->platform == INTEL_PLATFORM_HSW && prog_data->total_shared > 0) { - /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */ - const fs_builder abld = bld.exec_all().group(1, 0); - abld.MOV(retype(brw_sr0_reg(1), BRW_TYPE_UW), - suboffset(retype(brw_vec1_grf(0, 0), BRW_TYPE_UW), 1)); - } - - nir_to_brw(this); - - if (failed) - return false; - - emit_cs_terminate(); - - calculate_cfg(); - - brw_fs_optimize(*this); - - assign_curb_setup(); - - brw_fs_lower_3src_null_dest(*this); - brw_fs_workaround_memory_fence_before_eot(*this); - brw_fs_workaround_emit_dummy_mov_instruction(*this); - - allocate_registers(allow_spilling); - - return !failed; -} - -bool -fs_visitor::run_bs(bool allow_spilling) -{ - assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE); - - payload_ = new bs_thread_payload(*this); - - nir_to_brw(this); - - if (failed) - return false; - - /* TODO(RT): Perhaps rename this? */ - emit_cs_terminate(); - - calculate_cfg(); - - brw_fs_optimize(*this); - - assign_curb_setup(); - - brw_fs_lower_3src_null_dest(*this); - brw_fs_workaround_memory_fence_before_eot(*this); - brw_fs_workaround_emit_dummy_mov_instruction(*this); - - allocate_registers(allow_spilling); - - return !failed; -} - -bool -fs_visitor::run_task(bool allow_spilling) -{ - assert(stage == MESA_SHADER_TASK); - - payload_ = new task_mesh_thread_payload(*this); - - nir_to_brw(this); - - if (failed) - return false; - - emit_urb_fence(); - - emit_cs_terminate(); - - calculate_cfg(); - - brw_fs_optimize(*this); - - assign_curb_setup(); - - brw_fs_lower_3src_null_dest(*this); - brw_fs_workaround_memory_fence_before_eot(*this); - brw_fs_workaround_emit_dummy_mov_instruction(*this); - - allocate_registers(allow_spilling); - - return !failed; -} - -bool -fs_visitor::run_mesh(bool allow_spilling) -{ - assert(stage == MESA_SHADER_MESH); - - payload_ = new task_mesh_thread_payload(*this); - - nir_to_brw(this); - - if (failed) - return false; - - emit_urb_fence(); - - emit_cs_terminate(); - - calculate_cfg(); - - brw_fs_optimize(*this); - - assign_curb_setup(); - - brw_fs_lower_3src_null_dest(*this); - brw_fs_workaround_memory_fence_before_eot(*this); - brw_fs_workaround_emit_dummy_mov_instruction(*this); - - allocate_registers(allow_spilling); - - return !failed; -} - - /** * Move load_interpolated_input with simple (payload-based) barycentric modes * to the top of the program so we don't emit multiple PLNs for the same input. diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 2a1cbaa095e..40f48864034 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -301,15 +301,6 @@ public: uint8_t alignment, unsigned components); - bool run_fs(bool allow_spilling, bool do_rep_send); - bool run_vs(); - bool run_tcs(); - bool run_tes(); - bool run_gs(); - bool run_cs(bool allow_spilling); - bool run_bs(bool allow_spilling); - bool run_task(bool allow_spilling); - bool run_mesh(bool allow_spilling); void allocate_registers(bool allow_spilling); uint32_t compute_max_register_pressure(); void assign_curb_setup();