From 7c23b90537e359853d8ecd11c275e13ff23b0469 Mon Sep 17 00:00:00 2001 From: Caio Oliveira Date: Wed, 14 Feb 2024 22:41:17 -0800 Subject: [PATCH] intel/brw: Always use scalar shaders Remove scalar_stage[] array, since now it is always scalar. This removes any usage of vec4 shaders in brw. Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/brw_compile_gs.cpp | 149 +++----------- src/intel/compiler/brw_compile_tcs.cpp | 81 +++----- src/intel/compiler/brw_compile_vs.cpp | 90 +++------ src/intel/compiler/brw_compiler.c | 122 ++++-------- src/intel/compiler/brw_compiler.h | 1 - src/intel/compiler/brw_nir.c | 181 +++++++----------- src/intel/compiler/brw_nir.h | 3 +- src/intel/compiler/brw_nir_rt.c | 3 +- src/intel/compiler/brw_shader.cpp | 81 +++----- src/intel/compiler/brw_shader.h | 14 +- src/intel/vulkan/anv_device.c | 8 +- .../vulkan/anv_nir_compute_push_layout.c | 14 +- 12 files changed, 224 insertions(+), 523 deletions(-) diff --git a/src/intel/compiler/brw_compile_gs.cpp b/src/intel/compiler/brw_compile_gs.cpp index a63b34de831..19728e2d5c2 100644 --- a/src/intel/compiler/brw_compile_gs.cpp +++ b/src/intel/compiler/brw_compile_gs.cpp @@ -3,8 +3,6 @@ * SPDX-License-Identifier: MIT */ -#include "brw_vec4_gs_visitor.h" -#include "gfx6_gs_visitor.h" #include "brw_eu.h" #include "brw_fs.h" #include "brw_prim.h" @@ -41,7 +39,6 @@ brw_compile_gs(const struct brw_compiler *compiler, memset(&c, 0, sizeof(c)); c.key = *key; - const bool is_scalar = compiler->scalar_stage[MESA_SHADER_GEOMETRY]; const bool debug_enabled = brw_should_print_shader(nir, DEBUG_GS); prog_data->base.base.stage = MESA_SHADER_GEOMETRY; @@ -266,135 +263,33 @@ brw_compile_gs(const struct brw_compiler *compiler, brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_GEOMETRY); } - if (is_scalar) { - fs_visitor v(compiler, ¶ms->base, &c, prog_data, nir, - params->base.stats != NULL, debug_enabled); - if (v.run_gs()) { - prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8; + fs_visitor v(compiler, ¶ms->base, &c, prog_data, nir, + params->base.stats != NULL, debug_enabled); + if (v.run_gs()) { + prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8; - assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0); - prog_data->base.base.dispatch_grf_start_reg = - v.payload().num_regs / reg_unit(compiler->devinfo); + assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0); + prog_data->base.base.dispatch_grf_start_reg = + v.payload().num_regs / reg_unit(compiler->devinfo); - fs_generator g(compiler, ¶ms->base, - &prog_data->base.base, false, MESA_SHADER_GEOMETRY); - if (unlikely(debug_enabled)) { - const char *label = - nir->info.label ? nir->info.label : "unnamed"; - char *name = ralloc_asprintf(params->base.mem_ctx, - "%s geometry shader %s", - label, nir->info.name); - g.enable_debug(name); - } - g.generate_code(v.cfg, v.dispatch_width, v.shader_stats, - v.performance_analysis.require(), params->base.stats); - g.add_const_data(nir->constant_data, nir->constant_data_size); - return g.get_assembly(); + fs_generator g(compiler, ¶ms->base, + &prog_data->base.base, false, MESA_SHADER_GEOMETRY); + if (unlikely(debug_enabled)) { + const char *label = + nir->info.label ? nir->info.label : "unnamed"; + char *name = ralloc_asprintf(params->base.mem_ctx, + "%s geometry shader %s", + label, nir->info.name); + g.enable_debug(name); } - - params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg); - - return NULL; + g.generate_code(v.cfg, v.dispatch_width, v.shader_stats, + v.performance_analysis.require(), params->base.stats); + g.add_const_data(nir->constant_data, nir->constant_data_size); + return g.get_assembly(); } - if (compiler->devinfo->ver >= 7) { - /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do - * so without spilling. If the GS invocations count > 1, then we can't use - * dual object mode. - */ - if (prog_data->invocations <= 1 && - !INTEL_DEBUG(DEBUG_NO_DUAL_OBJECT_GS)) { - prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT; + params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg); - brw::vec4_gs_visitor v(compiler, ¶ms->base, &c, prog_data, nir, - true /* no_spills */, - debug_enabled); - - /* Backup 'nr_params' and 'param' as they can be modified by the - * the DUAL_OBJECT visitor. If it fails, we will run the fallback - * (DUAL_INSTANCED or SINGLE mode) and we need to restore original - * values. - */ - const unsigned param_count = prog_data->base.base.nr_params; - uint32_t *param = ralloc_array(NULL, uint32_t, param_count); - memcpy(param, prog_data->base.base.param, - sizeof(uint32_t) * param_count); - - if (v.run()) { - /* Success! Backup is not needed */ - ralloc_free(param); - return brw_vec4_generate_assembly(compiler, ¶ms->base, - nir, &prog_data->base, - v.cfg, - v.performance_analysis.require(), - debug_enabled); - } else { - /* These variables could be modified by the execution of the GS - * visitor if it packed the uniforms in the push constant buffer. - * As it failed, we need restore them so we can start again with - * DUAL_INSTANCED or SINGLE mode. - * - * FIXME: Could more variables be modified by this execution? - */ - memcpy(prog_data->base.base.param, param, - sizeof(uint32_t) * param_count); - prog_data->base.base.nr_params = param_count; - ralloc_free(param); - } - } - } - - /* Either we failed to compile in DUAL_OBJECT mode (probably because it - * would have required spilling) or DUAL_OBJECT mode is disabled. So fall - * back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers. - * - * FIXME: Single dispatch mode requires that the driver can handle - * interleaving of input registers, but this is already supported (dual - * instance mode has the same requirement). However, to take full advantage - * of single dispatch mode to reduce register pressure we would also need to - * do interleaved outputs, but currently, the vec4 visitor and generator - * classes do not support this, so at the moment register pressure in - * single and dual instance modes is the same. - * - * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS" - * "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely - * want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode - * is also supported. When InstanceCount=1 (one instance per object) software - * can decide which dispatch mode to use. DUAL_OBJECT mode would likely be - * the best choice for performance, followed by SINGLE mode." - * - * So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE - * mode is more performant when invocations > 1. Gfx6 only supports - * SINGLE mode. - */ - if (prog_data->invocations <= 1 || compiler->devinfo->ver < 7) - prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X1_SINGLE; - else - prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_INSTANCE; - - brw::vec4_gs_visitor *gs = NULL; - const unsigned *ret = NULL; - - if (compiler->devinfo->ver >= 7) - gs = new brw::vec4_gs_visitor(compiler, ¶ms->base, &c, prog_data, - nir, false /* no_spills */, - debug_enabled); - else - gs = new brw::gfx6_gs_visitor(compiler, ¶ms->base, &c, prog_data, - nir, false /* no_spills */, - debug_enabled); - - if (!gs->run()) { - params->base.error_str = - ralloc_strdup(params->base.mem_ctx, gs->fail_msg); - } else { - ret = brw_vec4_generate_assembly(compiler, ¶ms->base, nir, - &prog_data->base, gs->cfg, - gs->performance_analysis.require(), - debug_enabled); - } - - delete gs; - return ret; + return NULL; } diff --git a/src/intel/compiler/brw_compile_tcs.cpp b/src/intel/compiler/brw_compile_tcs.cpp index adce8e38e40..31b0a4ecdae 100644 --- a/src/intel/compiler/brw_compile_tcs.cpp +++ b/src/intel/compiler/brw_compile_tcs.cpp @@ -3,9 +3,9 @@ * SPDX-License-Identifier: MIT */ +#include "brw_eu.h" #include "intel_nir.h" #include "brw_nir.h" -#include "brw_vec4_tcs.h" #include "brw_fs.h" #include "brw_private.h" #include "dev/intel_debug.h" @@ -49,9 +49,7 @@ brw_compile_tcs(const struct brw_compiler *compiler, struct brw_tcs_prog_data *prog_data = params->prog_data; struct brw_vue_prog_data *vue_prog_data = &prog_data->base; - const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL]; const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TCS); - const unsigned *assembly; vue_prog_data->base.stage = MESA_SHADER_TESS_CTRL; prog_data->base.base.ray_queries = nir->info.ray_queries; @@ -89,7 +87,7 @@ brw_compile_tcs(const struct brw_compiler *compiler, prog_data->instances = nir->info.tess.tcs_vertices_out; prog_data->include_primitive_id = has_primitive_id; } else { - unsigned verts_per_thread = is_scalar ? 8 : 2; + unsigned verts_per_thread = 8; vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH; prog_data->instances = DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, verts_per_thread); @@ -135,54 +133,33 @@ brw_compile_tcs(const struct brw_compiler *compiler, brw_print_vue_map(stderr, &vue_prog_data->vue_map, MESA_SHADER_TESS_CTRL); } - if (is_scalar) { - const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8; - fs_visitor v(compiler, ¶ms->base, &key->base, - &prog_data->base.base, nir, dispatch_width, - params->base.stats != NULL, debug_enabled); - if (!v.run_tcs()) { - params->base.error_str = - ralloc_strdup(params->base.mem_ctx, v.fail_msg); - return NULL; - } - - assert(v.payload().num_regs % reg_unit(devinfo) == 0); - prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo); - - fs_generator g(compiler, ¶ms->base, - &prog_data->base.base, false, MESA_SHADER_TESS_CTRL); - if (unlikely(debug_enabled)) { - g.enable_debug(ralloc_asprintf(params->base.mem_ctx, - "%s tessellation control shader %s", - nir->info.label ? nir->info.label - : "unnamed", - nir->info.name)); - } - - g.generate_code(v.cfg, dispatch_width, v.shader_stats, - v.performance_analysis.require(), params->base.stats); - - g.add_const_data(nir->constant_data, nir->constant_data_size); - - assembly = g.get_assembly(); - } else { - brw::vec4_tcs_visitor v(compiler, ¶ms->base, key, prog_data, - nir, debug_enabled); - if (!v.run()) { - params->base.error_str = - ralloc_strdup(params->base.mem_ctx, v.fail_msg); - return NULL; - } - - if (INTEL_DEBUG(DEBUG_TCS)) - v.dump_instructions(); - - - assembly = brw_vec4_generate_assembly(compiler, ¶ms->base, nir, - &prog_data->base, v.cfg, - v.performance_analysis.require(), - debug_enabled); + const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8; + fs_visitor v(compiler, ¶ms->base, &key->base, + &prog_data->base.base, nir, dispatch_width, + params->base.stats != NULL, debug_enabled); + if (!v.run_tcs()) { + params->base.error_str = + ralloc_strdup(params->base.mem_ctx, v.fail_msg); + return NULL; } - return assembly; + assert(v.payload().num_regs % reg_unit(devinfo) == 0); + prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo); + + fs_generator g(compiler, ¶ms->base, + &prog_data->base.base, false, MESA_SHADER_TESS_CTRL); + if (unlikely(debug_enabled)) { + g.enable_debug(ralloc_asprintf(params->base.mem_ctx, + "%s tessellation control shader %s", + nir->info.label ? nir->info.label + : "unnamed", + nir->info.name)); + } + + g.generate_code(v.cfg, dispatch_width, v.shader_stats, + v.performance_analysis.require(), params->base.stats); + + g.add_const_data(nir->constant_data, nir->constant_data_size); + + return g.get_assembly(); } diff --git a/src/intel/compiler/brw_compile_vs.cpp b/src/intel/compiler/brw_compile_vs.cpp index c1e089ec280..983c2a837ec 100644 --- a/src/intel/compiler/brw_compile_vs.cpp +++ b/src/intel/compiler/brw_compile_vs.cpp @@ -3,11 +3,9 @@ * SPDX-License-Identifier: MIT */ -#include "brw_vec4.h" #include "brw_fs.h" #include "brw_eu.h" #include "brw_nir.h" -#include "brw_vec4_vs.h" #include "brw_private.h" #include "dev/intel_debug.h" @@ -28,11 +26,8 @@ brw_compile_vs(const struct brw_compiler *compiler, prog_data->base.base.ray_queries = nir->info.ray_queries; prog_data->base.base.total_scratch = 0; - const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX]; brw_nir_apply_key(nir, compiler, &key->base, 8); - const unsigned *assembly = NULL; - prog_data->inputs_read = nir->info.inputs_read; prog_data->double_inputs_read = nir->info.vs.double_inputs; @@ -83,17 +78,7 @@ brw_compile_vs(const struct brw_compiler *compiler, if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID)) prog_data->uses_drawid = true; - /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry - * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in - * vec4 mode, the hardware appears to wedge unless we read something. - */ - if (is_scalar) - prog_data->base.urb_read_length = - DIV_ROUND_UP(nr_attribute_slots, 2); - else - prog_data->base.urb_read_length = - DIV_ROUND_UP(MAX2(nr_attribute_slots, 1), 2); - + prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attribute_slots, 2); prog_data->nr_attribute_slots = nr_attribute_slots; /* Since vertex shaders reuse the same VUE entry for inputs and outputs @@ -114,58 +99,37 @@ brw_compile_vs(const struct brw_compiler *compiler, brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_VERTEX); } - if (is_scalar) { - const unsigned dispatch_width = compiler->devinfo->ver >= 20 ? 16 : 8; - prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8; + const unsigned dispatch_width = compiler->devinfo->ver >= 20 ? 16 : 8; + prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8; - fs_visitor v(compiler, ¶ms->base, &key->base, - &prog_data->base.base, nir, dispatch_width, - params->base.stats != NULL, debug_enabled); - if (!v.run_vs()) { - params->base.error_str = - ralloc_strdup(params->base.mem_ctx, v.fail_msg); - return NULL; - } - - assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0); - prog_data->base.base.dispatch_grf_start_reg = - v.payload().num_regs / reg_unit(compiler->devinfo); - - fs_generator g(compiler, ¶ms->base, - &prog_data->base.base, v.runtime_check_aads_emit, - MESA_SHADER_VERTEX); - if (unlikely(debug_enabled)) { - const char *debug_name = - ralloc_asprintf(params->base.mem_ctx, "%s vertex shader %s", - nir->info.label ? nir->info.label : - "unnamed", - nir->info.name); - - g.enable_debug(debug_name); - } - g.generate_code(v.cfg, dispatch_width, v.shader_stats, - v.performance_analysis.require(), params->base.stats); - g.add_const_data(nir->constant_data, nir->constant_data_size); - assembly = g.get_assembly(); + fs_visitor v(compiler, ¶ms->base, &key->base, + &prog_data->base.base, nir, dispatch_width, + params->base.stats != NULL, debug_enabled); + if (!v.run_vs()) { + params->base.error_str = + ralloc_strdup(params->base.mem_ctx, v.fail_msg); + return NULL; } - if (!assembly) { - prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT; + assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0); + prog_data->base.base.dispatch_grf_start_reg = + v.payload().num_regs / reg_unit(compiler->devinfo); - vec4_vs_visitor v(compiler, ¶ms->base, key, prog_data, - nir, debug_enabled); - if (!v.run()) { - params->base.error_str = - ralloc_strdup(params->base.mem_ctx, v.fail_msg); - return NULL; - } + fs_generator g(compiler, ¶ms->base, + &prog_data->base.base, v.runtime_check_aads_emit, + MESA_SHADER_VERTEX); + if (unlikely(debug_enabled)) { + const char *debug_name = + ralloc_asprintf(params->base.mem_ctx, "%s vertex shader %s", + nir->info.label ? nir->info.label : + "unnamed", + nir->info.name); - assembly = brw_vec4_generate_assembly(compiler, ¶ms->base, - nir, &prog_data->base, - v.cfg, - v.performance_analysis.require(), - debug_enabled); + g.enable_debug(debug_name); } + g.generate_code(v.cfg, dispatch_width, v.shader_stats, + v.performance_analysis.require(), params->base.stats); + g.add_const_data(nir->constant_data, nir->constant_data_size); - return assembly; + return g.get_assembly(); } diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c index 51034e3c78c..d7eac3ca69c 100644 --- a/src/intel/compiler/brw_compiler.c +++ b/src/intel/compiler/brw_compiler.c @@ -29,77 +29,51 @@ #include "compiler/nir/nir.h" #include "util/u_debug.h" -#define COMMON_OPTIONS \ - .has_uclz = true, \ - .lower_fdiv = true, \ - .lower_scmp = true, \ - .lower_flrp16 = true, \ - .lower_fmod = true, \ - .lower_ufind_msb = true, \ - .lower_uadd_carry = true, \ - .lower_usub_borrow = true, \ - .lower_flrp64 = true, \ - .lower_fisnormal = true, \ - .lower_isign = true, \ - .lower_ldexp = true, \ - .lower_bitfield_extract = true, \ - .lower_bitfield_insert = true, \ - .lower_device_index_to_zero = true, \ - .vectorize_io = true, \ - .vectorize_tess_levels = true, \ - .use_interpolated_input_intrinsics = true, \ - .lower_insert_byte = true, \ - .lower_insert_word = true, \ - .vertex_id_zero_based = true, \ - .lower_base_vertex = true, \ - .support_16bit_alu = true, \ - .lower_uniforms_to_ubo = true - -#define COMMON_SCALAR_OPTIONS \ - .lower_to_scalar = true, \ - .lower_pack_half_2x16 = true, \ - .lower_pack_snorm_2x16 = true, \ - .lower_pack_snorm_4x8 = true, \ - .lower_pack_unorm_2x16 = true, \ - .lower_pack_unorm_4x8 = true, \ - .lower_unpack_half_2x16 = true, \ - .lower_unpack_snorm_2x16 = true, \ - .lower_unpack_snorm_4x8 = true, \ - .lower_unpack_unorm_2x16 = true, \ - .lower_unpack_unorm_4x8 = true, \ - .lower_hadd64 = true, \ - .avoid_ternary_with_two_constants = true, \ - .has_pack_32_4x8 = true, \ - .max_unroll_iterations = 32, \ - .force_indirect_unrolling = nir_var_function_temp, \ - .divergence_analysis_options = \ - (nir_divergence_single_patch_per_tcs_subgroup | \ - nir_divergence_single_patch_per_tes_subgroup | \ - nir_divergence_shader_record_ptr_uniform) - const struct nir_shader_compiler_options brw_scalar_nir_options = { - COMMON_OPTIONS, - COMMON_SCALAR_OPTIONS, -}; - -const struct nir_shader_compiler_options brw_vector_nir_options = { - COMMON_OPTIONS, - - /* In the vec4 backend, our dpN instruction replicates its result to all the - * components of a vec4. We would like NIR to give us replicated fdot - * instructions because it can optimize better for us. - */ - .fdot_replicates = true, - - .lower_usub_sat = true, + .avoid_ternary_with_two_constants = true, + .divergence_analysis_options = + (nir_divergence_single_patch_per_tcs_subgroup | + nir_divergence_single_patch_per_tes_subgroup | + nir_divergence_shader_record_ptr_uniform), + .force_indirect_unrolling = nir_var_function_temp, + .has_pack_32_4x8 = true, + .has_uclz = true, + .lower_base_vertex = true, + .lower_bitfield_extract = true, + .lower_bitfield_insert = true, + .lower_device_index_to_zero = true, + .lower_fdiv = true, + .lower_fisnormal = true, + .lower_flrp16 = true, + .lower_flrp64 = true, + .lower_fmod = true, + .lower_hadd64 = true, + .lower_insert_byte = true, + .lower_insert_word = true, + .lower_isign = true, + .lower_ldexp = true, + .lower_pack_half_2x16 = true, .lower_pack_snorm_2x16 = true, + .lower_pack_snorm_4x8 = true, .lower_pack_unorm_2x16 = true, + .lower_pack_unorm_4x8 = true, + .lower_scmp = true, + .lower_to_scalar = true, + .lower_uadd_carry = true, + .lower_ufind_msb = true, + .lower_uniforms_to_ubo = true, + .lower_unpack_half_2x16 = true, .lower_unpack_snorm_2x16 = true, + .lower_unpack_snorm_4x8 = true, .lower_unpack_unorm_2x16 = true, - .lower_extract_byte = true, - .lower_extract_word = true, - .intel_vec4 = true, + .lower_unpack_unorm_4x8 = true, + .lower_usub_borrow = true, .max_unroll_iterations = 32, + .support_16bit_alu = true, + .use_interpolated_input_intrinsics = true, + .vectorize_io = true, + .vectorize_tess_levels = true, + .vertex_id_zero_based = true, }; struct brw_compiler * @@ -129,15 +103,6 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo) devinfo->platform != INTEL_PLATFORM_ARL_H) || debug_get_bool_option("INTEL_LOWER_DPAS", false); - /* There is no vec4 mode on Gfx10+, and we don't use it at all on Gfx8+. */ - for (int i = MESA_SHADER_VERTEX; i < MESA_ALL_SHADER_STAGES; i++) { - compiler->scalar_stage[i] = devinfo->ver >= 8 || - i == MESA_SHADER_FRAGMENT || i == MESA_SHADER_COMPUTE; - } - - for (int i = MESA_SHADER_TASK; i < MESA_VULKAN_SHADER_STAGES; i++) - compiler->scalar_stage[i] = true; - nir_lower_int64_options int64_options = nir_lower_imul64 | nir_lower_isign64 | @@ -175,13 +140,8 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo) for (int i = 0; i < MESA_ALL_SHADER_STAGES; i++) { struct nir_shader_compiler_options *nir_options = rzalloc(compiler, struct nir_shader_compiler_options); - bool is_scalar = compiler->scalar_stage[i]; - if (is_scalar) { - *nir_options = brw_scalar_nir_options; - int64_options |= nir_lower_usub_sat64; - } else { - *nir_options = brw_vector_nir_options; - } + *nir_options = brw_scalar_nir_options; + int64_options |= nir_lower_usub_sat64; /* Prior to Gfx6, there are no three source operations, and Gfx11 loses * LRP. diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index c7095b5cce5..0b2155a5626 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -86,7 +86,6 @@ struct brw_compiler { void (*shader_debug_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4); void (*shader_perf_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4); - bool scalar_stage[MESA_ALL_SHADER_STAGES]; bool use_tcs_multi_patch; struct nir_shader_compiler_options *nir_options[MESA_ALL_SHADER_STAGES]; diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 5e2b87d04e0..af691f26aac 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -714,7 +714,7 @@ brw_nir_lower_fs_outputs(nir_shader *nir) }) void -brw_nir_optimize(nir_shader *nir, bool is_scalar, +brw_nir_optimize(nir_shader *nir, const struct intel_device_info *devinfo) { bool progress; @@ -752,18 +752,11 @@ brw_nir_optimize(nir_shader *nir, bool is_scalar, OPT(nir_opt_ray_queries); OPT(nir_opt_ray_query_ranges); - if (is_scalar) { - OPT(nir_lower_alu_to_scalar, NULL, NULL); - } else { - OPT(nir_opt_shrink_stores, true); - OPT(nir_opt_shrink_vectors); - } + OPT(nir_lower_alu_to_scalar, NULL, NULL); OPT(nir_copy_prop); - if (is_scalar) { - OPT(nir_lower_phis_to_scalar, false); - } + OPT(nir_lower_phis_to_scalar, false); OPT(nir_copy_prop); OPT(nir_opt_dce); @@ -784,15 +777,9 @@ brw_nir_optimize(nir_shader *nir, bool is_scalar, * For indirect loads of uniforms (push constants), we assume that array * indices will nearly always be in bounds and the cost of the load is * low. Therefore there shouldn't be a performance benefit to avoid it. - * However, in vec4 tessellation shaders, these loads operate by - * actually pulling from memory. */ - const bool is_vec4_tessellation = !is_scalar && - (nir->info.stage == MESA_SHADER_TESS_CTRL || - nir->info.stage == MESA_SHADER_TESS_EVAL); - OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation, false); - OPT(nir_opt_peephole_select, 8, !is_vec4_tessellation, - devinfo->ver >= 6); + OPT(nir_opt_peephole_select, 0, true, false); + OPT(nir_opt_peephole_select, 8, true, devinfo->ver >= 6); OPT(nir_opt_intrinsics); OPT(nir_opt_idiv_const, 32); @@ -1014,15 +1001,11 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir, const struct intel_device_info *devinfo = compiler->devinfo; UNUSED bool progress; /* Written by OPT */ - const bool is_scalar = compiler->scalar_stage[nir->info.stage]; - nir_validate_ssa_dominance(nir, "before brw_preprocess_nir"); OPT(nir_lower_frexp); - if (is_scalar) { - OPT(nir_lower_alu_to_scalar, NULL, NULL); - } + OPT(nir_lower_alu_to_scalar, NULL, NULL); if (nir->info.stage == MESA_SHADER_GEOMETRY) OPT(nir_lower_gs_intrinsics, 0); @@ -1081,7 +1064,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir, OPT(nir_split_var_copies); OPT(nir_split_struct_vars, nir_var_function_temp); - brw_nir_optimize(nir, is_scalar, devinfo); + brw_nir_optimize(nir, devinfo); OPT(nir_lower_doubles, opts->softfp64, nir->options->lower_doubles_options); if (OPT(nir_lower_int64_float_conversions)) { @@ -1102,9 +1085,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir, OPT(nir_opt_large_constants, NULL, 32); } - if (is_scalar) { - OPT(nir_lower_load_const_to_scalar); - } + OPT(nir_lower_load_const_to_scalar); OPT(nir_lower_system_values); nir_lower_compute_system_values_options lower_csv_options = { @@ -1116,7 +1097,6 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir, .ballot_bit_size = 32, .ballot_components = 1, .lower_to_scalar = true, - .lower_vote_trivial = !is_scalar, .lower_relative_shuffle = true, .lower_quad_broadcast_dynamic = true, .lower_elect = true, @@ -1142,7 +1122,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir, * issues are helped but nothing else in shader-db is hurt except for maybe * that one kerbal space program shader. */ - if (is_scalar && !(indirect_mask & nir_var_function_temp)) + if (!(indirect_mask & nir_var_function_temp)) OPT(nir_lower_indirect_derefs, nir_var_function_temp, 16); /* Lower array derefs of vectors for SSBO and UBO loads. For both UBOs and @@ -1165,7 +1145,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir, OPT(intel_nir_clamp_per_vertex_loads); /* Get rid of split copies */ - brw_nir_optimize(nir, is_scalar, devinfo); + brw_nir_optimize(nir, devinfo); } static bool @@ -1321,18 +1301,13 @@ brw_nir_link_shaders(const struct brw_compiler *compiler, nir_validate_shader(producer, "after nir_lower_io_arrays_to_elements"); nir_validate_shader(consumer, "after nir_lower_io_arrays_to_elements"); - const bool p_is_scalar = compiler->scalar_stage[producer->info.stage]; - const bool c_is_scalar = compiler->scalar_stage[consumer->info.stage]; - - if (p_is_scalar && c_is_scalar) { - NIR_PASS(_, producer, nir_lower_io_to_scalar_early, nir_var_shader_out); - NIR_PASS(_, consumer, nir_lower_io_to_scalar_early, nir_var_shader_in); - brw_nir_optimize(producer, p_is_scalar, devinfo); - brw_nir_optimize(consumer, c_is_scalar, devinfo); - } + NIR_PASS(_, producer, nir_lower_io_to_scalar_early, nir_var_shader_out); + NIR_PASS(_, consumer, nir_lower_io_to_scalar_early, nir_var_shader_in); + brw_nir_optimize(producer, devinfo); + brw_nir_optimize(consumer, devinfo); if (nir_link_opt_varyings(producer, consumer)) - brw_nir_optimize(consumer, c_is_scalar, devinfo); + brw_nir_optimize(consumer, devinfo); NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL); NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL); @@ -1361,8 +1336,8 @@ brw_nir_link_shaders(const struct brw_compiler *compiler, brw_nir_no_indirect_mask(compiler, consumer->info.stage), UINT32_MAX); - brw_nir_optimize(producer, p_is_scalar, devinfo); - brw_nir_optimize(consumer, c_is_scalar, devinfo); + brw_nir_optimize(producer, devinfo); + brw_nir_optimize(consumer, devinfo); if (producer->info.stage == MESA_SHADER_MESH && consumer->info.stage == MESA_SHADER_FRAGMENT) { @@ -1591,48 +1566,45 @@ brw_vectorize_lower_mem_access(nir_shader *nir, enum brw_robustness_flags robust_flags) { bool progress = false; - const bool is_scalar = compiler->scalar_stage[nir->info.stage]; - if (is_scalar) { - nir_load_store_vectorize_options options = { - .modes = nir_var_mem_ubo | nir_var_mem_ssbo | - nir_var_mem_global | nir_var_mem_shared | - nir_var_mem_task_payload, - .callback = brw_nir_should_vectorize_mem, - .robust_modes = (nir_variable_mode)0, - }; + nir_load_store_vectorize_options options = { + .modes = nir_var_mem_ubo | nir_var_mem_ssbo | + nir_var_mem_global | nir_var_mem_shared | + nir_var_mem_task_payload, + .callback = brw_nir_should_vectorize_mem, + .robust_modes = (nir_variable_mode)0, + }; - if (robust_flags & BRW_ROBUSTNESS_UBO) - options.robust_modes |= nir_var_mem_ubo | nir_var_mem_global; - if (robust_flags & BRW_ROBUSTNESS_SSBO) - options.robust_modes |= nir_var_mem_ssbo | nir_var_mem_global; + if (robust_flags & BRW_ROBUSTNESS_UBO) + options.robust_modes |= nir_var_mem_ubo | nir_var_mem_global; + if (robust_flags & BRW_ROBUSTNESS_SSBO) + options.robust_modes |= nir_var_mem_ssbo | nir_var_mem_global; - OPT(nir_opt_load_store_vectorize, &options); + OPT(nir_opt_load_store_vectorize, &options); - /* Only run the blockify optimization on Gfx9+ because although prior HW - * versions have support for block loads, they do have limitations on - * alignment as well as requiring split sends which are not supported - * there. + /* Only run the blockify optimization on Gfx9+ because although prior HW + * versions have support for block loads, they do have limitations on + * alignment as well as requiring split sends which are not supported + * there. + */ + if (compiler->devinfo->ver >= 9) { + /* Required for nir_divergence_analysis() */ + OPT(nir_convert_to_lcssa, true, true); + + /* When HW supports block loads, using the divergence analysis, try + * to find uniform SSBO loads and turn them into block loads. + * + * Rerun the vectorizer after that to make the largest possible block + * loads. + * + * This is a win on 2 fronts : + * - fewer send messages + * - reduced register pressure */ - if (compiler->devinfo->ver >= 9) { - /* Required for nir_divergence_analysis() */ - OPT(nir_convert_to_lcssa, true, true); - - /* When HW supports block loads, using the divergence analysis, try - * to find uniform SSBO loads and turn them into block loads. - * - * Rerun the vectorizer after that to make the largest possible block - * loads. - * - * This is a win on 2 fronts : - * - fewer send messages - * - reduced register pressure - */ - nir_divergence_analysis(nir); - if (OPT(intel_nir_blockify_uniform_loads, compiler->devinfo)) - OPT(nir_opt_load_store_vectorize, &options); - OPT(nir_opt_remove_phis); - } + nir_divergence_analysis(nir); + if (OPT(intel_nir_blockify_uniform_loads, compiler->devinfo)) + OPT(nir_opt_load_store_vectorize, &options); + OPT(nir_opt_remove_phis); } nir_lower_mem_access_bit_sizes_options mem_access_options = { @@ -1683,7 +1655,6 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler, enum brw_robustness_flags robust_flags) { const struct intel_device_info *devinfo = compiler->devinfo; - const bool is_scalar = compiler->scalar_stage[nir->info.stage]; UNUSED bool progress; /* Written by OPT */ @@ -1710,20 +1681,20 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler, if (gl_shader_stage_can_set_fragment_shading_rate(nir->info.stage)) NIR_PASS(_, nir, intel_nir_lower_shading_rate_output); - brw_nir_optimize(nir, is_scalar, devinfo); + brw_nir_optimize(nir, devinfo); - if (is_scalar && nir_shader_has_local_variables(nir)) { + if (nir_shader_has_local_variables(nir)) { OPT(nir_lower_vars_to_explicit_types, nir_var_function_temp, glsl_get_natural_size_align_bytes); OPT(nir_lower_explicit_io, nir_var_function_temp, nir_address_format_32bit_offset); - brw_nir_optimize(nir, is_scalar, devinfo); + brw_nir_optimize(nir, devinfo); } brw_vectorize_lower_mem_access(nir, compiler, robust_flags); if (OPT(nir_lower_int64)) - brw_nir_optimize(nir, is_scalar, devinfo); + brw_nir_optimize(nir, devinfo); if (devinfo->ver >= 6) { /* Try and fuse multiply-adds, if successful, run shrink_vectors to @@ -1741,8 +1712,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler, OPT(nir_opt_shrink_vectors); } - if (is_scalar) - OPT(intel_nir_opt_peephole_imul32x16); + OPT(intel_nir_opt_peephole_imul32x16); if (OPT(nir_opt_comparison_pre)) { OPT(nir_copy_prop); @@ -1753,27 +1723,15 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler, * the other optimization passes) will have removed at least one * instruction from one of the branches of the if-statement, so now it * might be under the threshold of conversion to bcsel. - * - * See brw_nir_optimize for the explanation of is_vec4_tessellation. */ - const bool is_vec4_tessellation = !is_scalar && - (nir->info.stage == MESA_SHADER_TESS_CTRL || - nir->info.stage == MESA_SHADER_TESS_EVAL); - OPT(nir_opt_peephole_select, 0, is_vec4_tessellation, false); - OPT(nir_opt_peephole_select, 1, is_vec4_tessellation, - compiler->devinfo->ver >= 6); + OPT(nir_opt_peephole_select, 0, false, false); + OPT(nir_opt_peephole_select, 1, false, compiler->devinfo->ver >= 6); } do { progress = false; if (OPT(nir_opt_algebraic_late)) { - /* At this late stage, anything that makes more constants will wreak - * havok on the vec4 backend. The handling of constants in the vec4 - * backend is not good. - */ - if (is_scalar) - OPT(nir_opt_constant_folding); - + OPT(nir_opt_constant_folding); OPT(nir_copy_prop); OPT(nir_opt_dce); OPT(nir_opt_cse); @@ -1783,19 +1741,16 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler, if (OPT(nir_lower_fp16_casts, nir_lower_fp16_split_fp64)) { if (OPT(nir_lower_int64)) { - brw_nir_optimize(nir, is_scalar, devinfo); + brw_nir_optimize(nir, devinfo); } } OPT(intel_nir_lower_conversions); - if (is_scalar) - OPT(nir_lower_alu_to_scalar, NULL, NULL); + OPT(nir_lower_alu_to_scalar, NULL, NULL); while (OPT(nir_opt_algebraic_distribute_src_mods)) { - if (is_scalar) - OPT(nir_opt_constant_folding); - + OPT(nir_opt_constant_folding); OPT(nir_copy_prop); OPT(nir_opt_dce); OPT(nir_opt_cse); @@ -1821,7 +1776,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler, OPT(nir_lower_subgroups, &subgroups_options); if (OPT(nir_lower_int64)) - brw_nir_optimize(nir, is_scalar, devinfo); + brw_nir_optimize(nir, devinfo); divergence_analysis_dirty = true; } @@ -1834,7 +1789,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler, * that must be lowered. */ if (OPT(nir_lower_int64)) - brw_nir_optimize(nir, is_scalar, devinfo); + brw_nir_optimize(nir, devinfo); OPT(nir_lower_subgroups, &subgroups_options); } @@ -1880,11 +1835,6 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler, OPT(nir_convert_from_ssa, true); - if (!is_scalar) { - OPT(nir_move_vec_src_uses_to_dest, true); - OPT(nir_lower_vec_to_regs, NULL, NULL); - } - OPT(nir_opt_dce); if (OPT(nir_opt_rematerialize_compares)) @@ -2035,8 +1985,7 @@ brw_nir_apply_key(nir_shader *nir, OPT(brw_nir_limit_trig_input_range_workaround); if (progress) { - const bool is_scalar = compiler->scalar_stage[nir->info.stage]; - brw_nir_optimize(nir, is_scalar, compiler->devinfo); + brw_nir_optimize(nir, compiler->devinfo); } } diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h index 891d139cb3f..cf03b908b93 100644 --- a/src/intel/compiler/brw_nir.h +++ b/src/intel/compiler/brw_nir.h @@ -34,7 +34,6 @@ extern "C" { #endif extern const struct nir_shader_compiler_options brw_scalar_nir_options; -extern const struct nir_shader_compiler_options brw_vector_nir_options; int type_size_vec4(const struct glsl_type *type, bool bindless); int type_size_dvec4(const struct glsl_type *type, bool bindless); @@ -268,7 +267,7 @@ void brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler, nir_shader *nir, struct brw_ubo_range out_ranges[4]); -void brw_nir_optimize(nir_shader *nir, bool is_scalar, +void brw_nir_optimize(nir_shader *nir, const struct intel_device_info *devinfo); nir_shader *brw_nir_create_passthrough_tcs(void *mem_ctx, diff --git a/src/intel/compiler/brw_nir_rt.c b/src/intel/compiler/brw_nir_rt.c index b5daa1090de..81538732d71 100644 --- a/src/intel/compiler/brw_nir_rt.c +++ b/src/intel/compiler/brw_nir_rt.c @@ -529,8 +529,7 @@ brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler, NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics, devinfo, NULL); - const bool is_scalar = true; - brw_nir_optimize(nir, is_scalar, devinfo); + brw_nir_optimize(nir, devinfo); return nir; } diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index f56ae8d68d1..2176c3d4912 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -26,7 +26,6 @@ #include "brw_fs.h" #include "brw_nir.h" #include "brw_private.h" -#include "brw_vec4_tes.h" #include "dev/intel_debug.h" #include "util/macros.h" #include "util/u_debug.h" @@ -1310,9 +1309,7 @@ brw_compile_tes(const struct brw_compiler *compiler, const struct intel_vue_map *input_vue_map = params->input_vue_map; struct brw_tes_prog_data *prog_data = params->prog_data; - const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_EVAL]; const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TES); - const unsigned *assembly; prog_data->base.base.stage = MESA_SHADER_TESS_EVAL; prog_data->base.base.ray_queries = nir->info.ray_queries; @@ -1395,55 +1392,35 @@ brw_compile_tes(const struct brw_compiler *compiler, MESA_SHADER_TESS_EVAL); } - if (is_scalar) { - const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8; - fs_visitor v(compiler, ¶ms->base, &key->base, - &prog_data->base.base, nir, dispatch_width, - params->base.stats != NULL, debug_enabled); - if (!v.run_tes()) { - params->base.error_str = - ralloc_strdup(params->base.mem_ctx, v.fail_msg); - return NULL; - } - - assert(v.payload().num_regs % reg_unit(devinfo) == 0); - prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo); - - prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8; - - fs_generator g(compiler, ¶ms->base, - &prog_data->base.base, false, MESA_SHADER_TESS_EVAL); - if (unlikely(debug_enabled)) { - g.enable_debug(ralloc_asprintf(params->base.mem_ctx, - "%s tessellation evaluation shader %s", - nir->info.label ? nir->info.label - : "unnamed", - nir->info.name)); - } - - g.generate_code(v.cfg, dispatch_width, v.shader_stats, - v.performance_analysis.require(), params->base.stats); - - g.add_const_data(nir->constant_data, nir->constant_data_size); - - assembly = g.get_assembly(); - } else { - brw::vec4_tes_visitor v(compiler, ¶ms->base, key, prog_data, - nir, debug_enabled); - if (!v.run()) { - params->base.error_str = - ralloc_strdup(params->base.mem_ctx, v.fail_msg); - return NULL; - } - - if (unlikely(debug_enabled)) - v.dump_instructions(); - - assembly = brw_vec4_generate_assembly(compiler, ¶ms->base, nir, - &prog_data->base, v.cfg, - v.performance_analysis.require(), - debug_enabled); + const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8; + fs_visitor v(compiler, ¶ms->base, &key->base, + &prog_data->base.base, nir, dispatch_width, + params->base.stats != NULL, debug_enabled); + if (!v.run_tes()) { + params->base.error_str = + ralloc_strdup(params->base.mem_ctx, v.fail_msg); + return NULL; } - return assembly; + assert(v.payload().num_regs % reg_unit(devinfo) == 0); + prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo); + + prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8; + + fs_generator g(compiler, ¶ms->base, + &prog_data->base.base, false, MESA_SHADER_TESS_EVAL); + if (unlikely(debug_enabled)) { + g.enable_debug(ralloc_asprintf(params->base.mem_ctx, + "%s tessellation evaluation shader %s", + nir->info.label ? nir->info.label + : "unnamed", + nir->info.name)); + } + + g.generate_code(v.cfg, dispatch_width, v.shader_stats, + v.performance_analysis.require(), params->base.stats); + + g.add_const_data(nir->constant_data, nir->constant_data_size); + + return g.get_assembly(); } diff --git a/src/intel/compiler/brw_shader.h b/src/intel/compiler/brw_shader.h index ce666e4ed3b..fbd50c07e7e 100644 --- a/src/intel/compiler/brw_shader.h +++ b/src/intel/compiler/brw_shader.h @@ -134,7 +134,6 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler, gl_shader_stage stage) { const struct intel_device_info *devinfo = compiler->devinfo; - const bool is_scalar = compiler->scalar_stage[stage]; nir_variable_mode indirect_mask = (nir_variable_mode) 0; switch (stage) { @@ -143,19 +142,14 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler, indirect_mask |= nir_var_shader_in; break; - case MESA_SHADER_GEOMETRY: - if (!is_scalar) - indirect_mask |= nir_var_shader_in; - break; - default: /* Everything else can handle indirect inputs */ break; } - if (is_scalar && stage != MESA_SHADER_TESS_CTRL && - stage != MESA_SHADER_TASK && - stage != MESA_SHADER_MESH) + if (stage != MESA_SHADER_TESS_CTRL && + stage != MESA_SHADER_TASK && + stage != MESA_SHADER_MESH) indirect_mask |= nir_var_shader_out; /* On HSW+, we allow indirects in scalar shaders. They get implemented @@ -168,7 +162,7 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler, * indirects as scratch all the time, we may easily exceed this limit * without having any fallback. */ - if (is_scalar && devinfo->verx10 <= 70) + if (devinfo->verx10 <= 70) indirect_mask |= nir_var_function_temp; return indirect_mask; diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 4390b45c3d4..ce483508884 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -468,10 +468,7 @@ get_features(const struct anv_physical_device *pdevice, .textureCompressionBC = true, .occlusionQueryPrecise = true, .pipelineStatisticsQuery = true, - /* We can't do image stores in vec4 shaders */ - .vertexPipelineStoresAndAtomics = - pdevice->compiler->scalar_stage[MESA_SHADER_VERTEX] && - pdevice->compiler->scalar_stage[MESA_SHADER_GEOMETRY], + .vertexPipelineStoresAndAtomics = true, .fragmentStoresAndAtomics = true, .shaderTessellationAndGeometryPointSize = true, .shaderImageGatherExtended = true, @@ -940,8 +937,7 @@ get_properties_1_1(const struct anv_physical_device *pdevice, p->subgroupSize = BRW_SUBGROUP_SIZE; VkShaderStageFlags scalar_stages = 0; for (unsigned stage = 0; stage < MESA_SHADER_STAGES; stage++) { - if (pdevice->compiler->scalar_stage[stage]) - scalar_stages |= mesa_to_vk_shader_stage(stage); + scalar_stages |= mesa_to_vk_shader_stage(stage); } if (pdevice->vk.supported_extensions.KHR_ray_tracing_pipeline) { scalar_stages |= VK_SHADER_STAGE_RAYGEN_BIT_KHR | diff --git a/src/intel/vulkan/anv_nir_compute_push_layout.c b/src/intel/vulkan/anv_nir_compute_push_layout.c index b25e3c36c46..fc766e927e7 100644 --- a/src/intel/vulkan/anv_nir_compute_push_layout.c +++ b/src/intel/vulkan/anv_nir_compute_push_layout.c @@ -130,10 +130,8 @@ anv_nir_compute_push_layout(nir_shader *nir, push_start = MIN2(push_start, push_end); push_start = ROUND_DOWN_TO(push_start, 32); - /* For vec4 our push data size needs to be aligned to a vec4 and for - * scalar, it needs to be aligned to a DWORD. - */ - const unsigned alignment = compiler->scalar_stage[nir->info.stage] ? 4 : 16; + /* For scalar, push data size needs to be aligned to a DWORD. */ + const unsigned alignment = 4; nir->num_uniforms = ALIGN(push_end - push_start, alignment); prog_data->nr_params = nir->num_uniforms / 4; prog_data->param = rzalloc_array(mem_ctx, uint32_t, prog_data->nr_params); @@ -218,13 +216,7 @@ anv_nir_compute_push_layout(nir_shader *nir, if (push_ubo_ranges) { brw_nir_analyze_ubo_ranges(compiler, nir, prog_data->ubo_ranges); - /* The vec4 back-end pushes at most 32 regs while the scalar back-end - * pushes up to 64. This is primarily because the scalar back-end has a - * massively more competent register allocator and so the risk of - * spilling due to UBO pushing isn't nearly as high. - */ - const unsigned max_push_regs = - compiler->scalar_stage[nir->info.stage] ? 64 : 32; + const unsigned max_push_regs = 64; unsigned total_push_regs = push_constant_range.length; for (unsigned i = 0; i < 4; i++) {