intel/brw: Always use scalar shaders

Remove scalar_stage[] array, since now it is always scalar.  This
removes any usage of vec4 shaders in brw.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27691>
This commit is contained in:
Caio Oliveira 2024-02-14 22:41:17 -08:00 committed by Marge Bot
parent 303fd4e935
commit 7c23b90537
12 changed files with 224 additions and 523 deletions

View file

@ -3,8 +3,6 @@
* SPDX-License-Identifier: MIT
*/
#include "brw_vec4_gs_visitor.h"
#include "gfx6_gs_visitor.h"
#include "brw_eu.h"
#include "brw_fs.h"
#include "brw_prim.h"
@ -41,7 +39,6 @@ brw_compile_gs(const struct brw_compiler *compiler,
memset(&c, 0, sizeof(c));
c.key = *key;
const bool is_scalar = compiler->scalar_stage[MESA_SHADER_GEOMETRY];
const bool debug_enabled = brw_should_print_shader(nir, DEBUG_GS);
prog_data->base.base.stage = MESA_SHADER_GEOMETRY;
@ -266,135 +263,33 @@ brw_compile_gs(const struct brw_compiler *compiler,
brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_GEOMETRY);
}
if (is_scalar) {
fs_visitor v(compiler, &params->base, &c, prog_data, nir,
params->base.stats != NULL, debug_enabled);
if (v.run_gs()) {
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
fs_visitor v(compiler, &params->base, &c, prog_data, nir,
params->base.stats != NULL, debug_enabled);
if (v.run_gs()) {
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
prog_data->base.base.dispatch_grf_start_reg =
v.payload().num_regs / reg_unit(compiler->devinfo);
assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
prog_data->base.base.dispatch_grf_start_reg =
v.payload().num_regs / reg_unit(compiler->devinfo);
fs_generator g(compiler, &params->base,
&prog_data->base.base, false, MESA_SHADER_GEOMETRY);
if (unlikely(debug_enabled)) {
const char *label =
nir->info.label ? nir->info.label : "unnamed";
char *name = ralloc_asprintf(params->base.mem_ctx,
"%s geometry shader %s",
label, nir->info.name);
g.enable_debug(name);
}
g.generate_code(v.cfg, v.dispatch_width, v.shader_stats,
v.performance_analysis.require(), params->base.stats);
g.add_const_data(nir->constant_data, nir->constant_data_size);
return g.get_assembly();
fs_generator g(compiler, &params->base,
&prog_data->base.base, false, MESA_SHADER_GEOMETRY);
if (unlikely(debug_enabled)) {
const char *label =
nir->info.label ? nir->info.label : "unnamed";
char *name = ralloc_asprintf(params->base.mem_ctx,
"%s geometry shader %s",
label, nir->info.name);
g.enable_debug(name);
}
params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg);
return NULL;
g.generate_code(v.cfg, v.dispatch_width, v.shader_stats,
v.performance_analysis.require(), params->base.stats);
g.add_const_data(nir->constant_data, nir->constant_data_size);
return g.get_assembly();
}
if (compiler->devinfo->ver >= 7) {
/* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
* so without spilling. If the GS invocations count > 1, then we can't use
* dual object mode.
*/
if (prog_data->invocations <= 1 &&
!INTEL_DEBUG(DEBUG_NO_DUAL_OBJECT_GS)) {
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg);
brw::vec4_gs_visitor v(compiler, &params->base, &c, prog_data, nir,
true /* no_spills */,
debug_enabled);
/* Backup 'nr_params' and 'param' as they can be modified by the
* the DUAL_OBJECT visitor. If it fails, we will run the fallback
* (DUAL_INSTANCED or SINGLE mode) and we need to restore original
* values.
*/
const unsigned param_count = prog_data->base.base.nr_params;
uint32_t *param = ralloc_array(NULL, uint32_t, param_count);
memcpy(param, prog_data->base.base.param,
sizeof(uint32_t) * param_count);
if (v.run()) {
/* Success! Backup is not needed */
ralloc_free(param);
return brw_vec4_generate_assembly(compiler, &params->base,
nir, &prog_data->base,
v.cfg,
v.performance_analysis.require(),
debug_enabled);
} else {
/* These variables could be modified by the execution of the GS
* visitor if it packed the uniforms in the push constant buffer.
* As it failed, we need restore them so we can start again with
* DUAL_INSTANCED or SINGLE mode.
*
* FIXME: Could more variables be modified by this execution?
*/
memcpy(prog_data->base.base.param, param,
sizeof(uint32_t) * param_count);
prog_data->base.base.nr_params = param_count;
ralloc_free(param);
}
}
}
/* Either we failed to compile in DUAL_OBJECT mode (probably because it
* would have required spilling) or DUAL_OBJECT mode is disabled. So fall
* back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers.
*
* FIXME: Single dispatch mode requires that the driver can handle
* interleaving of input registers, but this is already supported (dual
* instance mode has the same requirement). However, to take full advantage
* of single dispatch mode to reduce register pressure we would also need to
* do interleaved outputs, but currently, the vec4 visitor and generator
* classes do not support this, so at the moment register pressure in
* single and dual instance modes is the same.
*
* From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS"
* "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely
* want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode
* is also supported. When InstanceCount=1 (one instance per object) software
* can decide which dispatch mode to use. DUAL_OBJECT mode would likely be
* the best choice for performance, followed by SINGLE mode."
*
* So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE
* mode is more performant when invocations > 1. Gfx6 only supports
* SINGLE mode.
*/
if (prog_data->invocations <= 1 || compiler->devinfo->ver < 7)
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X1_SINGLE;
else
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_INSTANCE;
brw::vec4_gs_visitor *gs = NULL;
const unsigned *ret = NULL;
if (compiler->devinfo->ver >= 7)
gs = new brw::vec4_gs_visitor(compiler, &params->base, &c, prog_data,
nir, false /* no_spills */,
debug_enabled);
else
gs = new brw::gfx6_gs_visitor(compiler, &params->base, &c, prog_data,
nir, false /* no_spills */,
debug_enabled);
if (!gs->run()) {
params->base.error_str =
ralloc_strdup(params->base.mem_ctx, gs->fail_msg);
} else {
ret = brw_vec4_generate_assembly(compiler, &params->base, nir,
&prog_data->base, gs->cfg,
gs->performance_analysis.require(),
debug_enabled);
}
delete gs;
return ret;
return NULL;
}

View file

@ -3,9 +3,9 @@
* SPDX-License-Identifier: MIT
*/
#include "brw_eu.h"
#include "intel_nir.h"
#include "brw_nir.h"
#include "brw_vec4_tcs.h"
#include "brw_fs.h"
#include "brw_private.h"
#include "dev/intel_debug.h"
@ -49,9 +49,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
struct brw_tcs_prog_data *prog_data = params->prog_data;
struct brw_vue_prog_data *vue_prog_data = &prog_data->base;
const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];
const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TCS);
const unsigned *assembly;
vue_prog_data->base.stage = MESA_SHADER_TESS_CTRL;
prog_data->base.base.ray_queries = nir->info.ray_queries;
@ -89,7 +87,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
prog_data->instances = nir->info.tess.tcs_vertices_out;
prog_data->include_primitive_id = has_primitive_id;
} else {
unsigned verts_per_thread = is_scalar ? 8 : 2;
unsigned verts_per_thread = 8;
vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH;
prog_data->instances =
DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, verts_per_thread);
@ -135,54 +133,33 @@ brw_compile_tcs(const struct brw_compiler *compiler,
brw_print_vue_map(stderr, &vue_prog_data->vue_map, MESA_SHADER_TESS_CTRL);
}
if (is_scalar) {
const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
fs_visitor v(compiler, &params->base, &key->base,
&prog_data->base.base, nir, dispatch_width,
params->base.stats != NULL, debug_enabled);
if (!v.run_tcs()) {
params->base.error_str =
ralloc_strdup(params->base.mem_ctx, v.fail_msg);
return NULL;
}
assert(v.payload().num_regs % reg_unit(devinfo) == 0);
prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
fs_generator g(compiler, &params->base,
&prog_data->base.base, false, MESA_SHADER_TESS_CTRL);
if (unlikely(debug_enabled)) {
g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
"%s tessellation control shader %s",
nir->info.label ? nir->info.label
: "unnamed",
nir->info.name));
}
g.generate_code(v.cfg, dispatch_width, v.shader_stats,
v.performance_analysis.require(), params->base.stats);
g.add_const_data(nir->constant_data, nir->constant_data_size);
assembly = g.get_assembly();
} else {
brw::vec4_tcs_visitor v(compiler, &params->base, key, prog_data,
nir, debug_enabled);
if (!v.run()) {
params->base.error_str =
ralloc_strdup(params->base.mem_ctx, v.fail_msg);
return NULL;
}
if (INTEL_DEBUG(DEBUG_TCS))
v.dump_instructions();
assembly = brw_vec4_generate_assembly(compiler, &params->base, nir,
&prog_data->base, v.cfg,
v.performance_analysis.require(),
debug_enabled);
const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
fs_visitor v(compiler, &params->base, &key->base,
&prog_data->base.base, nir, dispatch_width,
params->base.stats != NULL, debug_enabled);
if (!v.run_tcs()) {
params->base.error_str =
ralloc_strdup(params->base.mem_ctx, v.fail_msg);
return NULL;
}
return assembly;
assert(v.payload().num_regs % reg_unit(devinfo) == 0);
prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
fs_generator g(compiler, &params->base,
&prog_data->base.base, false, MESA_SHADER_TESS_CTRL);
if (unlikely(debug_enabled)) {
g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
"%s tessellation control shader %s",
nir->info.label ? nir->info.label
: "unnamed",
nir->info.name));
}
g.generate_code(v.cfg, dispatch_width, v.shader_stats,
v.performance_analysis.require(), params->base.stats);
g.add_const_data(nir->constant_data, nir->constant_data_size);
return g.get_assembly();
}

View file

@ -3,11 +3,9 @@
* SPDX-License-Identifier: MIT
*/
#include "brw_vec4.h"
#include "brw_fs.h"
#include "brw_eu.h"
#include "brw_nir.h"
#include "brw_vec4_vs.h"
#include "brw_private.h"
#include "dev/intel_debug.h"
@ -28,11 +26,8 @@ brw_compile_vs(const struct brw_compiler *compiler,
prog_data->base.base.ray_queries = nir->info.ray_queries;
prog_data->base.base.total_scratch = 0;
const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX];
brw_nir_apply_key(nir, compiler, &key->base, 8);
const unsigned *assembly = NULL;
prog_data->inputs_read = nir->info.inputs_read;
prog_data->double_inputs_read = nir->info.vs.double_inputs;
@ -83,17 +78,7 @@ brw_compile_vs(const struct brw_compiler *compiler,
if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID))
prog_data->uses_drawid = true;
/* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
* Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in
* vec4 mode, the hardware appears to wedge unless we read something.
*/
if (is_scalar)
prog_data->base.urb_read_length =
DIV_ROUND_UP(nr_attribute_slots, 2);
else
prog_data->base.urb_read_length =
DIV_ROUND_UP(MAX2(nr_attribute_slots, 1), 2);
prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attribute_slots, 2);
prog_data->nr_attribute_slots = nr_attribute_slots;
/* Since vertex shaders reuse the same VUE entry for inputs and outputs
@ -114,58 +99,37 @@ brw_compile_vs(const struct brw_compiler *compiler,
brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_VERTEX);
}
if (is_scalar) {
const unsigned dispatch_width = compiler->devinfo->ver >= 20 ? 16 : 8;
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
const unsigned dispatch_width = compiler->devinfo->ver >= 20 ? 16 : 8;
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
fs_visitor v(compiler, &params->base, &key->base,
&prog_data->base.base, nir, dispatch_width,
params->base.stats != NULL, debug_enabled);
if (!v.run_vs()) {
params->base.error_str =
ralloc_strdup(params->base.mem_ctx, v.fail_msg);
return NULL;
}
assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
prog_data->base.base.dispatch_grf_start_reg =
v.payload().num_regs / reg_unit(compiler->devinfo);
fs_generator g(compiler, &params->base,
&prog_data->base.base, v.runtime_check_aads_emit,
MESA_SHADER_VERTEX);
if (unlikely(debug_enabled)) {
const char *debug_name =
ralloc_asprintf(params->base.mem_ctx, "%s vertex shader %s",
nir->info.label ? nir->info.label :
"unnamed",
nir->info.name);
g.enable_debug(debug_name);
}
g.generate_code(v.cfg, dispatch_width, v.shader_stats,
v.performance_analysis.require(), params->base.stats);
g.add_const_data(nir->constant_data, nir->constant_data_size);
assembly = g.get_assembly();
fs_visitor v(compiler, &params->base, &key->base,
&prog_data->base.base, nir, dispatch_width,
params->base.stats != NULL, debug_enabled);
if (!v.run_vs()) {
params->base.error_str =
ralloc_strdup(params->base.mem_ctx, v.fail_msg);
return NULL;
}
if (!assembly) {
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
prog_data->base.base.dispatch_grf_start_reg =
v.payload().num_regs / reg_unit(compiler->devinfo);
vec4_vs_visitor v(compiler, &params->base, key, prog_data,
nir, debug_enabled);
if (!v.run()) {
params->base.error_str =
ralloc_strdup(params->base.mem_ctx, v.fail_msg);
return NULL;
}
fs_generator g(compiler, &params->base,
&prog_data->base.base, v.runtime_check_aads_emit,
MESA_SHADER_VERTEX);
if (unlikely(debug_enabled)) {
const char *debug_name =
ralloc_asprintf(params->base.mem_ctx, "%s vertex shader %s",
nir->info.label ? nir->info.label :
"unnamed",
nir->info.name);
assembly = brw_vec4_generate_assembly(compiler, &params->base,
nir, &prog_data->base,
v.cfg,
v.performance_analysis.require(),
debug_enabled);
g.enable_debug(debug_name);
}
g.generate_code(v.cfg, dispatch_width, v.shader_stats,
v.performance_analysis.require(), params->base.stats);
g.add_const_data(nir->constant_data, nir->constant_data_size);
return assembly;
return g.get_assembly();
}

View file

@ -29,77 +29,51 @@
#include "compiler/nir/nir.h"
#include "util/u_debug.h"
#define COMMON_OPTIONS \
.has_uclz = true, \
.lower_fdiv = true, \
.lower_scmp = true, \
.lower_flrp16 = true, \
.lower_fmod = true, \
.lower_ufind_msb = true, \
.lower_uadd_carry = true, \
.lower_usub_borrow = true, \
.lower_flrp64 = true, \
.lower_fisnormal = true, \
.lower_isign = true, \
.lower_ldexp = true, \
.lower_bitfield_extract = true, \
.lower_bitfield_insert = true, \
.lower_device_index_to_zero = true, \
.vectorize_io = true, \
.vectorize_tess_levels = true, \
.use_interpolated_input_intrinsics = true, \
.lower_insert_byte = true, \
.lower_insert_word = true, \
.vertex_id_zero_based = true, \
.lower_base_vertex = true, \
.support_16bit_alu = true, \
.lower_uniforms_to_ubo = true
#define COMMON_SCALAR_OPTIONS \
.lower_to_scalar = true, \
.lower_pack_half_2x16 = true, \
.lower_pack_snorm_2x16 = true, \
.lower_pack_snorm_4x8 = true, \
.lower_pack_unorm_2x16 = true, \
.lower_pack_unorm_4x8 = true, \
.lower_unpack_half_2x16 = true, \
.lower_unpack_snorm_2x16 = true, \
.lower_unpack_snorm_4x8 = true, \
.lower_unpack_unorm_2x16 = true, \
.lower_unpack_unorm_4x8 = true, \
.lower_hadd64 = true, \
.avoid_ternary_with_two_constants = true, \
.has_pack_32_4x8 = true, \
.max_unroll_iterations = 32, \
.force_indirect_unrolling = nir_var_function_temp, \
.divergence_analysis_options = \
(nir_divergence_single_patch_per_tcs_subgroup | \
nir_divergence_single_patch_per_tes_subgroup | \
nir_divergence_shader_record_ptr_uniform)
const struct nir_shader_compiler_options brw_scalar_nir_options = {
COMMON_OPTIONS,
COMMON_SCALAR_OPTIONS,
};
const struct nir_shader_compiler_options brw_vector_nir_options = {
COMMON_OPTIONS,
/* In the vec4 backend, our dpN instruction replicates its result to all the
* components of a vec4. We would like NIR to give us replicated fdot
* instructions because it can optimize better for us.
*/
.fdot_replicates = true,
.lower_usub_sat = true,
.avoid_ternary_with_two_constants = true,
.divergence_analysis_options =
(nir_divergence_single_patch_per_tcs_subgroup |
nir_divergence_single_patch_per_tes_subgroup |
nir_divergence_shader_record_ptr_uniform),
.force_indirect_unrolling = nir_var_function_temp,
.has_pack_32_4x8 = true,
.has_uclz = true,
.lower_base_vertex = true,
.lower_bitfield_extract = true,
.lower_bitfield_insert = true,
.lower_device_index_to_zero = true,
.lower_fdiv = true,
.lower_fisnormal = true,
.lower_flrp16 = true,
.lower_flrp64 = true,
.lower_fmod = true,
.lower_hadd64 = true,
.lower_insert_byte = true,
.lower_insert_word = true,
.lower_isign = true,
.lower_ldexp = true,
.lower_pack_half_2x16 = true,
.lower_pack_snorm_2x16 = true,
.lower_pack_snorm_4x8 = true,
.lower_pack_unorm_2x16 = true,
.lower_pack_unorm_4x8 = true,
.lower_scmp = true,
.lower_to_scalar = true,
.lower_uadd_carry = true,
.lower_ufind_msb = true,
.lower_uniforms_to_ubo = true,
.lower_unpack_half_2x16 = true,
.lower_unpack_snorm_2x16 = true,
.lower_unpack_snorm_4x8 = true,
.lower_unpack_unorm_2x16 = true,
.lower_extract_byte = true,
.lower_extract_word = true,
.intel_vec4 = true,
.lower_unpack_unorm_4x8 = true,
.lower_usub_borrow = true,
.max_unroll_iterations = 32,
.support_16bit_alu = true,
.use_interpolated_input_intrinsics = true,
.vectorize_io = true,
.vectorize_tess_levels = true,
.vertex_id_zero_based = true,
};
struct brw_compiler *
@ -129,15 +103,6 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
devinfo->platform != INTEL_PLATFORM_ARL_H) ||
debug_get_bool_option("INTEL_LOWER_DPAS", false);
/* There is no vec4 mode on Gfx10+, and we don't use it at all on Gfx8+. */
for (int i = MESA_SHADER_VERTEX; i < MESA_ALL_SHADER_STAGES; i++) {
compiler->scalar_stage[i] = devinfo->ver >= 8 ||
i == MESA_SHADER_FRAGMENT || i == MESA_SHADER_COMPUTE;
}
for (int i = MESA_SHADER_TASK; i < MESA_VULKAN_SHADER_STAGES; i++)
compiler->scalar_stage[i] = true;
nir_lower_int64_options int64_options =
nir_lower_imul64 |
nir_lower_isign64 |
@ -175,13 +140,8 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
for (int i = 0; i < MESA_ALL_SHADER_STAGES; i++) {
struct nir_shader_compiler_options *nir_options =
rzalloc(compiler, struct nir_shader_compiler_options);
bool is_scalar = compiler->scalar_stage[i];
if (is_scalar) {
*nir_options = brw_scalar_nir_options;
int64_options |= nir_lower_usub_sat64;
} else {
*nir_options = brw_vector_nir_options;
}
*nir_options = brw_scalar_nir_options;
int64_options |= nir_lower_usub_sat64;
/* Prior to Gfx6, there are no three source operations, and Gfx11 loses
* LRP.

View file

@ -86,7 +86,6 @@ struct brw_compiler {
void (*shader_debug_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4);
void (*shader_perf_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4);
bool scalar_stage[MESA_ALL_SHADER_STAGES];
bool use_tcs_multi_patch;
struct nir_shader_compiler_options *nir_options[MESA_ALL_SHADER_STAGES];

View file

@ -714,7 +714,7 @@ brw_nir_lower_fs_outputs(nir_shader *nir)
})
void
brw_nir_optimize(nir_shader *nir, bool is_scalar,
brw_nir_optimize(nir_shader *nir,
const struct intel_device_info *devinfo)
{
bool progress;
@ -752,18 +752,11 @@ brw_nir_optimize(nir_shader *nir, bool is_scalar,
OPT(nir_opt_ray_queries);
OPT(nir_opt_ray_query_ranges);
if (is_scalar) {
OPT(nir_lower_alu_to_scalar, NULL, NULL);
} else {
OPT(nir_opt_shrink_stores, true);
OPT(nir_opt_shrink_vectors);
}
OPT(nir_lower_alu_to_scalar, NULL, NULL);
OPT(nir_copy_prop);
if (is_scalar) {
OPT(nir_lower_phis_to_scalar, false);
}
OPT(nir_lower_phis_to_scalar, false);
OPT(nir_copy_prop);
OPT(nir_opt_dce);
@ -784,15 +777,9 @@ brw_nir_optimize(nir_shader *nir, bool is_scalar,
* For indirect loads of uniforms (push constants), we assume that array
* indices will nearly always be in bounds and the cost of the load is
* low. Therefore there shouldn't be a performance benefit to avoid it.
* However, in vec4 tessellation shaders, these loads operate by
* actually pulling from memory.
*/
const bool is_vec4_tessellation = !is_scalar &&
(nir->info.stage == MESA_SHADER_TESS_CTRL ||
nir->info.stage == MESA_SHADER_TESS_EVAL);
OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation, false);
OPT(nir_opt_peephole_select, 8, !is_vec4_tessellation,
devinfo->ver >= 6);
OPT(nir_opt_peephole_select, 0, true, false);
OPT(nir_opt_peephole_select, 8, true, devinfo->ver >= 6);
OPT(nir_opt_intrinsics);
OPT(nir_opt_idiv_const, 32);
@ -1014,15 +1001,11 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
const struct intel_device_info *devinfo = compiler->devinfo;
UNUSED bool progress; /* Written by OPT */
const bool is_scalar = compiler->scalar_stage[nir->info.stage];
nir_validate_ssa_dominance(nir, "before brw_preprocess_nir");
OPT(nir_lower_frexp);
if (is_scalar) {
OPT(nir_lower_alu_to_scalar, NULL, NULL);
}
OPT(nir_lower_alu_to_scalar, NULL, NULL);
if (nir->info.stage == MESA_SHADER_GEOMETRY)
OPT(nir_lower_gs_intrinsics, 0);
@ -1081,7 +1064,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
OPT(nir_split_var_copies);
OPT(nir_split_struct_vars, nir_var_function_temp);
brw_nir_optimize(nir, is_scalar, devinfo);
brw_nir_optimize(nir, devinfo);
OPT(nir_lower_doubles, opts->softfp64, nir->options->lower_doubles_options);
if (OPT(nir_lower_int64_float_conversions)) {
@ -1102,9 +1085,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
OPT(nir_opt_large_constants, NULL, 32);
}
if (is_scalar) {
OPT(nir_lower_load_const_to_scalar);
}
OPT(nir_lower_load_const_to_scalar);
OPT(nir_lower_system_values);
nir_lower_compute_system_values_options lower_csv_options = {
@ -1116,7 +1097,6 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
.ballot_bit_size = 32,
.ballot_components = 1,
.lower_to_scalar = true,
.lower_vote_trivial = !is_scalar,
.lower_relative_shuffle = true,
.lower_quad_broadcast_dynamic = true,
.lower_elect = true,
@ -1142,7 +1122,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
* issues are helped but nothing else in shader-db is hurt except for maybe
* that one kerbal space program shader.
*/
if (is_scalar && !(indirect_mask & nir_var_function_temp))
if (!(indirect_mask & nir_var_function_temp))
OPT(nir_lower_indirect_derefs, nir_var_function_temp, 16);
/* Lower array derefs of vectors for SSBO and UBO loads. For both UBOs and
@ -1165,7 +1145,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
OPT(intel_nir_clamp_per_vertex_loads);
/* Get rid of split copies */
brw_nir_optimize(nir, is_scalar, devinfo);
brw_nir_optimize(nir, devinfo);
}
static bool
@ -1321,18 +1301,13 @@ brw_nir_link_shaders(const struct brw_compiler *compiler,
nir_validate_shader(producer, "after nir_lower_io_arrays_to_elements");
nir_validate_shader(consumer, "after nir_lower_io_arrays_to_elements");
const bool p_is_scalar = compiler->scalar_stage[producer->info.stage];
const bool c_is_scalar = compiler->scalar_stage[consumer->info.stage];
if (p_is_scalar && c_is_scalar) {
NIR_PASS(_, producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
NIR_PASS(_, consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
brw_nir_optimize(producer, p_is_scalar, devinfo);
brw_nir_optimize(consumer, c_is_scalar, devinfo);
}
NIR_PASS(_, producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
NIR_PASS(_, consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
brw_nir_optimize(producer, devinfo);
brw_nir_optimize(consumer, devinfo);
if (nir_link_opt_varyings(producer, consumer))
brw_nir_optimize(consumer, c_is_scalar, devinfo);
brw_nir_optimize(consumer, devinfo);
NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
@ -1361,8 +1336,8 @@ brw_nir_link_shaders(const struct brw_compiler *compiler,
brw_nir_no_indirect_mask(compiler, consumer->info.stage),
UINT32_MAX);
brw_nir_optimize(producer, p_is_scalar, devinfo);
brw_nir_optimize(consumer, c_is_scalar, devinfo);
brw_nir_optimize(producer, devinfo);
brw_nir_optimize(consumer, devinfo);
if (producer->info.stage == MESA_SHADER_MESH &&
consumer->info.stage == MESA_SHADER_FRAGMENT) {
@ -1591,48 +1566,45 @@ brw_vectorize_lower_mem_access(nir_shader *nir,
enum brw_robustness_flags robust_flags)
{
bool progress = false;
const bool is_scalar = compiler->scalar_stage[nir->info.stage];
if (is_scalar) {
nir_load_store_vectorize_options options = {
.modes = nir_var_mem_ubo | nir_var_mem_ssbo |
nir_var_mem_global | nir_var_mem_shared |
nir_var_mem_task_payload,
.callback = brw_nir_should_vectorize_mem,
.robust_modes = (nir_variable_mode)0,
};
nir_load_store_vectorize_options options = {
.modes = nir_var_mem_ubo | nir_var_mem_ssbo |
nir_var_mem_global | nir_var_mem_shared |
nir_var_mem_task_payload,
.callback = brw_nir_should_vectorize_mem,
.robust_modes = (nir_variable_mode)0,
};
if (robust_flags & BRW_ROBUSTNESS_UBO)
options.robust_modes |= nir_var_mem_ubo | nir_var_mem_global;
if (robust_flags & BRW_ROBUSTNESS_SSBO)
options.robust_modes |= nir_var_mem_ssbo | nir_var_mem_global;
if (robust_flags & BRW_ROBUSTNESS_UBO)
options.robust_modes |= nir_var_mem_ubo | nir_var_mem_global;
if (robust_flags & BRW_ROBUSTNESS_SSBO)
options.robust_modes |= nir_var_mem_ssbo | nir_var_mem_global;
OPT(nir_opt_load_store_vectorize, &options);
OPT(nir_opt_load_store_vectorize, &options);
/* Only run the blockify optimization on Gfx9+ because although prior HW
* versions have support for block loads, they do have limitations on
* alignment as well as requiring split sends which are not supported
* there.
/* Only run the blockify optimization on Gfx9+ because although prior HW
* versions have support for block loads, they do have limitations on
* alignment as well as requiring split sends which are not supported
* there.
*/
if (compiler->devinfo->ver >= 9) {
/* Required for nir_divergence_analysis() */
OPT(nir_convert_to_lcssa, true, true);
/* When HW supports block loads, using the divergence analysis, try
* to find uniform SSBO loads and turn them into block loads.
*
* Rerun the vectorizer after that to make the largest possible block
* loads.
*
* This is a win on 2 fronts :
* - fewer send messages
* - reduced register pressure
*/
if (compiler->devinfo->ver >= 9) {
/* Required for nir_divergence_analysis() */
OPT(nir_convert_to_lcssa, true, true);
/* When HW supports block loads, using the divergence analysis, try
* to find uniform SSBO loads and turn them into block loads.
*
* Rerun the vectorizer after that to make the largest possible block
* loads.
*
* This is a win on 2 fronts :
* - fewer send messages
* - reduced register pressure
*/
nir_divergence_analysis(nir);
if (OPT(intel_nir_blockify_uniform_loads, compiler->devinfo))
OPT(nir_opt_load_store_vectorize, &options);
OPT(nir_opt_remove_phis);
}
nir_divergence_analysis(nir);
if (OPT(intel_nir_blockify_uniform_loads, compiler->devinfo))
OPT(nir_opt_load_store_vectorize, &options);
OPT(nir_opt_remove_phis);
}
nir_lower_mem_access_bit_sizes_options mem_access_options = {
@ -1683,7 +1655,6 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
enum brw_robustness_flags robust_flags)
{
const struct intel_device_info *devinfo = compiler->devinfo;
const bool is_scalar = compiler->scalar_stage[nir->info.stage];
UNUSED bool progress; /* Written by OPT */
@ -1710,20 +1681,20 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
if (gl_shader_stage_can_set_fragment_shading_rate(nir->info.stage))
NIR_PASS(_, nir, intel_nir_lower_shading_rate_output);
brw_nir_optimize(nir, is_scalar, devinfo);
brw_nir_optimize(nir, devinfo);
if (is_scalar && nir_shader_has_local_variables(nir)) {
if (nir_shader_has_local_variables(nir)) {
OPT(nir_lower_vars_to_explicit_types, nir_var_function_temp,
glsl_get_natural_size_align_bytes);
OPT(nir_lower_explicit_io, nir_var_function_temp,
nir_address_format_32bit_offset);
brw_nir_optimize(nir, is_scalar, devinfo);
brw_nir_optimize(nir, devinfo);
}
brw_vectorize_lower_mem_access(nir, compiler, robust_flags);
if (OPT(nir_lower_int64))
brw_nir_optimize(nir, is_scalar, devinfo);
brw_nir_optimize(nir, devinfo);
if (devinfo->ver >= 6) {
/* Try and fuse multiply-adds, if successful, run shrink_vectors to
@ -1741,8 +1712,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
OPT(nir_opt_shrink_vectors);
}
if (is_scalar)
OPT(intel_nir_opt_peephole_imul32x16);
OPT(intel_nir_opt_peephole_imul32x16);
if (OPT(nir_opt_comparison_pre)) {
OPT(nir_copy_prop);
@ -1753,27 +1723,15 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
* the other optimization passes) will have removed at least one
* instruction from one of the branches of the if-statement, so now it
* might be under the threshold of conversion to bcsel.
*
* See brw_nir_optimize for the explanation of is_vec4_tessellation.
*/
const bool is_vec4_tessellation = !is_scalar &&
(nir->info.stage == MESA_SHADER_TESS_CTRL ||
nir->info.stage == MESA_SHADER_TESS_EVAL);
OPT(nir_opt_peephole_select, 0, is_vec4_tessellation, false);
OPT(nir_opt_peephole_select, 1, is_vec4_tessellation,
compiler->devinfo->ver >= 6);
OPT(nir_opt_peephole_select, 0, false, false);
OPT(nir_opt_peephole_select, 1, false, compiler->devinfo->ver >= 6);
}
do {
progress = false;
if (OPT(nir_opt_algebraic_late)) {
/* At this late stage, anything that makes more constants will wreak
* havok on the vec4 backend. The handling of constants in the vec4
* backend is not good.
*/
if (is_scalar)
OPT(nir_opt_constant_folding);
OPT(nir_opt_constant_folding);
OPT(nir_copy_prop);
OPT(nir_opt_dce);
OPT(nir_opt_cse);
@ -1783,19 +1741,16 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
if (OPT(nir_lower_fp16_casts, nir_lower_fp16_split_fp64)) {
if (OPT(nir_lower_int64)) {
brw_nir_optimize(nir, is_scalar, devinfo);
brw_nir_optimize(nir, devinfo);
}
}
OPT(intel_nir_lower_conversions);
if (is_scalar)
OPT(nir_lower_alu_to_scalar, NULL, NULL);
OPT(nir_lower_alu_to_scalar, NULL, NULL);
while (OPT(nir_opt_algebraic_distribute_src_mods)) {
if (is_scalar)
OPT(nir_opt_constant_folding);
OPT(nir_opt_constant_folding);
OPT(nir_copy_prop);
OPT(nir_opt_dce);
OPT(nir_opt_cse);
@ -1821,7 +1776,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
OPT(nir_lower_subgroups, &subgroups_options);
if (OPT(nir_lower_int64))
brw_nir_optimize(nir, is_scalar, devinfo);
brw_nir_optimize(nir, devinfo);
divergence_analysis_dirty = true;
}
@ -1834,7 +1789,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
* that must be lowered.
*/
if (OPT(nir_lower_int64))
brw_nir_optimize(nir, is_scalar, devinfo);
brw_nir_optimize(nir, devinfo);
OPT(nir_lower_subgroups, &subgroups_options);
}
@ -1880,11 +1835,6 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
OPT(nir_convert_from_ssa, true);
if (!is_scalar) {
OPT(nir_move_vec_src_uses_to_dest, true);
OPT(nir_lower_vec_to_regs, NULL, NULL);
}
OPT(nir_opt_dce);
if (OPT(nir_opt_rematerialize_compares))
@ -2035,8 +1985,7 @@ brw_nir_apply_key(nir_shader *nir,
OPT(brw_nir_limit_trig_input_range_workaround);
if (progress) {
const bool is_scalar = compiler->scalar_stage[nir->info.stage];
brw_nir_optimize(nir, is_scalar, compiler->devinfo);
brw_nir_optimize(nir, compiler->devinfo);
}
}

View file

@ -34,7 +34,6 @@ extern "C" {
#endif
extern const struct nir_shader_compiler_options brw_scalar_nir_options;
extern const struct nir_shader_compiler_options brw_vector_nir_options;
int type_size_vec4(const struct glsl_type *type, bool bindless);
int type_size_dvec4(const struct glsl_type *type, bool bindless);
@ -268,7 +267,7 @@ void brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
nir_shader *nir,
struct brw_ubo_range out_ranges[4]);
void brw_nir_optimize(nir_shader *nir, bool is_scalar,
void brw_nir_optimize(nir_shader *nir,
const struct intel_device_info *devinfo);
nir_shader *brw_nir_create_passthrough_tcs(void *mem_ctx,

View file

@ -529,8 +529,7 @@ brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler,
NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics, devinfo, NULL);
const bool is_scalar = true;
brw_nir_optimize(nir, is_scalar, devinfo);
brw_nir_optimize(nir, devinfo);
return nir;
}

View file

@ -26,7 +26,6 @@
#include "brw_fs.h"
#include "brw_nir.h"
#include "brw_private.h"
#include "brw_vec4_tes.h"
#include "dev/intel_debug.h"
#include "util/macros.h"
#include "util/u_debug.h"
@ -1310,9 +1309,7 @@ brw_compile_tes(const struct brw_compiler *compiler,
const struct intel_vue_map *input_vue_map = params->input_vue_map;
struct brw_tes_prog_data *prog_data = params->prog_data;
const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_EVAL];
const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TES);
const unsigned *assembly;
prog_data->base.base.stage = MESA_SHADER_TESS_EVAL;
prog_data->base.base.ray_queries = nir->info.ray_queries;
@ -1395,55 +1392,35 @@ brw_compile_tes(const struct brw_compiler *compiler,
MESA_SHADER_TESS_EVAL);
}
if (is_scalar) {
const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
fs_visitor v(compiler, &params->base, &key->base,
&prog_data->base.base, nir, dispatch_width,
params->base.stats != NULL, debug_enabled);
if (!v.run_tes()) {
params->base.error_str =
ralloc_strdup(params->base.mem_ctx, v.fail_msg);
return NULL;
}
assert(v.payload().num_regs % reg_unit(devinfo) == 0);
prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
fs_generator g(compiler, &params->base,
&prog_data->base.base, false, MESA_SHADER_TESS_EVAL);
if (unlikely(debug_enabled)) {
g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
"%s tessellation evaluation shader %s",
nir->info.label ? nir->info.label
: "unnamed",
nir->info.name));
}
g.generate_code(v.cfg, dispatch_width, v.shader_stats,
v.performance_analysis.require(), params->base.stats);
g.add_const_data(nir->constant_data, nir->constant_data_size);
assembly = g.get_assembly();
} else {
brw::vec4_tes_visitor v(compiler, &params->base, key, prog_data,
nir, debug_enabled);
if (!v.run()) {
params->base.error_str =
ralloc_strdup(params->base.mem_ctx, v.fail_msg);
return NULL;
}
if (unlikely(debug_enabled))
v.dump_instructions();
assembly = brw_vec4_generate_assembly(compiler, &params->base, nir,
&prog_data->base, v.cfg,
v.performance_analysis.require(),
debug_enabled);
const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
fs_visitor v(compiler, &params->base, &key->base,
&prog_data->base.base, nir, dispatch_width,
params->base.stats != NULL, debug_enabled);
if (!v.run_tes()) {
params->base.error_str =
ralloc_strdup(params->base.mem_ctx, v.fail_msg);
return NULL;
}
return assembly;
assert(v.payload().num_regs % reg_unit(devinfo) == 0);
prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
fs_generator g(compiler, &params->base,
&prog_data->base.base, false, MESA_SHADER_TESS_EVAL);
if (unlikely(debug_enabled)) {
g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
"%s tessellation evaluation shader %s",
nir->info.label ? nir->info.label
: "unnamed",
nir->info.name));
}
g.generate_code(v.cfg, dispatch_width, v.shader_stats,
v.performance_analysis.require(), params->base.stats);
g.add_const_data(nir->constant_data, nir->constant_data_size);
return g.get_assembly();
}

View file

@ -134,7 +134,6 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
gl_shader_stage stage)
{
const struct intel_device_info *devinfo = compiler->devinfo;
const bool is_scalar = compiler->scalar_stage[stage];
nir_variable_mode indirect_mask = (nir_variable_mode) 0;
switch (stage) {
@ -143,19 +142,14 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
indirect_mask |= nir_var_shader_in;
break;
case MESA_SHADER_GEOMETRY:
if (!is_scalar)
indirect_mask |= nir_var_shader_in;
break;
default:
/* Everything else can handle indirect inputs */
break;
}
if (is_scalar && stage != MESA_SHADER_TESS_CTRL &&
stage != MESA_SHADER_TASK &&
stage != MESA_SHADER_MESH)
if (stage != MESA_SHADER_TESS_CTRL &&
stage != MESA_SHADER_TASK &&
stage != MESA_SHADER_MESH)
indirect_mask |= nir_var_shader_out;
/* On HSW+, we allow indirects in scalar shaders. They get implemented
@ -168,7 +162,7 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
* indirects as scratch all the time, we may easily exceed this limit
* without having any fallback.
*/
if (is_scalar && devinfo->verx10 <= 70)
if (devinfo->verx10 <= 70)
indirect_mask |= nir_var_function_temp;
return indirect_mask;

View file

@ -468,10 +468,7 @@ get_features(const struct anv_physical_device *pdevice,
.textureCompressionBC = true,
.occlusionQueryPrecise = true,
.pipelineStatisticsQuery = true,
/* We can't do image stores in vec4 shaders */
.vertexPipelineStoresAndAtomics =
pdevice->compiler->scalar_stage[MESA_SHADER_VERTEX] &&
pdevice->compiler->scalar_stage[MESA_SHADER_GEOMETRY],
.vertexPipelineStoresAndAtomics = true,
.fragmentStoresAndAtomics = true,
.shaderTessellationAndGeometryPointSize = true,
.shaderImageGatherExtended = true,
@ -940,8 +937,7 @@ get_properties_1_1(const struct anv_physical_device *pdevice,
p->subgroupSize = BRW_SUBGROUP_SIZE;
VkShaderStageFlags scalar_stages = 0;
for (unsigned stage = 0; stage < MESA_SHADER_STAGES; stage++) {
if (pdevice->compiler->scalar_stage[stage])
scalar_stages |= mesa_to_vk_shader_stage(stage);
scalar_stages |= mesa_to_vk_shader_stage(stage);
}
if (pdevice->vk.supported_extensions.KHR_ray_tracing_pipeline) {
scalar_stages |= VK_SHADER_STAGE_RAYGEN_BIT_KHR |

View file

@ -130,10 +130,8 @@ anv_nir_compute_push_layout(nir_shader *nir,
push_start = MIN2(push_start, push_end);
push_start = ROUND_DOWN_TO(push_start, 32);
/* For vec4 our push data size needs to be aligned to a vec4 and for
* scalar, it needs to be aligned to a DWORD.
*/
const unsigned alignment = compiler->scalar_stage[nir->info.stage] ? 4 : 16;
/* For scalar, push data size needs to be aligned to a DWORD. */
const unsigned alignment = 4;
nir->num_uniforms = ALIGN(push_end - push_start, alignment);
prog_data->nr_params = nir->num_uniforms / 4;
prog_data->param = rzalloc_array(mem_ctx, uint32_t, prog_data->nr_params);
@ -218,13 +216,7 @@ anv_nir_compute_push_layout(nir_shader *nir,
if (push_ubo_ranges) {
brw_nir_analyze_ubo_ranges(compiler, nir, prog_data->ubo_ranges);
/* The vec4 back-end pushes at most 32 regs while the scalar back-end
* pushes up to 64. This is primarily because the scalar back-end has a
* massively more competent register allocator and so the risk of
* spilling due to UBO pushing isn't nearly as high.
*/
const unsigned max_push_regs =
compiler->scalar_stage[nir->info.stage] ? 64 : 32;
const unsigned max_push_regs = 64;
unsigned total_push_regs = push_constant_range.length;
for (unsigned i = 0; i < 4; i++) {