mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-23 10:48:08 +02:00
intel/brw: Always use scalar shaders
Remove scalar_stage[] array, since now it is always scalar. This removes any usage of vec4 shaders in brw. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27691>
This commit is contained in:
parent
303fd4e935
commit
7c23b90537
12 changed files with 224 additions and 523 deletions
|
|
@ -3,8 +3,6 @@
|
|||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "brw_vec4_gs_visitor.h"
|
||||
#include "gfx6_gs_visitor.h"
|
||||
#include "brw_eu.h"
|
||||
#include "brw_fs.h"
|
||||
#include "brw_prim.h"
|
||||
|
|
@ -41,7 +39,6 @@ brw_compile_gs(const struct brw_compiler *compiler,
|
|||
memset(&c, 0, sizeof(c));
|
||||
c.key = *key;
|
||||
|
||||
const bool is_scalar = compiler->scalar_stage[MESA_SHADER_GEOMETRY];
|
||||
const bool debug_enabled = brw_should_print_shader(nir, DEBUG_GS);
|
||||
|
||||
prog_data->base.base.stage = MESA_SHADER_GEOMETRY;
|
||||
|
|
@ -266,135 +263,33 @@ brw_compile_gs(const struct brw_compiler *compiler,
|
|||
brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_GEOMETRY);
|
||||
}
|
||||
|
||||
if (is_scalar) {
|
||||
fs_visitor v(compiler, ¶ms->base, &c, prog_data, nir,
|
||||
params->base.stats != NULL, debug_enabled);
|
||||
if (v.run_gs()) {
|
||||
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
|
||||
fs_visitor v(compiler, ¶ms->base, &c, prog_data, nir,
|
||||
params->base.stats != NULL, debug_enabled);
|
||||
if (v.run_gs()) {
|
||||
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
|
||||
|
||||
assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
|
||||
prog_data->base.base.dispatch_grf_start_reg =
|
||||
v.payload().num_regs / reg_unit(compiler->devinfo);
|
||||
assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
|
||||
prog_data->base.base.dispatch_grf_start_reg =
|
||||
v.payload().num_regs / reg_unit(compiler->devinfo);
|
||||
|
||||
fs_generator g(compiler, ¶ms->base,
|
||||
&prog_data->base.base, false, MESA_SHADER_GEOMETRY);
|
||||
if (unlikely(debug_enabled)) {
|
||||
const char *label =
|
||||
nir->info.label ? nir->info.label : "unnamed";
|
||||
char *name = ralloc_asprintf(params->base.mem_ctx,
|
||||
"%s geometry shader %s",
|
||||
label, nir->info.name);
|
||||
g.enable_debug(name);
|
||||
}
|
||||
g.generate_code(v.cfg, v.dispatch_width, v.shader_stats,
|
||||
v.performance_analysis.require(), params->base.stats);
|
||||
g.add_const_data(nir->constant_data, nir->constant_data_size);
|
||||
return g.get_assembly();
|
||||
fs_generator g(compiler, ¶ms->base,
|
||||
&prog_data->base.base, false, MESA_SHADER_GEOMETRY);
|
||||
if (unlikely(debug_enabled)) {
|
||||
const char *label =
|
||||
nir->info.label ? nir->info.label : "unnamed";
|
||||
char *name = ralloc_asprintf(params->base.mem_ctx,
|
||||
"%s geometry shader %s",
|
||||
label, nir->info.name);
|
||||
g.enable_debug(name);
|
||||
}
|
||||
|
||||
params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg);
|
||||
|
||||
return NULL;
|
||||
g.generate_code(v.cfg, v.dispatch_width, v.shader_stats,
|
||||
v.performance_analysis.require(), params->base.stats);
|
||||
g.add_const_data(nir->constant_data, nir->constant_data_size);
|
||||
return g.get_assembly();
|
||||
}
|
||||
|
||||
if (compiler->devinfo->ver >= 7) {
|
||||
/* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
|
||||
* so without spilling. If the GS invocations count > 1, then we can't use
|
||||
* dual object mode.
|
||||
*/
|
||||
if (prog_data->invocations <= 1 &&
|
||||
!INTEL_DEBUG(DEBUG_NO_DUAL_OBJECT_GS)) {
|
||||
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
|
||||
params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg);
|
||||
|
||||
brw::vec4_gs_visitor v(compiler, ¶ms->base, &c, prog_data, nir,
|
||||
true /* no_spills */,
|
||||
debug_enabled);
|
||||
|
||||
/* Backup 'nr_params' and 'param' as they can be modified by the
|
||||
* the DUAL_OBJECT visitor. If it fails, we will run the fallback
|
||||
* (DUAL_INSTANCED or SINGLE mode) and we need to restore original
|
||||
* values.
|
||||
*/
|
||||
const unsigned param_count = prog_data->base.base.nr_params;
|
||||
uint32_t *param = ralloc_array(NULL, uint32_t, param_count);
|
||||
memcpy(param, prog_data->base.base.param,
|
||||
sizeof(uint32_t) * param_count);
|
||||
|
||||
if (v.run()) {
|
||||
/* Success! Backup is not needed */
|
||||
ralloc_free(param);
|
||||
return brw_vec4_generate_assembly(compiler, ¶ms->base,
|
||||
nir, &prog_data->base,
|
||||
v.cfg,
|
||||
v.performance_analysis.require(),
|
||||
debug_enabled);
|
||||
} else {
|
||||
/* These variables could be modified by the execution of the GS
|
||||
* visitor if it packed the uniforms in the push constant buffer.
|
||||
* As it failed, we need restore them so we can start again with
|
||||
* DUAL_INSTANCED or SINGLE mode.
|
||||
*
|
||||
* FIXME: Could more variables be modified by this execution?
|
||||
*/
|
||||
memcpy(prog_data->base.base.param, param,
|
||||
sizeof(uint32_t) * param_count);
|
||||
prog_data->base.base.nr_params = param_count;
|
||||
ralloc_free(param);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Either we failed to compile in DUAL_OBJECT mode (probably because it
|
||||
* would have required spilling) or DUAL_OBJECT mode is disabled. So fall
|
||||
* back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers.
|
||||
*
|
||||
* FIXME: Single dispatch mode requires that the driver can handle
|
||||
* interleaving of input registers, but this is already supported (dual
|
||||
* instance mode has the same requirement). However, to take full advantage
|
||||
* of single dispatch mode to reduce register pressure we would also need to
|
||||
* do interleaved outputs, but currently, the vec4 visitor and generator
|
||||
* classes do not support this, so at the moment register pressure in
|
||||
* single and dual instance modes is the same.
|
||||
*
|
||||
* From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS"
|
||||
* "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely
|
||||
* want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode
|
||||
* is also supported. When InstanceCount=1 (one instance per object) software
|
||||
* can decide which dispatch mode to use. DUAL_OBJECT mode would likely be
|
||||
* the best choice for performance, followed by SINGLE mode."
|
||||
*
|
||||
* So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE
|
||||
* mode is more performant when invocations > 1. Gfx6 only supports
|
||||
* SINGLE mode.
|
||||
*/
|
||||
if (prog_data->invocations <= 1 || compiler->devinfo->ver < 7)
|
||||
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X1_SINGLE;
|
||||
else
|
||||
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_INSTANCE;
|
||||
|
||||
brw::vec4_gs_visitor *gs = NULL;
|
||||
const unsigned *ret = NULL;
|
||||
|
||||
if (compiler->devinfo->ver >= 7)
|
||||
gs = new brw::vec4_gs_visitor(compiler, ¶ms->base, &c, prog_data,
|
||||
nir, false /* no_spills */,
|
||||
debug_enabled);
|
||||
else
|
||||
gs = new brw::gfx6_gs_visitor(compiler, ¶ms->base, &c, prog_data,
|
||||
nir, false /* no_spills */,
|
||||
debug_enabled);
|
||||
|
||||
if (!gs->run()) {
|
||||
params->base.error_str =
|
||||
ralloc_strdup(params->base.mem_ctx, gs->fail_msg);
|
||||
} else {
|
||||
ret = brw_vec4_generate_assembly(compiler, ¶ms->base, nir,
|
||||
&prog_data->base, gs->cfg,
|
||||
gs->performance_analysis.require(),
|
||||
debug_enabled);
|
||||
}
|
||||
|
||||
delete gs;
|
||||
return ret;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3,9 +3,9 @@
|
|||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "brw_eu.h"
|
||||
#include "intel_nir.h"
|
||||
#include "brw_nir.h"
|
||||
#include "brw_vec4_tcs.h"
|
||||
#include "brw_fs.h"
|
||||
#include "brw_private.h"
|
||||
#include "dev/intel_debug.h"
|
||||
|
|
@ -49,9 +49,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
|
|||
struct brw_tcs_prog_data *prog_data = params->prog_data;
|
||||
struct brw_vue_prog_data *vue_prog_data = &prog_data->base;
|
||||
|
||||
const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];
|
||||
const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TCS);
|
||||
const unsigned *assembly;
|
||||
|
||||
vue_prog_data->base.stage = MESA_SHADER_TESS_CTRL;
|
||||
prog_data->base.base.ray_queries = nir->info.ray_queries;
|
||||
|
|
@ -89,7 +87,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
|
|||
prog_data->instances = nir->info.tess.tcs_vertices_out;
|
||||
prog_data->include_primitive_id = has_primitive_id;
|
||||
} else {
|
||||
unsigned verts_per_thread = is_scalar ? 8 : 2;
|
||||
unsigned verts_per_thread = 8;
|
||||
vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH;
|
||||
prog_data->instances =
|
||||
DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, verts_per_thread);
|
||||
|
|
@ -135,54 +133,33 @@ brw_compile_tcs(const struct brw_compiler *compiler,
|
|||
brw_print_vue_map(stderr, &vue_prog_data->vue_map, MESA_SHADER_TESS_CTRL);
|
||||
}
|
||||
|
||||
if (is_scalar) {
|
||||
const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
|
||||
fs_visitor v(compiler, ¶ms->base, &key->base,
|
||||
&prog_data->base.base, nir, dispatch_width,
|
||||
params->base.stats != NULL, debug_enabled);
|
||||
if (!v.run_tcs()) {
|
||||
params->base.error_str =
|
||||
ralloc_strdup(params->base.mem_ctx, v.fail_msg);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
assert(v.payload().num_regs % reg_unit(devinfo) == 0);
|
||||
prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
|
||||
|
||||
fs_generator g(compiler, ¶ms->base,
|
||||
&prog_data->base.base, false, MESA_SHADER_TESS_CTRL);
|
||||
if (unlikely(debug_enabled)) {
|
||||
g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
|
||||
"%s tessellation control shader %s",
|
||||
nir->info.label ? nir->info.label
|
||||
: "unnamed",
|
||||
nir->info.name));
|
||||
}
|
||||
|
||||
g.generate_code(v.cfg, dispatch_width, v.shader_stats,
|
||||
v.performance_analysis.require(), params->base.stats);
|
||||
|
||||
g.add_const_data(nir->constant_data, nir->constant_data_size);
|
||||
|
||||
assembly = g.get_assembly();
|
||||
} else {
|
||||
brw::vec4_tcs_visitor v(compiler, ¶ms->base, key, prog_data,
|
||||
nir, debug_enabled);
|
||||
if (!v.run()) {
|
||||
params->base.error_str =
|
||||
ralloc_strdup(params->base.mem_ctx, v.fail_msg);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (INTEL_DEBUG(DEBUG_TCS))
|
||||
v.dump_instructions();
|
||||
|
||||
|
||||
assembly = brw_vec4_generate_assembly(compiler, ¶ms->base, nir,
|
||||
&prog_data->base, v.cfg,
|
||||
v.performance_analysis.require(),
|
||||
debug_enabled);
|
||||
const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
|
||||
fs_visitor v(compiler, ¶ms->base, &key->base,
|
||||
&prog_data->base.base, nir, dispatch_width,
|
||||
params->base.stats != NULL, debug_enabled);
|
||||
if (!v.run_tcs()) {
|
||||
params->base.error_str =
|
||||
ralloc_strdup(params->base.mem_ctx, v.fail_msg);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return assembly;
|
||||
assert(v.payload().num_regs % reg_unit(devinfo) == 0);
|
||||
prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
|
||||
|
||||
fs_generator g(compiler, ¶ms->base,
|
||||
&prog_data->base.base, false, MESA_SHADER_TESS_CTRL);
|
||||
if (unlikely(debug_enabled)) {
|
||||
g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
|
||||
"%s tessellation control shader %s",
|
||||
nir->info.label ? nir->info.label
|
||||
: "unnamed",
|
||||
nir->info.name));
|
||||
}
|
||||
|
||||
g.generate_code(v.cfg, dispatch_width, v.shader_stats,
|
||||
v.performance_analysis.require(), params->base.stats);
|
||||
|
||||
g.add_const_data(nir->constant_data, nir->constant_data_size);
|
||||
|
||||
return g.get_assembly();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,11 +3,9 @@
|
|||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "brw_vec4.h"
|
||||
#include "brw_fs.h"
|
||||
#include "brw_eu.h"
|
||||
#include "brw_nir.h"
|
||||
#include "brw_vec4_vs.h"
|
||||
#include "brw_private.h"
|
||||
#include "dev/intel_debug.h"
|
||||
|
||||
|
|
@ -28,11 +26,8 @@ brw_compile_vs(const struct brw_compiler *compiler,
|
|||
prog_data->base.base.ray_queries = nir->info.ray_queries;
|
||||
prog_data->base.base.total_scratch = 0;
|
||||
|
||||
const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX];
|
||||
brw_nir_apply_key(nir, compiler, &key->base, 8);
|
||||
|
||||
const unsigned *assembly = NULL;
|
||||
|
||||
prog_data->inputs_read = nir->info.inputs_read;
|
||||
prog_data->double_inputs_read = nir->info.vs.double_inputs;
|
||||
|
||||
|
|
@ -83,17 +78,7 @@ brw_compile_vs(const struct brw_compiler *compiler,
|
|||
if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID))
|
||||
prog_data->uses_drawid = true;
|
||||
|
||||
/* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
|
||||
* Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in
|
||||
* vec4 mode, the hardware appears to wedge unless we read something.
|
||||
*/
|
||||
if (is_scalar)
|
||||
prog_data->base.urb_read_length =
|
||||
DIV_ROUND_UP(nr_attribute_slots, 2);
|
||||
else
|
||||
prog_data->base.urb_read_length =
|
||||
DIV_ROUND_UP(MAX2(nr_attribute_slots, 1), 2);
|
||||
|
||||
prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attribute_slots, 2);
|
||||
prog_data->nr_attribute_slots = nr_attribute_slots;
|
||||
|
||||
/* Since vertex shaders reuse the same VUE entry for inputs and outputs
|
||||
|
|
@ -114,58 +99,37 @@ brw_compile_vs(const struct brw_compiler *compiler,
|
|||
brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_VERTEX);
|
||||
}
|
||||
|
||||
if (is_scalar) {
|
||||
const unsigned dispatch_width = compiler->devinfo->ver >= 20 ? 16 : 8;
|
||||
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
|
||||
const unsigned dispatch_width = compiler->devinfo->ver >= 20 ? 16 : 8;
|
||||
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
|
||||
|
||||
fs_visitor v(compiler, ¶ms->base, &key->base,
|
||||
&prog_data->base.base, nir, dispatch_width,
|
||||
params->base.stats != NULL, debug_enabled);
|
||||
if (!v.run_vs()) {
|
||||
params->base.error_str =
|
||||
ralloc_strdup(params->base.mem_ctx, v.fail_msg);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
|
||||
prog_data->base.base.dispatch_grf_start_reg =
|
||||
v.payload().num_regs / reg_unit(compiler->devinfo);
|
||||
|
||||
fs_generator g(compiler, ¶ms->base,
|
||||
&prog_data->base.base, v.runtime_check_aads_emit,
|
||||
MESA_SHADER_VERTEX);
|
||||
if (unlikely(debug_enabled)) {
|
||||
const char *debug_name =
|
||||
ralloc_asprintf(params->base.mem_ctx, "%s vertex shader %s",
|
||||
nir->info.label ? nir->info.label :
|
||||
"unnamed",
|
||||
nir->info.name);
|
||||
|
||||
g.enable_debug(debug_name);
|
||||
}
|
||||
g.generate_code(v.cfg, dispatch_width, v.shader_stats,
|
||||
v.performance_analysis.require(), params->base.stats);
|
||||
g.add_const_data(nir->constant_data, nir->constant_data_size);
|
||||
assembly = g.get_assembly();
|
||||
fs_visitor v(compiler, ¶ms->base, &key->base,
|
||||
&prog_data->base.base, nir, dispatch_width,
|
||||
params->base.stats != NULL, debug_enabled);
|
||||
if (!v.run_vs()) {
|
||||
params->base.error_str =
|
||||
ralloc_strdup(params->base.mem_ctx, v.fail_msg);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!assembly) {
|
||||
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
|
||||
assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
|
||||
prog_data->base.base.dispatch_grf_start_reg =
|
||||
v.payload().num_regs / reg_unit(compiler->devinfo);
|
||||
|
||||
vec4_vs_visitor v(compiler, ¶ms->base, key, prog_data,
|
||||
nir, debug_enabled);
|
||||
if (!v.run()) {
|
||||
params->base.error_str =
|
||||
ralloc_strdup(params->base.mem_ctx, v.fail_msg);
|
||||
return NULL;
|
||||
}
|
||||
fs_generator g(compiler, ¶ms->base,
|
||||
&prog_data->base.base, v.runtime_check_aads_emit,
|
||||
MESA_SHADER_VERTEX);
|
||||
if (unlikely(debug_enabled)) {
|
||||
const char *debug_name =
|
||||
ralloc_asprintf(params->base.mem_ctx, "%s vertex shader %s",
|
||||
nir->info.label ? nir->info.label :
|
||||
"unnamed",
|
||||
nir->info.name);
|
||||
|
||||
assembly = brw_vec4_generate_assembly(compiler, ¶ms->base,
|
||||
nir, &prog_data->base,
|
||||
v.cfg,
|
||||
v.performance_analysis.require(),
|
||||
debug_enabled);
|
||||
g.enable_debug(debug_name);
|
||||
}
|
||||
g.generate_code(v.cfg, dispatch_width, v.shader_stats,
|
||||
v.performance_analysis.require(), params->base.stats);
|
||||
g.add_const_data(nir->constant_data, nir->constant_data_size);
|
||||
|
||||
return assembly;
|
||||
return g.get_assembly();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -29,77 +29,51 @@
|
|||
#include "compiler/nir/nir.h"
|
||||
#include "util/u_debug.h"
|
||||
|
||||
#define COMMON_OPTIONS \
|
||||
.has_uclz = true, \
|
||||
.lower_fdiv = true, \
|
||||
.lower_scmp = true, \
|
||||
.lower_flrp16 = true, \
|
||||
.lower_fmod = true, \
|
||||
.lower_ufind_msb = true, \
|
||||
.lower_uadd_carry = true, \
|
||||
.lower_usub_borrow = true, \
|
||||
.lower_flrp64 = true, \
|
||||
.lower_fisnormal = true, \
|
||||
.lower_isign = true, \
|
||||
.lower_ldexp = true, \
|
||||
.lower_bitfield_extract = true, \
|
||||
.lower_bitfield_insert = true, \
|
||||
.lower_device_index_to_zero = true, \
|
||||
.vectorize_io = true, \
|
||||
.vectorize_tess_levels = true, \
|
||||
.use_interpolated_input_intrinsics = true, \
|
||||
.lower_insert_byte = true, \
|
||||
.lower_insert_word = true, \
|
||||
.vertex_id_zero_based = true, \
|
||||
.lower_base_vertex = true, \
|
||||
.support_16bit_alu = true, \
|
||||
.lower_uniforms_to_ubo = true
|
||||
|
||||
#define COMMON_SCALAR_OPTIONS \
|
||||
.lower_to_scalar = true, \
|
||||
.lower_pack_half_2x16 = true, \
|
||||
.lower_pack_snorm_2x16 = true, \
|
||||
.lower_pack_snorm_4x8 = true, \
|
||||
.lower_pack_unorm_2x16 = true, \
|
||||
.lower_pack_unorm_4x8 = true, \
|
||||
.lower_unpack_half_2x16 = true, \
|
||||
.lower_unpack_snorm_2x16 = true, \
|
||||
.lower_unpack_snorm_4x8 = true, \
|
||||
.lower_unpack_unorm_2x16 = true, \
|
||||
.lower_unpack_unorm_4x8 = true, \
|
||||
.lower_hadd64 = true, \
|
||||
.avoid_ternary_with_two_constants = true, \
|
||||
.has_pack_32_4x8 = true, \
|
||||
.max_unroll_iterations = 32, \
|
||||
.force_indirect_unrolling = nir_var_function_temp, \
|
||||
.divergence_analysis_options = \
|
||||
(nir_divergence_single_patch_per_tcs_subgroup | \
|
||||
nir_divergence_single_patch_per_tes_subgroup | \
|
||||
nir_divergence_shader_record_ptr_uniform)
|
||||
|
||||
const struct nir_shader_compiler_options brw_scalar_nir_options = {
|
||||
COMMON_OPTIONS,
|
||||
COMMON_SCALAR_OPTIONS,
|
||||
};
|
||||
|
||||
const struct nir_shader_compiler_options brw_vector_nir_options = {
|
||||
COMMON_OPTIONS,
|
||||
|
||||
/* In the vec4 backend, our dpN instruction replicates its result to all the
|
||||
* components of a vec4. We would like NIR to give us replicated fdot
|
||||
* instructions because it can optimize better for us.
|
||||
*/
|
||||
.fdot_replicates = true,
|
||||
|
||||
.lower_usub_sat = true,
|
||||
.avoid_ternary_with_two_constants = true,
|
||||
.divergence_analysis_options =
|
||||
(nir_divergence_single_patch_per_tcs_subgroup |
|
||||
nir_divergence_single_patch_per_tes_subgroup |
|
||||
nir_divergence_shader_record_ptr_uniform),
|
||||
.force_indirect_unrolling = nir_var_function_temp,
|
||||
.has_pack_32_4x8 = true,
|
||||
.has_uclz = true,
|
||||
.lower_base_vertex = true,
|
||||
.lower_bitfield_extract = true,
|
||||
.lower_bitfield_insert = true,
|
||||
.lower_device_index_to_zero = true,
|
||||
.lower_fdiv = true,
|
||||
.lower_fisnormal = true,
|
||||
.lower_flrp16 = true,
|
||||
.lower_flrp64 = true,
|
||||
.lower_fmod = true,
|
||||
.lower_hadd64 = true,
|
||||
.lower_insert_byte = true,
|
||||
.lower_insert_word = true,
|
||||
.lower_isign = true,
|
||||
.lower_ldexp = true,
|
||||
.lower_pack_half_2x16 = true,
|
||||
.lower_pack_snorm_2x16 = true,
|
||||
.lower_pack_snorm_4x8 = true,
|
||||
.lower_pack_unorm_2x16 = true,
|
||||
.lower_pack_unorm_4x8 = true,
|
||||
.lower_scmp = true,
|
||||
.lower_to_scalar = true,
|
||||
.lower_uadd_carry = true,
|
||||
.lower_ufind_msb = true,
|
||||
.lower_uniforms_to_ubo = true,
|
||||
.lower_unpack_half_2x16 = true,
|
||||
.lower_unpack_snorm_2x16 = true,
|
||||
.lower_unpack_snorm_4x8 = true,
|
||||
.lower_unpack_unorm_2x16 = true,
|
||||
.lower_extract_byte = true,
|
||||
.lower_extract_word = true,
|
||||
.intel_vec4 = true,
|
||||
.lower_unpack_unorm_4x8 = true,
|
||||
.lower_usub_borrow = true,
|
||||
.max_unroll_iterations = 32,
|
||||
.support_16bit_alu = true,
|
||||
.use_interpolated_input_intrinsics = true,
|
||||
.vectorize_io = true,
|
||||
.vectorize_tess_levels = true,
|
||||
.vertex_id_zero_based = true,
|
||||
};
|
||||
|
||||
struct brw_compiler *
|
||||
|
|
@ -129,15 +103,6 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
|
|||
devinfo->platform != INTEL_PLATFORM_ARL_H) ||
|
||||
debug_get_bool_option("INTEL_LOWER_DPAS", false);
|
||||
|
||||
/* There is no vec4 mode on Gfx10+, and we don't use it at all on Gfx8+. */
|
||||
for (int i = MESA_SHADER_VERTEX; i < MESA_ALL_SHADER_STAGES; i++) {
|
||||
compiler->scalar_stage[i] = devinfo->ver >= 8 ||
|
||||
i == MESA_SHADER_FRAGMENT || i == MESA_SHADER_COMPUTE;
|
||||
}
|
||||
|
||||
for (int i = MESA_SHADER_TASK; i < MESA_VULKAN_SHADER_STAGES; i++)
|
||||
compiler->scalar_stage[i] = true;
|
||||
|
||||
nir_lower_int64_options int64_options =
|
||||
nir_lower_imul64 |
|
||||
nir_lower_isign64 |
|
||||
|
|
@ -175,13 +140,8 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
|
|||
for (int i = 0; i < MESA_ALL_SHADER_STAGES; i++) {
|
||||
struct nir_shader_compiler_options *nir_options =
|
||||
rzalloc(compiler, struct nir_shader_compiler_options);
|
||||
bool is_scalar = compiler->scalar_stage[i];
|
||||
if (is_scalar) {
|
||||
*nir_options = brw_scalar_nir_options;
|
||||
int64_options |= nir_lower_usub_sat64;
|
||||
} else {
|
||||
*nir_options = brw_vector_nir_options;
|
||||
}
|
||||
*nir_options = brw_scalar_nir_options;
|
||||
int64_options |= nir_lower_usub_sat64;
|
||||
|
||||
/* Prior to Gfx6, there are no three source operations, and Gfx11 loses
|
||||
* LRP.
|
||||
|
|
|
|||
|
|
@ -86,7 +86,6 @@ struct brw_compiler {
|
|||
void (*shader_debug_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4);
|
||||
void (*shader_perf_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4);
|
||||
|
||||
bool scalar_stage[MESA_ALL_SHADER_STAGES];
|
||||
bool use_tcs_multi_patch;
|
||||
struct nir_shader_compiler_options *nir_options[MESA_ALL_SHADER_STAGES];
|
||||
|
||||
|
|
|
|||
|
|
@ -714,7 +714,7 @@ brw_nir_lower_fs_outputs(nir_shader *nir)
|
|||
})
|
||||
|
||||
void
|
||||
brw_nir_optimize(nir_shader *nir, bool is_scalar,
|
||||
brw_nir_optimize(nir_shader *nir,
|
||||
const struct intel_device_info *devinfo)
|
||||
{
|
||||
bool progress;
|
||||
|
|
@ -752,18 +752,11 @@ brw_nir_optimize(nir_shader *nir, bool is_scalar,
|
|||
OPT(nir_opt_ray_queries);
|
||||
OPT(nir_opt_ray_query_ranges);
|
||||
|
||||
if (is_scalar) {
|
||||
OPT(nir_lower_alu_to_scalar, NULL, NULL);
|
||||
} else {
|
||||
OPT(nir_opt_shrink_stores, true);
|
||||
OPT(nir_opt_shrink_vectors);
|
||||
}
|
||||
OPT(nir_lower_alu_to_scalar, NULL, NULL);
|
||||
|
||||
OPT(nir_copy_prop);
|
||||
|
||||
if (is_scalar) {
|
||||
OPT(nir_lower_phis_to_scalar, false);
|
||||
}
|
||||
OPT(nir_lower_phis_to_scalar, false);
|
||||
|
||||
OPT(nir_copy_prop);
|
||||
OPT(nir_opt_dce);
|
||||
|
|
@ -784,15 +777,9 @@ brw_nir_optimize(nir_shader *nir, bool is_scalar,
|
|||
* For indirect loads of uniforms (push constants), we assume that array
|
||||
* indices will nearly always be in bounds and the cost of the load is
|
||||
* low. Therefore there shouldn't be a performance benefit to avoid it.
|
||||
* However, in vec4 tessellation shaders, these loads operate by
|
||||
* actually pulling from memory.
|
||||
*/
|
||||
const bool is_vec4_tessellation = !is_scalar &&
|
||||
(nir->info.stage == MESA_SHADER_TESS_CTRL ||
|
||||
nir->info.stage == MESA_SHADER_TESS_EVAL);
|
||||
OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation, false);
|
||||
OPT(nir_opt_peephole_select, 8, !is_vec4_tessellation,
|
||||
devinfo->ver >= 6);
|
||||
OPT(nir_opt_peephole_select, 0, true, false);
|
||||
OPT(nir_opt_peephole_select, 8, true, devinfo->ver >= 6);
|
||||
|
||||
OPT(nir_opt_intrinsics);
|
||||
OPT(nir_opt_idiv_const, 32);
|
||||
|
|
@ -1014,15 +1001,11 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
|
|||
const struct intel_device_info *devinfo = compiler->devinfo;
|
||||
UNUSED bool progress; /* Written by OPT */
|
||||
|
||||
const bool is_scalar = compiler->scalar_stage[nir->info.stage];
|
||||
|
||||
nir_validate_ssa_dominance(nir, "before brw_preprocess_nir");
|
||||
|
||||
OPT(nir_lower_frexp);
|
||||
|
||||
if (is_scalar) {
|
||||
OPT(nir_lower_alu_to_scalar, NULL, NULL);
|
||||
}
|
||||
OPT(nir_lower_alu_to_scalar, NULL, NULL);
|
||||
|
||||
if (nir->info.stage == MESA_SHADER_GEOMETRY)
|
||||
OPT(nir_lower_gs_intrinsics, 0);
|
||||
|
|
@ -1081,7 +1064,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
|
|||
OPT(nir_split_var_copies);
|
||||
OPT(nir_split_struct_vars, nir_var_function_temp);
|
||||
|
||||
brw_nir_optimize(nir, is_scalar, devinfo);
|
||||
brw_nir_optimize(nir, devinfo);
|
||||
|
||||
OPT(nir_lower_doubles, opts->softfp64, nir->options->lower_doubles_options);
|
||||
if (OPT(nir_lower_int64_float_conversions)) {
|
||||
|
|
@ -1102,9 +1085,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
|
|||
OPT(nir_opt_large_constants, NULL, 32);
|
||||
}
|
||||
|
||||
if (is_scalar) {
|
||||
OPT(nir_lower_load_const_to_scalar);
|
||||
}
|
||||
OPT(nir_lower_load_const_to_scalar);
|
||||
|
||||
OPT(nir_lower_system_values);
|
||||
nir_lower_compute_system_values_options lower_csv_options = {
|
||||
|
|
@ -1116,7 +1097,6 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
|
|||
.ballot_bit_size = 32,
|
||||
.ballot_components = 1,
|
||||
.lower_to_scalar = true,
|
||||
.lower_vote_trivial = !is_scalar,
|
||||
.lower_relative_shuffle = true,
|
||||
.lower_quad_broadcast_dynamic = true,
|
||||
.lower_elect = true,
|
||||
|
|
@ -1142,7 +1122,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
|
|||
* issues are helped but nothing else in shader-db is hurt except for maybe
|
||||
* that one kerbal space program shader.
|
||||
*/
|
||||
if (is_scalar && !(indirect_mask & nir_var_function_temp))
|
||||
if (!(indirect_mask & nir_var_function_temp))
|
||||
OPT(nir_lower_indirect_derefs, nir_var_function_temp, 16);
|
||||
|
||||
/* Lower array derefs of vectors for SSBO and UBO loads. For both UBOs and
|
||||
|
|
@ -1165,7 +1145,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
|
|||
OPT(intel_nir_clamp_per_vertex_loads);
|
||||
|
||||
/* Get rid of split copies */
|
||||
brw_nir_optimize(nir, is_scalar, devinfo);
|
||||
brw_nir_optimize(nir, devinfo);
|
||||
}
|
||||
|
||||
static bool
|
||||
|
|
@ -1321,18 +1301,13 @@ brw_nir_link_shaders(const struct brw_compiler *compiler,
|
|||
nir_validate_shader(producer, "after nir_lower_io_arrays_to_elements");
|
||||
nir_validate_shader(consumer, "after nir_lower_io_arrays_to_elements");
|
||||
|
||||
const bool p_is_scalar = compiler->scalar_stage[producer->info.stage];
|
||||
const bool c_is_scalar = compiler->scalar_stage[consumer->info.stage];
|
||||
|
||||
if (p_is_scalar && c_is_scalar) {
|
||||
NIR_PASS(_, producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
|
||||
NIR_PASS(_, consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
|
||||
brw_nir_optimize(producer, p_is_scalar, devinfo);
|
||||
brw_nir_optimize(consumer, c_is_scalar, devinfo);
|
||||
}
|
||||
NIR_PASS(_, producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
|
||||
NIR_PASS(_, consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
|
||||
brw_nir_optimize(producer, devinfo);
|
||||
brw_nir_optimize(consumer, devinfo);
|
||||
|
||||
if (nir_link_opt_varyings(producer, consumer))
|
||||
brw_nir_optimize(consumer, c_is_scalar, devinfo);
|
||||
brw_nir_optimize(consumer, devinfo);
|
||||
|
||||
NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
|
||||
NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
|
||||
|
|
@ -1361,8 +1336,8 @@ brw_nir_link_shaders(const struct brw_compiler *compiler,
|
|||
brw_nir_no_indirect_mask(compiler, consumer->info.stage),
|
||||
UINT32_MAX);
|
||||
|
||||
brw_nir_optimize(producer, p_is_scalar, devinfo);
|
||||
brw_nir_optimize(consumer, c_is_scalar, devinfo);
|
||||
brw_nir_optimize(producer, devinfo);
|
||||
brw_nir_optimize(consumer, devinfo);
|
||||
|
||||
if (producer->info.stage == MESA_SHADER_MESH &&
|
||||
consumer->info.stage == MESA_SHADER_FRAGMENT) {
|
||||
|
|
@ -1591,48 +1566,45 @@ brw_vectorize_lower_mem_access(nir_shader *nir,
|
|||
enum brw_robustness_flags robust_flags)
|
||||
{
|
||||
bool progress = false;
|
||||
const bool is_scalar = compiler->scalar_stage[nir->info.stage];
|
||||
|
||||
if (is_scalar) {
|
||||
nir_load_store_vectorize_options options = {
|
||||
.modes = nir_var_mem_ubo | nir_var_mem_ssbo |
|
||||
nir_var_mem_global | nir_var_mem_shared |
|
||||
nir_var_mem_task_payload,
|
||||
.callback = brw_nir_should_vectorize_mem,
|
||||
.robust_modes = (nir_variable_mode)0,
|
||||
};
|
||||
nir_load_store_vectorize_options options = {
|
||||
.modes = nir_var_mem_ubo | nir_var_mem_ssbo |
|
||||
nir_var_mem_global | nir_var_mem_shared |
|
||||
nir_var_mem_task_payload,
|
||||
.callback = brw_nir_should_vectorize_mem,
|
||||
.robust_modes = (nir_variable_mode)0,
|
||||
};
|
||||
|
||||
if (robust_flags & BRW_ROBUSTNESS_UBO)
|
||||
options.robust_modes |= nir_var_mem_ubo | nir_var_mem_global;
|
||||
if (robust_flags & BRW_ROBUSTNESS_SSBO)
|
||||
options.robust_modes |= nir_var_mem_ssbo | nir_var_mem_global;
|
||||
if (robust_flags & BRW_ROBUSTNESS_UBO)
|
||||
options.robust_modes |= nir_var_mem_ubo | nir_var_mem_global;
|
||||
if (robust_flags & BRW_ROBUSTNESS_SSBO)
|
||||
options.robust_modes |= nir_var_mem_ssbo | nir_var_mem_global;
|
||||
|
||||
OPT(nir_opt_load_store_vectorize, &options);
|
||||
OPT(nir_opt_load_store_vectorize, &options);
|
||||
|
||||
/* Only run the blockify optimization on Gfx9+ because although prior HW
|
||||
* versions have support for block loads, they do have limitations on
|
||||
* alignment as well as requiring split sends which are not supported
|
||||
* there.
|
||||
/* Only run the blockify optimization on Gfx9+ because although prior HW
|
||||
* versions have support for block loads, they do have limitations on
|
||||
* alignment as well as requiring split sends which are not supported
|
||||
* there.
|
||||
*/
|
||||
if (compiler->devinfo->ver >= 9) {
|
||||
/* Required for nir_divergence_analysis() */
|
||||
OPT(nir_convert_to_lcssa, true, true);
|
||||
|
||||
/* When HW supports block loads, using the divergence analysis, try
|
||||
* to find uniform SSBO loads and turn them into block loads.
|
||||
*
|
||||
* Rerun the vectorizer after that to make the largest possible block
|
||||
* loads.
|
||||
*
|
||||
* This is a win on 2 fronts :
|
||||
* - fewer send messages
|
||||
* - reduced register pressure
|
||||
*/
|
||||
if (compiler->devinfo->ver >= 9) {
|
||||
/* Required for nir_divergence_analysis() */
|
||||
OPT(nir_convert_to_lcssa, true, true);
|
||||
|
||||
/* When HW supports block loads, using the divergence analysis, try
|
||||
* to find uniform SSBO loads and turn them into block loads.
|
||||
*
|
||||
* Rerun the vectorizer after that to make the largest possible block
|
||||
* loads.
|
||||
*
|
||||
* This is a win on 2 fronts :
|
||||
* - fewer send messages
|
||||
* - reduced register pressure
|
||||
*/
|
||||
nir_divergence_analysis(nir);
|
||||
if (OPT(intel_nir_blockify_uniform_loads, compiler->devinfo))
|
||||
OPT(nir_opt_load_store_vectorize, &options);
|
||||
OPT(nir_opt_remove_phis);
|
||||
}
|
||||
nir_divergence_analysis(nir);
|
||||
if (OPT(intel_nir_blockify_uniform_loads, compiler->devinfo))
|
||||
OPT(nir_opt_load_store_vectorize, &options);
|
||||
OPT(nir_opt_remove_phis);
|
||||
}
|
||||
|
||||
nir_lower_mem_access_bit_sizes_options mem_access_options = {
|
||||
|
|
@ -1683,7 +1655,6 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
|
|||
enum brw_robustness_flags robust_flags)
|
||||
{
|
||||
const struct intel_device_info *devinfo = compiler->devinfo;
|
||||
const bool is_scalar = compiler->scalar_stage[nir->info.stage];
|
||||
|
||||
UNUSED bool progress; /* Written by OPT */
|
||||
|
||||
|
|
@ -1710,20 +1681,20 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
|
|||
if (gl_shader_stage_can_set_fragment_shading_rate(nir->info.stage))
|
||||
NIR_PASS(_, nir, intel_nir_lower_shading_rate_output);
|
||||
|
||||
brw_nir_optimize(nir, is_scalar, devinfo);
|
||||
brw_nir_optimize(nir, devinfo);
|
||||
|
||||
if (is_scalar && nir_shader_has_local_variables(nir)) {
|
||||
if (nir_shader_has_local_variables(nir)) {
|
||||
OPT(nir_lower_vars_to_explicit_types, nir_var_function_temp,
|
||||
glsl_get_natural_size_align_bytes);
|
||||
OPT(nir_lower_explicit_io, nir_var_function_temp,
|
||||
nir_address_format_32bit_offset);
|
||||
brw_nir_optimize(nir, is_scalar, devinfo);
|
||||
brw_nir_optimize(nir, devinfo);
|
||||
}
|
||||
|
||||
brw_vectorize_lower_mem_access(nir, compiler, robust_flags);
|
||||
|
||||
if (OPT(nir_lower_int64))
|
||||
brw_nir_optimize(nir, is_scalar, devinfo);
|
||||
brw_nir_optimize(nir, devinfo);
|
||||
|
||||
if (devinfo->ver >= 6) {
|
||||
/* Try and fuse multiply-adds, if successful, run shrink_vectors to
|
||||
|
|
@ -1741,8 +1712,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
|
|||
OPT(nir_opt_shrink_vectors);
|
||||
}
|
||||
|
||||
if (is_scalar)
|
||||
OPT(intel_nir_opt_peephole_imul32x16);
|
||||
OPT(intel_nir_opt_peephole_imul32x16);
|
||||
|
||||
if (OPT(nir_opt_comparison_pre)) {
|
||||
OPT(nir_copy_prop);
|
||||
|
|
@ -1753,27 +1723,15 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
|
|||
* the other optimization passes) will have removed at least one
|
||||
* instruction from one of the branches of the if-statement, so now it
|
||||
* might be under the threshold of conversion to bcsel.
|
||||
*
|
||||
* See brw_nir_optimize for the explanation of is_vec4_tessellation.
|
||||
*/
|
||||
const bool is_vec4_tessellation = !is_scalar &&
|
||||
(nir->info.stage == MESA_SHADER_TESS_CTRL ||
|
||||
nir->info.stage == MESA_SHADER_TESS_EVAL);
|
||||
OPT(nir_opt_peephole_select, 0, is_vec4_tessellation, false);
|
||||
OPT(nir_opt_peephole_select, 1, is_vec4_tessellation,
|
||||
compiler->devinfo->ver >= 6);
|
||||
OPT(nir_opt_peephole_select, 0, false, false);
|
||||
OPT(nir_opt_peephole_select, 1, false, compiler->devinfo->ver >= 6);
|
||||
}
|
||||
|
||||
do {
|
||||
progress = false;
|
||||
if (OPT(nir_opt_algebraic_late)) {
|
||||
/* At this late stage, anything that makes more constants will wreak
|
||||
* havok on the vec4 backend. The handling of constants in the vec4
|
||||
* backend is not good.
|
||||
*/
|
||||
if (is_scalar)
|
||||
OPT(nir_opt_constant_folding);
|
||||
|
||||
OPT(nir_opt_constant_folding);
|
||||
OPT(nir_copy_prop);
|
||||
OPT(nir_opt_dce);
|
||||
OPT(nir_opt_cse);
|
||||
|
|
@ -1783,19 +1741,16 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
|
|||
|
||||
if (OPT(nir_lower_fp16_casts, nir_lower_fp16_split_fp64)) {
|
||||
if (OPT(nir_lower_int64)) {
|
||||
brw_nir_optimize(nir, is_scalar, devinfo);
|
||||
brw_nir_optimize(nir, devinfo);
|
||||
}
|
||||
}
|
||||
|
||||
OPT(intel_nir_lower_conversions);
|
||||
|
||||
if (is_scalar)
|
||||
OPT(nir_lower_alu_to_scalar, NULL, NULL);
|
||||
OPT(nir_lower_alu_to_scalar, NULL, NULL);
|
||||
|
||||
while (OPT(nir_opt_algebraic_distribute_src_mods)) {
|
||||
if (is_scalar)
|
||||
OPT(nir_opt_constant_folding);
|
||||
|
||||
OPT(nir_opt_constant_folding);
|
||||
OPT(nir_copy_prop);
|
||||
OPT(nir_opt_dce);
|
||||
OPT(nir_opt_cse);
|
||||
|
|
@ -1821,7 +1776,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
|
|||
OPT(nir_lower_subgroups, &subgroups_options);
|
||||
|
||||
if (OPT(nir_lower_int64))
|
||||
brw_nir_optimize(nir, is_scalar, devinfo);
|
||||
brw_nir_optimize(nir, devinfo);
|
||||
|
||||
divergence_analysis_dirty = true;
|
||||
}
|
||||
|
|
@ -1834,7 +1789,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
|
|||
* that must be lowered.
|
||||
*/
|
||||
if (OPT(nir_lower_int64))
|
||||
brw_nir_optimize(nir, is_scalar, devinfo);
|
||||
brw_nir_optimize(nir, devinfo);
|
||||
|
||||
OPT(nir_lower_subgroups, &subgroups_options);
|
||||
}
|
||||
|
|
@ -1880,11 +1835,6 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
|
|||
|
||||
OPT(nir_convert_from_ssa, true);
|
||||
|
||||
if (!is_scalar) {
|
||||
OPT(nir_move_vec_src_uses_to_dest, true);
|
||||
OPT(nir_lower_vec_to_regs, NULL, NULL);
|
||||
}
|
||||
|
||||
OPT(nir_opt_dce);
|
||||
|
||||
if (OPT(nir_opt_rematerialize_compares))
|
||||
|
|
@ -2035,8 +1985,7 @@ brw_nir_apply_key(nir_shader *nir,
|
|||
OPT(brw_nir_limit_trig_input_range_workaround);
|
||||
|
||||
if (progress) {
|
||||
const bool is_scalar = compiler->scalar_stage[nir->info.stage];
|
||||
brw_nir_optimize(nir, is_scalar, compiler->devinfo);
|
||||
brw_nir_optimize(nir, compiler->devinfo);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -34,7 +34,6 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
extern const struct nir_shader_compiler_options brw_scalar_nir_options;
|
||||
extern const struct nir_shader_compiler_options brw_vector_nir_options;
|
||||
|
||||
int type_size_vec4(const struct glsl_type *type, bool bindless);
|
||||
int type_size_dvec4(const struct glsl_type *type, bool bindless);
|
||||
|
|
@ -268,7 +267,7 @@ void brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
|
|||
nir_shader *nir,
|
||||
struct brw_ubo_range out_ranges[4]);
|
||||
|
||||
void brw_nir_optimize(nir_shader *nir, bool is_scalar,
|
||||
void brw_nir_optimize(nir_shader *nir,
|
||||
const struct intel_device_info *devinfo);
|
||||
|
||||
nir_shader *brw_nir_create_passthrough_tcs(void *mem_ctx,
|
||||
|
|
|
|||
|
|
@ -529,8 +529,7 @@ brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler,
|
|||
|
||||
NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics, devinfo, NULL);
|
||||
|
||||
const bool is_scalar = true;
|
||||
brw_nir_optimize(nir, is_scalar, devinfo);
|
||||
brw_nir_optimize(nir, devinfo);
|
||||
|
||||
return nir;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,7 +26,6 @@
|
|||
#include "brw_fs.h"
|
||||
#include "brw_nir.h"
|
||||
#include "brw_private.h"
|
||||
#include "brw_vec4_tes.h"
|
||||
#include "dev/intel_debug.h"
|
||||
#include "util/macros.h"
|
||||
#include "util/u_debug.h"
|
||||
|
|
@ -1310,9 +1309,7 @@ brw_compile_tes(const struct brw_compiler *compiler,
|
|||
const struct intel_vue_map *input_vue_map = params->input_vue_map;
|
||||
struct brw_tes_prog_data *prog_data = params->prog_data;
|
||||
|
||||
const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_EVAL];
|
||||
const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TES);
|
||||
const unsigned *assembly;
|
||||
|
||||
prog_data->base.base.stage = MESA_SHADER_TESS_EVAL;
|
||||
prog_data->base.base.ray_queries = nir->info.ray_queries;
|
||||
|
|
@ -1395,55 +1392,35 @@ brw_compile_tes(const struct brw_compiler *compiler,
|
|||
MESA_SHADER_TESS_EVAL);
|
||||
}
|
||||
|
||||
if (is_scalar) {
|
||||
const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
|
||||
fs_visitor v(compiler, ¶ms->base, &key->base,
|
||||
&prog_data->base.base, nir, dispatch_width,
|
||||
params->base.stats != NULL, debug_enabled);
|
||||
if (!v.run_tes()) {
|
||||
params->base.error_str =
|
||||
ralloc_strdup(params->base.mem_ctx, v.fail_msg);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
assert(v.payload().num_regs % reg_unit(devinfo) == 0);
|
||||
prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
|
||||
|
||||
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
|
||||
|
||||
fs_generator g(compiler, ¶ms->base,
|
||||
&prog_data->base.base, false, MESA_SHADER_TESS_EVAL);
|
||||
if (unlikely(debug_enabled)) {
|
||||
g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
|
||||
"%s tessellation evaluation shader %s",
|
||||
nir->info.label ? nir->info.label
|
||||
: "unnamed",
|
||||
nir->info.name));
|
||||
}
|
||||
|
||||
g.generate_code(v.cfg, dispatch_width, v.shader_stats,
|
||||
v.performance_analysis.require(), params->base.stats);
|
||||
|
||||
g.add_const_data(nir->constant_data, nir->constant_data_size);
|
||||
|
||||
assembly = g.get_assembly();
|
||||
} else {
|
||||
brw::vec4_tes_visitor v(compiler, ¶ms->base, key, prog_data,
|
||||
nir, debug_enabled);
|
||||
if (!v.run()) {
|
||||
params->base.error_str =
|
||||
ralloc_strdup(params->base.mem_ctx, v.fail_msg);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (unlikely(debug_enabled))
|
||||
v.dump_instructions();
|
||||
|
||||
assembly = brw_vec4_generate_assembly(compiler, ¶ms->base, nir,
|
||||
&prog_data->base, v.cfg,
|
||||
v.performance_analysis.require(),
|
||||
debug_enabled);
|
||||
const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
|
||||
fs_visitor v(compiler, ¶ms->base, &key->base,
|
||||
&prog_data->base.base, nir, dispatch_width,
|
||||
params->base.stats != NULL, debug_enabled);
|
||||
if (!v.run_tes()) {
|
||||
params->base.error_str =
|
||||
ralloc_strdup(params->base.mem_ctx, v.fail_msg);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return assembly;
|
||||
assert(v.payload().num_regs % reg_unit(devinfo) == 0);
|
||||
prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
|
||||
|
||||
prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
|
||||
|
||||
fs_generator g(compiler, ¶ms->base,
|
||||
&prog_data->base.base, false, MESA_SHADER_TESS_EVAL);
|
||||
if (unlikely(debug_enabled)) {
|
||||
g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
|
||||
"%s tessellation evaluation shader %s",
|
||||
nir->info.label ? nir->info.label
|
||||
: "unnamed",
|
||||
nir->info.name));
|
||||
}
|
||||
|
||||
g.generate_code(v.cfg, dispatch_width, v.shader_stats,
|
||||
v.performance_analysis.require(), params->base.stats);
|
||||
|
||||
g.add_const_data(nir->constant_data, nir->constant_data_size);
|
||||
|
||||
return g.get_assembly();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -134,7 +134,6 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
|
|||
gl_shader_stage stage)
|
||||
{
|
||||
const struct intel_device_info *devinfo = compiler->devinfo;
|
||||
const bool is_scalar = compiler->scalar_stage[stage];
|
||||
nir_variable_mode indirect_mask = (nir_variable_mode) 0;
|
||||
|
||||
switch (stage) {
|
||||
|
|
@ -143,19 +142,14 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
|
|||
indirect_mask |= nir_var_shader_in;
|
||||
break;
|
||||
|
||||
case MESA_SHADER_GEOMETRY:
|
||||
if (!is_scalar)
|
||||
indirect_mask |= nir_var_shader_in;
|
||||
break;
|
||||
|
||||
default:
|
||||
/* Everything else can handle indirect inputs */
|
||||
break;
|
||||
}
|
||||
|
||||
if (is_scalar && stage != MESA_SHADER_TESS_CTRL &&
|
||||
stage != MESA_SHADER_TASK &&
|
||||
stage != MESA_SHADER_MESH)
|
||||
if (stage != MESA_SHADER_TESS_CTRL &&
|
||||
stage != MESA_SHADER_TASK &&
|
||||
stage != MESA_SHADER_MESH)
|
||||
indirect_mask |= nir_var_shader_out;
|
||||
|
||||
/* On HSW+, we allow indirects in scalar shaders. They get implemented
|
||||
|
|
@ -168,7 +162,7 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
|
|||
* indirects as scratch all the time, we may easily exceed this limit
|
||||
* without having any fallback.
|
||||
*/
|
||||
if (is_scalar && devinfo->verx10 <= 70)
|
||||
if (devinfo->verx10 <= 70)
|
||||
indirect_mask |= nir_var_function_temp;
|
||||
|
||||
return indirect_mask;
|
||||
|
|
|
|||
|
|
@ -468,10 +468,7 @@ get_features(const struct anv_physical_device *pdevice,
|
|||
.textureCompressionBC = true,
|
||||
.occlusionQueryPrecise = true,
|
||||
.pipelineStatisticsQuery = true,
|
||||
/* We can't do image stores in vec4 shaders */
|
||||
.vertexPipelineStoresAndAtomics =
|
||||
pdevice->compiler->scalar_stage[MESA_SHADER_VERTEX] &&
|
||||
pdevice->compiler->scalar_stage[MESA_SHADER_GEOMETRY],
|
||||
.vertexPipelineStoresAndAtomics = true,
|
||||
.fragmentStoresAndAtomics = true,
|
||||
.shaderTessellationAndGeometryPointSize = true,
|
||||
.shaderImageGatherExtended = true,
|
||||
|
|
@ -940,8 +937,7 @@ get_properties_1_1(const struct anv_physical_device *pdevice,
|
|||
p->subgroupSize = BRW_SUBGROUP_SIZE;
|
||||
VkShaderStageFlags scalar_stages = 0;
|
||||
for (unsigned stage = 0; stage < MESA_SHADER_STAGES; stage++) {
|
||||
if (pdevice->compiler->scalar_stage[stage])
|
||||
scalar_stages |= mesa_to_vk_shader_stage(stage);
|
||||
scalar_stages |= mesa_to_vk_shader_stage(stage);
|
||||
}
|
||||
if (pdevice->vk.supported_extensions.KHR_ray_tracing_pipeline) {
|
||||
scalar_stages |= VK_SHADER_STAGE_RAYGEN_BIT_KHR |
|
||||
|
|
|
|||
|
|
@ -130,10 +130,8 @@ anv_nir_compute_push_layout(nir_shader *nir,
|
|||
push_start = MIN2(push_start, push_end);
|
||||
push_start = ROUND_DOWN_TO(push_start, 32);
|
||||
|
||||
/* For vec4 our push data size needs to be aligned to a vec4 and for
|
||||
* scalar, it needs to be aligned to a DWORD.
|
||||
*/
|
||||
const unsigned alignment = compiler->scalar_stage[nir->info.stage] ? 4 : 16;
|
||||
/* For scalar, push data size needs to be aligned to a DWORD. */
|
||||
const unsigned alignment = 4;
|
||||
nir->num_uniforms = ALIGN(push_end - push_start, alignment);
|
||||
prog_data->nr_params = nir->num_uniforms / 4;
|
||||
prog_data->param = rzalloc_array(mem_ctx, uint32_t, prog_data->nr_params);
|
||||
|
|
@ -218,13 +216,7 @@ anv_nir_compute_push_layout(nir_shader *nir,
|
|||
if (push_ubo_ranges) {
|
||||
brw_nir_analyze_ubo_ranges(compiler, nir, prog_data->ubo_ranges);
|
||||
|
||||
/* The vec4 back-end pushes at most 32 regs while the scalar back-end
|
||||
* pushes up to 64. This is primarily because the scalar back-end has a
|
||||
* massively more competent register allocator and so the risk of
|
||||
* spilling due to UBO pushing isn't nearly as high.
|
||||
*/
|
||||
const unsigned max_push_regs =
|
||||
compiler->scalar_stage[nir->info.stage] ? 64 : 32;
|
||||
const unsigned max_push_regs = 64;
|
||||
|
||||
unsigned total_push_regs = push_constant_range.length;
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue