intel/elk: Remove multi-polygon support

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27629>
This commit is contained in:
Caio Oliveira 2024-02-13 13:26:59 -08:00 committed by Marge Bot
parent fd3a815a5b
commit be73fa1434
9 changed files with 30 additions and 144 deletions

View file

@ -297,8 +297,8 @@ iris_apply_elk_wm_prog_data(struct iris_compiled_shader *shader,
iris->flat_inputs = elk->flat_inputs;
iris->inputs = elk->inputs;
iris->computed_depth_mode = elk->computed_depth_mode;
iris->max_polygons = elk->max_polygons;
iris->dispatch_multi = elk->dispatch_multi;
iris->max_polygons = 1;
iris->dispatch_multi = 0;
iris->computed_stencil = elk->computed_stencil;
iris->early_fragment_tests = elk->early_fragment_tests;
iris->post_depth_coverage = elk->post_depth_coverage;

View file

@ -89,10 +89,8 @@ intel_set_ps_dispatch_state(struct GENX(3DSTATE_PS) *ps,
}
assert(enable_8 || enable_16 || enable_32);
assert(!prog_data->dispatch_multi);
ps->_8PixelDispatchEnable = enable_8 ||
(GFX_VER == 12 && prog_data->dispatch_multi);
ps->_8PixelDispatchEnable = enable_8;
ps->_16PixelDispatchEnable = enable_16;
ps->_32PixelDispatchEnable = enable_32;
}

View file

@ -833,18 +833,6 @@ struct elk_wm_prog_data {
uint8_t color_outputs_written;
uint8_t computed_depth_mode;
/**
* Number of polygons handled in parallel by the multi-polygon PS
* kernel.
*/
uint8_t max_polygons;
/**
* Dispatch width of the multi-polygon PS kernel, or 0 if no
* multi-polygon kernel was built.
*/
uint8_t dispatch_multi;
bool computed_stencil;
bool early_fragment_tests;
bool post_depth_coverage;
@ -1791,7 +1779,7 @@ elk_cs_get_dispatch_info(const struct intel_device_info *devinfo,
*/
static inline bool
elk_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo,
gl_shader_stage stage, unsigned max_polygons,
gl_shader_stage stage,
const struct elk_stage_prog_data *prog_data)
{
/* The code below makes assumptions about the hardware's thread dispatch
@ -1814,8 +1802,7 @@ elk_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo,
const struct elk_wm_prog_data *wm_prog_data =
(const struct elk_wm_prog_data *)prog_data;
return !wm_prog_data->persample_dispatch &&
wm_prog_data->uses_vmask &&
max_polygons < 2;
wm_prog_data->uses_vmask;
}
case MESA_SHADER_COMPUTE:
/* Compute shaders will be spawned with either a fully enabled dispatch

View file

@ -1505,48 +1505,14 @@ elk_fs_visitor::assign_urb_setup()
* 3 Attr0.w a1-a0 a2-a0 N/A a0
* 4 Attr1.x a1-a0 a2-a0 N/A a0
* ...
*
* In multipolygon mode that no longer works since
* different channels may be processing polygons with
* different plane parameters, so each parameter above is
* represented as a dispatch_width-wide vector:
*
* elk_fs_reg::nr elk_fs_reg::offset Input Comp0 ... CompN
* 0 0 Attr0.x a1[0]-a0[0] ... a1[N]-a0[N]
* 0 4 * dispatch_width Attr0.x a2[0]-a0[0] ... a2[N]-a0[N]
* 0 8 * dispatch_width Attr0.x N/A ... N/A
* 0 12 * dispatch_width Attr0.x a0[0] ... a0[N]
* 1 0 Attr0.y a1[0]-a0[0] ... a1[N]-a0[N]
* ...
*
* Note that many of the components on a single row above
* are likely to be replicated multiple times (if, say, a
* single SIMD thread is only processing 2 different
* polygons), so plane parameters aren't actually stored
* in GRF memory with that layout to avoid wasting space.
* Instead we compose ATTR register regions with a 2D
* region that walks through the parameters of each
* polygon with the correct stride, reading the parameter
* corresponding to each channel directly from the PS
* thread payload.
*
* The latter layout corresponds to a param_width equal to
* dispatch_width, while the former (scalar parameter)
* layout has a param_width of 1.
*
* Gfx20+ represent plane parameters in a format similar
* to the above, except the parameters are packed in 12B
* and ordered like "a0, a1-a0, a2-a0" instead of the
* above vec4 representation with a missing component.
*/
const unsigned param_width = (max_polygons > 1 ? dispatch_width : 1);
const unsigned param_width = 1;
/* Size of a single scalar component of a plane parameter
* in bytes.
*/
const unsigned chan_sz = 4;
struct elk_reg reg;
assert(max_polygons > 0);
/* Calculate the base register on the thread payload of
* either the block of vertex setup data or the block of
@ -1558,7 +1524,7 @@ elk_fs_visitor::assign_urb_setup()
const unsigned base = urb_start +
(per_prim ? 0 :
ALIGN(prog_data->num_per_primitive_inputs / 2,
reg_unit(devinfo)) * max_polygons);
reg_unit(devinfo)));
const unsigned idx = per_prim ? inst->src[i].nr :
inst->src[i].nr - prog_data->num_per_primitive_inputs;
@ -1570,7 +1536,7 @@ elk_fs_visitor::assign_urb_setup()
* Earlier platforms and per-primitive block pack 2 logical
* input components per 32B register.
*/
const unsigned grf = base + idx / 2 * max_polygons;
const unsigned grf = base + idx / 2;
assert(inst->src[i].offset / param_width < REG_SIZE / 2);
const unsigned delta = (idx % 2) * (REG_SIZE / 2) +
inst->src[i].offset / (param_width * chan_sz) * chan_sz +
@ -1594,13 +1560,13 @@ elk_fs_visitor::assign_urb_setup()
* but they may be replicated multiple times for multipolygon
* dispatch.
*/
this->first_non_payload_grf += prog_data->num_varying_inputs * 2 * max_polygons;
this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
/* Unlike regular attributes, per-primitive attributes have all 4 channels
* in the same slot, so each GRF can store two slots.
*/
assert(prog_data->num_per_primitive_inputs % 2 == 0);
this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2 * max_polygons;
this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2;
}
void
@ -2931,8 +2897,7 @@ elk_fs_visitor::eliminate_find_live_channel()
bool progress = false;
unsigned depth = 0;
if (!elk_stage_has_packed_dispatch(devinfo, stage, max_polygons,
stage_prog_data)) {
if (!elk_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
/* The optimization below assumes that channel zero is live on thread
* dispatch, which may not be the case if the fixed function dispatches
* threads sparsely.
@ -4226,19 +4191,6 @@ get_fpu_lowered_simd_width(const elk_fs_visitor *shader,
/* Maximum execution size representable in the instruction controls. */
unsigned max_width = MIN2(32, inst->exec_size);
/* Number of channels per polygon handled by a multipolygon PS shader. */
const unsigned poly_width = shader->dispatch_width /
MAX2(1, shader->max_polygons);
/* Number of registers that will be read by an ATTR source if
* present for multipolygon PS shaders, since the PS vertex setup
* data for each polygon is stored in different contiguous GRFs.
*/
const unsigned attr_reg_count = (shader->stage != MESA_SHADER_FRAGMENT ||
shader->max_polygons < 2 ? 0 :
DIV_ROUND_UP(inst->exec_size,
poly_width) * reg_unit(devinfo));
/* According to the PRMs:
* "A. In Direct Addressing mode, a source cannot span more than 2
* adjacent GRF registers.
@ -4251,8 +4203,7 @@ get_fpu_lowered_simd_width(const elk_fs_visitor *shader,
unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
for (unsigned i = 0; i < inst->sources; i++)
reg_count = MAX3(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE),
(inst->src[i].file == ATTR ? attr_reg_count : 0));
reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
/* Calculate the maximum execution size of the instruction based on the
* factor by which it goes over the hardware limit of 2 GRFs.
@ -5200,8 +5151,7 @@ elk_fs_visitor::lower_find_live_channel()
return false;
bool packed_dispatch =
elk_stage_has_packed_dispatch(devinfo, stage, max_polygons,
stage_prog_data);
elk_stage_has_packed_dispatch(devinfo, stage, stage_prog_data);
bool vmask =
stage == MESA_SHADER_FRAGMENT &&
elk_wm_prog_data(stage_prog_data)->uses_vmask;
@ -6505,7 +6455,6 @@ elk_nir_populate_wm_prog_data(nir_shader *shader,
prog_data->uses_omask = !key->ignore_sample_mask_out &&
(shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
prog_data->color_outputs_written = key->color_outputs_valid;
prog_data->max_polygons = 1;
prog_data->computed_depth_mode = computed_depth_mode(shader);
prog_data->computed_stencil =
shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
@ -6681,7 +6630,7 @@ elk_compile_fs(const struct elk_compiler *compiler,
bool has_spilled = false;
v8 = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
prog_data, nir, 8, 1,
prog_data, nir, 8,
params->base.stats != NULL,
debug_enabled);
if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
@ -6716,7 +6665,7 @@ elk_compile_fs(const struct elk_compiler *compiler,
(INTEL_SIMD(FS, 16) || params->use_rep_send)) {
/* Try a SIMD16 compile */
v16 = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
prog_data, nir, 16, 1,
prog_data, nir, 16,
params->base.stats != NULL,
debug_enabled);
if (v8)
@ -6749,7 +6698,7 @@ elk_compile_fs(const struct elk_compiler *compiler,
INTEL_SIMD(FS, 32)) {
/* Try a SIMD32 compile */
v32 = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
prog_data, nir, 32, 1,
prog_data, nir, 32,
params->base.stats != NULL,
debug_enabled);
if (v8)
@ -6830,7 +6779,7 @@ elk_compile_fs(const struct elk_compiler *compiler,
if (simd8_cfg) {
prog_data->dispatch_8 = true;
g.generate_code(simd8_cfg, 8, v8->shader_stats,
v8->performance_analysis.require(), stats, 1);
v8->performance_analysis.require(), stats);
stats = stats ? stats + 1 : NULL;
max_dispatch_width = 8;
}
@ -6839,7 +6788,7 @@ elk_compile_fs(const struct elk_compiler *compiler,
prog_data->dispatch_16 = true;
prog_data->prog_offset_16 = g.generate_code(
simd16_cfg, 16, v16->shader_stats,
v16->performance_analysis.require(), stats, 1);
v16->performance_analysis.require(), stats);
stats = stats ? stats + 1 : NULL;
max_dispatch_width = 16;
}
@ -6848,7 +6797,7 @@ elk_compile_fs(const struct elk_compiler *compiler,
prog_data->dispatch_32 = true;
prog_data->prog_offset_32 = g.generate_code(
simd32_cfg, 32, v32->shader_stats,
v32->performance_analysis.require(), stats, 1);
v32->performance_analysis.require(), stats);
stats = stats ? stats + 1 : NULL;
max_dispatch_width = 32;
}
@ -7146,7 +7095,6 @@ elk_fs_test_dispatch_packing(const fs_builder &bld)
elk_wm_prog_data(shader->stage_prog_data)->uses_vmask;
if (elk_stage_has_packed_dispatch(shader->devinfo, stage,
shader->max_polygons,
shader->stage_prog_data)) {
const fs_builder ubld = bld.exec_all().group(1, 0);
const elk_fs_reg tmp = component(bld.vgrf(ELK_REGISTER_TYPE_UD), 0);

View file

@ -177,7 +177,6 @@ public:
struct elk_wm_prog_data *prog_data,
const nir_shader *shader,
unsigned dispatch_width,
unsigned num_polygons,
bool needs_register_pressure,
bool debug_enabled);
elk_fs_visitor(const struct elk_compiler *compiler,
@ -400,7 +399,6 @@ public:
bool needs_register_pressure;
const unsigned dispatch_width; /**< 8, 16 or 32 */
const unsigned max_polygons;
unsigned max_dispatch_width;
/* The API selected subgroup size */
@ -451,8 +449,7 @@ public:
int generate_code(const elk_cfg_t *cfg, int dispatch_width,
struct shader_stats shader_stats,
const elk::performance &perf,
struct elk_compile_stats *stats,
unsigned max_polygons = 0);
struct elk_compile_stats *stats);
void add_const_data(void *data, unsigned size);
const unsigned *get_assembly();

View file

@ -660,8 +660,7 @@ instruction_requires_packed_data(elk_fs_inst *inst)
static bool
try_copy_propagate(const elk_compiler *compiler, elk_fs_inst *inst,
acp_entry *entry, int arg,
const elk::simple_allocator &alloc,
uint8_t max_polygons)
const elk::simple_allocator &alloc)
{
if (inst->src[arg].file != VGRF)
return false;
@ -799,17 +798,6 @@ try_copy_propagate(const elk_compiler *compiler, elk_fs_inst *inst,
(reg_offset(inst->dst) % REG_SIZE) != (reg_offset(entry->src) % REG_SIZE))
return false;
/* The <8;8,0> regions used for FS attributes in multipolygon
* dispatch mode could violate regioning restrictions, don't copy
* propagate them in such cases.
*/
if (entry->src.file == ATTR && max_polygons > 1 &&
(has_dst_aligned_region_restriction(devinfo, inst, dst_type) ||
instruction_requires_packed_data(inst) ||
(inst->elk_is_3src(compiler) && arg == 2) ||
entry->dst.type != inst->src[arg].type))
return false;
/* Bail if the source FIXED_GRF region of the copy cannot be trivially
* composed with the source region of the instruction -- E.g. because the
* copy uses some extended stride greater than 4 not supported natively by
@ -1245,8 +1233,7 @@ can_propagate_from(elk_fs_inst *inst)
static bool
opt_copy_propagation_local(const elk_compiler *compiler, linear_ctx *lin_ctx,
elk_bblock_t *block, struct acp &acp,
const elk::simple_allocator &alloc,
uint8_t max_polygons)
const elk::simple_allocator &alloc)
{
bool progress = false;
@ -1266,8 +1253,7 @@ opt_copy_propagation_local(const elk_compiler *compiler, linear_ctx *lin_ctx,
break;
}
} else {
if (try_copy_propagate(compiler, inst, *iter, i, alloc,
max_polygons)) {
if (try_copy_propagate(compiler, inst, *iter, i, alloc)) {
instruction_progress = true;
break;
}
@ -1373,8 +1359,7 @@ elk_fs_visitor::opt_copy_propagation()
*/
foreach_block (block, cfg) {
progress = opt_copy_propagation_local(compiler, lin_ctx, block,
out_acp[block->num], alloc,
max_polygons) || progress;
out_acp[block->num], alloc) || progress;
/* If the destination of an ACP entry exists only within this block,
* then there's no need to keep it for dataflow analysis. We can delete
@ -1414,7 +1399,7 @@ elk_fs_visitor::opt_copy_propagation()
}
progress = opt_copy_propagation_local(compiler, lin_ctx, block,
in_acp, alloc, max_polygons) ||
in_acp, alloc) ||
progress;
}

View file

@ -1517,8 +1517,7 @@ int
elk_fs_generator::generate_code(const elk_cfg_t *cfg, int dispatch_width,
struct shader_stats shader_stats,
const elk::performance &perf,
struct elk_compile_stats *stats,
unsigned max_polygons)
struct elk_compile_stats *stats)
{
/* align to 64 byte boundary. */
elk_realign(p, 64);
@ -2273,7 +2272,6 @@ elk_fs_generator::generate_code(const elk_cfg_t *cfg, int dispatch_width,
before_size, after_size);
if (stats) {
stats->dispatch_width = dispatch_width;
stats->max_polygons = max_polygons;
stats->max_dispatch_width = dispatch_width;
stats->instructions = before_size / 16 - nop_count - sync_nop_count;
stats->sends = send_count;

View file

@ -64,19 +64,7 @@ elk_fs_visitor::interp_reg(const fs_builder &bld, unsigned location,
const unsigned per_vertex_start = prog_data->num_per_primitive_inputs;
const unsigned regnr = per_vertex_start + (nr * 4) + channel;
if (max_polygons > 1) {
/* In multipolygon dispatch each plane parameter is a
* dispatch_width-wide SIMD vector (see comment in
* assign_urb_setup()), so we need to use offset() instead of
* component() to select the specified parameter.
*/
const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
bld.MOV(tmp, offset(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_UD),
dispatch_width, comp));
return retype(tmp, ELK_REGISTER_TYPE_F);
} else {
return component(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_F), comp);
}
return component(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_F), comp);
}
/* The register location here is relative to the start of the URB
@ -99,19 +87,7 @@ elk_fs_visitor::per_primitive_reg(const fs_builder &bld, int location, unsigned
assert(regnr < prog_data->num_per_primitive_inputs);
if (max_polygons > 1) {
/* In multipolygon dispatch each primitive constant is a
* dispatch_width-wide SIMD vector (see comment in
* assign_urb_setup()), so we need to use offset() instead of
* component() to select the specified parameter.
*/
const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
bld.MOV(tmp, offset(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_UD),
dispatch_width, comp % 4));
return retype(tmp, ELK_REGISTER_TYPE_F);
} else {
return component(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_F), comp % 4);
}
return component(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_F), comp % 4);
}
/** Emits the interpolation for the varying inputs. */
@ -878,7 +854,6 @@ elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler,
performance_analysis(this),
needs_register_pressure(needs_register_pressure),
dispatch_width(dispatch_width),
max_polygons(0),
api_subgroup_size(elk_nir_api_subgroup_size(shader, dispatch_width))
{
init();
@ -889,7 +864,7 @@ elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler,
const elk_wm_prog_key *key,
struct elk_wm_prog_data *prog_data,
const nir_shader *shader,
unsigned dispatch_width, unsigned max_polygons,
unsigned dispatch_width,
bool needs_register_pressure,
bool debug_enabled)
: elk_backend_shader(compiler, params, shader, &prog_data->base,
@ -899,7 +874,6 @@ elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler,
performance_analysis(this),
needs_register_pressure(needs_register_pressure),
dispatch_width(dispatch_width),
max_polygons(max_polygons),
api_subgroup_size(elk_nir_api_subgroup_size(shader, dispatch_width))
{
init();
@ -924,7 +898,6 @@ elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler,
performance_analysis(this),
needs_register_pressure(needs_register_pressure),
dispatch_width(8),
max_polygons(0),
api_subgroup_size(elk_nir_api_subgroup_size(shader, dispatch_width))
{
init();

View file

@ -1208,7 +1208,7 @@ vec4_visitor::eliminate_find_live_channel()
bool progress = false;
unsigned depth = 0;
if (!elk_stage_has_packed_dispatch(devinfo, stage, 0, stage_prog_data)) {
if (!elk_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
/* The optimization below assumes that channel zero is live on thread
* dispatch, which may not be the case if the fixed function dispatches
* threads sparsely.