diff --git a/src/gallium/drivers/iris/iris_program.c b/src/gallium/drivers/iris/iris_program.c index 3b315851b5a..1a09302a297 100644 --- a/src/gallium/drivers/iris/iris_program.c +++ b/src/gallium/drivers/iris/iris_program.c @@ -297,8 +297,8 @@ iris_apply_elk_wm_prog_data(struct iris_compiled_shader *shader, iris->flat_inputs = elk->flat_inputs; iris->inputs = elk->inputs; iris->computed_depth_mode = elk->computed_depth_mode; - iris->max_polygons = elk->max_polygons; - iris->dispatch_multi = elk->dispatch_multi; + iris->max_polygons = 1; + iris->dispatch_multi = 0; iris->computed_stencil = elk->computed_stencil; iris->early_fragment_tests = elk->early_fragment_tests; iris->post_depth_coverage = elk->post_depth_coverage; diff --git a/src/intel/common/intel_genX_state_elk.h b/src/intel/common/intel_genX_state_elk.h index cb84580f41b..d24909becc3 100644 --- a/src/intel/common/intel_genX_state_elk.h +++ b/src/intel/common/intel_genX_state_elk.h @@ -89,10 +89,8 @@ intel_set_ps_dispatch_state(struct GENX(3DSTATE_PS) *ps, } assert(enable_8 || enable_16 || enable_32); - assert(!prog_data->dispatch_multi); - ps->_8PixelDispatchEnable = enable_8 || - (GFX_VER == 12 && prog_data->dispatch_multi); + ps->_8PixelDispatchEnable = enable_8; ps->_16PixelDispatchEnable = enable_16; ps->_32PixelDispatchEnable = enable_32; } diff --git a/src/intel/compiler/elk/elk_compiler.h b/src/intel/compiler/elk/elk_compiler.h index 158d6247887..c6740862715 100644 --- a/src/intel/compiler/elk/elk_compiler.h +++ b/src/intel/compiler/elk/elk_compiler.h @@ -833,18 +833,6 @@ struct elk_wm_prog_data { uint8_t color_outputs_written; uint8_t computed_depth_mode; - /** - * Number of polygons handled in parallel by the multi-polygon PS - * kernel. - */ - uint8_t max_polygons; - - /** - * Dispatch width of the multi-polygon PS kernel, or 0 if no - * multi-polygon kernel was built. - */ - uint8_t dispatch_multi; - bool computed_stencil; bool early_fragment_tests; bool post_depth_coverage; @@ -1791,7 +1779,7 @@ elk_cs_get_dispatch_info(const struct intel_device_info *devinfo, */ static inline bool elk_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo, - gl_shader_stage stage, unsigned max_polygons, + gl_shader_stage stage, const struct elk_stage_prog_data *prog_data) { /* The code below makes assumptions about the hardware's thread dispatch @@ -1814,8 +1802,7 @@ elk_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo, const struct elk_wm_prog_data *wm_prog_data = (const struct elk_wm_prog_data *)prog_data; return !wm_prog_data->persample_dispatch && - wm_prog_data->uses_vmask && - max_polygons < 2; + wm_prog_data->uses_vmask; } case MESA_SHADER_COMPUTE: /* Compute shaders will be spawned with either a fully enabled dispatch diff --git a/src/intel/compiler/elk/elk_fs.cpp b/src/intel/compiler/elk/elk_fs.cpp index dd50f14ddc3..a1ae1eafc3a 100644 --- a/src/intel/compiler/elk/elk_fs.cpp +++ b/src/intel/compiler/elk/elk_fs.cpp @@ -1505,48 +1505,14 @@ elk_fs_visitor::assign_urb_setup() * 3 Attr0.w a1-a0 a2-a0 N/A a0 * 4 Attr1.x a1-a0 a2-a0 N/A a0 * ... - * - * In multipolygon mode that no longer works since - * different channels may be processing polygons with - * different plane parameters, so each parameter above is - * represented as a dispatch_width-wide vector: - * - * elk_fs_reg::nr elk_fs_reg::offset Input Comp0 ... CompN - * 0 0 Attr0.x a1[0]-a0[0] ... a1[N]-a0[N] - * 0 4 * dispatch_width Attr0.x a2[0]-a0[0] ... a2[N]-a0[N] - * 0 8 * dispatch_width Attr0.x N/A ... N/A - * 0 12 * dispatch_width Attr0.x a0[0] ... a0[N] - * 1 0 Attr0.y a1[0]-a0[0] ... a1[N]-a0[N] - * ... - * - * Note that many of the components on a single row above - * are likely to be replicated multiple times (if, say, a - * single SIMD thread is only processing 2 different - * polygons), so plane parameters aren't actually stored - * in GRF memory with that layout to avoid wasting space. - * Instead we compose ATTR register regions with a 2D - * region that walks through the parameters of each - * polygon with the correct stride, reading the parameter - * corresponding to each channel directly from the PS - * thread payload. - * - * The latter layout corresponds to a param_width equal to - * dispatch_width, while the former (scalar parameter) - * layout has a param_width of 1. - * - * Gfx20+ represent plane parameters in a format similar - * to the above, except the parameters are packed in 12B - * and ordered like "a0, a1-a0, a2-a0" instead of the - * above vec4 representation with a missing component. */ - const unsigned param_width = (max_polygons > 1 ? dispatch_width : 1); + const unsigned param_width = 1; /* Size of a single scalar component of a plane parameter * in bytes. */ const unsigned chan_sz = 4; struct elk_reg reg; - assert(max_polygons > 0); /* Calculate the base register on the thread payload of * either the block of vertex setup data or the block of @@ -1558,7 +1524,7 @@ elk_fs_visitor::assign_urb_setup() const unsigned base = urb_start + (per_prim ? 0 : ALIGN(prog_data->num_per_primitive_inputs / 2, - reg_unit(devinfo)) * max_polygons); + reg_unit(devinfo))); const unsigned idx = per_prim ? inst->src[i].nr : inst->src[i].nr - prog_data->num_per_primitive_inputs; @@ -1570,7 +1536,7 @@ elk_fs_visitor::assign_urb_setup() * Earlier platforms and per-primitive block pack 2 logical * input components per 32B register. */ - const unsigned grf = base + idx / 2 * max_polygons; + const unsigned grf = base + idx / 2; assert(inst->src[i].offset / param_width < REG_SIZE / 2); const unsigned delta = (idx % 2) * (REG_SIZE / 2) + inst->src[i].offset / (param_width * chan_sz) * chan_sz + @@ -1594,13 +1560,13 @@ elk_fs_visitor::assign_urb_setup() * but they may be replicated multiple times for multipolygon * dispatch. */ - this->first_non_payload_grf += prog_data->num_varying_inputs * 2 * max_polygons; + this->first_non_payload_grf += prog_data->num_varying_inputs * 2; /* Unlike regular attributes, per-primitive attributes have all 4 channels * in the same slot, so each GRF can store two slots. */ assert(prog_data->num_per_primitive_inputs % 2 == 0); - this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2 * max_polygons; + this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2; } void @@ -2931,8 +2897,7 @@ elk_fs_visitor::eliminate_find_live_channel() bool progress = false; unsigned depth = 0; - if (!elk_stage_has_packed_dispatch(devinfo, stage, max_polygons, - stage_prog_data)) { + if (!elk_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) { /* The optimization below assumes that channel zero is live on thread * dispatch, which may not be the case if the fixed function dispatches * threads sparsely. @@ -4226,19 +4191,6 @@ get_fpu_lowered_simd_width(const elk_fs_visitor *shader, /* Maximum execution size representable in the instruction controls. */ unsigned max_width = MIN2(32, inst->exec_size); - /* Number of channels per polygon handled by a multipolygon PS shader. */ - const unsigned poly_width = shader->dispatch_width / - MAX2(1, shader->max_polygons); - - /* Number of registers that will be read by an ATTR source if - * present for multipolygon PS shaders, since the PS vertex setup - * data for each polygon is stored in different contiguous GRFs. - */ - const unsigned attr_reg_count = (shader->stage != MESA_SHADER_FRAGMENT || - shader->max_polygons < 2 ? 0 : - DIV_ROUND_UP(inst->exec_size, - poly_width) * reg_unit(devinfo)); - /* According to the PRMs: * "A. In Direct Addressing mode, a source cannot span more than 2 * adjacent GRF registers. @@ -4251,8 +4203,7 @@ get_fpu_lowered_simd_width(const elk_fs_visitor *shader, unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE); for (unsigned i = 0; i < inst->sources; i++) - reg_count = MAX3(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE), - (inst->src[i].file == ATTR ? attr_reg_count : 0)); + reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE)); /* Calculate the maximum execution size of the instruction based on the * factor by which it goes over the hardware limit of 2 GRFs. @@ -5200,8 +5151,7 @@ elk_fs_visitor::lower_find_live_channel() return false; bool packed_dispatch = - elk_stage_has_packed_dispatch(devinfo, stage, max_polygons, - stage_prog_data); + elk_stage_has_packed_dispatch(devinfo, stage, stage_prog_data); bool vmask = stage == MESA_SHADER_FRAGMENT && elk_wm_prog_data(stage_prog_data)->uses_vmask; @@ -6505,7 +6455,6 @@ elk_nir_populate_wm_prog_data(nir_shader *shader, prog_data->uses_omask = !key->ignore_sample_mask_out && (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)); prog_data->color_outputs_written = key->color_outputs_valid; - prog_data->max_polygons = 1; prog_data->computed_depth_mode = computed_depth_mode(shader); prog_data->computed_stencil = shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL); @@ -6681,7 +6630,7 @@ elk_compile_fs(const struct elk_compiler *compiler, bool has_spilled = false; v8 = std::make_unique(compiler, ¶ms->base, key, - prog_data, nir, 8, 1, + prog_data, nir, 8, params->base.stats != NULL, debug_enabled); if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) { @@ -6716,7 +6665,7 @@ elk_compile_fs(const struct elk_compiler *compiler, (INTEL_SIMD(FS, 16) || params->use_rep_send)) { /* Try a SIMD16 compile */ v16 = std::make_unique(compiler, ¶ms->base, key, - prog_data, nir, 16, 1, + prog_data, nir, 16, params->base.stats != NULL, debug_enabled); if (v8) @@ -6749,7 +6698,7 @@ elk_compile_fs(const struct elk_compiler *compiler, INTEL_SIMD(FS, 32)) { /* Try a SIMD32 compile */ v32 = std::make_unique(compiler, ¶ms->base, key, - prog_data, nir, 32, 1, + prog_data, nir, 32, params->base.stats != NULL, debug_enabled); if (v8) @@ -6830,7 +6779,7 @@ elk_compile_fs(const struct elk_compiler *compiler, if (simd8_cfg) { prog_data->dispatch_8 = true; g.generate_code(simd8_cfg, 8, v8->shader_stats, - v8->performance_analysis.require(), stats, 1); + v8->performance_analysis.require(), stats); stats = stats ? stats + 1 : NULL; max_dispatch_width = 8; } @@ -6839,7 +6788,7 @@ elk_compile_fs(const struct elk_compiler *compiler, prog_data->dispatch_16 = true; prog_data->prog_offset_16 = g.generate_code( simd16_cfg, 16, v16->shader_stats, - v16->performance_analysis.require(), stats, 1); + v16->performance_analysis.require(), stats); stats = stats ? stats + 1 : NULL; max_dispatch_width = 16; } @@ -6848,7 +6797,7 @@ elk_compile_fs(const struct elk_compiler *compiler, prog_data->dispatch_32 = true; prog_data->prog_offset_32 = g.generate_code( simd32_cfg, 32, v32->shader_stats, - v32->performance_analysis.require(), stats, 1); + v32->performance_analysis.require(), stats); stats = stats ? stats + 1 : NULL; max_dispatch_width = 32; } @@ -7146,7 +7095,6 @@ elk_fs_test_dispatch_packing(const fs_builder &bld) elk_wm_prog_data(shader->stage_prog_data)->uses_vmask; if (elk_stage_has_packed_dispatch(shader->devinfo, stage, - shader->max_polygons, shader->stage_prog_data)) { const fs_builder ubld = bld.exec_all().group(1, 0); const elk_fs_reg tmp = component(bld.vgrf(ELK_REGISTER_TYPE_UD), 0); diff --git a/src/intel/compiler/elk/elk_fs.h b/src/intel/compiler/elk/elk_fs.h index b138556dda2..95abb621b33 100644 --- a/src/intel/compiler/elk/elk_fs.h +++ b/src/intel/compiler/elk/elk_fs.h @@ -177,7 +177,6 @@ public: struct elk_wm_prog_data *prog_data, const nir_shader *shader, unsigned dispatch_width, - unsigned num_polygons, bool needs_register_pressure, bool debug_enabled); elk_fs_visitor(const struct elk_compiler *compiler, @@ -400,7 +399,6 @@ public: bool needs_register_pressure; const unsigned dispatch_width; /**< 8, 16 or 32 */ - const unsigned max_polygons; unsigned max_dispatch_width; /* The API selected subgroup size */ @@ -451,8 +449,7 @@ public: int generate_code(const elk_cfg_t *cfg, int dispatch_width, struct shader_stats shader_stats, const elk::performance &perf, - struct elk_compile_stats *stats, - unsigned max_polygons = 0); + struct elk_compile_stats *stats); void add_const_data(void *data, unsigned size); const unsigned *get_assembly(); diff --git a/src/intel/compiler/elk/elk_fs_copy_propagation.cpp b/src/intel/compiler/elk/elk_fs_copy_propagation.cpp index 14f0051c2d3..9d388d8dae9 100644 --- a/src/intel/compiler/elk/elk_fs_copy_propagation.cpp +++ b/src/intel/compiler/elk/elk_fs_copy_propagation.cpp @@ -660,8 +660,7 @@ instruction_requires_packed_data(elk_fs_inst *inst) static bool try_copy_propagate(const elk_compiler *compiler, elk_fs_inst *inst, acp_entry *entry, int arg, - const elk::simple_allocator &alloc, - uint8_t max_polygons) + const elk::simple_allocator &alloc) { if (inst->src[arg].file != VGRF) return false; @@ -799,17 +798,6 @@ try_copy_propagate(const elk_compiler *compiler, elk_fs_inst *inst, (reg_offset(inst->dst) % REG_SIZE) != (reg_offset(entry->src) % REG_SIZE)) return false; - /* The <8;8,0> regions used for FS attributes in multipolygon - * dispatch mode could violate regioning restrictions, don't copy - * propagate them in such cases. - */ - if (entry->src.file == ATTR && max_polygons > 1 && - (has_dst_aligned_region_restriction(devinfo, inst, dst_type) || - instruction_requires_packed_data(inst) || - (inst->elk_is_3src(compiler) && arg == 2) || - entry->dst.type != inst->src[arg].type)) - return false; - /* Bail if the source FIXED_GRF region of the copy cannot be trivially * composed with the source region of the instruction -- E.g. because the * copy uses some extended stride greater than 4 not supported natively by @@ -1245,8 +1233,7 @@ can_propagate_from(elk_fs_inst *inst) static bool opt_copy_propagation_local(const elk_compiler *compiler, linear_ctx *lin_ctx, elk_bblock_t *block, struct acp &acp, - const elk::simple_allocator &alloc, - uint8_t max_polygons) + const elk::simple_allocator &alloc) { bool progress = false; @@ -1266,8 +1253,7 @@ opt_copy_propagation_local(const elk_compiler *compiler, linear_ctx *lin_ctx, break; } } else { - if (try_copy_propagate(compiler, inst, *iter, i, alloc, - max_polygons)) { + if (try_copy_propagate(compiler, inst, *iter, i, alloc)) { instruction_progress = true; break; } @@ -1373,8 +1359,7 @@ elk_fs_visitor::opt_copy_propagation() */ foreach_block (block, cfg) { progress = opt_copy_propagation_local(compiler, lin_ctx, block, - out_acp[block->num], alloc, - max_polygons) || progress; + out_acp[block->num], alloc) || progress; /* If the destination of an ACP entry exists only within this block, * then there's no need to keep it for dataflow analysis. We can delete @@ -1414,7 +1399,7 @@ elk_fs_visitor::opt_copy_propagation() } progress = opt_copy_propagation_local(compiler, lin_ctx, block, - in_acp, alloc, max_polygons) || + in_acp, alloc) || progress; } diff --git a/src/intel/compiler/elk/elk_fs_generator.cpp b/src/intel/compiler/elk/elk_fs_generator.cpp index 49611794c6a..ad59fb21ccb 100644 --- a/src/intel/compiler/elk/elk_fs_generator.cpp +++ b/src/intel/compiler/elk/elk_fs_generator.cpp @@ -1517,8 +1517,7 @@ int elk_fs_generator::generate_code(const elk_cfg_t *cfg, int dispatch_width, struct shader_stats shader_stats, const elk::performance &perf, - struct elk_compile_stats *stats, - unsigned max_polygons) + struct elk_compile_stats *stats) { /* align to 64 byte boundary. */ elk_realign(p, 64); @@ -2273,7 +2272,6 @@ elk_fs_generator::generate_code(const elk_cfg_t *cfg, int dispatch_width, before_size, after_size); if (stats) { stats->dispatch_width = dispatch_width; - stats->max_polygons = max_polygons; stats->max_dispatch_width = dispatch_width; stats->instructions = before_size / 16 - nop_count - sync_nop_count; stats->sends = send_count; diff --git a/src/intel/compiler/elk/elk_fs_visitor.cpp b/src/intel/compiler/elk/elk_fs_visitor.cpp index 1d0e73a9231..c834f029011 100644 --- a/src/intel/compiler/elk/elk_fs_visitor.cpp +++ b/src/intel/compiler/elk/elk_fs_visitor.cpp @@ -64,19 +64,7 @@ elk_fs_visitor::interp_reg(const fs_builder &bld, unsigned location, const unsigned per_vertex_start = prog_data->num_per_primitive_inputs; const unsigned regnr = per_vertex_start + (nr * 4) + channel; - if (max_polygons > 1) { - /* In multipolygon dispatch each plane parameter is a - * dispatch_width-wide SIMD vector (see comment in - * assign_urb_setup()), so we need to use offset() instead of - * component() to select the specified parameter. - */ - const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD); - bld.MOV(tmp, offset(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_UD), - dispatch_width, comp)); - return retype(tmp, ELK_REGISTER_TYPE_F); - } else { - return component(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_F), comp); - } + return component(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_F), comp); } /* The register location here is relative to the start of the URB @@ -99,19 +87,7 @@ elk_fs_visitor::per_primitive_reg(const fs_builder &bld, int location, unsigned assert(regnr < prog_data->num_per_primitive_inputs); - if (max_polygons > 1) { - /* In multipolygon dispatch each primitive constant is a - * dispatch_width-wide SIMD vector (see comment in - * assign_urb_setup()), so we need to use offset() instead of - * component() to select the specified parameter. - */ - const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD); - bld.MOV(tmp, offset(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_UD), - dispatch_width, comp % 4)); - return retype(tmp, ELK_REGISTER_TYPE_F); - } else { - return component(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_F), comp % 4); - } + return component(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_F), comp % 4); } /** Emits the interpolation for the varying inputs. */ @@ -878,7 +854,6 @@ elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler, performance_analysis(this), needs_register_pressure(needs_register_pressure), dispatch_width(dispatch_width), - max_polygons(0), api_subgroup_size(elk_nir_api_subgroup_size(shader, dispatch_width)) { init(); @@ -889,7 +864,7 @@ elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler, const elk_wm_prog_key *key, struct elk_wm_prog_data *prog_data, const nir_shader *shader, - unsigned dispatch_width, unsigned max_polygons, + unsigned dispatch_width, bool needs_register_pressure, bool debug_enabled) : elk_backend_shader(compiler, params, shader, &prog_data->base, @@ -899,7 +874,6 @@ elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler, performance_analysis(this), needs_register_pressure(needs_register_pressure), dispatch_width(dispatch_width), - max_polygons(max_polygons), api_subgroup_size(elk_nir_api_subgroup_size(shader, dispatch_width)) { init(); @@ -924,7 +898,6 @@ elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler, performance_analysis(this), needs_register_pressure(needs_register_pressure), dispatch_width(8), - max_polygons(0), api_subgroup_size(elk_nir_api_subgroup_size(shader, dispatch_width)) { init(); diff --git a/src/intel/compiler/elk/elk_vec4.cpp b/src/intel/compiler/elk/elk_vec4.cpp index d6a24d23ccb..1d5492f35f3 100644 --- a/src/intel/compiler/elk/elk_vec4.cpp +++ b/src/intel/compiler/elk/elk_vec4.cpp @@ -1208,7 +1208,7 @@ vec4_visitor::eliminate_find_live_channel() bool progress = false; unsigned depth = 0; - if (!elk_stage_has_packed_dispatch(devinfo, stage, 0, stage_prog_data)) { + if (!elk_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) { /* The optimization below assumes that channel zero is live on thread * dispatch, which may not be the case if the fixed function dispatches * threads sparsely.