intel/elk: Remove Gfx9+ from passes

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27629>
This commit is contained in:
Caio Oliveira 2024-02-12 19:10:18 -08:00 committed by Marge Bot
parent 2b6b786feb
commit 241a03b8ec
8 changed files with 53 additions and 542 deletions

View file

@ -591,21 +591,6 @@ namespace {
return constrained;
}
/**
* Return whether the hardware will be able to prevent a bank conflict by
* optimizing out the read cycle of a source register. The formula was
* found experimentally.
*/
bool
is_conflict_optimized_out(const intel_device_info *devinfo,
const elk_fs_inst *inst)
{
return devinfo->ver >= 9 &&
((is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) ||
reg_of(inst->src[0]) == reg_of(inst->src[2]))) ||
reg_of(inst->src[1]) == reg_of(inst->src[2]));
}
/**
* Return a matrix that allows reasonably efficient computation of the
* cycle-count cost of bank conflicts incurred throughout the whole program
@ -662,10 +647,9 @@ namespace {
REG_SIZE);
/* Neglect same-atom conflicts (since they're either trivial or
* impossible to avoid without splitting the atom), and conflicts
* known to be optimized out by the hardware.
* impossible to avoid without splitting the atom).
*/
if (r != s && !is_conflict_optimized_out(v->devinfo, inst)) {
if (r != s) {
/* Calculate the parity of the sources relative to the start of
* their respective atoms. If their parity is the same (and
* none of the atoms straddle the 2KB mark), the instruction
@ -911,10 +895,6 @@ elk_fs_visitor::opt_bank_conflicts()
{
assert(grf_used || !"Must be called after register allocation");
/* TODO: Re-work this pass for Gfx20+. */
if (devinfo->ver >= 20)
return false;
/* No ternary instructions -- No bank conflicts. */
if (devinfo->ver < 6)
return false;
@ -950,6 +930,5 @@ elk_has_bank_conflict(const struct elk_isa_info *isa, const elk_fs_inst *inst)
{
return elk_is_3src(isa, inst->opcode) &&
is_grf(inst->src[1]) && is_grf(inst->src[2]) &&
bank_of(reg_of(inst->src[1])) == bank_of(reg_of(inst->src[2])) &&
!is_conflict_optimized_out(isa->devinfo, inst);
bank_of(reg_of(inst->src[1])) == bank_of(reg_of(inst->src[2]));
}

View file

@ -993,120 +993,6 @@ get_alignment_for_imm(const struct imm *imm)
return imm->size;
}
static bool
representable_as_hf(float f, uint16_t *hf)
{
union fi u;
uint16_t h = _mesa_float_to_half(f);
u.f = _mesa_half_to_float(h);
if (u.f == f) {
*hf = h;
return true;
}
return false;
}
static bool
representable_as_w(int d, int16_t *w)
{
int res = ((d & 0xffff8000) + 0x8000) & 0xffff7fff;
if (!res) {
*w = d;
return true;
}
return false;
}
static bool
representable_as_uw(unsigned ud, uint16_t *uw)
{
if (!(ud & 0xffff0000)) {
*uw = ud;
return true;
}
return false;
}
static bool
supports_src_as_imm(const struct intel_device_info *devinfo, const elk_fs_inst *inst)
{
if (devinfo->ver < 12)
return false;
switch (inst->opcode) {
case ELK_OPCODE_MAD:
/* Integer types can always mix sizes. Floating point types can mix
* sizes on Gfx12. On Gfx12.5, floating point sources must all be HF or
* all be F.
*/
return devinfo->verx10 < 125 || inst->src[0].type != ELK_REGISTER_TYPE_F;
default:
return false;
}
}
static bool
can_promote_src_as_imm(const struct intel_device_info *devinfo, elk_fs_inst *inst,
unsigned src_idx)
{
bool can_promote = false;
/* Experiment shows that we can only support src0 as immediate for MAD on
* Gfx12. ADD3 can use src0 or src2 in Gfx12.5, but constant propagation
* only propagates into src0. It's possible that src2 works for W or UW MAD
* on Gfx12.5.
*/
if (src_idx != 0)
return false;
if (!supports_src_as_imm(devinfo, inst))
return false;
/* TODO - Fix the codepath below to use a bfloat16 immediate on XeHP,
* since HF/F mixed mode has been removed from the hardware.
*/
switch (inst->src[src_idx].type) {
case ELK_REGISTER_TYPE_F: {
uint16_t hf;
if (representable_as_hf(inst->src[src_idx].f, &hf)) {
inst->src[src_idx] = retype(elk_imm_uw(hf), ELK_REGISTER_TYPE_HF);
can_promote = true;
}
break;
}
case ELK_REGISTER_TYPE_D: {
int16_t w;
if (representable_as_w(inst->src[src_idx].d, &w)) {
inst->src[src_idx] = elk_imm_w(w);
can_promote = true;
}
break;
}
case ELK_REGISTER_TYPE_UD: {
uint16_t uw;
if (representable_as_uw(inst->src[src_idx].ud, &uw)) {
inst->src[src_idx] = elk_imm_uw(uw);
can_promote = true;
}
break;
}
case ELK_REGISTER_TYPE_W:
case ELK_REGISTER_TYPE_UW:
case ELK_REGISTER_TYPE_HF:
can_promote = true;
break;
default:
break;
}
return can_promote;
}
static void
add_candidate_immediate(struct table *table, elk_fs_inst *inst, unsigned ip,
unsigned i,
@ -1358,9 +1244,6 @@ elk_fs_visitor::opt_combine_constants()
if (inst->src[i].file != IMM)
continue;
if (can_promote_src_as_imm(devinfo, inst, i))
continue;
add_candidate_immediate(&table, inst, ip, i, true, false, block,
devinfo, const_ctx);
}

View file

@ -64,7 +64,7 @@ elk_fs_visitor::lower_pack()
const uint32_t half = _mesa_float_to_half(inst->src[i].f);
ibld.MOV(subscript(dst, ELK_REGISTER_TYPE_UW, i),
elk_imm_uw(half));
} else if (i == 1 && devinfo->ver < 9) {
} else if (i == 1) {
/* Pre-Skylake requires DWord aligned destinations */
elk_fs_reg tmp = ibld.vgrf(ELK_REGISTER_TYPE_UD);
ibld.F32TO16(subscript(tmp, ELK_REGISTER_TYPE_HF, 0),

View file

@ -145,9 +145,8 @@ namespace {
* Work around both of the above and handle platforms that
* don't support 64-bit types at all.
*/
if ((!devinfo->has_64bit_int ||
devinfo->platform == INTEL_PLATFORM_CHV ||
intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
if ((!devinfo->has_64bit_int || devinfo->platform == INTEL_PLATFORM_CHV) &&
type_sz(t) > 4)
return ELK_REGISTER_TYPE_UD;
else if (has_dst_aligned_region_restriction(devinfo, inst))
return elk_int_type(type_sz(t), false);
@ -174,30 +173,19 @@ namespace {
* integer DWord multiply, indirect addressing must not be
* used."
*
* For MTL (verx10 == 125), float64 is supported, but int64 is not.
* Therefore we need to lower cluster broadcast using 32-bit int ops.
*
* For gfx12.5+ platforms that support int64, the register regions
* used by cluster broadcast aren't supported by the 64-bit pipeline.
*
* Work around the above and handle platforms that don't
* support 64-bit types at all.
*/
if ((!has_64bit || devinfo->verx10 >= 125 ||
devinfo->platform == INTEL_PLATFORM_CHV ||
intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
if ((!has_64bit || devinfo->platform == INTEL_PLATFORM_CHV) &&
type_sz(t) > 4)
return ELK_REGISTER_TYPE_UD;
else
return elk_int_type(type_sz(t), false);
case ELK_SHADER_OPCODE_BROADCAST:
case ELK_SHADER_OPCODE_MOV_INDIRECT:
if (((devinfo->verx10 == 70 ||
devinfo->platform == INTEL_PLATFORM_CHV ||
intel_device_info_is_9lp(devinfo) ||
devinfo->verx10 >= 125) && type_sz(inst->src[0].type) > 4) ||
(devinfo->verx10 >= 125 &&
elk_reg_type_is_floating_point(inst->src[0].type)))
if ((devinfo->verx10 == 70 || devinfo->platform == INTEL_PLATFORM_CHV) &&
type_sz(inst->src[0].type) > 4)
return elk_int_type(type_sz(t), false);
else
return t;

View file

@ -334,7 +334,6 @@ public:
node_count = 0;
first_payload_node = 0;
first_mrf_hack_node = 0;
scratch_header_node = 0;
grf127_send_hack_node = 0;
first_vgrf_node = 0;
last_vgrf_node = 0;
@ -372,7 +371,6 @@ private:
void set_spill_costs();
int choose_spill_reg();
elk_fs_reg alloc_scratch_header();
elk_fs_reg alloc_spill_reg(unsigned size, int ip);
void spill_reg(unsigned spill_reg);
@ -397,7 +395,6 @@ private:
int node_count;
int first_payload_node;
int first_mrf_hack_node;
int scratch_header_node;
int grf127_send_hack_node;
int first_vgrf_node;
int last_vgrf_node;
@ -466,8 +463,7 @@ namespace {
spill_max_size(const elk_backend_shader *s)
{
/* LSC is limited to SIMD16 sends */
if (s->devinfo->has_lsc)
return 2;
assert(!s->devinfo->has_lsc);
/* FINISHME - On Gfx7+ it should be possible to avoid this limit
* altogether by spilling directly from the temporary GRF
@ -487,8 +483,6 @@ namespace {
unsigned
spill_base_mrf(const elk_backend_shader *s)
{
/* We don't use the MRF hack on Gfx9+ */
assert(s->devinfo->ver < 9);
return ELK_MAX_MRF(s->devinfo->ver) - spill_max_size(s) - 1;
}
}
@ -520,10 +514,6 @@ elk_fs_reg_alloc::setup_live_interference(unsigned node,
ra_add_node_interference(g, node, first_mrf_hack_node + i);
}
/* Everything interferes with the scratch header */
if (scratch_header_node >= 0)
ra_add_node_interference(g, node, scratch_header_node);
/* Add interference with every vgrf whose live range intersects this
* node's. We only need to look at nodes below this one as the reflexivity
* of interference will take care of the rest.
@ -643,7 +633,7 @@ elk_fs_reg_alloc::build_interference_graph(bool allow_spilling)
node_count = 0;
first_payload_node = node_count;
node_count += payload_node_count;
if (devinfo->ver >= 7 && devinfo->ver < 9 && allow_spilling) {
if (devinfo->ver >= 7 && allow_spilling) {
first_mrf_hack_node = node_count;
node_count += ELK_MAX_GRF - GFX7_MRF_HACK_START;
} else {
@ -658,11 +648,6 @@ elk_fs_reg_alloc::build_interference_graph(bool allow_spilling)
first_vgrf_node = node_count;
node_count += fs->alloc.count;
last_vgrf_node = node_count - 1;
if ((devinfo->ver >= 9 && devinfo->verx10 < 125) && allow_spilling) {
scratch_header_node = node_count++;
} else {
scratch_header_node = -1;
}
first_spill_node = node_count;
fs->calculate_payload_ranges(payload_node_count,
@ -802,30 +787,7 @@ elk_fs_reg_alloc::emit_unspill(const fs_builder &bld,
++stats->fill_count;
elk_fs_inst *unspill_inst;
if (devinfo->ver >= 9) {
elk_fs_reg header = this->scratch_header;
fs_builder ubld = bld.exec_all().group(1, 0);
assert(spill_offset % 16 == 0);
unspill_inst = ubld.MOV(component(header, 2),
elk_imm_ud(spill_offset / 16));
_mesa_set_add(spill_insts, unspill_inst);
const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
elk_fs_reg srcs[] = { elk_imm_ud(0), header };
unspill_inst = bld.emit(ELK_SHADER_OPCODE_SEND, dst,
srcs, ARRAY_SIZE(srcs));
unspill_inst->mlen = 1;
unspill_inst->header_size = 1;
unspill_inst->size_written = reg_size * REG_SIZE;
unspill_inst->send_has_side_effects = false;
unspill_inst->send_is_volatile = true;
unspill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
unspill_inst->desc =
elk_dp_desc(devinfo, bti,
ELK_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
ELK_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8));
} else if (devinfo->ver >= 7 && spill_offset < (1 << 12) * REG_SIZE) {
if (devinfo->ver >= 7 && spill_offset < (1 << 12) * REG_SIZE) {
/* The Gfx7 descriptor-based offset is 12 bits of HWORD units.
* Because the Gfx7-style scratch block read is hardwired to BTI 255,
* on Gfx9+ it would cause the DC to do an IA-coherent read, what
@ -974,19 +936,6 @@ elk_fs_reg_alloc::choose_spill_reg()
return node - first_vgrf_node;
}
elk_fs_reg
elk_fs_reg_alloc::alloc_scratch_header()
{
int vgrf = fs->alloc.allocate(1);
assert(first_vgrf_node + vgrf == scratch_header_node);
ra_set_node_class(g, scratch_header_node,
compiler->fs_reg_sets[rsi].classes[0]);
setup_live_interference(scratch_header_node, 0, INT_MAX);
return elk_fs_reg(VGRF, vgrf, ELK_REGISTER_TYPE_UD);
}
elk_fs_reg
elk_fs_reg_alloc::alloc_spill_reg(unsigned size, int ip)
{
@ -1035,25 +984,13 @@ elk_fs_reg_alloc::spill_reg(unsigned spill_reg)
* SIMD16 mode, because we'd stomp the FB writes.
*/
if (!fs->spilled_any_registers) {
if (devinfo->verx10 >= 125) {
/* We will allocate a register on the fly */
} else if (devinfo->ver >= 9) {
this->scratch_header = alloc_scratch_header();
fs_builder ubld = fs_builder(fs, 8).exec_all().at(
fs->cfg->first_block(), fs->cfg->first_block()->start());
bool mrf_used[ELK_MAX_MRF(devinfo->ver)];
get_used_mrfs(fs, mrf_used);
elk_fs_inst *inst = ubld.emit(ELK_SHADER_OPCODE_SCRATCH_HEADER,
this->scratch_header);
_mesa_set_add(spill_insts, inst);
} else {
bool mrf_used[ELK_MAX_MRF(devinfo->ver)];
get_used_mrfs(fs, mrf_used);
for (int i = spill_base_mrf(fs); i < ELK_MAX_MRF(devinfo->ver); i++) {
if (mrf_used[i]) {
fs->fail("Register spilling not supported with m%d used", i);
return;
}
for (int i = spill_base_mrf(fs); i < ELK_MAX_MRF(devinfo->ver); i++) {
if (mrf_used[i]) {
fs->fail("Register spilling not supported with m%d used", i);
return;
}
}

View file

@ -349,10 +349,7 @@ namespace {
case ELK_TCS_OPCODE_GET_PRIMITIVE_ID:
case ELK_TES_OPCODE_GET_PRIMITIVE_ID:
case ELK_SHADER_OPCODE_READ_SR_REG:
if (devinfo->ver >= 11) {
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
0, 10, 6 /* XXX */, 14, 0, 0);
} else if (devinfo->ver >= 8) {
if (devinfo->ver >= 8) {
if (type_sz(info.tx) > 4)
return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
@ -373,10 +370,7 @@ namespace {
case ELK_OPCODE_MUL:
case ELK_SHADER_OPCODE_MOV_RELOC_IMM:
case ELK_VEC4_OPCODE_MOV_FOR_SCRATCH:
if (devinfo->ver >= 11) {
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
0, 10, 6, 14, 0, 0);
} else if (devinfo->ver >= 8) {
if (devinfo->ver >= 8) {
if (type_sz(info.tx) > 4)
return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
@ -407,10 +401,7 @@ namespace {
case ELK_OPCODE_BFE:
case ELK_OPCODE_BFI2:
case ELK_OPCODE_CSEL:
if (devinfo->ver >= 11)
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
else if (devinfo->ver >= 8)
if (devinfo->ver >= 8)
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
else if (devinfo->verx10 >= 75)
@ -423,10 +414,7 @@ namespace {
abort();
case ELK_OPCODE_MAD:
if (devinfo->ver >= 11) {
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
} else if (devinfo->ver >= 8) {
if (devinfo->ver >= 8) {
if (type_sz(info.tx) > 4)
return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
@ -457,10 +445,7 @@ namespace {
}
case ELK_OPCODE_F32TO16:
if (devinfo->ver >= 11)
return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
else if (devinfo->ver >= 8)
if (devinfo->ver >= 8)
return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
else if (devinfo->verx10 >= 75)
@ -619,11 +604,7 @@ namespace {
abort();
case ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT:
if (devinfo->ver >= 11)
return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
0, 10 /* XXX */, 6 /* XXX */,
14 /* XXX */, 0, 0);
else if (devinfo->ver >= 8)
if (devinfo->ver >= 8)
return calculate_desc(info, EU_UNIT_FPU, 16, 6, 0, 0, 6,
0, 8 /* XXX */, 4 /* XXX */,
12 /* XXX */, 0, 0);
@ -639,11 +620,7 @@ namespace {
abort();
case ELK_SHADER_OPCODE_MOV_INDIRECT:
if (devinfo->ver >= 11)
return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
0, 10 /* XXX */, 6 /* XXX */,
14 /* XXX */, 0, 0);
else if (devinfo->ver >= 8)
if (devinfo->ver >= 8)
return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
0, 8 /* XXX */, 4 /* XXX */,
12 /* XXX */, 0, 0);
@ -657,10 +634,7 @@ namespace {
18 /* XXX */, 0, 0);
case ELK_SHADER_OPCODE_BROADCAST:
if (devinfo->ver >= 11)
return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0, 4, 0,
0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
else if (devinfo->ver >= 8)
if (devinfo->ver >= 8)
return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
else if (devinfo->verx10 >= 75)
@ -674,10 +648,7 @@ namespace {
case ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL:
case ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
if (devinfo->ver >= 11)
return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
else if (devinfo->ver >= 8)
if (devinfo->ver >= 8)
return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
else if (devinfo->verx10 >= 75)
@ -691,11 +662,7 @@ namespace {
case ELK_SHADER_OPCODE_RND_MODE:
case ELK_SHADER_OPCODE_FLOAT_CONTROL_MODE:
if (devinfo->ver >= 11)
return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
4 /* XXX */, 0,
0, 0, 0, 0, 0, 0);
else if (devinfo->ver >= 8)
if (devinfo->ver >= 8)
return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0,
4 /* XXX */, 0,
0, 0, 0, 0, 0, 0);
@ -711,12 +678,7 @@ namespace {
abort();
case ELK_SHADER_OPCODE_SHUFFLE:
if (devinfo->ver >= 11)
return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
44 /* XXX */, 0,
0, 10 /* XXX */, 6 /* XXX */,
14 /* XXX */, 0, 0);
else if (devinfo->ver >= 8)
if (devinfo->ver >= 8)
return calculate_desc(info, EU_UNIT_FPU, 42 /* XXX */, 0, 0,
42 /* XXX */, 0,
0, 8 /* XXX */, 4 /* XXX */,
@ -735,12 +697,7 @@ namespace {
abort();
case ELK_SHADER_OPCODE_SEL_EXEC:
if (devinfo->ver >= 11)
return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
0, 4 /* XXX */,
0, 10 /* XXX */, 6 /* XXX */,
14 /* XXX */, 0, 0);
else if (devinfo->ver >= 8)
if (devinfo->ver >= 8)
return calculate_desc(info, EU_UNIT_FPU, 8 /* XXX */, 4 /* XXX */, 0,
0, 4 /* XXX */,
0, 8 /* XXX */, 4 /* XXX */,
@ -757,12 +714,7 @@ namespace {
18 /* XXX */, 0, 0);
case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
if (devinfo->ver >= 11)
return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
0, 8 /* XXX */,
0, 10 /* XXX */, 6 /* XXX */,
14 /* XXX */, 0, 0);
else if (devinfo->ver >= 8)
if (devinfo->ver >= 8)
return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
0, 8 /* XXX */,
0, 8 /* XXX */, 4 /* XXX */,
@ -779,10 +731,7 @@ namespace {
18 /* XXX */, 0, 0);
case ELK_FS_OPCODE_DDY_FINE:
if (devinfo->ver >= 11)
return calculate_desc(info, EU_UNIT_FPU, 0, 14, 0, 0, 4,
0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
else if (devinfo->ver >= 8)
if (devinfo->ver >= 8)
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
else if (devinfo->verx10 >= 75)
@ -793,11 +742,7 @@ namespace {
0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
case ELK_FS_OPCODE_LOAD_LIVE_CHANNELS:
if (devinfo->ver >= 11)
return calculate_desc(info, EU_UNIT_FPU, 2 /* XXX */, 0, 0,
2 /* XXX */, 0,
0, 0, 0, 10 /* XXX */, 0, 0);
else if (devinfo->ver >= 8)
if (devinfo->ver >= 8)
return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
0, 2 /* XXX */,
0, 0, 0, 8 /* XXX */, 0, 0);
@ -1493,16 +1438,8 @@ namespace {
*
* In the meantime use values that roughly match the control flow
* weights used elsewhere in the compiler back-end.
*
* Note that we provide slightly more pessimistic weights on
* Gfx12+ for SIMD32, since the effective warp size on that
* platform is 2x the SIMD width due to EU fusion, which increases
* the likelihood of divergent control flow in comparison to
* previous generations, giving narrower SIMD modes a performance
* advantage in several test-cases with non-uniform discard jumps.
*/
const float discard_weight = (dispatch_width > 16 || s->devinfo->ver < 12 ?
1.0 : 0.5);
const float discard_weight = 1.0;
const float loop_weight = 10;
unsigned halt_count = 0;
unsigned elapsed = 0;

View file

@ -192,12 +192,9 @@ lower_fb_write_logical_send(const fs_builder &bld, elk_fs_inst *inst,
assert(length == 0);
length = 2;
} else if ((devinfo->verx10 <= 70 &&
prog_data->uses_kill) ||
(devinfo->ver < 11 &&
(color1.file != BAD_FILE || key->nr_color_regions > 1))) {
assert(devinfo->ver < 20);
} else if ((devinfo->verx10 <= 70 && prog_data->uses_kill) ||
color1.file != BAD_FILE ||
key->nr_color_regions > 1) {
/* From the Sandy Bridge PRM, volume 4, page 198:
*
* "Dispatched Pixel Enables. One bit per pixel indicating
@ -220,9 +217,6 @@ lower_fb_write_logical_send(const fs_builder &bld, elk_fs_inst *inst,
retype(elk_vec8_grf(2, 0), ELK_REGISTER_TYPE_UD),
};
ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
/* Gfx12 will require additional fix-ups if we ever hit this path. */
assert(devinfo->ver < 12);
}
uint32_t g00_bits = 0;
@ -723,7 +717,6 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_op
unsigned grad_components,
bool residency)
{
const elk_compiler *compiler = bld.shader->compiler;
const intel_device_info *devinfo = bld.shader->devinfo;
const enum elk_reg_type payload_type =
elk_reg_type_from_bit_size(payload_type_bit_size, ELK_REGISTER_TYPE_F);
@ -802,26 +795,11 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_op
* address space but means we can do something more efficient in the
* shader.
*/
if (compiler->use_bindless_sampler_offset) {
assert(devinfo->ver >= 11);
ubld1.OR(component(header, 3), sampler_handle, elk_imm_ud(1));
} else {
ubld1.MOV(component(header, 3), sampler_handle);
}
ubld1.MOV(component(header, 3), sampler_handle);
} else if (is_high_sampler(devinfo, sampler)) {
elk_fs_reg sampler_state_ptr =
retype(elk_vec1_grf(0, 3), ELK_REGISTER_TYPE_UD);
/* Gfx11+ sampler message headers include bits in 4:0 which conflict
* with the ones included in g0.3 bits 4:0. Mask them out.
*/
if (devinfo->ver >= 11) {
sampler_state_ptr = ubld1.vgrf(ELK_REGISTER_TYPE_UD);
ubld1.AND(sampler_state_ptr,
retype(elk_vec1_grf(0, 3), ELK_REGISTER_TYPE_UD),
elk_imm_ud(INTEL_MASK(31, 5)));
}
if (sampler.file == ELK_IMMEDIATE_VALUE) {
assert(sampler.ud >= 16);
const int sampler_state_size = 16; /* 16 bytes */
@ -834,26 +812,9 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_op
ubld1.SHL(tmp, tmp, elk_imm_ud(4));
ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
}
} else if (devinfo->ver >= 11) {
/* Gfx11+ sampler message headers include bits in 4:0 which conflict
* with the ones included in g0.3 bits 4:0. Mask them out.
*/
ubld1.AND(component(header, 3),
retype(elk_vec1_grf(0, 3), ELK_REGISTER_TYPE_UD),
elk_imm_ud(INTEL_MASK(31, 5)));
}
}
/* Change the opcode to account for LOD being zero before the
* switch-statement that emits sources based on the opcode.
*/
if (devinfo->ver >= 9 && lod.is_zero()) {
if (op == ELK_SHADER_OPCODE_TXL)
op = ELK_SHADER_OPCODE_TXL_LZ;
else if (op == ELK_SHADER_OPCODE_TXF)
op = ELK_SHADER_OPCODE_TXF_LZ;
}
/* On Xe2 and newer platforms, min_lod is the first parameter specifically
* so that a bunch of other, possibly unused, parameters don't need to also
* be included.
@ -910,27 +871,15 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_op
break;
case ELK_SHADER_OPCODE_TXF:
case ELK_SHADER_OPCODE_TXF_LZ:
/* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
* On Gfx9 they are u, v, lod, r
*/
/* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
bld.MOV(retype(sources[length++], payload_signed_type), coordinate);
if (devinfo->ver >= 9) {
if (coord_components >= 2) {
bld.MOV(retype(sources[length], payload_signed_type),
offset(coordinate, bld, 1));
} else {
sources[length] = elk_imm_d(0);
}
length++;
}
if (op != ELK_SHADER_OPCODE_TXF_LZ) {
bld.MOV(retype(sources[length], payload_signed_type), lod);
length++;
}
for (unsigned i = devinfo->ver >= 9 ? 2 : 1; i < coord_components; i++)
for (unsigned i = 1; i < coord_components; i++)
bld.MOV(retype(sources[length++], payload_signed_type),
offset(coordinate, bld, i));
@ -964,24 +913,9 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_op
* only valid data is in first two register. So with 16-bit
* payload, we need to split 2-32bit register into 4-16-bit
* payload.
*
* From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
* Shared Functions - 3D Sampler - Messages - Message Format:
*
* ld2dms_w si mcs0 mcs1 mcs2 mcs3 u v r
*/
if (devinfo->verx10 >= 125 && op == ELK_SHADER_OPCODE_TXF_CMS_W) {
elk_fs_reg tmp = offset(mcs, bld, i);
bld.MOV(retype(sources[length++], payload_unsigned_type),
mcs.file == IMM ? mcs :
subscript(tmp, payload_unsigned_type, 0));
bld.MOV(retype(sources[length++], payload_unsigned_type),
mcs.file == IMM ? mcs :
subscript(tmp, payload_unsigned_type, 1));
} else {
bld.MOV(retype(sources[length++], payload_unsigned_type),
mcs.file == IMM ? mcs : offset(mcs, bld, i));
}
bld.MOV(retype(sources[length++], payload_unsigned_type),
mcs.file == IMM ? mcs : offset(mcs, bld, i));
}
}
@ -1021,29 +955,11 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_op
if (min_lod.file != BAD_FILE) {
/* Account for all of the missing coordinate sources */
if (op == ELK_SHADER_OPCODE_TXD && devinfo->verx10 >= 125) {
/* On DG2 and newer platforms, sample_d can only be used with 1D and
* 2D surfaces, so the maximum number of gradient components is 2.
* In spite of this limitation, the Bspec lists a mysterious R
* component before the min_lod, so the maximum coordinate components
* is 3.
*
* See bspec 45942, "Enable new message layout for cube array"
*/
length += 3 - coord_components;
length += (2 - grad_components) * 2;
} else {
length += 4 - coord_components;
if (op == ELK_SHADER_OPCODE_TXD)
length += (3 - grad_components) * 2;
}
length += 4 - coord_components;
if (op == ELK_SHADER_OPCODE_TXD)
length += (3 - grad_components) * 2;
bld.MOV(sources[length++], min_lod);
/* Wa_14014595444: Populate MLOD as parameter 5 (twice). */
if (devinfo->verx10 == 125 && op == ELK_FS_OPCODE_TXB &&
!inst->shadow_compare)
bld.MOV(sources[length++], min_lod);
}
const elk_fs_reg src_payload =
@ -1145,39 +1061,13 @@ get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
assert(src_type_size == 2 || src_type_size == 4);
#ifndef NDEBUG
/* Make sure all sources agree. On gfx12 this doesn't hold when sampling
* compressed multisampled surfaces. There the payload contains MCS data
* which is already in 16-bits unlike the other parameters that need forced
* conversion.
*/
if (devinfo->verx10 < 125 ||
(op != ELK_SHADER_OPCODE_TXF_CMS_W &&
op != ELK_SHADER_OPCODE_TXF_CMS)) {
for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
assert(src[i].file == BAD_FILE ||
elk_reg_type_to_size(src[i].type) == src_type_size);
}
/* Make sure all sources agree. */
for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
assert(src[i].file == BAD_FILE ||
elk_reg_type_to_size(src[i].type) == src_type_size);
}
#endif
if (devinfo->verx10 < 125)
return src_type_size * 8;
/* Force conversion from 32-bit sources to 16-bit payload. From the XeHP Bspec:
* 3D and GPGPU Programs - Shared Functions - 3D Sampler - Messages - Message
* Format [GFX12:HAS:1209977870] *
*
* ld2dms_w SIMD8H and SIMD16H Only
* ld_mcs SIMD8H and SIMD16H Only
* ld2dms REMOVEDBY(GEN:HAS:1406788836)
*/
if (op == ELK_SHADER_OPCODE_TXF_CMS_W ||
op == ELK_SHADER_OPCODE_TXF_CMS ||
op == ELK_SHADER_OPCODE_TXF_UMS ||
op == ELK_SHADER_OPCODE_TXF_MCS)
src_type_size = 2;
return src_type_size * 8;
}
@ -1211,7 +1101,7 @@ lower_sampler_logical_send(const fs_builder &bld, elk_fs_inst *inst, elk_opcode
get_sampler_msg_payload_type_bit_size(devinfo, op, inst->src);
/* 16-bit payloads are available only on gfx11+ */
assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11);
assert(msg_payload_type_bit_size != 16);
lower_sampler_logical_send_gfx7(bld, inst, op, coordinate,
shadow_c, lod, lod2, min_lod,
@ -1259,7 +1149,6 @@ emit_predicate_on_vector_mask(const fs_builder &bld, elk_fs_inst *inst)
assert(inst->predicate == ELK_PREDICATE_NORMAL);
assert(!inst->predicate_inverse);
assert(inst->flag_subreg == 0);
assert(s.devinfo->ver < 20);
/* Combine the vector mask with the existing predicate by using a
* vertical predication mode.
*/
@ -1345,7 +1234,7 @@ lower_surface_logical_send(const fs_builder &bld, elk_fs_inst *inst)
* For all stateless A32 messages, we also need a header
*/
elk_fs_reg header;
if ((devinfo->ver < 9 && is_typed_access) || is_stateless) {
if (is_typed_access || is_stateless) {
fs_builder ubld = bld.exec_all().group(8, 0);
header = ubld.vgrf(ELK_REGISTER_TYPE_UD);
if (is_stateless) {
@ -1515,71 +1404,6 @@ lower_surface_logical_send(const fs_builder &bld, elk_fs_inst *inst)
inst->src[1] = payload;
}
static void
lower_surface_block_logical_send(const fs_builder &bld, elk_fs_inst *inst)
{
const intel_device_info *devinfo = bld.shader->devinfo;
assert(devinfo->ver >= 9);
/* Get the logical send arguments. */
const elk_fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
const elk_fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
const elk_fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
const elk_fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
const elk_fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
assert(arg.file == IMM);
assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE);
assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE);
const bool is_stateless =
surface.file == IMM && (surface.ud == ELK_BTI_STATELESS ||
surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
const bool has_side_effects = inst->has_side_effects();
const bool align_16B =
inst->opcode != ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL;
const bool write = inst->opcode == ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;
/* The address is stored in the header. See MH_A32_GO and MH_BTS_GO. */
fs_builder ubld = bld.exec_all().group(8, 0);
elk_fs_reg header = ubld.vgrf(ELK_REGISTER_TYPE_UD);
if (is_stateless)
ubld.emit(ELK_SHADER_OPCODE_SCRATCH_HEADER, header);
else
ubld.MOV(header, elk_imm_d(0));
/* Address in OWord units when aligned to OWords. */
if (align_16B)
ubld.group(1, 0).SHR(component(header, 2), addr, elk_imm_ud(4));
else
ubld.group(1, 0).MOV(component(header, 2), addr);
elk_fs_reg data;
if (write) {
const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
data = retype(bld.move_to_vgrf(src, src_sz), ELK_REGISTER_TYPE_UD);
}
inst->opcode = ELK_SHADER_OPCODE_SEND;
inst->mlen = 1;
inst->header_size = 1;
inst->send_has_side_effects = has_side_effects;
inst->send_is_volatile = !has_side_effects;
inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
const uint32_t desc = elk_dp_oword_block_rw_desc(devinfo, align_16B,
arg.ud, write);
setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
inst->resize_sources(2);
inst->src[1] = header;
}
static void
emit_fragment_mask(const fs_builder &bld, elk_fs_inst *inst)
{
@ -1965,7 +1789,7 @@ lower_get_buffer_size(const fs_builder &bld, elk_fs_inst *inst)
/* Since we can only execute this instruction on uniform bti/surface
* handles, elk_fs_nir.cpp should already have limited this to SIMD8.
*/
assert(inst->exec_size == (devinfo->ver < 20 ? 8 : 16));
assert(inst->exec_size == 8);
elk_fs_reg surface = inst->src[GET_BUFFER_SIZE_SRC_SURFACE];
elk_fs_reg surface_handle = inst->src[GET_BUFFER_SIZE_SRC_SURFACE_HANDLE];
@ -2087,11 +1911,6 @@ elk_fs_visitor::lower_logical_sends()
lower_surface_logical_send(ibld, inst);
break;
case ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
case ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
lower_surface_block_logical_send(ibld, inst);
break;
case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:

View file

@ -321,38 +321,6 @@ elk_nir_lower_cs_intrinsics(nir_shader *nir,
}
}
if (devinfo->verx10 >= 125 && prog_data &&
nir->info.stage == MESA_SHADER_COMPUTE &&
nir->info.cs.derivative_group != DERIVATIVE_GROUP_QUADS &&
!nir->info.workgroup_size_variable &&
util_is_power_of_two_nonzero(nir->info.workgroup_size[0]) &&
util_is_power_of_two_nonzero(nir->info.workgroup_size[1])) {
state.hw_generated_local_id = true;
/* TODO: more heuristics about 1D/SLM access vs. 2D access */
bool linear =
BITSET_TEST(nir->info.system_values_read,
SYSTEM_VALUE_LOCAL_INVOCATION_INDEX) ||
(nir->info.workgroup_size[1] == 1 &&
nir->info.workgroup_size[2] == 1) ||
(nir->info.num_images == 0 && nir->info.num_textures == 0);
prog_data->walk_order =
linear ? INTEL_WALK_ORDER_XYZ : INTEL_WALK_ORDER_YXZ;
/* nir_lower_compute_system_values will replace any references to
* SYSTEM_VALUE_LOCAL_INVOCATION_ID vector components with zero for
* any dimension where the workgroup size is 1, so we can skip
* generating those. However, the hardware can only generate
* X, XY, or XYZ - it can't skip earlier components.
*/
prog_data->generate_local_id =
(nir->info.workgroup_size[0] > 1 ? WRITEMASK_X : 0) |
(nir->info.workgroup_size[1] > 1 ? WRITEMASK_XY : 0) |
(nir->info.workgroup_size[2] > 1 ? WRITEMASK_XYZ : 0);
}
nir_foreach_function_impl(impl, nir) {
state.impl = impl;
lower_cs_intrinsics_convert_impl(&state);