mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 18:18:06 +02:00
intel/elk: Remove Gfx9+ from passes
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27629>
This commit is contained in:
parent
2b6b786feb
commit
241a03b8ec
8 changed files with 53 additions and 542 deletions
|
|
@ -591,21 +591,6 @@ namespace {
|
|||
return constrained;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return whether the hardware will be able to prevent a bank conflict by
|
||||
* optimizing out the read cycle of a source register. The formula was
|
||||
* found experimentally.
|
||||
*/
|
||||
bool
|
||||
is_conflict_optimized_out(const intel_device_info *devinfo,
|
||||
const elk_fs_inst *inst)
|
||||
{
|
||||
return devinfo->ver >= 9 &&
|
||||
((is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) ||
|
||||
reg_of(inst->src[0]) == reg_of(inst->src[2]))) ||
|
||||
reg_of(inst->src[1]) == reg_of(inst->src[2]));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a matrix that allows reasonably efficient computation of the
|
||||
* cycle-count cost of bank conflicts incurred throughout the whole program
|
||||
|
|
@ -662,10 +647,9 @@ namespace {
|
|||
REG_SIZE);
|
||||
|
||||
/* Neglect same-atom conflicts (since they're either trivial or
|
||||
* impossible to avoid without splitting the atom), and conflicts
|
||||
* known to be optimized out by the hardware.
|
||||
* impossible to avoid without splitting the atom).
|
||||
*/
|
||||
if (r != s && !is_conflict_optimized_out(v->devinfo, inst)) {
|
||||
if (r != s) {
|
||||
/* Calculate the parity of the sources relative to the start of
|
||||
* their respective atoms. If their parity is the same (and
|
||||
* none of the atoms straddle the 2KB mark), the instruction
|
||||
|
|
@ -911,10 +895,6 @@ elk_fs_visitor::opt_bank_conflicts()
|
|||
{
|
||||
assert(grf_used || !"Must be called after register allocation");
|
||||
|
||||
/* TODO: Re-work this pass for Gfx20+. */
|
||||
if (devinfo->ver >= 20)
|
||||
return false;
|
||||
|
||||
/* No ternary instructions -- No bank conflicts. */
|
||||
if (devinfo->ver < 6)
|
||||
return false;
|
||||
|
|
@ -950,6 +930,5 @@ elk_has_bank_conflict(const struct elk_isa_info *isa, const elk_fs_inst *inst)
|
|||
{
|
||||
return elk_is_3src(isa, inst->opcode) &&
|
||||
is_grf(inst->src[1]) && is_grf(inst->src[2]) &&
|
||||
bank_of(reg_of(inst->src[1])) == bank_of(reg_of(inst->src[2])) &&
|
||||
!is_conflict_optimized_out(isa->devinfo, inst);
|
||||
bank_of(reg_of(inst->src[1])) == bank_of(reg_of(inst->src[2]));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -993,120 +993,6 @@ get_alignment_for_imm(const struct imm *imm)
|
|||
return imm->size;
|
||||
}
|
||||
|
||||
static bool
|
||||
representable_as_hf(float f, uint16_t *hf)
|
||||
{
|
||||
union fi u;
|
||||
uint16_t h = _mesa_float_to_half(f);
|
||||
u.f = _mesa_half_to_float(h);
|
||||
|
||||
if (u.f == f) {
|
||||
*hf = h;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool
|
||||
representable_as_w(int d, int16_t *w)
|
||||
{
|
||||
int res = ((d & 0xffff8000) + 0x8000) & 0xffff7fff;
|
||||
if (!res) {
|
||||
*w = d;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool
|
||||
representable_as_uw(unsigned ud, uint16_t *uw)
|
||||
{
|
||||
if (!(ud & 0xffff0000)) {
|
||||
*uw = ud;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool
|
||||
supports_src_as_imm(const struct intel_device_info *devinfo, const elk_fs_inst *inst)
|
||||
{
|
||||
if (devinfo->ver < 12)
|
||||
return false;
|
||||
|
||||
switch (inst->opcode) {
|
||||
case ELK_OPCODE_MAD:
|
||||
/* Integer types can always mix sizes. Floating point types can mix
|
||||
* sizes on Gfx12. On Gfx12.5, floating point sources must all be HF or
|
||||
* all be F.
|
||||
*/
|
||||
return devinfo->verx10 < 125 || inst->src[0].type != ELK_REGISTER_TYPE_F;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
can_promote_src_as_imm(const struct intel_device_info *devinfo, elk_fs_inst *inst,
|
||||
unsigned src_idx)
|
||||
{
|
||||
bool can_promote = false;
|
||||
|
||||
/* Experiment shows that we can only support src0 as immediate for MAD on
|
||||
* Gfx12. ADD3 can use src0 or src2 in Gfx12.5, but constant propagation
|
||||
* only propagates into src0. It's possible that src2 works for W or UW MAD
|
||||
* on Gfx12.5.
|
||||
*/
|
||||
if (src_idx != 0)
|
||||
return false;
|
||||
|
||||
if (!supports_src_as_imm(devinfo, inst))
|
||||
return false;
|
||||
|
||||
/* TODO - Fix the codepath below to use a bfloat16 immediate on XeHP,
|
||||
* since HF/F mixed mode has been removed from the hardware.
|
||||
*/
|
||||
switch (inst->src[src_idx].type) {
|
||||
case ELK_REGISTER_TYPE_F: {
|
||||
uint16_t hf;
|
||||
if (representable_as_hf(inst->src[src_idx].f, &hf)) {
|
||||
inst->src[src_idx] = retype(elk_imm_uw(hf), ELK_REGISTER_TYPE_HF);
|
||||
can_promote = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ELK_REGISTER_TYPE_D: {
|
||||
int16_t w;
|
||||
if (representable_as_w(inst->src[src_idx].d, &w)) {
|
||||
inst->src[src_idx] = elk_imm_w(w);
|
||||
can_promote = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ELK_REGISTER_TYPE_UD: {
|
||||
uint16_t uw;
|
||||
if (representable_as_uw(inst->src[src_idx].ud, &uw)) {
|
||||
inst->src[src_idx] = elk_imm_uw(uw);
|
||||
can_promote = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ELK_REGISTER_TYPE_W:
|
||||
case ELK_REGISTER_TYPE_UW:
|
||||
case ELK_REGISTER_TYPE_HF:
|
||||
can_promote = true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return can_promote;
|
||||
}
|
||||
|
||||
static void
|
||||
add_candidate_immediate(struct table *table, elk_fs_inst *inst, unsigned ip,
|
||||
unsigned i,
|
||||
|
|
@ -1358,9 +1244,6 @@ elk_fs_visitor::opt_combine_constants()
|
|||
if (inst->src[i].file != IMM)
|
||||
continue;
|
||||
|
||||
if (can_promote_src_as_imm(devinfo, inst, i))
|
||||
continue;
|
||||
|
||||
add_candidate_immediate(&table, inst, ip, i, true, false, block,
|
||||
devinfo, const_ctx);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ elk_fs_visitor::lower_pack()
|
|||
const uint32_t half = _mesa_float_to_half(inst->src[i].f);
|
||||
ibld.MOV(subscript(dst, ELK_REGISTER_TYPE_UW, i),
|
||||
elk_imm_uw(half));
|
||||
} else if (i == 1 && devinfo->ver < 9) {
|
||||
} else if (i == 1) {
|
||||
/* Pre-Skylake requires DWord aligned destinations */
|
||||
elk_fs_reg tmp = ibld.vgrf(ELK_REGISTER_TYPE_UD);
|
||||
ibld.F32TO16(subscript(tmp, ELK_REGISTER_TYPE_HF, 0),
|
||||
|
|
|
|||
|
|
@ -145,9 +145,8 @@ namespace {
|
|||
* Work around both of the above and handle platforms that
|
||||
* don't support 64-bit types at all.
|
||||
*/
|
||||
if ((!devinfo->has_64bit_int ||
|
||||
devinfo->platform == INTEL_PLATFORM_CHV ||
|
||||
intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
|
||||
if ((!devinfo->has_64bit_int || devinfo->platform == INTEL_PLATFORM_CHV) &&
|
||||
type_sz(t) > 4)
|
||||
return ELK_REGISTER_TYPE_UD;
|
||||
else if (has_dst_aligned_region_restriction(devinfo, inst))
|
||||
return elk_int_type(type_sz(t), false);
|
||||
|
|
@ -174,30 +173,19 @@ namespace {
|
|||
* integer DWord multiply, indirect addressing must not be
|
||||
* used."
|
||||
*
|
||||
* For MTL (verx10 == 125), float64 is supported, but int64 is not.
|
||||
* Therefore we need to lower cluster broadcast using 32-bit int ops.
|
||||
*
|
||||
* For gfx12.5+ platforms that support int64, the register regions
|
||||
* used by cluster broadcast aren't supported by the 64-bit pipeline.
|
||||
*
|
||||
* Work around the above and handle platforms that don't
|
||||
* support 64-bit types at all.
|
||||
*/
|
||||
if ((!has_64bit || devinfo->verx10 >= 125 ||
|
||||
devinfo->platform == INTEL_PLATFORM_CHV ||
|
||||
intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
|
||||
if ((!has_64bit || devinfo->platform == INTEL_PLATFORM_CHV) &&
|
||||
type_sz(t) > 4)
|
||||
return ELK_REGISTER_TYPE_UD;
|
||||
else
|
||||
return elk_int_type(type_sz(t), false);
|
||||
|
||||
case ELK_SHADER_OPCODE_BROADCAST:
|
||||
case ELK_SHADER_OPCODE_MOV_INDIRECT:
|
||||
if (((devinfo->verx10 == 70 ||
|
||||
devinfo->platform == INTEL_PLATFORM_CHV ||
|
||||
intel_device_info_is_9lp(devinfo) ||
|
||||
devinfo->verx10 >= 125) && type_sz(inst->src[0].type) > 4) ||
|
||||
(devinfo->verx10 >= 125 &&
|
||||
elk_reg_type_is_floating_point(inst->src[0].type)))
|
||||
if ((devinfo->verx10 == 70 || devinfo->platform == INTEL_PLATFORM_CHV) &&
|
||||
type_sz(inst->src[0].type) > 4)
|
||||
return elk_int_type(type_sz(t), false);
|
||||
else
|
||||
return t;
|
||||
|
|
|
|||
|
|
@ -334,7 +334,6 @@ public:
|
|||
node_count = 0;
|
||||
first_payload_node = 0;
|
||||
first_mrf_hack_node = 0;
|
||||
scratch_header_node = 0;
|
||||
grf127_send_hack_node = 0;
|
||||
first_vgrf_node = 0;
|
||||
last_vgrf_node = 0;
|
||||
|
|
@ -372,7 +371,6 @@ private:
|
|||
|
||||
void set_spill_costs();
|
||||
int choose_spill_reg();
|
||||
elk_fs_reg alloc_scratch_header();
|
||||
elk_fs_reg alloc_spill_reg(unsigned size, int ip);
|
||||
void spill_reg(unsigned spill_reg);
|
||||
|
||||
|
|
@ -397,7 +395,6 @@ private:
|
|||
int node_count;
|
||||
int first_payload_node;
|
||||
int first_mrf_hack_node;
|
||||
int scratch_header_node;
|
||||
int grf127_send_hack_node;
|
||||
int first_vgrf_node;
|
||||
int last_vgrf_node;
|
||||
|
|
@ -466,8 +463,7 @@ namespace {
|
|||
spill_max_size(const elk_backend_shader *s)
|
||||
{
|
||||
/* LSC is limited to SIMD16 sends */
|
||||
if (s->devinfo->has_lsc)
|
||||
return 2;
|
||||
assert(!s->devinfo->has_lsc);
|
||||
|
||||
/* FINISHME - On Gfx7+ it should be possible to avoid this limit
|
||||
* altogether by spilling directly from the temporary GRF
|
||||
|
|
@ -487,8 +483,6 @@ namespace {
|
|||
unsigned
|
||||
spill_base_mrf(const elk_backend_shader *s)
|
||||
{
|
||||
/* We don't use the MRF hack on Gfx9+ */
|
||||
assert(s->devinfo->ver < 9);
|
||||
return ELK_MAX_MRF(s->devinfo->ver) - spill_max_size(s) - 1;
|
||||
}
|
||||
}
|
||||
|
|
@ -520,10 +514,6 @@ elk_fs_reg_alloc::setup_live_interference(unsigned node,
|
|||
ra_add_node_interference(g, node, first_mrf_hack_node + i);
|
||||
}
|
||||
|
||||
/* Everything interferes with the scratch header */
|
||||
if (scratch_header_node >= 0)
|
||||
ra_add_node_interference(g, node, scratch_header_node);
|
||||
|
||||
/* Add interference with every vgrf whose live range intersects this
|
||||
* node's. We only need to look at nodes below this one as the reflexivity
|
||||
* of interference will take care of the rest.
|
||||
|
|
@ -643,7 +633,7 @@ elk_fs_reg_alloc::build_interference_graph(bool allow_spilling)
|
|||
node_count = 0;
|
||||
first_payload_node = node_count;
|
||||
node_count += payload_node_count;
|
||||
if (devinfo->ver >= 7 && devinfo->ver < 9 && allow_spilling) {
|
||||
if (devinfo->ver >= 7 && allow_spilling) {
|
||||
first_mrf_hack_node = node_count;
|
||||
node_count += ELK_MAX_GRF - GFX7_MRF_HACK_START;
|
||||
} else {
|
||||
|
|
@ -658,11 +648,6 @@ elk_fs_reg_alloc::build_interference_graph(bool allow_spilling)
|
|||
first_vgrf_node = node_count;
|
||||
node_count += fs->alloc.count;
|
||||
last_vgrf_node = node_count - 1;
|
||||
if ((devinfo->ver >= 9 && devinfo->verx10 < 125) && allow_spilling) {
|
||||
scratch_header_node = node_count++;
|
||||
} else {
|
||||
scratch_header_node = -1;
|
||||
}
|
||||
first_spill_node = node_count;
|
||||
|
||||
fs->calculate_payload_ranges(payload_node_count,
|
||||
|
|
@ -802,30 +787,7 @@ elk_fs_reg_alloc::emit_unspill(const fs_builder &bld,
|
|||
++stats->fill_count;
|
||||
|
||||
elk_fs_inst *unspill_inst;
|
||||
if (devinfo->ver >= 9) {
|
||||
elk_fs_reg header = this->scratch_header;
|
||||
fs_builder ubld = bld.exec_all().group(1, 0);
|
||||
assert(spill_offset % 16 == 0);
|
||||
unspill_inst = ubld.MOV(component(header, 2),
|
||||
elk_imm_ud(spill_offset / 16));
|
||||
_mesa_set_add(spill_insts, unspill_inst);
|
||||
|
||||
const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
|
||||
|
||||
elk_fs_reg srcs[] = { elk_imm_ud(0), header };
|
||||
unspill_inst = bld.emit(ELK_SHADER_OPCODE_SEND, dst,
|
||||
srcs, ARRAY_SIZE(srcs));
|
||||
unspill_inst->mlen = 1;
|
||||
unspill_inst->header_size = 1;
|
||||
unspill_inst->size_written = reg_size * REG_SIZE;
|
||||
unspill_inst->send_has_side_effects = false;
|
||||
unspill_inst->send_is_volatile = true;
|
||||
unspill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
|
||||
unspill_inst->desc =
|
||||
elk_dp_desc(devinfo, bti,
|
||||
ELK_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
|
||||
ELK_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8));
|
||||
} else if (devinfo->ver >= 7 && spill_offset < (1 << 12) * REG_SIZE) {
|
||||
if (devinfo->ver >= 7 && spill_offset < (1 << 12) * REG_SIZE) {
|
||||
/* The Gfx7 descriptor-based offset is 12 bits of HWORD units.
|
||||
* Because the Gfx7-style scratch block read is hardwired to BTI 255,
|
||||
* on Gfx9+ it would cause the DC to do an IA-coherent read, what
|
||||
|
|
@ -974,19 +936,6 @@ elk_fs_reg_alloc::choose_spill_reg()
|
|||
return node - first_vgrf_node;
|
||||
}
|
||||
|
||||
elk_fs_reg
|
||||
elk_fs_reg_alloc::alloc_scratch_header()
|
||||
{
|
||||
int vgrf = fs->alloc.allocate(1);
|
||||
assert(first_vgrf_node + vgrf == scratch_header_node);
|
||||
ra_set_node_class(g, scratch_header_node,
|
||||
compiler->fs_reg_sets[rsi].classes[0]);
|
||||
|
||||
setup_live_interference(scratch_header_node, 0, INT_MAX);
|
||||
|
||||
return elk_fs_reg(VGRF, vgrf, ELK_REGISTER_TYPE_UD);
|
||||
}
|
||||
|
||||
elk_fs_reg
|
||||
elk_fs_reg_alloc::alloc_spill_reg(unsigned size, int ip)
|
||||
{
|
||||
|
|
@ -1035,25 +984,13 @@ elk_fs_reg_alloc::spill_reg(unsigned spill_reg)
|
|||
* SIMD16 mode, because we'd stomp the FB writes.
|
||||
*/
|
||||
if (!fs->spilled_any_registers) {
|
||||
if (devinfo->verx10 >= 125) {
|
||||
/* We will allocate a register on the fly */
|
||||
} else if (devinfo->ver >= 9) {
|
||||
this->scratch_header = alloc_scratch_header();
|
||||
fs_builder ubld = fs_builder(fs, 8).exec_all().at(
|
||||
fs->cfg->first_block(), fs->cfg->first_block()->start());
|
||||
bool mrf_used[ELK_MAX_MRF(devinfo->ver)];
|
||||
get_used_mrfs(fs, mrf_used);
|
||||
|
||||
elk_fs_inst *inst = ubld.emit(ELK_SHADER_OPCODE_SCRATCH_HEADER,
|
||||
this->scratch_header);
|
||||
_mesa_set_add(spill_insts, inst);
|
||||
} else {
|
||||
bool mrf_used[ELK_MAX_MRF(devinfo->ver)];
|
||||
get_used_mrfs(fs, mrf_used);
|
||||
|
||||
for (int i = spill_base_mrf(fs); i < ELK_MAX_MRF(devinfo->ver); i++) {
|
||||
if (mrf_used[i]) {
|
||||
fs->fail("Register spilling not supported with m%d used", i);
|
||||
return;
|
||||
}
|
||||
for (int i = spill_base_mrf(fs); i < ELK_MAX_MRF(devinfo->ver); i++) {
|
||||
if (mrf_used[i]) {
|
||||
fs->fail("Register spilling not supported with m%d used", i);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -349,10 +349,7 @@ namespace {
|
|||
case ELK_TCS_OPCODE_GET_PRIMITIVE_ID:
|
||||
case ELK_TES_OPCODE_GET_PRIMITIVE_ID:
|
||||
case ELK_SHADER_OPCODE_READ_SR_REG:
|
||||
if (devinfo->ver >= 11) {
|
||||
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
||||
0, 10, 6 /* XXX */, 14, 0, 0);
|
||||
} else if (devinfo->ver >= 8) {
|
||||
if (devinfo->ver >= 8) {
|
||||
if (type_sz(info.tx) > 4)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
|
||||
0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
|
||||
|
|
@ -373,10 +370,7 @@ namespace {
|
|||
case ELK_OPCODE_MUL:
|
||||
case ELK_SHADER_OPCODE_MOV_RELOC_IMM:
|
||||
case ELK_VEC4_OPCODE_MOV_FOR_SCRATCH:
|
||||
if (devinfo->ver >= 11) {
|
||||
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
||||
0, 10, 6, 14, 0, 0);
|
||||
} else if (devinfo->ver >= 8) {
|
||||
if (devinfo->ver >= 8) {
|
||||
if (type_sz(info.tx) > 4)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
|
||||
0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
|
||||
|
|
@ -407,10 +401,7 @@ namespace {
|
|||
case ELK_OPCODE_BFE:
|
||||
case ELK_OPCODE_BFI2:
|
||||
case ELK_OPCODE_CSEL:
|
||||
if (devinfo->ver >= 11)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
|
||||
0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
|
||||
else if (devinfo->ver >= 8)
|
||||
if (devinfo->ver >= 8)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
|
||||
0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
|
||||
else if (devinfo->verx10 >= 75)
|
||||
|
|
@ -423,10 +414,7 @@ namespace {
|
|||
abort();
|
||||
|
||||
case ELK_OPCODE_MAD:
|
||||
if (devinfo->ver >= 11) {
|
||||
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
|
||||
0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
|
||||
} else if (devinfo->ver >= 8) {
|
||||
if (devinfo->ver >= 8) {
|
||||
if (type_sz(info.tx) > 4)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
|
||||
0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
|
||||
|
|
@ -457,10 +445,7 @@ namespace {
|
|||
}
|
||||
|
||||
case ELK_OPCODE_F32TO16:
|
||||
if (devinfo->ver >= 11)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
|
||||
0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
|
||||
else if (devinfo->ver >= 8)
|
||||
if (devinfo->ver >= 8)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
|
||||
0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
|
||||
else if (devinfo->verx10 >= 75)
|
||||
|
|
@ -619,11 +604,7 @@ namespace {
|
|||
abort();
|
||||
|
||||
case ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT:
|
||||
if (devinfo->ver >= 11)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
|
||||
0, 10 /* XXX */, 6 /* XXX */,
|
||||
14 /* XXX */, 0, 0);
|
||||
else if (devinfo->ver >= 8)
|
||||
if (devinfo->ver >= 8)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 16, 6, 0, 0, 6,
|
||||
0, 8 /* XXX */, 4 /* XXX */,
|
||||
12 /* XXX */, 0, 0);
|
||||
|
|
@ -639,11 +620,7 @@ namespace {
|
|||
abort();
|
||||
|
||||
case ELK_SHADER_OPCODE_MOV_INDIRECT:
|
||||
if (devinfo->ver >= 11)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
|
||||
0, 10 /* XXX */, 6 /* XXX */,
|
||||
14 /* XXX */, 0, 0);
|
||||
else if (devinfo->ver >= 8)
|
||||
if (devinfo->ver >= 8)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
|
||||
0, 8 /* XXX */, 4 /* XXX */,
|
||||
12 /* XXX */, 0, 0);
|
||||
|
|
@ -657,10 +634,7 @@ namespace {
|
|||
18 /* XXX */, 0, 0);
|
||||
|
||||
case ELK_SHADER_OPCODE_BROADCAST:
|
||||
if (devinfo->ver >= 11)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0, 4, 0,
|
||||
0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
|
||||
else if (devinfo->ver >= 8)
|
||||
if (devinfo->ver >= 8)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
|
||||
0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
|
||||
else if (devinfo->verx10 >= 75)
|
||||
|
|
@ -674,10 +648,7 @@ namespace {
|
|||
|
||||
case ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL:
|
||||
case ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
|
||||
if (devinfo->ver >= 11)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
|
||||
0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
|
||||
else if (devinfo->ver >= 8)
|
||||
if (devinfo->ver >= 8)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
|
||||
0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
|
||||
else if (devinfo->verx10 >= 75)
|
||||
|
|
@ -691,11 +662,7 @@ namespace {
|
|||
|
||||
case ELK_SHADER_OPCODE_RND_MODE:
|
||||
case ELK_SHADER_OPCODE_FLOAT_CONTROL_MODE:
|
||||
if (devinfo->ver >= 11)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
|
||||
4 /* XXX */, 0,
|
||||
0, 0, 0, 0, 0, 0);
|
||||
else if (devinfo->ver >= 8)
|
||||
if (devinfo->ver >= 8)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0,
|
||||
4 /* XXX */, 0,
|
||||
0, 0, 0, 0, 0, 0);
|
||||
|
|
@ -711,12 +678,7 @@ namespace {
|
|||
abort();
|
||||
|
||||
case ELK_SHADER_OPCODE_SHUFFLE:
|
||||
if (devinfo->ver >= 11)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
|
||||
44 /* XXX */, 0,
|
||||
0, 10 /* XXX */, 6 /* XXX */,
|
||||
14 /* XXX */, 0, 0);
|
||||
else if (devinfo->ver >= 8)
|
||||
if (devinfo->ver >= 8)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 42 /* XXX */, 0, 0,
|
||||
42 /* XXX */, 0,
|
||||
0, 8 /* XXX */, 4 /* XXX */,
|
||||
|
|
@ -735,12 +697,7 @@ namespace {
|
|||
abort();
|
||||
|
||||
case ELK_SHADER_OPCODE_SEL_EXEC:
|
||||
if (devinfo->ver >= 11)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
|
||||
0, 4 /* XXX */,
|
||||
0, 10 /* XXX */, 6 /* XXX */,
|
||||
14 /* XXX */, 0, 0);
|
||||
else if (devinfo->ver >= 8)
|
||||
if (devinfo->ver >= 8)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 8 /* XXX */, 4 /* XXX */, 0,
|
||||
0, 4 /* XXX */,
|
||||
0, 8 /* XXX */, 4 /* XXX */,
|
||||
|
|
@ -757,12 +714,7 @@ namespace {
|
|||
18 /* XXX */, 0, 0);
|
||||
|
||||
case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
|
||||
if (devinfo->ver >= 11)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
|
||||
0, 8 /* XXX */,
|
||||
0, 10 /* XXX */, 6 /* XXX */,
|
||||
14 /* XXX */, 0, 0);
|
||||
else if (devinfo->ver >= 8)
|
||||
if (devinfo->ver >= 8)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
|
||||
0, 8 /* XXX */,
|
||||
0, 8 /* XXX */, 4 /* XXX */,
|
||||
|
|
@ -779,10 +731,7 @@ namespace {
|
|||
18 /* XXX */, 0, 0);
|
||||
|
||||
case ELK_FS_OPCODE_DDY_FINE:
|
||||
if (devinfo->ver >= 11)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 0, 14, 0, 0, 4,
|
||||
0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
|
||||
else if (devinfo->ver >= 8)
|
||||
if (devinfo->ver >= 8)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
||||
0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
|
||||
else if (devinfo->verx10 >= 75)
|
||||
|
|
@ -793,11 +742,7 @@ namespace {
|
|||
0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
|
||||
|
||||
case ELK_FS_OPCODE_LOAD_LIVE_CHANNELS:
|
||||
if (devinfo->ver >= 11)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 2 /* XXX */, 0, 0,
|
||||
2 /* XXX */, 0,
|
||||
0, 0, 0, 10 /* XXX */, 0, 0);
|
||||
else if (devinfo->ver >= 8)
|
||||
if (devinfo->ver >= 8)
|
||||
return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
|
||||
0, 2 /* XXX */,
|
||||
0, 0, 0, 8 /* XXX */, 0, 0);
|
||||
|
|
@ -1493,16 +1438,8 @@ namespace {
|
|||
*
|
||||
* In the meantime use values that roughly match the control flow
|
||||
* weights used elsewhere in the compiler back-end.
|
||||
*
|
||||
* Note that we provide slightly more pessimistic weights on
|
||||
* Gfx12+ for SIMD32, since the effective warp size on that
|
||||
* platform is 2x the SIMD width due to EU fusion, which increases
|
||||
* the likelihood of divergent control flow in comparison to
|
||||
* previous generations, giving narrower SIMD modes a performance
|
||||
* advantage in several test-cases with non-uniform discard jumps.
|
||||
*/
|
||||
const float discard_weight = (dispatch_width > 16 || s->devinfo->ver < 12 ?
|
||||
1.0 : 0.5);
|
||||
const float discard_weight = 1.0;
|
||||
const float loop_weight = 10;
|
||||
unsigned halt_count = 0;
|
||||
unsigned elapsed = 0;
|
||||
|
|
|
|||
|
|
@ -192,12 +192,9 @@ lower_fb_write_logical_send(const fs_builder &bld, elk_fs_inst *inst,
|
|||
|
||||
assert(length == 0);
|
||||
length = 2;
|
||||
} else if ((devinfo->verx10 <= 70 &&
|
||||
prog_data->uses_kill) ||
|
||||
(devinfo->ver < 11 &&
|
||||
(color1.file != BAD_FILE || key->nr_color_regions > 1))) {
|
||||
assert(devinfo->ver < 20);
|
||||
|
||||
} else if ((devinfo->verx10 <= 70 && prog_data->uses_kill) ||
|
||||
color1.file != BAD_FILE ||
|
||||
key->nr_color_regions > 1) {
|
||||
/* From the Sandy Bridge PRM, volume 4, page 198:
|
||||
*
|
||||
* "Dispatched Pixel Enables. One bit per pixel indicating
|
||||
|
|
@ -220,9 +217,6 @@ lower_fb_write_logical_send(const fs_builder &bld, elk_fs_inst *inst,
|
|||
retype(elk_vec8_grf(2, 0), ELK_REGISTER_TYPE_UD),
|
||||
};
|
||||
ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
|
||||
|
||||
/* Gfx12 will require additional fix-ups if we ever hit this path. */
|
||||
assert(devinfo->ver < 12);
|
||||
}
|
||||
|
||||
uint32_t g00_bits = 0;
|
||||
|
|
@ -723,7 +717,6 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_op
|
|||
unsigned grad_components,
|
||||
bool residency)
|
||||
{
|
||||
const elk_compiler *compiler = bld.shader->compiler;
|
||||
const intel_device_info *devinfo = bld.shader->devinfo;
|
||||
const enum elk_reg_type payload_type =
|
||||
elk_reg_type_from_bit_size(payload_type_bit_size, ELK_REGISTER_TYPE_F);
|
||||
|
|
@ -802,26 +795,11 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_op
|
|||
* address space but means we can do something more efficient in the
|
||||
* shader.
|
||||
*/
|
||||
if (compiler->use_bindless_sampler_offset) {
|
||||
assert(devinfo->ver >= 11);
|
||||
ubld1.OR(component(header, 3), sampler_handle, elk_imm_ud(1));
|
||||
} else {
|
||||
ubld1.MOV(component(header, 3), sampler_handle);
|
||||
}
|
||||
ubld1.MOV(component(header, 3), sampler_handle);
|
||||
} else if (is_high_sampler(devinfo, sampler)) {
|
||||
elk_fs_reg sampler_state_ptr =
|
||||
retype(elk_vec1_grf(0, 3), ELK_REGISTER_TYPE_UD);
|
||||
|
||||
/* Gfx11+ sampler message headers include bits in 4:0 which conflict
|
||||
* with the ones included in g0.3 bits 4:0. Mask them out.
|
||||
*/
|
||||
if (devinfo->ver >= 11) {
|
||||
sampler_state_ptr = ubld1.vgrf(ELK_REGISTER_TYPE_UD);
|
||||
ubld1.AND(sampler_state_ptr,
|
||||
retype(elk_vec1_grf(0, 3), ELK_REGISTER_TYPE_UD),
|
||||
elk_imm_ud(INTEL_MASK(31, 5)));
|
||||
}
|
||||
|
||||
if (sampler.file == ELK_IMMEDIATE_VALUE) {
|
||||
assert(sampler.ud >= 16);
|
||||
const int sampler_state_size = 16; /* 16 bytes */
|
||||
|
|
@ -834,26 +812,9 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_op
|
|||
ubld1.SHL(tmp, tmp, elk_imm_ud(4));
|
||||
ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
|
||||
}
|
||||
} else if (devinfo->ver >= 11) {
|
||||
/* Gfx11+ sampler message headers include bits in 4:0 which conflict
|
||||
* with the ones included in g0.3 bits 4:0. Mask them out.
|
||||
*/
|
||||
ubld1.AND(component(header, 3),
|
||||
retype(elk_vec1_grf(0, 3), ELK_REGISTER_TYPE_UD),
|
||||
elk_imm_ud(INTEL_MASK(31, 5)));
|
||||
}
|
||||
}
|
||||
|
||||
/* Change the opcode to account for LOD being zero before the
|
||||
* switch-statement that emits sources based on the opcode.
|
||||
*/
|
||||
if (devinfo->ver >= 9 && lod.is_zero()) {
|
||||
if (op == ELK_SHADER_OPCODE_TXL)
|
||||
op = ELK_SHADER_OPCODE_TXL_LZ;
|
||||
else if (op == ELK_SHADER_OPCODE_TXF)
|
||||
op = ELK_SHADER_OPCODE_TXF_LZ;
|
||||
}
|
||||
|
||||
/* On Xe2 and newer platforms, min_lod is the first parameter specifically
|
||||
* so that a bunch of other, possibly unused, parameters don't need to also
|
||||
* be included.
|
||||
|
|
@ -910,27 +871,15 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_op
|
|||
break;
|
||||
case ELK_SHADER_OPCODE_TXF:
|
||||
case ELK_SHADER_OPCODE_TXF_LZ:
|
||||
/* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
|
||||
* On Gfx9 they are u, v, lod, r
|
||||
*/
|
||||
/* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
|
||||
bld.MOV(retype(sources[length++], payload_signed_type), coordinate);
|
||||
|
||||
if (devinfo->ver >= 9) {
|
||||
if (coord_components >= 2) {
|
||||
bld.MOV(retype(sources[length], payload_signed_type),
|
||||
offset(coordinate, bld, 1));
|
||||
} else {
|
||||
sources[length] = elk_imm_d(0);
|
||||
}
|
||||
length++;
|
||||
}
|
||||
|
||||
if (op != ELK_SHADER_OPCODE_TXF_LZ) {
|
||||
bld.MOV(retype(sources[length], payload_signed_type), lod);
|
||||
length++;
|
||||
}
|
||||
|
||||
for (unsigned i = devinfo->ver >= 9 ? 2 : 1; i < coord_components; i++)
|
||||
for (unsigned i = 1; i < coord_components; i++)
|
||||
bld.MOV(retype(sources[length++], payload_signed_type),
|
||||
offset(coordinate, bld, i));
|
||||
|
||||
|
|
@ -964,24 +913,9 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_op
|
|||
* only valid data is in first two register. So with 16-bit
|
||||
* payload, we need to split 2-32bit register into 4-16-bit
|
||||
* payload.
|
||||
*
|
||||
* From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
|
||||
* Shared Functions - 3D Sampler - Messages - Message Format:
|
||||
*
|
||||
* ld2dms_w si mcs0 mcs1 mcs2 mcs3 u v r
|
||||
*/
|
||||
if (devinfo->verx10 >= 125 && op == ELK_SHADER_OPCODE_TXF_CMS_W) {
|
||||
elk_fs_reg tmp = offset(mcs, bld, i);
|
||||
bld.MOV(retype(sources[length++], payload_unsigned_type),
|
||||
mcs.file == IMM ? mcs :
|
||||
subscript(tmp, payload_unsigned_type, 0));
|
||||
bld.MOV(retype(sources[length++], payload_unsigned_type),
|
||||
mcs.file == IMM ? mcs :
|
||||
subscript(tmp, payload_unsigned_type, 1));
|
||||
} else {
|
||||
bld.MOV(retype(sources[length++], payload_unsigned_type),
|
||||
mcs.file == IMM ? mcs : offset(mcs, bld, i));
|
||||
}
|
||||
bld.MOV(retype(sources[length++], payload_unsigned_type),
|
||||
mcs.file == IMM ? mcs : offset(mcs, bld, i));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1021,29 +955,11 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_op
|
|||
|
||||
if (min_lod.file != BAD_FILE) {
|
||||
/* Account for all of the missing coordinate sources */
|
||||
if (op == ELK_SHADER_OPCODE_TXD && devinfo->verx10 >= 125) {
|
||||
/* On DG2 and newer platforms, sample_d can only be used with 1D and
|
||||
* 2D surfaces, so the maximum number of gradient components is 2.
|
||||
* In spite of this limitation, the Bspec lists a mysterious R
|
||||
* component before the min_lod, so the maximum coordinate components
|
||||
* is 3.
|
||||
*
|
||||
* See bspec 45942, "Enable new message layout for cube array"
|
||||
*/
|
||||
length += 3 - coord_components;
|
||||
length += (2 - grad_components) * 2;
|
||||
} else {
|
||||
length += 4 - coord_components;
|
||||
if (op == ELK_SHADER_OPCODE_TXD)
|
||||
length += (3 - grad_components) * 2;
|
||||
}
|
||||
length += 4 - coord_components;
|
||||
if (op == ELK_SHADER_OPCODE_TXD)
|
||||
length += (3 - grad_components) * 2;
|
||||
|
||||
bld.MOV(sources[length++], min_lod);
|
||||
|
||||
/* Wa_14014595444: Populate MLOD as parameter 5 (twice). */
|
||||
if (devinfo->verx10 == 125 && op == ELK_FS_OPCODE_TXB &&
|
||||
!inst->shadow_compare)
|
||||
bld.MOV(sources[length++], min_lod);
|
||||
}
|
||||
|
||||
const elk_fs_reg src_payload =
|
||||
|
|
@ -1145,39 +1061,13 @@ get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
|
|||
assert(src_type_size == 2 || src_type_size == 4);
|
||||
|
||||
#ifndef NDEBUG
|
||||
/* Make sure all sources agree. On gfx12 this doesn't hold when sampling
|
||||
* compressed multisampled surfaces. There the payload contains MCS data
|
||||
* which is already in 16-bits unlike the other parameters that need forced
|
||||
* conversion.
|
||||
*/
|
||||
if (devinfo->verx10 < 125 ||
|
||||
(op != ELK_SHADER_OPCODE_TXF_CMS_W &&
|
||||
op != ELK_SHADER_OPCODE_TXF_CMS)) {
|
||||
for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
|
||||
assert(src[i].file == BAD_FILE ||
|
||||
elk_reg_type_to_size(src[i].type) == src_type_size);
|
||||
}
|
||||
/* Make sure all sources agree. */
|
||||
for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
|
||||
assert(src[i].file == BAD_FILE ||
|
||||
elk_reg_type_to_size(src[i].type) == src_type_size);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (devinfo->verx10 < 125)
|
||||
return src_type_size * 8;
|
||||
|
||||
/* Force conversion from 32-bit sources to 16-bit payload. From the XeHP Bspec:
|
||||
* 3D and GPGPU Programs - Shared Functions - 3D Sampler - Messages - Message
|
||||
* Format [GFX12:HAS:1209977870] *
|
||||
*
|
||||
* ld2dms_w SIMD8H and SIMD16H Only
|
||||
* ld_mcs SIMD8H and SIMD16H Only
|
||||
* ld2dms REMOVEDBY(GEN:HAS:1406788836)
|
||||
*/
|
||||
|
||||
if (op == ELK_SHADER_OPCODE_TXF_CMS_W ||
|
||||
op == ELK_SHADER_OPCODE_TXF_CMS ||
|
||||
op == ELK_SHADER_OPCODE_TXF_UMS ||
|
||||
op == ELK_SHADER_OPCODE_TXF_MCS)
|
||||
src_type_size = 2;
|
||||
|
||||
return src_type_size * 8;
|
||||
}
|
||||
|
||||
|
|
@ -1211,7 +1101,7 @@ lower_sampler_logical_send(const fs_builder &bld, elk_fs_inst *inst, elk_opcode
|
|||
get_sampler_msg_payload_type_bit_size(devinfo, op, inst->src);
|
||||
|
||||
/* 16-bit payloads are available only on gfx11+ */
|
||||
assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11);
|
||||
assert(msg_payload_type_bit_size != 16);
|
||||
|
||||
lower_sampler_logical_send_gfx7(bld, inst, op, coordinate,
|
||||
shadow_c, lod, lod2, min_lod,
|
||||
|
|
@ -1259,7 +1149,6 @@ emit_predicate_on_vector_mask(const fs_builder &bld, elk_fs_inst *inst)
|
|||
assert(inst->predicate == ELK_PREDICATE_NORMAL);
|
||||
assert(!inst->predicate_inverse);
|
||||
assert(inst->flag_subreg == 0);
|
||||
assert(s.devinfo->ver < 20);
|
||||
/* Combine the vector mask with the existing predicate by using a
|
||||
* vertical predication mode.
|
||||
*/
|
||||
|
|
@ -1345,7 +1234,7 @@ lower_surface_logical_send(const fs_builder &bld, elk_fs_inst *inst)
|
|||
* For all stateless A32 messages, we also need a header
|
||||
*/
|
||||
elk_fs_reg header;
|
||||
if ((devinfo->ver < 9 && is_typed_access) || is_stateless) {
|
||||
if (is_typed_access || is_stateless) {
|
||||
fs_builder ubld = bld.exec_all().group(8, 0);
|
||||
header = ubld.vgrf(ELK_REGISTER_TYPE_UD);
|
||||
if (is_stateless) {
|
||||
|
|
@ -1515,71 +1404,6 @@ lower_surface_logical_send(const fs_builder &bld, elk_fs_inst *inst)
|
|||
inst->src[1] = payload;
|
||||
}
|
||||
|
||||
static void
|
||||
lower_surface_block_logical_send(const fs_builder &bld, elk_fs_inst *inst)
|
||||
{
|
||||
const intel_device_info *devinfo = bld.shader->devinfo;
|
||||
assert(devinfo->ver >= 9);
|
||||
|
||||
/* Get the logical send arguments. */
|
||||
const elk_fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
|
||||
const elk_fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
|
||||
const elk_fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
|
||||
const elk_fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
|
||||
const elk_fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
|
||||
assert(arg.file == IMM);
|
||||
assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE);
|
||||
assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE);
|
||||
|
||||
const bool is_stateless =
|
||||
surface.file == IMM && (surface.ud == ELK_BTI_STATELESS ||
|
||||
surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
|
||||
|
||||
const bool has_side_effects = inst->has_side_effects();
|
||||
|
||||
const bool align_16B =
|
||||
inst->opcode != ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL;
|
||||
|
||||
const bool write = inst->opcode == ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;
|
||||
|
||||
/* The address is stored in the header. See MH_A32_GO and MH_BTS_GO. */
|
||||
fs_builder ubld = bld.exec_all().group(8, 0);
|
||||
elk_fs_reg header = ubld.vgrf(ELK_REGISTER_TYPE_UD);
|
||||
|
||||
if (is_stateless)
|
||||
ubld.emit(ELK_SHADER_OPCODE_SCRATCH_HEADER, header);
|
||||
else
|
||||
ubld.MOV(header, elk_imm_d(0));
|
||||
|
||||
/* Address in OWord units when aligned to OWords. */
|
||||
if (align_16B)
|
||||
ubld.group(1, 0).SHR(component(header, 2), addr, elk_imm_ud(4));
|
||||
else
|
||||
ubld.group(1, 0).MOV(component(header, 2), addr);
|
||||
|
||||
elk_fs_reg data;
|
||||
if (write) {
|
||||
const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
|
||||
data = retype(bld.move_to_vgrf(src, src_sz), ELK_REGISTER_TYPE_UD);
|
||||
}
|
||||
|
||||
inst->opcode = ELK_SHADER_OPCODE_SEND;
|
||||
inst->mlen = 1;
|
||||
inst->header_size = 1;
|
||||
inst->send_has_side_effects = has_side_effects;
|
||||
inst->send_is_volatile = !has_side_effects;
|
||||
|
||||
inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
|
||||
|
||||
const uint32_t desc = elk_dp_oword_block_rw_desc(devinfo, align_16B,
|
||||
arg.ud, write);
|
||||
setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
|
||||
|
||||
inst->resize_sources(2);
|
||||
|
||||
inst->src[1] = header;
|
||||
}
|
||||
|
||||
static void
|
||||
emit_fragment_mask(const fs_builder &bld, elk_fs_inst *inst)
|
||||
{
|
||||
|
|
@ -1965,7 +1789,7 @@ lower_get_buffer_size(const fs_builder &bld, elk_fs_inst *inst)
|
|||
/* Since we can only execute this instruction on uniform bti/surface
|
||||
* handles, elk_fs_nir.cpp should already have limited this to SIMD8.
|
||||
*/
|
||||
assert(inst->exec_size == (devinfo->ver < 20 ? 8 : 16));
|
||||
assert(inst->exec_size == 8);
|
||||
|
||||
elk_fs_reg surface = inst->src[GET_BUFFER_SIZE_SRC_SURFACE];
|
||||
elk_fs_reg surface_handle = inst->src[GET_BUFFER_SIZE_SRC_SURFACE_HANDLE];
|
||||
|
|
@ -2087,11 +1911,6 @@ elk_fs_visitor::lower_logical_sends()
|
|||
lower_surface_logical_send(ibld, inst);
|
||||
break;
|
||||
|
||||
case ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
|
||||
case ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
|
||||
lower_surface_block_logical_send(ibld, inst);
|
||||
break;
|
||||
|
||||
case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
|
||||
case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
|
||||
case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
|
||||
|
|
|
|||
|
|
@ -321,38 +321,6 @@ elk_nir_lower_cs_intrinsics(nir_shader *nir,
|
|||
}
|
||||
}
|
||||
|
||||
if (devinfo->verx10 >= 125 && prog_data &&
|
||||
nir->info.stage == MESA_SHADER_COMPUTE &&
|
||||
nir->info.cs.derivative_group != DERIVATIVE_GROUP_QUADS &&
|
||||
!nir->info.workgroup_size_variable &&
|
||||
util_is_power_of_two_nonzero(nir->info.workgroup_size[0]) &&
|
||||
util_is_power_of_two_nonzero(nir->info.workgroup_size[1])) {
|
||||
|
||||
state.hw_generated_local_id = true;
|
||||
|
||||
/* TODO: more heuristics about 1D/SLM access vs. 2D access */
|
||||
bool linear =
|
||||
BITSET_TEST(nir->info.system_values_read,
|
||||
SYSTEM_VALUE_LOCAL_INVOCATION_INDEX) ||
|
||||
(nir->info.workgroup_size[1] == 1 &&
|
||||
nir->info.workgroup_size[2] == 1) ||
|
||||
(nir->info.num_images == 0 && nir->info.num_textures == 0);
|
||||
|
||||
prog_data->walk_order =
|
||||
linear ? INTEL_WALK_ORDER_XYZ : INTEL_WALK_ORDER_YXZ;
|
||||
|
||||
/* nir_lower_compute_system_values will replace any references to
|
||||
* SYSTEM_VALUE_LOCAL_INVOCATION_ID vector components with zero for
|
||||
* any dimension where the workgroup size is 1, so we can skip
|
||||
* generating those. However, the hardware can only generate
|
||||
* X, XY, or XYZ - it can't skip earlier components.
|
||||
*/
|
||||
prog_data->generate_local_id =
|
||||
(nir->info.workgroup_size[0] > 1 ? WRITEMASK_X : 0) |
|
||||
(nir->info.workgroup_size[1] > 1 ? WRITEMASK_XY : 0) |
|
||||
(nir->info.workgroup_size[2] > 1 ? WRITEMASK_XYZ : 0);
|
||||
}
|
||||
|
||||
nir_foreach_function_impl(impl, nir) {
|
||||
state.impl = impl;
|
||||
lower_cs_intrinsics_convert_impl(&state);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue