intel/brw: Add SHADER_OPCODE_SEND_GATHER

Starting in Xe3, there's a variant of SEND that take the
register numbers from the ARF scalar register, and don't
require them to be contiguous.  The new opcode added here
represents that kind of SEND.

To make the original sources still reachable, we keep them
around during the IR, just ignoring them at generator time.
This allow software scoreboard to properly reason the
dependencies without trying to decode the contents of ARF
scalar register being used.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Lionel Landwerlin <None>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32410>
This commit is contained in:
Caio Oliveira 2024-11-20 08:12:52 -08:00 committed by Marge Bot
parent 2fca22347c
commit 650ec7169d
8 changed files with 69 additions and 5 deletions

View file

@ -264,6 +264,19 @@ enum opcode {
*/
SHADER_OPCODE_SEND,
/**
* A variant of SEND that collects its sources to form an input.
*
* Source 0: Message descriptor ("desc").
* Source 1: Message extended descriptor ("ex_desc").
* Source 2: Before register allocation must be BAD_FILE,
* after that, the ARF scalar register containing
* the (physical) numbers of the payload sources.
* Source 3..n: Payload sources. For this opcode, they must each
* have the size of a physical GRF.
*/
SHADER_OPCODE_SEND_GATHER,
/**
* An "undefined" write which does nothing but indicates to liveness that
* we don't care about any values in the register which predate this

View file

@ -206,6 +206,7 @@ fs_inst::is_send_from_grf() const
{
switch (opcode) {
case SHADER_OPCODE_SEND:
case SHADER_OPCODE_SEND_GATHER:
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
@ -240,6 +241,7 @@ fs_inst::is_control_source(unsigned arg) const
return arg == 1 || arg == 2;
case SHADER_OPCODE_SEND:
case SHADER_OPCODE_SEND_GATHER:
return arg == 0 || arg == 1;
case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
@ -278,6 +280,9 @@ fs_inst::is_payload(unsigned arg) const
case SHADER_OPCODE_SEND:
return arg == 2 || arg == 3;
case SHADER_OPCODE_SEND_GATHER:
return arg >= 2;
default:
return false;
}
@ -609,6 +614,14 @@ fs_inst::size_read(const struct intel_device_info *devinfo, int arg) const
}
break;
case SHADER_OPCODE_SEND_GATHER:
if (arg >= 3) {
/* SEND_GATHER is Xe3+, so no need to pass devinfo around. */
const unsigned reg_unit = 2;
return REG_SIZE * reg_unit;
}
break;
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
if (arg == 0)

View file

@ -168,7 +168,13 @@ brw_generator::generate_send(fs_inst *inst,
struct brw_reg payload,
struct brw_reg payload2)
{
const bool gather = false;
const bool gather = inst->opcode == SHADER_OPCODE_SEND_GATHER;
if (gather) {
assert(payload.file == ARF);
assert(payload.nr == BRW_ARF_SCALAR);
assert(payload2.file == ARF);
assert(payload2.nr == BRW_ARF_NULL);
}
if (ex_desc.file == IMM && ex_desc.ud == 0) {
brw_send_indirect_message(p, inst->sfid, dst, payload, desc, inst->eot, gather);
@ -854,7 +860,14 @@ brw_generator::generate_code(const cfg_t *cfg, int dispatch_width,
brw_set_default_group(p, inst->group);
}
for (unsigned int i = 0; i < inst->sources; i++) {
/* For SEND_GATHER, the payload sources are represented inside the
* scalar register in src[2], so we can skip them.
*/
const unsigned num_sources =
inst->opcode == SHADER_OPCODE_SEND_GATHER ? 3 : inst->sources;
assert(num_sources <= ARRAY_SIZE(src));
for (unsigned int i = 0; i < num_sources; i++) {
src[i] = normalize_brw_reg_for_encoding(&inst->src[i]);
/* The accumulator result appears to get used for the
* conditional modifier generation. When negating a UD
@ -1147,6 +1160,7 @@ brw_generator::generate_code(const cfg_t *cfg, int dispatch_width,
break;
case SHADER_OPCODE_SEND:
case SHADER_OPCODE_SEND_GATHER:
generate_send(inst, dst, src[0], src[1], src[2],
inst->ex_mlen > 0 ? src[3] : brw_null_reg());
send_count++;

View file

@ -130,6 +130,13 @@ namespace {
if (inst->opcode == SHADER_OPCODE_SEND) {
ss = DIV_ROUND_UP(inst->size_read(devinfo, 2), REG_SIZE) +
DIV_ROUND_UP(inst->size_read(devinfo, 3), REG_SIZE);
} else if (inst->opcode == SHADER_OPCODE_SEND_GATHER) {
ss = inst->mlen;
/* If haven't lowered yet, count the sources. */
if (!ss) {
for (int i = 3; i < inst->sources; i++)
ss += DIV_ROUND_UP(inst->size_read(devinfo, i), REG_SIZE);
}
} else {
for (unsigned i = 0; i < inst->sources; i++)
ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(devinfo, i), REG_SIZE));
@ -597,6 +604,7 @@ namespace {
0, 0, 0, 0, 0, 0);
case SHADER_OPCODE_SEND:
case SHADER_OPCODE_SEND_GATHER:
switch (info.sfid) {
case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
/* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */

View file

@ -2629,15 +2629,22 @@ brw_lower_send_descriptors(fs_visitor &s)
bool progress = false;
foreach_block_and_inst (block, fs_inst, inst, s.cfg) {
if (inst->opcode != SHADER_OPCODE_SEND)
if (inst->opcode != SHADER_OPCODE_SEND &&
inst->opcode != SHADER_OPCODE_SEND_GATHER)
continue;
const brw_builder ubld = brw_builder(&s, block, inst).exec_all().group(1, 0);
/* Descriptor */
const unsigned rlen = inst->dst.is_null() ? 0 : inst->size_written / REG_SIZE;
unsigned mlen = inst->mlen;
if (inst->opcode == SHADER_OPCODE_SEND_GATHER) {
assert(inst->sources >= 3);
mlen = (inst->sources - 3) * reg_unit(devinfo);
}
uint32_t desc_imm = inst->desc |
brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size);
brw_message_desc(devinfo, mlen, rlen, inst->header_size);
assert(inst->src[0].file != BAD_FILE);
assert(inst->src[1].file != BAD_FILE);

View file

@ -120,6 +120,8 @@ brw_instruction_name(const struct brw_isa_info *isa, enum opcode op)
case SHADER_OPCODE_SEND:
return "send";
case SHADER_OPCODE_SEND_GATHER:
return "send_gather";
case SHADER_OPCODE_UNDEF:
return "undef";

View file

@ -201,6 +201,7 @@ fs_inst::has_side_effects() const
{
switch (opcode) {
case SHADER_OPCODE_SEND:
case SHADER_OPCODE_SEND_GATHER:
return send_has_side_effects;
case BRW_OPCODE_SYNC:
@ -227,7 +228,8 @@ bool
fs_inst::is_volatile() const
{
return opcode == SHADER_OPCODE_MEMORY_LOAD_LOGICAL ||
(opcode == SHADER_OPCODE_SEND && send_is_volatile);
((opcode == SHADER_OPCODE_SEND ||
opcode == SHADER_OPCODE_SEND_GATHER) && send_is_volatile);
}
#ifndef NDEBUG

View file

@ -299,6 +299,11 @@ brw_validate(const fs_visitor &s)
fsv_assert(is_uniform(inst->src[0]) && is_uniform(inst->src[1]));
break;
case SHADER_OPCODE_SEND_GATHER:
fsv_assert(is_uniform(inst->src[0]) && is_uniform(inst->src[1]));
fsv_assert(devinfo->ver >= 30);
break;
case BRW_OPCODE_MOV:
fsv_assert(inst->sources == 1);
break;