brw/lower: Adjust source stride on DF is_scalar sources to MAD on Gfx9

This commit used to be "brw/emit: Allow scalar sources to 64-bit
3-source instructions". These instructions were fixed up in
brw_eu_emit. There seems to be some conflict with the <0,1,0> stride an
post-RA scheduling. The only difference between the passing code
generated by this commit and the failing code generated by the older
commit is some post-RA scheduling.

v2: Change the stride of a MAD even if the instruction isn't
lowered. MAD instructions that are already SIMD8 have to follow the same
rules. 🤦

v3: Pull the lowering out to its own pass. Update the comment in
brw_fs_validate. Suggested by Ken.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
This commit is contained in:
Ian Romanick 2024-02-20 22:23:07 -08:00
parent d5d7ae22ae
commit 007c92b2ac
4 changed files with 49 additions and 1 deletions

View file

@ -641,6 +641,7 @@ bool brw_fs_lower_logical_sends(fs_visitor &s);
bool brw_fs_lower_pack(fs_visitor &s);
bool brw_fs_lower_load_payload(fs_visitor &s);
bool brw_fs_lower_regioning(fs_visitor &s);
bool brw_lower_scalar_fp64_MAD(fs_visitor &s);
bool brw_fs_lower_scoreboard(fs_visitor &s);
bool brw_fs_lower_sends_overlapping_payload(fs_visitor &s);
bool brw_fs_lower_simd_width(fs_visitor &s);

View file

@ -8,6 +8,47 @@
using namespace brw;
/**
* Align16 3-source instructions cannot have scalar stride w/64-bit types.
*
* The Bspec says:
*
* Replicate Control. This field is only present in three-source
* instructions, for each of the three source operands. It controls
* replication of the starting channel to all channels in the execution
* size. ChanSel does not apply when Replicate Control is set. This is
* applicable to 32b datatypes and 16b datatype. 64b datatypes cannot use
* the replicate control.
*
* In practice, this can only happen on Gfx9 with DF sources to MAD. Since
* the source is_scalar, this can be fixed by just making the stride=1. Also
* clear is_scalar "just in case."
*/
bool
brw_lower_scalar_fp64_MAD(fs_visitor &s)
{
const intel_device_info *devinfo = s.devinfo;
bool progress = false;
if (devinfo->ver != 9)
return false;
foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
if (inst->opcode == BRW_OPCODE_MAD &&
inst->dst.type == BRW_TYPE_DF) {
for (unsigned i = 0; i < 3; i++) {
if (inst->src[i].is_scalar) {
inst->src[i].is_scalar = false;
inst->src[i].stride = 1;
progress = true;
}
}
}
}
return progress;
}
/**
* Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
* or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.

View file

@ -96,6 +96,7 @@ brw_fs_optimize(fs_visitor &s)
OPT(brw_fs_lower_subgroup_ops);
OPT(brw_fs_lower_csel);
OPT(brw_fs_lower_simd_width);
OPT(brw_lower_scalar_fp64_MAD);
OPT(brw_fs_lower_barycentrics);
OPT(brw_fs_lower_logical_sends);

View file

@ -366,7 +366,12 @@ brw_fs_validate(const fs_visitor &s)
const unsigned stride_in_bytes = byte_stride(inst->src[i]);
const unsigned size_in_bytes = brw_type_size_bytes(inst->src[i].type);
if (stride_in_bytes == 0) {
fsv_assert_lte(size_in_bytes, 4);
/* If the source is_scalar, then the stride will be
* converted to <4;4,1> in brw_lower_scalar_fp64_MAD after
* SIMD splitting.
*/
if (!inst->src[i].is_scalar)
fsv_assert_lte(size_in_bytes, 4);
} else {
fsv_assert_eq(stride_in_bytes, size_in_bytes);
}