From 007c92b2acaf21ac95eb3aad6538031371df97bc Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Tue, 20 Feb 2024 22:23:07 -0800 Subject: [PATCH] brw/lower: Adjust source stride on DF is_scalar sources to MAD on Gfx9 This commit used to be "brw/emit: Allow scalar sources to 64-bit 3-source instructions". These instructions were fixed up in brw_eu_emit. There seems to be some conflict with the <0,1,0> stride an post-RA scheduling. The only difference between the passing code generated by this commit and the failing code generated by the older commit is some post-RA scheduling. v2: Change the stride of a MAD even if the instruction isn't lowered. MAD instructions that are already SIMD8 have to follow the same rules. :facepalm: v3: Pull the lowering out to its own pass. Update the comment in brw_fs_validate. Suggested by Ken. Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/brw_fs.h | 1 + src/intel/compiler/brw_fs_lower.cpp | 41 ++++++++++++++++++++++++++ src/intel/compiler/brw_fs_opt.cpp | 1 + src/intel/compiler/brw_fs_validate.cpp | 7 ++++- 4 files changed, 49 insertions(+), 1 deletion(-) diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 6b41ba31233..2f4a9b22427 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -641,6 +641,7 @@ bool brw_fs_lower_logical_sends(fs_visitor &s); bool brw_fs_lower_pack(fs_visitor &s); bool brw_fs_lower_load_payload(fs_visitor &s); bool brw_fs_lower_regioning(fs_visitor &s); +bool brw_lower_scalar_fp64_MAD(fs_visitor &s); bool brw_fs_lower_scoreboard(fs_visitor &s); bool brw_fs_lower_sends_overlapping_payload(fs_visitor &s); bool brw_fs_lower_simd_width(fs_visitor &s); diff --git a/src/intel/compiler/brw_fs_lower.cpp b/src/intel/compiler/brw_fs_lower.cpp index 11648ce1a63..bc5a2d81140 100644 --- a/src/intel/compiler/brw_fs_lower.cpp +++ b/src/intel/compiler/brw_fs_lower.cpp @@ -8,6 +8,47 @@ using namespace brw; +/** + * Align16 3-source instructions cannot have scalar stride w/64-bit types. + * + * The Bspec says: + * + * Replicate Control. This field is only present in three-source + * instructions, for each of the three source operands. It controls + * replication of the starting channel to all channels in the execution + * size. ChanSel does not apply when Replicate Control is set. This is + * applicable to 32b datatypes and 16b datatype. 64b datatypes cannot use + * the replicate control. + * + * In practice, this can only happen on Gfx9 with DF sources to MAD. Since + * the source is_scalar, this can be fixed by just making the stride=1. Also + * clear is_scalar "just in case." + */ +bool +brw_lower_scalar_fp64_MAD(fs_visitor &s) +{ + const intel_device_info *devinfo = s.devinfo; + bool progress = false; + + if (devinfo->ver != 9) + return false; + + foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) { + if (inst->opcode == BRW_OPCODE_MAD && + inst->dst.type == BRW_TYPE_DF) { + for (unsigned i = 0; i < 3; i++) { + if (inst->src[i].is_scalar) { + inst->src[i].is_scalar = false; + inst->src[i].stride = 1; + progress = true; + } + } + } + } + + return progress; +} + /** * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs. diff --git a/src/intel/compiler/brw_fs_opt.cpp b/src/intel/compiler/brw_fs_opt.cpp index 309eb4f99c7..4a72ed5476a 100644 --- a/src/intel/compiler/brw_fs_opt.cpp +++ b/src/intel/compiler/brw_fs_opt.cpp @@ -96,6 +96,7 @@ brw_fs_optimize(fs_visitor &s) OPT(brw_fs_lower_subgroup_ops); OPT(brw_fs_lower_csel); OPT(brw_fs_lower_simd_width); + OPT(brw_lower_scalar_fp64_MAD); OPT(brw_fs_lower_barycentrics); OPT(brw_fs_lower_logical_sends); diff --git a/src/intel/compiler/brw_fs_validate.cpp b/src/intel/compiler/brw_fs_validate.cpp index 698c9330c4f..7c579caba3e 100644 --- a/src/intel/compiler/brw_fs_validate.cpp +++ b/src/intel/compiler/brw_fs_validate.cpp @@ -366,7 +366,12 @@ brw_fs_validate(const fs_visitor &s) const unsigned stride_in_bytes = byte_stride(inst->src[i]); const unsigned size_in_bytes = brw_type_size_bytes(inst->src[i].type); if (stride_in_bytes == 0) { - fsv_assert_lte(size_in_bytes, 4); + /* If the source is_scalar, then the stride will be + * converted to <4;4,1> in brw_lower_scalar_fp64_MAD after + * SIMD splitting. + */ + if (!inst->src[i].is_scalar) + fsv_assert_lte(size_in_bytes, 4); } else { fsv_assert_eq(stride_in_bytes, size_in_bytes); }