From aa53665fda63484495d736ddd1d4542c66814e61 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Wed, 24 Mar 2021 09:56:42 +0200 Subject: [PATCH] intel/fs/copy_prop: check stride constraints with actual final type In some cases we will change the type of the destination register of an instruction. This is the type we should use to verify that we're allow to do the replacement. Otherwise we can hit restrictions on CHV and upcoming Xe-Hp for instance where the copy propagation transforms this : send(16) (mlen: 2) vgrf10:UD, 0u, 0u, vgrf35:D, null:UD mov(16) vgrf11:UW, vgrf10<2>:UW mov(16) vgrf12:UW, vgrf10+0.2<2>:UW mov(16) vgrf15:HF, |vgrf11|:HF mov(16) vgrf16:HF, |vgrf12|:HF mov(8) vgrf41<2>:UW, vgrf15+0.0:UW group0 mov(8) vgrf42<2>:UW, vgrf15+0.16:UW group8 mov(8) vgrf45<2>:UW, vgrf16+0.0:UW group0 mov(8) vgrf46<2>:UW, vgrf16+0.16:UW group8 into this : send(16) (mlen: 2) vgrf10:UD, 0u, 0u, vgrf35:D, null:UD mov(8) vgrf41<2>:HF, |vgrf10+0.0|<2>:HF group0 mov(8) vgrf42<2>:HF, |vgrf10+1.0|<2>:HF group8 mov(8) vgrf45<2>:HF, |vgrf10+0.2|<2>:HF group0 mov(8) vgrf46<2>:HF, |vgrf10+1.2|<2>:HF group8 Because of the floating point use, stride and offets should be the same. v2: Fix final destination type selection (Curro) v3: constify (Curro) Signed-off-by: Lionel Landwerlin Cc: Reviewed-by: Francisco Jerez Part-of: --- src/intel/compiler/brw_fs_copy_propagation.cpp | 14 ++++++++++---- src/intel/compiler/brw_ir_fs.h | 12 ++++++++++-- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp index 343bac23889..6be8d8d8693 100644 --- a/src/intel/compiler/brw_fs_copy_propagation.cpp +++ b/src/intel/compiler/brw_fs_copy_propagation.cpp @@ -367,7 +367,8 @@ is_logic_op(enum opcode opcode) } static bool -can_take_stride(fs_inst *inst, unsigned arg, unsigned stride, +can_take_stride(fs_inst *inst, brw_reg_type dst_type, + unsigned arg, unsigned stride, const gen_device_info *devinfo) { if (stride > 4) @@ -377,9 +378,9 @@ can_take_stride(fs_inst *inst, unsigned arg, unsigned stride, * of the corresponding channel of the destination, and the provided stride * would break this restriction. */ - if (has_dst_aligned_region_restriction(devinfo, inst) && + if (has_dst_aligned_region_restriction(devinfo, inst, dst_type) && !(type_sz(inst->src[arg].type) * stride == - type_sz(inst->dst.type) * inst->dst.stride || + type_sz(dst_type) * inst->dst.stride || stride == 0)) return false; @@ -534,10 +535,15 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) if (instruction_requires_packed_data(inst) && entry_stride != 1) return false; + const brw_reg_type dst_type = (has_source_modifiers && + entry->dst.type != inst->src[arg].type) ? + entry->dst.type : inst->dst.type; + /* Bail if the result of composing both strides would exceed the * hardware limit. */ - if (!can_take_stride(inst, arg, entry_stride * inst->src[arg].stride, + if (!can_take_stride(inst, dst_type, arg, + entry_stride * inst->src[arg].stride, devinfo)) return false; diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h index 60f0c8bfa77..6becef748ce 100644 --- a/src/intel/compiler/brw_ir_fs.h +++ b/src/intel/compiler/brw_ir_fs.h @@ -553,7 +553,8 @@ is_unordered(const fs_inst *inst) */ static inline bool has_dst_aligned_region_restriction(const gen_device_info *devinfo, - const fs_inst *inst) + const fs_inst *inst, + brw_reg_type dst_type) { const brw_reg_type exec_type = get_exec_type(inst); /* Even though the hardware spec claims that "integer DWord multiply" @@ -567,13 +568,20 @@ has_dst_aligned_region_restriction(const gen_device_info *devinfo, (inst->opcode == BRW_OPCODE_MAD && MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4)); - if (type_sz(inst->dst.type) > 4 || type_sz(exec_type) > 4 || + if (type_sz(dst_type) > 4 || type_sz(exec_type) > 4 || (type_sz(exec_type) == 4 && is_dword_multiply)) return devinfo->is_cherryview || gen_device_info_is_9lp(devinfo); else return false; } +static inline bool +has_dst_aligned_region_restriction(const gen_device_info *devinfo, + const fs_inst *inst) +{ + return has_dst_aligned_region_restriction(devinfo, inst, inst->dst.type); +} + /** * Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from * the specified register file into a VGRF.