From e3f502e0074cc0b9d5a6807fa900b240cf7e0fc6 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Tue, 13 Apr 2021 14:07:19 -0700 Subject: [PATCH] intel/fs: Allow copy propagation between MOVs of mixed sizes This eliminates some spurious, size-converting moves. For example, on Ice Lake this helps dEQP-VK.spirv_assembly.type.vec3.i8.bitwise_xor_frag: SIMD8 shader: 52 instructions. 1 loops. 4164 cycles. 0:0 spills:fills, 5 sends SIMD8 shader: 49 instructions. 1 loops. 4044 cycles. 0:0 spills:fills, 5 sends Unfortunately, this doesn't clean everything up. Here's a subset of the "before" assembly: send(8) g11<1>UW g2<0,1,0>UD 0x02106e02 dp data 1 MsgDesc: ( untyped surface read, Surface = 2, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q }; mov(8) g7<4>UB g11<8,8,1>UD { align1 1Q }; mov(8) g12<1>UB g7<32,8,4>UB { align1 1Q }; send(8) g13<1>UW g2<0,1,0>UD 0x02106e03 dp data 1 MsgDesc: ( untyped surface read, Surface = 3, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q }; mov(8) g15<1>UW g12<8,8,1>UB { align1 1Q }; mov(8) g8<4>UB g13<8,8,1>UD { align1 1Q }; mov(8) g14<1>UB g8<32,8,4>UB { align1 1Q }; mov(8) g16<1>UW g14<8,8,1>UB { align1 1Q }; xor(8) g17<1>UW g15<8,8,1>UW g16<8,8,1>UW { align1 1Q }; And here's the same subset of the "after" assembly: send(8) g11<1>UW g2<0,1,0>UD 0x02106e02 dp data 1 MsgDesc: ( untyped surface read, Surface = 2, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q }; mov(8) g7<4>UB g11<8,8,1>UD { align1 1Q }; send(8) g13<1>UW g2<0,1,0>UD 0x02106e03 dp data 1 MsgDesc: ( untyped surface read, Surface = 3, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q }; mov(8) g15<1>UW g7<32,8,4>UB { align1 1Q }; mov(8) g8<4>UB g13<8,8,1>UD { align1 1Q }; mov(8) g16<1>UW g8<32,8,4>UB { align1 1Q }; xor(8) g17<1>UW g15<8,8,1>UW g16<8,8,1>UW { align1 1Q }; There are a lot of regioning and type restrictions in fs_visitor::try_copy_propagate, and I'm a little nervious about messing with them too much. Reviewed-by: Lionel Landwerlin Suggested-by: Francisco Jerez Part-of: --- src/intel/compiler/brw_fs_copy_propagation.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp index fecbcf75a8b..167b7e6a69f 100644 --- a/src/intel/compiler/brw_fs_copy_propagation.cpp +++ b/src/intel/compiler/brw_fs_copy_propagation.cpp @@ -52,6 +52,7 @@ struct acp_entry : public exec_node { unsigned size_read; enum opcode opcode; bool saturate; + bool is_partial_write; }; struct block_data { @@ -586,8 +587,11 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) * destination of the copy, and simply replacing the sources would give a * program with different semantics. */ - if (type_sz(entry->dst.type) < type_sz(inst->src[arg].type)) + if ((type_sz(entry->dst.type) < type_sz(inst->src[arg].type) || + entry->is_partial_write) && + inst->opcode != BRW_OPCODE_MOV) { return false; + } /* Bail if the result of composing both strides cannot be expressed * as another stride. This avoids, for example, trying to transform @@ -680,7 +684,8 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) /* Compute the first component of the copy that the instruction is * reading, and the base byte offset within that component. */ - assert(entry->dst.offset % REG_SIZE == 0 && entry->dst.stride == 1); + assert((entry->dst.offset % REG_SIZE == 0 || inst->opcode == BRW_OPCODE_MOV) && + entry->dst.stride == 1); const unsigned component = rel_offset / type_sz(entry->dst.type); const unsigned suboffset = rel_offset % type_sz(entry->dst.type); @@ -950,7 +955,9 @@ can_propagate_from(fs_inst *inst) (inst->src[0].file == FIXED_GRF && inst->src[0].is_contiguous())) && inst->src[0].type == inst->dst.type && - !inst->is_partial_write()) || + /* Subset of !is_partial_write() conditions. */ + !((inst->predicate && inst->opcode != BRW_OPCODE_SEL) || + !inst->dst.is_contiguous())) || is_identity_payload(FIXED_GRF, inst); } @@ -1012,6 +1019,7 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block, entry->size_read += inst->size_read(i); entry->opcode = inst->opcode; entry->saturate = inst->saturate; + entry->is_partial_write = inst->is_partial_write(); acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry); } else if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD && inst->dst.file == VGRF) {