From e3f502e0074cc0b9d5a6807fa900b240cf7e0fc6 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Tue, 13 Apr 2021 14:07:19 -0700
Subject: [PATCH] intel/fs: Allow copy propagation between MOVs of mixed sizes

This eliminates some spurious, size-converting moves.  For example, on
Ice Lake this helps dEQP-VK.spirv_assembly.type.vec3.i8.bitwise_xor_frag:

SIMD8 shader: 52 instructions. 1 loops. 4164 cycles. 0:0 spills:fills, 5 sends
SIMD8 shader: 49 instructions. 1 loops. 4044 cycles. 0:0 spills:fills, 5 sends

Unfortunately, this doesn't clean everything up.  Here's a subset of the
"before" assembly:

send(8)         g11<1>UW        g2<0,1,0>UD     0x02106e02
                            dp data 1 MsgDesc: ( untyped surface read, Surface = 2, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q };
mov(8)          g7<4>UB         g11<8,8,1>UD                    { align1 1Q };
mov(8)          g12<1>UB        g7<32,8,4>UB                    { align1 1Q };
send(8)         g13<1>UW        g2<0,1,0>UD     0x02106e03
                            dp data 1 MsgDesc: ( untyped surface read, Surface = 3, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q };
mov(8)          g15<1>UW        g12<8,8,1>UB                    { align1 1Q };
mov(8)          g8<4>UB         g13<8,8,1>UD                    { align1 1Q };
mov(8)          g14<1>UB        g8<32,8,4>UB                    { align1 1Q };
mov(8)          g16<1>UW        g14<8,8,1>UB                    { align1 1Q };
xor(8)          g17<1>UW        g15<8,8,1>UW    g16<8,8,1>UW    { align1 1Q };

And here's the same subset of the "after" assembly:

send(8)         g11<1>UW        g2<0,1,0>UD     0x02106e02
                            dp data 1 MsgDesc: ( untyped surface read, Surface = 2, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q };
mov(8)          g7<4>UB         g11<8,8,1>UD                    { align1 1Q };
send(8)         g13<1>UW        g2<0,1,0>UD     0x02106e03
                            dp data 1 MsgDesc: ( untyped surface read, Surface = 3, SIMD8, Mask = 0xe) mlen 1 rlen 1 { align1 1Q };
mov(8)          g15<1>UW        g7<32,8,4>UB                    { align1 1Q };
mov(8)          g8<4>UB         g13<8,8,1>UD                    { align1 1Q };
mov(8)          g16<1>UW        g8<32,8,4>UB                    { align1 1Q };
xor(8)          g17<1>UW        g15<8,8,1>UW    g16<8,8,1>UW    { align1 1Q };

There are a lot of regioning and type restrictions in
fs_visitor::try_copy_propagate, and I'm a little nervious about messing
with them too much.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Suggested-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9025>
---
 src/intel/compiler/brw_fs_copy_propagation.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp
index fecbcf75a8b..167b7e6a69f 100644
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -52,6 +52,7 @@ struct acp_entry : public exec_node {
    unsigned size_read;
    enum opcode opcode;
    bool saturate;
+   bool is_partial_write;
 };
 
 struct block_data {
@@ -586,8 +587,11 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
     * destination of the copy, and simply replacing the sources would give a
     * program with different semantics.
     */
-   if (type_sz(entry->dst.type) < type_sz(inst->src[arg].type))
+   if ((type_sz(entry->dst.type) < type_sz(inst->src[arg].type) ||
+        entry->is_partial_write) &&
+       inst->opcode != BRW_OPCODE_MOV) {
       return false;
+   }
 
    /* Bail if the result of composing both strides cannot be expressed
     * as another stride. This avoids, for example, trying to transform
@@ -680,7 +684,8 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
    /* Compute the first component of the copy that the instruction is
     * reading, and the base byte offset within that component.
     */
-   assert(entry->dst.offset % REG_SIZE == 0 && entry->dst.stride == 1);
+   assert((entry->dst.offset % REG_SIZE == 0 || inst->opcode == BRW_OPCODE_MOV) &&
+           entry->dst.stride == 1);
    const unsigned component = rel_offset / type_sz(entry->dst.type);
    const unsigned suboffset = rel_offset % type_sz(entry->dst.type);
 
@@ -950,7 +955,9 @@ can_propagate_from(fs_inst *inst)
             (inst->src[0].file == FIXED_GRF &&
              inst->src[0].is_contiguous())) &&
            inst->src[0].type == inst->dst.type &&
-           !inst->is_partial_write()) ||
+           /* Subset of !is_partial_write() conditions. */
+           !((inst->predicate && inst->opcode != BRW_OPCODE_SEL) ||
+             !inst->dst.is_contiguous())) ||
           is_identity_payload(FIXED_GRF, inst);
 }
 
@@ -1012,6 +1019,7 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
             entry->size_read += inst->size_read(i);
          entry->opcode = inst->opcode;
          entry->saturate = inst->saturate;
+         entry->is_partial_write = inst->is_partial_write();
          acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry);
       } else if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
                  inst->dst.file == VGRF) {