2024-01-04 23:27:04 -08:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
* SPDX-License-Identifier: MIT
|
|
|
|
|
*/
|
|
|
|
|
|
2025-02-05 14:25:15 -08:00
|
|
|
#include "brw_shader.h"
|
2025-01-15 08:20:46 -08:00
|
|
|
#include "brw_builder.h"
|
2024-01-04 23:27:04 -08:00
|
|
|
|
2024-02-20 22:23:07 -08:00
|
|
|
/**
|
|
|
|
|
* Align16 3-source instructions cannot have scalar stride w/64-bit types.
|
|
|
|
|
*
|
|
|
|
|
* The Bspec says:
|
|
|
|
|
*
|
|
|
|
|
* Replicate Control. This field is only present in three-source
|
|
|
|
|
* instructions, for each of the three source operands. It controls
|
|
|
|
|
* replication of the starting channel to all channels in the execution
|
|
|
|
|
* size. ChanSel does not apply when Replicate Control is set. This is
|
|
|
|
|
* applicable to 32b datatypes and 16b datatype. 64b datatypes cannot use
|
|
|
|
|
* the replicate control.
|
|
|
|
|
*
|
|
|
|
|
* In practice, this can only happen on Gfx9 with DF sources to MAD. Since
|
|
|
|
|
* the source is_scalar, this can be fixed by just making the stride=1. Also
|
|
|
|
|
* clear is_scalar "just in case."
|
|
|
|
|
*/
|
|
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_lower_scalar_fp64_MAD(brw_shader &s)
|
2024-02-20 22:23:07 -08:00
|
|
|
{
|
|
|
|
|
const intel_device_info *devinfo = s.devinfo;
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
if (devinfo->ver != 9)
|
|
|
|
|
return false;
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
|
2024-02-20 22:23:07 -08:00
|
|
|
if (inst->opcode == BRW_OPCODE_MAD &&
|
|
|
|
|
inst->dst.type == BRW_TYPE_DF) {
|
|
|
|
|
for (unsigned i = 0; i < 3; i++) {
|
|
|
|
|
if (inst->src[i].is_scalar) {
|
|
|
|
|
inst->src[i].is_scalar = false;
|
|
|
|
|
inst->src[i].stride = 1;
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-04 23:27:04 -08:00
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_lower_load_payload(brw_shader &s)
|
2024-01-04 23:27:04 -08:00
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst_safe (block, brw_inst, inst, s.cfg) {
|
2024-01-04 23:27:04 -08:00
|
|
|
if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
|
|
|
|
continue;
|
|
|
|
|
|
2024-02-21 21:21:20 -08:00
|
|
|
assert(inst->dst.file == VGRF);
|
2024-01-04 23:27:04 -08:00
|
|
|
assert(inst->saturate == false);
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dst = inst->dst;
|
2024-01-04 23:27:04 -08:00
|
|
|
|
2025-02-27 22:04:03 -08:00
|
|
|
const brw_builder ibld(inst);
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder ubld = ibld.exec_all();
|
2024-01-04 23:27:04 -08:00
|
|
|
|
|
|
|
|
for (uint8_t i = 0; i < inst->header_size;) {
|
|
|
|
|
/* Number of header GRFs to initialize at once with a single MOV
|
|
|
|
|
* instruction.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned n =
|
2025-02-24 10:39:54 +02:00
|
|
|
(i + 1 < inst->header_size &&
|
|
|
|
|
(inst->src[i].file == IMM ||
|
|
|
|
|
(inst->src[i].is_contiguous() &&
|
|
|
|
|
inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))))) ?
|
2024-01-04 23:27:04 -08:00
|
|
|
2 : 1;
|
|
|
|
|
|
|
|
|
|
if (inst->src[i].file != BAD_FILE)
|
2024-04-20 17:08:02 -07:00
|
|
|
ubld.group(8 * n, 0).MOV(retype(dst, BRW_TYPE_UD),
|
|
|
|
|
retype(inst->src[i], BRW_TYPE_UD));
|
2024-01-04 23:27:04 -08:00
|
|
|
|
|
|
|
|
dst = byte_offset(dst, n * REG_SIZE);
|
|
|
|
|
i += n;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (uint8_t i = inst->header_size; i < inst->sources; i++) {
|
|
|
|
|
dst.type = inst->src[i].type;
|
|
|
|
|
if (inst->src[i].file != BAD_FILE) {
|
|
|
|
|
ibld.MOV(dst, inst->src[i]);
|
|
|
|
|
}
|
|
|
|
|
dst = offset(dst, ibld, 1);
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-27 23:23:26 -08:00
|
|
|
inst->remove();
|
2024-01-04 23:27:04 -08:00
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2024-12-06 20:52:05 -08:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
|
2024-01-04 23:27:04 -08:00
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
2024-05-21 13:40:34 -07:00
|
|
|
/**
|
|
|
|
|
* Lower CSEL with unsupported types to CMP+SEL.
|
|
|
|
|
*
|
|
|
|
|
* Or, for unsigned ==/!= comparisons, simply change the types.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_lower_csel(brw_shader &s)
|
2024-05-21 13:40:34 -07:00
|
|
|
{
|
|
|
|
|
const intel_device_info *devinfo = s.devinfo;
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
|
2024-05-21 13:40:34 -07:00
|
|
|
if (inst->opcode != BRW_OPCODE_CSEL)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
bool supported = false;
|
|
|
|
|
enum brw_reg_type orig_type = inst->src[2].type;
|
|
|
|
|
enum brw_reg_type new_type = orig_type;
|
|
|
|
|
|
|
|
|
|
switch (orig_type) {
|
|
|
|
|
case BRW_TYPE_F:
|
|
|
|
|
/* Gfx9 CSEL can only do F */
|
|
|
|
|
supported = true;
|
|
|
|
|
break;
|
|
|
|
|
case BRW_TYPE_HF:
|
|
|
|
|
case BRW_TYPE_W:
|
|
|
|
|
case BRW_TYPE_D:
|
|
|
|
|
/* Gfx11+ CSEL can do HF, W, and D. Note that we can't simply
|
|
|
|
|
* retype integer ==/!= comparisons as float on earlier hardware
|
|
|
|
|
* because it breaks for 0x8000000 and 0 (-0.0 == 0.0).
|
|
|
|
|
*/
|
|
|
|
|
supported = devinfo->ver >= 11;
|
|
|
|
|
break;
|
|
|
|
|
case BRW_TYPE_UW:
|
|
|
|
|
case BRW_TYPE_UD:
|
|
|
|
|
/* CSEL doesn't support UW/UD but we can simply retype to use the
|
|
|
|
|
* signed types when comparing with == or !=.
|
|
|
|
|
*/
|
|
|
|
|
supported = devinfo->ver >= 11 &&
|
|
|
|
|
(inst->conditional_mod == BRW_CONDITIONAL_EQ ||
|
|
|
|
|
inst->conditional_mod == BRW_CONDITIONAL_NEQ);
|
|
|
|
|
|
|
|
|
|
/* Bspec 47408, Gfx125+ CSEL does support the both signed and unsigned
|
|
|
|
|
* integer types.
|
|
|
|
|
*/
|
|
|
|
|
if (devinfo->verx10 < 125) {
|
|
|
|
|
new_type = inst->src[2].type == BRW_TYPE_UD ?
|
|
|
|
|
BRW_TYPE_D : BRW_TYPE_W;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!supported) {
|
2025-02-27 22:04:03 -08:00
|
|
|
const brw_builder ibld(inst);
|
2024-05-21 13:40:34 -07:00
|
|
|
|
|
|
|
|
/* CSEL: dst = src2 <op> 0 ? src0 : src1 */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg zero = brw_imm_reg(orig_type);
|
2024-05-21 13:40:34 -07:00
|
|
|
ibld.CMP(retype(brw_null_reg(), orig_type),
|
|
|
|
|
inst->src[2], zero, inst->conditional_mod);
|
|
|
|
|
|
2025-08-09 16:23:01 -07:00
|
|
|
inst = brw_transform_inst(s, inst, BRW_OPCODE_SEL, 2);
|
2024-05-21 13:40:34 -07:00
|
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NONE;
|
|
|
|
|
progress = true;
|
|
|
|
|
} else if (new_type != orig_type) {
|
|
|
|
|
inst->src[0].type = new_type;
|
|
|
|
|
inst->src[1].type = new_type;
|
|
|
|
|
inst->src[2].type = new_type;
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2024-12-06 20:52:05 -08:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
|
2024-05-21 13:40:34 -07:00
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-04 23:27:04 -08:00
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_lower_sub_sat(brw_shader &s)
|
2024-01-04 23:27:04 -08:00
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
|
2025-02-27 22:04:03 -08:00
|
|
|
const brw_builder ibld(inst);
|
2024-01-04 23:27:04 -08:00
|
|
|
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_USUB_SAT ||
|
|
|
|
|
inst->opcode == SHADER_OPCODE_ISUB_SAT) {
|
|
|
|
|
/* The fundamental problem is the hardware performs source negation
|
|
|
|
|
* at the bit width of the source. If the source is 0x80000000D, the
|
|
|
|
|
* negation is 0x80000000D. As a result, subtractSaturate(0,
|
|
|
|
|
* 0x80000000) will produce 0x80000000 instead of 0x7fffffff. There
|
|
|
|
|
* are at least three ways to resolve this:
|
|
|
|
|
*
|
|
|
|
|
* 1. Use the accumulator for the negated source. The accumulator is
|
|
|
|
|
* 33 bits, so our source 0x80000000 is sign-extended to
|
|
|
|
|
* 0x1800000000. The negation of which is 0x080000000. This
|
|
|
|
|
* doesn't help for 64-bit integers (which are already bigger than
|
|
|
|
|
* 33 bits). There are also only 8 accumulators, so SIMD16 or
|
|
|
|
|
* SIMD32 instructions would have to be split into multiple SIMD8
|
|
|
|
|
* instructions.
|
|
|
|
|
*
|
|
|
|
|
* 2. Use slightly different math. For any n-bit value x, we know (x
|
|
|
|
|
* >> 1) != -(x >> 1). We can use this fact to only do
|
|
|
|
|
* subtractions involving (x >> 1). subtractSaturate(a, b) ==
|
|
|
|
|
* subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
|
|
|
|
|
*
|
|
|
|
|
* 3. For unsigned sources, it is sufficient to replace the
|
|
|
|
|
* subtractSaturate with (a > b) ? a - b : 0.
|
|
|
|
|
*
|
|
|
|
|
* It may also be possible to use the SUBB instruction. This
|
|
|
|
|
* implicitly writes the accumulator, so it could only be used in the
|
|
|
|
|
* same situations as #1 above. It is further limited by only
|
|
|
|
|
* allowing UD sources.
|
|
|
|
|
*/
|
2024-04-20 17:08:02 -07:00
|
|
|
if (inst->exec_size == 8 && inst->src[0].type != BRW_TYPE_Q &&
|
|
|
|
|
inst->src[0].type != BRW_TYPE_UQ) {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg acc = retype(brw_acc_reg(inst->exec_size),
|
intel/brw: Use helper to create accumulator register
This ensure the region triple <V,W,H> is set correctly, in this case the
desired region is a sequential like <8,8,1>. Without the helper the
sequence we get is <0,1,0> -- which the generator currently partially
adjusts when emitting code, but is not sufficient when doing validation
earlier.
The code generated code is slightly modified. From crucible test
func.shader.subtractSaturate.uint in the fragment shader for SIMD8, the
diff looks like
```
mov(8) acc0<1>UD g21<8,8,1>UD { align1 1Q $0.dst };
-add.sat(8) g22<1>UD -acc0<0,1,0>UD g16<8,8,1>UD { align1 1Q @1 $0.dst };
+add.sat(8) g22<1>UD -acc0<8,8,1>UD g16<8,8,1>UD { align1 1Q @1 $0.dst };
```
Note that without the patch generator adjusted the hstride for acc0 used
as destination (see brw_set_dest), but kept the src region as is. For
the source, it is not clear to me why the <0,1,0> would work correctly
here since it is a scalar, but using <8,8,1> it is correct.
Fixes: 58907568ec5 ("intel/fs: Add SHADER_OPCODE_[IU]SUB_SAT pseudo-ops")
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28059>
2024-03-08 08:36:03 -08:00
|
|
|
inst->src[1].type);
|
2024-01-04 23:27:04 -08:00
|
|
|
|
|
|
|
|
ibld.MOV(acc, inst->src[1]);
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
|
2024-01-04 23:27:04 -08:00
|
|
|
add->saturate = true;
|
|
|
|
|
add->src[0].negate = true;
|
|
|
|
|
} else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) {
|
|
|
|
|
/* tmp = src1 >> 1;
|
|
|
|
|
* dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
|
|
|
|
|
*/
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *add;
|
2024-01-04 23:27:04 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp = ibld.vgrf(inst->src[0].type);
|
2024-04-12 17:43:22 -07:00
|
|
|
ibld.SHR(tmp, inst->src[1], brw_imm_d(1));
|
2024-01-04 23:27:04 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg s1_sub_t = ibld.ADD(inst->src[1], negate(tmp));
|
|
|
|
|
brw_reg sat_s0_sub_t = ibld.ADD(inst->src[0], negate(tmp), &add);
|
2024-01-04 23:27:04 -08:00
|
|
|
add->saturate = true;
|
|
|
|
|
|
2024-04-12 17:43:22 -07:00
|
|
|
add = ibld.ADD(inst->dst, sat_s0_sub_t, negate(s1_sub_t));
|
2024-01-04 23:27:04 -08:00
|
|
|
add->saturate = true;
|
|
|
|
|
} else {
|
|
|
|
|
/* a > b ? a - b : 0 */
|
|
|
|
|
ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
|
|
|
|
|
BRW_CONDITIONAL_G);
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
|
2024-01-04 23:27:04 -08:00
|
|
|
add->src[1].negate = !add->src[1].negate;
|
|
|
|
|
|
|
|
|
|
ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0))
|
|
|
|
|
->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-27 23:23:26 -08:00
|
|
|
inst->remove();
|
2024-01-04 23:27:04 -08:00
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2024-12-06 20:52:05 -08:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
|
|
|
|
|
BRW_DEPENDENCY_VARIABLES);
|
2024-01-04 23:27:04 -08:00
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Transform barycentric vectors into the interleaved form expected by the PLN
|
|
|
|
|
* instruction and returned by the Gfx7+ PI shared function.
|
|
|
|
|
*
|
|
|
|
|
* For channels 0-15 in SIMD16 mode they are expected to be laid out as
|
|
|
|
|
* follows in the register file:
|
|
|
|
|
*
|
|
|
|
|
* rN+0: X[0-7]
|
|
|
|
|
* rN+1: Y[0-7]
|
|
|
|
|
* rN+2: X[8-15]
|
|
|
|
|
* rN+3: Y[8-15]
|
|
|
|
|
*
|
|
|
|
|
* There is no need to handle SIMD32 here -- This is expected to be run after
|
|
|
|
|
* SIMD lowering, since SIMD lowering relies on vectors having the standard
|
|
|
|
|
* component layout.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_lower_barycentrics(brw_shader &s)
|
2024-01-04 23:27:04 -08:00
|
|
|
{
|
|
|
|
|
const intel_device_info *devinfo = s.devinfo;
|
|
|
|
|
|
2024-02-15 13:19:08 -08:00
|
|
|
if (s.stage != MESA_SHADER_FRAGMENT || devinfo->ver >= 20)
|
2024-01-04 23:27:04 -08:00
|
|
|
return false;
|
|
|
|
|
|
2024-02-15 13:19:08 -08:00
|
|
|
bool progress = false;
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
|
2024-01-04 23:27:04 -08:00
|
|
|
if (inst->exec_size < 16)
|
|
|
|
|
continue;
|
|
|
|
|
|
2025-02-27 22:04:03 -08:00
|
|
|
const brw_builder ibld(inst);
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder ubld = ibld.exec_all().group(8, 0);
|
2024-01-04 23:27:04 -08:00
|
|
|
|
|
|
|
|
switch (inst->opcode) {
|
2024-04-11 01:10:51 -07:00
|
|
|
case BRW_OPCODE_PLN: {
|
2024-01-04 23:27:04 -08:00
|
|
|
assert(inst->exec_size == 16);
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg tmp = ibld.vgrf(inst->src[1].type, 2);
|
|
|
|
|
brw_reg srcs[4];
|
2024-01-04 23:27:04 -08:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
|
2024-04-11 01:10:51 -07:00
|
|
|
srcs[i] = horiz_offset(offset(inst->src[1], ibld, i % 2),
|
2024-01-04 23:27:04 -08:00
|
|
|
8 * (i / 2));
|
|
|
|
|
|
|
|
|
|
ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
|
|
|
|
|
|
2024-04-11 01:10:51 -07:00
|
|
|
inst->src[1] = tmp;
|
2024-01-04 23:27:04 -08:00
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
|
|
|
|
|
assert(inst->exec_size == 16);
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg tmp = ibld.vgrf(inst->dst.type, 2);
|
2024-01-04 23:27:04 -08:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
for (unsigned g = 0; g < inst->exec_size / 8; g++) {
|
2025-04-02 16:12:45 -07:00
|
|
|
brw_inst *mov = ibld.after(inst).group(8, g)
|
2024-01-04 23:27:04 -08:00
|
|
|
.MOV(horiz_offset(offset(inst->dst, ibld, i),
|
|
|
|
|
8 * g),
|
|
|
|
|
offset(tmp, ubld, 2 * g + i));
|
|
|
|
|
mov->predicate = inst->predicate;
|
|
|
|
|
mov->predicate_inverse = inst->predicate_inverse;
|
|
|
|
|
mov->flag_subreg = inst->flag_subreg;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inst->dst = tmp;
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2024-12-06 20:52:05 -08:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
|
|
|
|
|
BRW_DEPENDENCY_VARIABLES);
|
2024-01-04 23:27:04 -08:00
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Lower a derivative instruction as the floating-point difference of two
|
|
|
|
|
* swizzles of the source, specified as \p swz0 and \p swz1.
|
|
|
|
|
*/
|
|
|
|
|
static bool
|
2025-02-28 00:06:24 -08:00
|
|
|
lower_derivative(brw_shader &s, brw_inst *inst,
|
2024-01-04 23:27:04 -08:00
|
|
|
unsigned swz0, unsigned swz1)
|
|
|
|
|
{
|
2025-02-27 22:04:03 -08:00
|
|
|
const brw_builder ubld = brw_builder(inst).exec_all();
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg tmp0 = ubld.vgrf(inst->src[0].type);
|
|
|
|
|
const brw_reg tmp1 = ubld.vgrf(inst->src[0].type);
|
2024-01-04 23:27:04 -08:00
|
|
|
|
|
|
|
|
ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0));
|
|
|
|
|
ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1));
|
|
|
|
|
|
2025-08-09 16:23:01 -07:00
|
|
|
inst = brw_transform_inst(s, inst, BRW_OPCODE_ADD);
|
2024-01-04 23:27:04 -08:00
|
|
|
inst->src[0] = negate(tmp0);
|
|
|
|
|
inst->src[1] = tmp1;
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Lower derivative instructions on platforms where codegen cannot implement
|
|
|
|
|
* them efficiently (i.e. XeHP).
|
|
|
|
|
*/
|
|
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_lower_derivatives(brw_shader &s)
|
2024-01-04 23:27:04 -08:00
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
if (s.devinfo->verx10 < 125)
|
|
|
|
|
return false;
|
|
|
|
|
|
2025-08-09 16:23:01 -07:00
|
|
|
foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
|
2024-01-04 23:27:04 -08:00
|
|
|
if (inst->opcode == FS_OPCODE_DDX_COARSE)
|
2025-02-28 00:06:24 -08:00
|
|
|
progress |= lower_derivative(s, inst,
|
2024-01-04 23:27:04 -08:00
|
|
|
BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY);
|
|
|
|
|
|
|
|
|
|
else if (inst->opcode == FS_OPCODE_DDX_FINE)
|
2025-02-28 00:06:24 -08:00
|
|
|
progress |= lower_derivative(s, inst,
|
2024-01-04 23:27:04 -08:00
|
|
|
BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW);
|
|
|
|
|
|
|
|
|
|
else if (inst->opcode == FS_OPCODE_DDY_COARSE)
|
2025-02-28 00:06:24 -08:00
|
|
|
progress |= lower_derivative(s, inst,
|
2024-01-04 23:27:04 -08:00
|
|
|
BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ);
|
|
|
|
|
|
|
|
|
|
else if (inst->opcode == FS_OPCODE_DDY_FINE)
|
2025-02-28 00:06:24 -08:00
|
|
|
progress |= lower_derivative(s, inst,
|
2024-01-04 23:27:04 -08:00
|
|
|
BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2024-12-06 20:52:05 -08:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
|
|
|
|
|
BRW_DEPENDENCY_VARIABLES);
|
2024-01-04 23:27:04 -08:00
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_lower_find_live_channel(brw_shader &s)
|
2024-01-04 23:27:04 -08:00
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
bool packed_dispatch =
|
|
|
|
|
brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
|
2024-02-19 23:07:04 -08:00
|
|
|
s.prog_data);
|
2024-01-04 23:27:04 -08:00
|
|
|
bool vmask =
|
|
|
|
|
s.stage == MESA_SHADER_FRAGMENT &&
|
2024-02-19 23:07:04 -08:00
|
|
|
brw_wm_prog_data(s.prog_data)->uses_vmask;
|
2024-01-04 23:27:04 -08:00
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
|
2024-01-04 23:27:04 -08:00
|
|
|
if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL &&
|
2024-01-05 09:19:38 -08:00
|
|
|
inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL &&
|
|
|
|
|
inst->opcode != SHADER_OPCODE_LOAD_LIVE_CHANNELS)
|
2024-01-04 23:27:04 -08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL;
|
|
|
|
|
|
|
|
|
|
/* Getting the first active channel index is easy on Gfx8: Just find
|
|
|
|
|
* the first bit set in the execution mask. The register exists on
|
|
|
|
|
* HSW already but it reads back as all ones when the current
|
|
|
|
|
* instruction has execution masking disabled, so it's kind of
|
|
|
|
|
* useless there.
|
|
|
|
|
*/
|
|
|
|
|
|
2025-02-27 22:04:03 -08:00
|
|
|
const brw_builder ibld(inst);
|
2024-01-04 23:27:04 -08:00
|
|
|
if (!inst->is_partial_write())
|
|
|
|
|
ibld.emit_undef_for_dst(inst);
|
|
|
|
|
|
2025-04-03 01:14:03 -07:00
|
|
|
const brw_builder ubld = brw_builder(inst).uniform();
|
2024-01-04 23:27:04 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg exec_mask = ubld.vgrf(BRW_TYPE_UD);
|
2022-08-08 16:43:58 +03:00
|
|
|
ubld.UNDEF(exec_mask);
|
2024-05-28 16:43:43 +03:00
|
|
|
ubld.emit(SHADER_OPCODE_READ_ARCH_REG, exec_mask,
|
|
|
|
|
retype(brw_mask_reg(0),
|
|
|
|
|
BRW_TYPE_UD));
|
2022-08-08 16:43:58 +03:00
|
|
|
|
2024-01-04 23:27:04 -08:00
|
|
|
/* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
|
|
|
|
|
* so combine the execution and dispatch masks to obtain the true mask.
|
|
|
|
|
*
|
|
|
|
|
* If we're looking for the first live channel, and we have packed
|
|
|
|
|
* dispatch, we can skip this step, as we know all dispatched channels
|
|
|
|
|
* will appear at the front of the mask.
|
|
|
|
|
*/
|
|
|
|
|
if (!(first && packed_dispatch)) {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg mask = ubld.vgrf(BRW_TYPE_UD);
|
2024-01-04 23:27:04 -08:00
|
|
|
ubld.UNDEF(mask);
|
2024-05-28 16:43:43 +03:00
|
|
|
ubld.emit(SHADER_OPCODE_READ_ARCH_REG, mask,
|
|
|
|
|
retype(brw_sr0_reg(vmask ? 3 : 2),
|
|
|
|
|
BRW_TYPE_UD));
|
2024-01-04 23:27:04 -08:00
|
|
|
|
|
|
|
|
/* Quarter control has the effect of magically shifting the value of
|
|
|
|
|
* ce0 so you'll get the first/last active channel relative to the
|
|
|
|
|
* specified quarter control as result.
|
|
|
|
|
*/
|
|
|
|
|
if (inst->group > 0)
|
|
|
|
|
ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8)));
|
|
|
|
|
|
|
|
|
|
ubld.AND(mask, exec_mask, mask);
|
|
|
|
|
exec_mask = mask;
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-05 09:19:38 -08:00
|
|
|
switch (inst->opcode) {
|
|
|
|
|
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
|
2024-01-04 23:27:04 -08:00
|
|
|
ubld.FBL(inst->dst, exec_mask);
|
2024-01-05 09:19:38 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL: {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp = ubld.vgrf(BRW_TYPE_UD);
|
2024-01-04 23:27:04 -08:00
|
|
|
ubld.UNDEF(tmp);
|
|
|
|
|
ubld.LZD(tmp, exec_mask);
|
|
|
|
|
ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31));
|
2024-01-05 09:19:38 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
|
|
|
|
|
ubld.MOV(inst->dst, exec_mask);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("Impossible.");
|
2024-01-04 23:27:04 -08:00
|
|
|
}
|
|
|
|
|
|
2025-02-27 23:23:26 -08:00
|
|
|
inst->remove();
|
2024-01-04 23:27:04 -08:00
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2024-12-06 20:52:05 -08:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
|
|
|
|
|
BRW_DEPENDENCY_VARIABLES);
|
2024-01-04 23:27:04 -08:00
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* From the Skylake PRM Vol. 2a docs for sends:
|
|
|
|
|
*
|
|
|
|
|
* "It is required that the second block of GRFs does not overlap with the
|
|
|
|
|
* first block."
|
|
|
|
|
*
|
|
|
|
|
* There are plenty of cases where we may accidentally violate this due to
|
|
|
|
|
* having, for instance, both sources be the constant 0. This little pass
|
|
|
|
|
* just adds a new vgrf for the second payload and copies it over.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_lower_sends_overlapping_payload(brw_shader &s)
|
2024-01-04 23:27:04 -08:00
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst_safe (block, brw_inst, inst, s.cfg) {
|
2025-08-20 15:43:08 -07:00
|
|
|
if (inst->opcode != SHADER_OPCODE_SEND)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
brw_send_inst *send = inst->as_send();
|
|
|
|
|
|
|
|
|
|
if (send->ex_mlen > 0 &&
|
|
|
|
|
regions_overlap(send->src[SEND_SRC_PAYLOAD1],
|
|
|
|
|
send->mlen * REG_SIZE,
|
|
|
|
|
send->src[SEND_SRC_PAYLOAD2],
|
|
|
|
|
send->ex_mlen * REG_SIZE)) {
|
|
|
|
|
const unsigned arg = send->mlen < send->ex_mlen ?
|
2025-01-16 02:57:20 -08:00
|
|
|
SEND_SRC_PAYLOAD1 : SEND_SRC_PAYLOAD2;
|
2025-08-20 15:43:08 -07:00
|
|
|
const unsigned len = MIN2(send->mlen, send->ex_mlen);
|
2024-01-12 02:58:30 -08:00
|
|
|
|
2025-01-31 12:50:20 -08:00
|
|
|
brw_reg tmp = retype(brw_allocate_vgrf_units(s, len), BRW_TYPE_UD);
|
2024-06-18 15:25:22 -07:00
|
|
|
|
2024-01-04 23:27:04 -08:00
|
|
|
/* Sadly, we've lost all notion of channels and bit sizes at this
|
|
|
|
|
* point. Just WE_all it.
|
|
|
|
|
*/
|
2025-08-20 15:43:08 -07:00
|
|
|
const brw_builder ibld = brw_builder(send).exec_all().group(16, 0);
|
|
|
|
|
brw_reg copy_src = retype(send->src[arg], BRW_TYPE_UD);
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg copy_dst = tmp;
|
2024-01-12 02:58:30 -08:00
|
|
|
for (unsigned i = 0; i < len; i += 2) {
|
|
|
|
|
if (len == i + 1) {
|
2024-01-04 23:27:04 -08:00
|
|
|
/* Only one register left; do SIMD8 */
|
|
|
|
|
ibld.group(8, 0).MOV(copy_dst, copy_src);
|
|
|
|
|
} else {
|
|
|
|
|
ibld.MOV(copy_dst, copy_src);
|
|
|
|
|
}
|
|
|
|
|
copy_src = offset(copy_src, ibld, 1);
|
|
|
|
|
copy_dst = offset(copy_dst, ibld, 1);
|
|
|
|
|
}
|
2025-08-20 15:43:08 -07:00
|
|
|
send->src[arg] = tmp;
|
2024-01-04 23:27:04 -08:00
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2024-12-06 20:52:05 -08:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
|
|
|
|
|
BRW_DEPENDENCY_VARIABLES);
|
2024-01-04 23:27:04 -08:00
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
2024-02-21 21:21:20 -08:00
|
|
|
* Three source instruction must have a GRF destination register.
|
2024-01-04 23:27:04 -08:00
|
|
|
* ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_lower_3src_null_dest(brw_shader &s)
|
2024-01-04 23:27:04 -08:00
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst_safe (block, brw_inst, inst, s.cfg) {
|
2024-01-04 23:27:04 -08:00
|
|
|
if (inst->is_3src(s.compiler) && inst->dst.is_null()) {
|
2025-01-31 12:50:20 -08:00
|
|
|
inst->dst = retype(brw_allocate_vgrf_units(s, s.dispatch_width / 8),
|
|
|
|
|
inst->dst.type);
|
2024-01-04 23:27:04 -08:00
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2025-03-10 16:08:31 -07:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DATA_FLOW |
|
|
|
|
|
BRW_DEPENDENCY_INSTRUCTION_DETAIL |
|
2024-12-06 20:52:05 -08:00
|
|
|
BRW_DEPENDENCY_VARIABLES);
|
2024-01-04 23:27:04 -08:00
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-21 10:58:20 -07:00
|
|
|
static bool
|
|
|
|
|
unsupported_64bit_type(const intel_device_info *devinfo,
|
|
|
|
|
enum brw_reg_type type)
|
|
|
|
|
{
|
2024-04-20 17:08:02 -07:00
|
|
|
return (!devinfo->has_64bit_float && type == BRW_TYPE_DF) ||
|
|
|
|
|
(!devinfo->has_64bit_int && (type == BRW_TYPE_UQ ||
|
|
|
|
|
type == BRW_TYPE_Q));
|
2024-03-21 10:58:20 -07:00
|
|
|
}
|
|
|
|
|
|
2025-02-28 01:03:50 -08:00
|
|
|
bool
|
|
|
|
|
brw_lower_bfloat_conversion(brw_shader &s, brw_inst *inst)
|
|
|
|
|
{
|
|
|
|
|
assert(s.devinfo->has_bfloat16);
|
|
|
|
|
assert(inst->dst.type == BRW_TYPE_BF || inst->src[0].type == BRW_TYPE_BF);
|
|
|
|
|
|
|
|
|
|
if (inst->dst.type == inst->src[0].type) {
|
|
|
|
|
/* Except for DPAS, instructions with only bfloat operands are
|
|
|
|
|
* not supported, so just move the bits using UW.
|
|
|
|
|
*/
|
|
|
|
|
inst->dst = retype(inst->dst, BRW_TYPE_UW);
|
|
|
|
|
inst->src[0] = retype(inst->src[0], BRW_TYPE_UW);
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
} else if (inst->dst.type == BRW_TYPE_BF &&
|
|
|
|
|
byte_stride(inst->dst) == 2) {
|
|
|
|
|
/* Converting to packed BF is not supported natively. Using
|
|
|
|
|
* ADD with -0.0f preserves NaN correctly. Note +0.0f would
|
|
|
|
|
* not work since it doesn't preserve -0.0f!
|
|
|
|
|
*/
|
|
|
|
|
assert(inst->src[0].type == BRW_TYPE_F);
|
2025-08-09 16:23:01 -07:00
|
|
|
inst = brw_transform_inst(s, inst, BRW_OPCODE_ADD);
|
2025-02-28 01:03:50 -08:00
|
|
|
inst->src[1] = brw_imm_f(-0.0f);
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
} else if (inst->dst.type == BRW_TYPE_F &&
|
|
|
|
|
byte_stride(inst->src[0]) != 2) {
|
|
|
|
|
/* Converting from a unpacked BF is not supported natively. */
|
|
|
|
|
const brw_builder ibld(inst);
|
|
|
|
|
ibld.SHL(retype(inst->dst, BRW_TYPE_UD),
|
|
|
|
|
retype(inst->src[0], BRW_TYPE_UW),
|
|
|
|
|
brw_imm_uw(16));
|
|
|
|
|
inst->remove();
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-18 22:52:35 -07:00
|
|
|
/**
|
|
|
|
|
* Perform lowering to legalize the IR for various ALU restrictions.
|
|
|
|
|
*
|
|
|
|
|
* For example:
|
|
|
|
|
* - Splitting 64-bit MOV/SEL into 2x32-bit where needed
|
|
|
|
|
*/
|
|
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_lower_alu_restrictions(brw_shader &s)
|
2024-03-18 22:52:35 -07:00
|
|
|
{
|
|
|
|
|
const intel_device_info *devinfo = s.devinfo;
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
|
2024-03-18 22:52:35 -07:00
|
|
|
switch (inst->opcode) {
|
|
|
|
|
case BRW_OPCODE_MOV:
|
2024-03-21 11:04:28 -07:00
|
|
|
if (unsupported_64bit_type(devinfo, inst->dst.type)) {
|
2024-03-18 22:52:35 -07:00
|
|
|
assert(inst->dst.type == inst->src[0].type);
|
|
|
|
|
assert(!inst->saturate);
|
|
|
|
|
assert(!inst->src[0].abs);
|
|
|
|
|
assert(!inst->src[0].negate);
|
2025-02-27 22:04:03 -08:00
|
|
|
const brw_builder ibld(inst);
|
2024-03-18 22:52:35 -07:00
|
|
|
|
2024-04-21 00:33:52 -07:00
|
|
|
enum brw_reg_type type = brw_type_with_size(inst->dst.type, 32);
|
2024-03-18 22:52:35 -07:00
|
|
|
|
|
|
|
|
if (!inst->is_partial_write())
|
|
|
|
|
ibld.emit_undef_for_dst(inst);
|
|
|
|
|
|
2024-03-21 11:04:28 -07:00
|
|
|
ibld.MOV(subscript(inst->dst, type, 1),
|
|
|
|
|
subscript(inst->src[0], type, 1));
|
|
|
|
|
ibld.MOV(subscript(inst->dst, type, 0),
|
|
|
|
|
subscript(inst->src[0], type, 0));
|
2024-03-18 22:52:35 -07:00
|
|
|
|
2025-02-27 23:23:26 -08:00
|
|
|
inst->remove();
|
2024-03-18 22:52:35 -07:00
|
|
|
progress = true;
|
|
|
|
|
}
|
2025-02-28 01:03:50 -08:00
|
|
|
|
|
|
|
|
if (inst->dst.type == BRW_TYPE_BF || inst->src[0].type == BRW_TYPE_BF)
|
|
|
|
|
progress |= brw_lower_bfloat_conversion(s, inst);
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_MUL:
|
|
|
|
|
case BRW_OPCODE_MAD: {
|
|
|
|
|
/* BFloat16 restrictions:
|
|
|
|
|
*
|
|
|
|
|
* "Bfloat16 not in Src1 of 2-source instructions involving
|
|
|
|
|
* multiplier."
|
|
|
|
|
*
|
|
|
|
|
* and
|
|
|
|
|
*
|
|
|
|
|
* "Bfloat16 not allowed in Src2 of 3-source instructions
|
|
|
|
|
* involving multiplier."
|
|
|
|
|
*/
|
|
|
|
|
brw_reg &last_src = inst->src[inst->sources - 1];
|
|
|
|
|
if (last_src.type == BRW_TYPE_BF) {
|
|
|
|
|
assert(devinfo->has_bfloat16);
|
|
|
|
|
const brw_builder ibld = brw_builder(inst);
|
|
|
|
|
|
|
|
|
|
brw_reg src2_as_f = ibld.vgrf(BRW_TYPE_F);
|
|
|
|
|
brw_inst *conv = ibld.MOV(src2_as_f, last_src);
|
|
|
|
|
brw_lower_bfloat_conversion(s, conv);
|
|
|
|
|
last_src = src2_as_f;
|
|
|
|
|
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
2024-03-18 22:52:35 -07:00
|
|
|
break;
|
2025-02-28 01:03:50 -08:00
|
|
|
}
|
2024-03-18 22:52:35 -07:00
|
|
|
|
|
|
|
|
case BRW_OPCODE_SEL:
|
2024-03-21 10:58:20 -07:00
|
|
|
if (unsupported_64bit_type(devinfo, inst->dst.type)) {
|
2024-03-18 22:52:35 -07:00
|
|
|
assert(inst->dst.type == inst->src[0].type);
|
|
|
|
|
assert(!inst->saturate);
|
|
|
|
|
assert(!inst->src[0].abs && !inst->src[0].negate);
|
|
|
|
|
assert(!inst->src[1].abs && !inst->src[1].negate);
|
2024-03-21 11:00:42 -07:00
|
|
|
assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
|
2025-02-27 22:04:03 -08:00
|
|
|
const brw_builder ibld(inst);
|
2024-03-18 22:52:35 -07:00
|
|
|
|
2024-04-21 00:33:52 -07:00
|
|
|
enum brw_reg_type type = brw_type_with_size(inst->dst.type, 32);
|
2024-03-21 11:02:41 -07:00
|
|
|
|
2024-03-18 22:52:35 -07:00
|
|
|
if (!inst->is_partial_write())
|
|
|
|
|
ibld.emit_undef_for_dst(inst);
|
|
|
|
|
|
|
|
|
|
set_predicate(inst->predicate,
|
2024-03-21 11:02:41 -07:00
|
|
|
ibld.SEL(subscript(inst->dst, type, 0),
|
|
|
|
|
subscript(inst->src[0], type, 0),
|
|
|
|
|
subscript(inst->src[1], type, 0)));
|
2024-03-18 22:52:35 -07:00
|
|
|
set_predicate(inst->predicate,
|
2024-03-21 11:02:41 -07:00
|
|
|
ibld.SEL(subscript(inst->dst, type, 1),
|
|
|
|
|
subscript(inst->src[0], type, 1),
|
|
|
|
|
subscript(inst->src[1], type, 1)));
|
2024-03-18 22:52:35 -07:00
|
|
|
|
2025-02-27 23:23:26 -08:00
|
|
|
inst->remove();
|
2024-03-18 22:52:35 -07:00
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
2025-04-30 18:33:18 +00:00
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
|
|
|
case SHADER_OPCODE_BROADCAST:
|
|
|
|
|
/* Gen12.5 adds the following region restriction:
|
|
|
|
|
*
|
|
|
|
|
* "Vx1 and VxH indirect addressing for Float, Half-Float,
|
|
|
|
|
* Double-Float and Quad-Word data must not be used."
|
|
|
|
|
*
|
|
|
|
|
* We require the source and destination types to match so stomp to
|
|
|
|
|
* an unsigned integer type.
|
|
|
|
|
*/
|
|
|
|
|
assert(inst->src[0].type == inst->dst.type);
|
|
|
|
|
inst->src[0].type = inst->dst.type = brw_type_with_size(BRW_TYPE_UD,
|
|
|
|
|
brw_type_size_bits(inst->src[0].type));
|
|
|
|
|
break;
|
|
|
|
|
|
2024-03-18 22:52:35 -07:00
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress) {
|
2025-02-28 01:03:50 -08:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
|
|
|
|
|
BRW_DEPENDENCY_VARIABLES);
|
2024-03-18 22:52:35 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
2024-04-04 16:03:34 -07:00
|
|
|
|
|
|
|
|
static void
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_lower_vgrf_to_fixed_grf(const struct intel_device_info *devinfo, brw_inst *inst,
|
2024-12-06 11:37:57 -08:00
|
|
|
brw_reg *reg, bool compressed)
|
2024-04-04 16:03:34 -07:00
|
|
|
{
|
|
|
|
|
if (reg->file != VGRF)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
struct brw_reg new_reg;
|
|
|
|
|
|
|
|
|
|
if (reg->stride == 0) {
|
|
|
|
|
new_reg = brw_vec1_grf(reg->nr, 0);
|
|
|
|
|
} else if (reg->stride > 4) {
|
|
|
|
|
assert(reg != &inst->dst);
|
2024-04-21 00:57:59 -07:00
|
|
|
assert(reg->stride * brw_type_size_bytes(reg->type) <= REG_SIZE);
|
2024-04-04 16:03:34 -07:00
|
|
|
new_reg = brw_vecn_grf(1, reg->nr, 0);
|
|
|
|
|
new_reg = stride(new_reg, reg->stride, 1, 0);
|
|
|
|
|
} else {
|
|
|
|
|
/* From the Haswell PRM:
|
|
|
|
|
*
|
|
|
|
|
* "VertStride must be used to cross GRF register boundaries. This
|
|
|
|
|
* rule implies that elements within a 'Width' cannot cross GRF
|
|
|
|
|
* boundaries."
|
|
|
|
|
*
|
|
|
|
|
* The maximum width value that could satisfy this restriction is:
|
|
|
|
|
*/
|
2024-04-21 00:57:59 -07:00
|
|
|
const unsigned reg_width =
|
|
|
|
|
REG_SIZE / (reg->stride * brw_type_size_bytes(reg->type));
|
2024-04-04 16:03:34 -07:00
|
|
|
|
|
|
|
|
/* Because the hardware can only split source regions at a whole
|
|
|
|
|
* multiple of width during decompression (i.e. vertically), clamp
|
|
|
|
|
* the value obtained above to the physical execution size of a
|
|
|
|
|
* single decompressed chunk of the instruction:
|
|
|
|
|
*/
|
|
|
|
|
const bool compressed = inst->dst.component_size(inst->exec_size) > REG_SIZE;
|
|
|
|
|
const unsigned phys_width = compressed ? inst->exec_size / 2 :
|
|
|
|
|
inst->exec_size;
|
|
|
|
|
|
|
|
|
|
/* XXX - The equation above is strictly speaking not correct on
|
|
|
|
|
* hardware that supports unbalanced GRF writes -- On Gfx9+
|
|
|
|
|
* each decompressed chunk of the instruction may have a
|
|
|
|
|
* different execution size when the number of components
|
|
|
|
|
* written to each destination GRF is not the same.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
const unsigned max_hw_width = 16;
|
|
|
|
|
|
|
|
|
|
const unsigned width = MIN3(reg_width, phys_width, max_hw_width);
|
|
|
|
|
new_reg = brw_vecn_grf(width, reg->nr, 0);
|
|
|
|
|
new_reg = stride(new_reg, width * reg->stride, width, reg->stride);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
new_reg = retype(new_reg, reg->type);
|
|
|
|
|
new_reg = byte_offset(new_reg, reg->offset);
|
|
|
|
|
new_reg.abs = reg->abs;
|
|
|
|
|
new_reg.negate = reg->negate;
|
brw: Basic infrastructure to store convergent values as scalars
In SIMD16 and SIMD32, storing convergent values in full 16- or
32-channel registers is wasteful. It wastes register space, and in most
cases on SIMD32, it wastes instructions. Our register allocator is not
clever enough to handle scalar allocations. It's fundamental unit of
allocation is SIMD8. Start treating convergent values as SIMD8.
Add a tracking bit in brw_reg to specify that a register represents a
convergent, scalar value. This has two implications:
1. All channels of the SIMD8 register must contain the same value. In
general, this means that writes to the register must be
force_writemask_all and exec_size = 8;
2. Reads of this register can (and should) use <0,1,0> stride. SIMD8
instructions that have restrictions on source stride can us <8,8,1>.
Values that are vectors (e.g., results of load_uniform or texture
operations) will be stored as multiple SIMD8 hardware registers.
v2: brw_fs_opt_copy_propagation_defs fix from Ken. Fix for Xe2.
v3: Eliminte offset_to_scalar(). Remove mention of vec4 backend in
brw_reg.h. Both suggested by Caio. The offset_to_scalar() change
necessitates some trickery in the fs_builder offset() function, but I
think this is an improvement overall. There is also some rework in
find_value_for_offset to account for the possibility that is_scalar
sources in LOAD_PAYLOAD might be <8;8,1> or <0;1,0>.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-09 17:12:11 -08:00
|
|
|
new_reg.is_scalar = reg->is_scalar;
|
2024-04-04 16:03:34 -07:00
|
|
|
|
|
|
|
|
*reg = new_reg;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_lower_vgrfs_to_fixed_grfs(brw_shader &s)
|
2024-04-04 16:03:34 -07:00
|
|
|
{
|
|
|
|
|
assert(s.grf_used || !"Must be called after register allocation");
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst(block, brw_inst, inst, s.cfg) {
|
2024-04-04 16:03:34 -07:00
|
|
|
/* If the instruction writes to more than one register, it needs to be
|
|
|
|
|
* explicitly marked as compressed on Gen <= 5. On Gen >= 6 the
|
|
|
|
|
* hardware figures out by itself what the right compression mode is,
|
|
|
|
|
* but we still need to know whether the instruction is compressed to
|
|
|
|
|
* set up the source register regions appropriately.
|
|
|
|
|
*
|
|
|
|
|
* XXX - This is wrong for instructions that write a single register but
|
|
|
|
|
* read more than one which should strictly speaking be treated as
|
|
|
|
|
* compressed. For instructions that don't write any registers it
|
|
|
|
|
* relies on the destination being a null register of the correct
|
|
|
|
|
* type and regioning so the instruction is considered compressed
|
|
|
|
|
* or not accordingly.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
const bool compressed =
|
|
|
|
|
inst->dst.component_size(inst->exec_size) > REG_SIZE;
|
|
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
brw_lower_vgrf_to_fixed_grf(s.devinfo, inst, &inst->dst, compressed);
|
2024-04-04 16:03:34 -07:00
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2024-12-06 11:37:57 -08:00
|
|
|
brw_lower_vgrf_to_fixed_grf(s.devinfo, inst, &inst->src[i], compressed);
|
2024-04-04 16:03:34 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-06 20:52:05 -08:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DATA_FLOW |
|
2025-03-10 16:08:31 -07:00
|
|
|
BRW_DEPENDENCY_INSTRUCTION_DETAIL |
|
2024-12-06 20:52:05 -08:00
|
|
|
BRW_DEPENDENCY_VARIABLES);
|
2024-04-04 16:03:34 -07:00
|
|
|
}
|
2024-02-24 01:24:03 -08:00
|
|
|
|
2024-11-20 08:12:52 -08:00
|
|
|
static brw_reg
|
|
|
|
|
brw_s0(enum brw_reg_type type, unsigned subnr)
|
|
|
|
|
{
|
|
|
|
|
return brw_make_reg(ARF,
|
|
|
|
|
BRW_ARF_SCALAR,
|
|
|
|
|
subnr,
|
|
|
|
|
0,
|
|
|
|
|
0,
|
|
|
|
|
type,
|
|
|
|
|
BRW_VERTICAL_STRIDE_0,
|
|
|
|
|
BRW_WIDTH_1,
|
|
|
|
|
BRW_HORIZONTAL_STRIDE_0,
|
|
|
|
|
BRW_SWIZZLE_XYZW,
|
|
|
|
|
WRITEMASK_XYZW);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool
|
2025-08-20 15:43:08 -07:00
|
|
|
brw_lower_send_gather_inst(brw_shader &s, brw_send_inst *inst)
|
2024-11-20 08:12:52 -08:00
|
|
|
{
|
|
|
|
|
const intel_device_info *devinfo = s.devinfo;
|
|
|
|
|
assert(devinfo->ver >= 30);
|
|
|
|
|
|
|
|
|
|
const unsigned unit = reg_unit(devinfo);
|
|
|
|
|
assert(unit == 2);
|
|
|
|
|
|
|
|
|
|
assert(inst->opcode == SHADER_OPCODE_SEND_GATHER);
|
|
|
|
|
assert(inst->sources > 2);
|
|
|
|
|
assert(inst->src[2].file == BAD_FILE);
|
|
|
|
|
|
|
|
|
|
unsigned count = 0;
|
|
|
|
|
uint8_t regs[16] = {};
|
|
|
|
|
|
|
|
|
|
const unsigned num_payload_sources = inst->sources - 3;
|
|
|
|
|
assert(num_payload_sources > 0);
|
|
|
|
|
|
|
|
|
|
/* Limited by Src0.Length in the SEND instruction. */
|
|
|
|
|
assert(num_payload_sources < 16);
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 3; i < inst->sources; i++) {
|
|
|
|
|
assert(inst->src[i].file == FIXED_GRF);
|
|
|
|
|
assert(inst->src[i].nr % reg_unit(devinfo) == 0);
|
|
|
|
|
|
|
|
|
|
unsigned nr = phys_nr(devinfo, inst->src[i]);
|
|
|
|
|
assert(nr <= UINT8_MAX);
|
|
|
|
|
regs[count++] = nr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Fill out ARF scalar register with the physical register numbers
|
|
|
|
|
* and use SEND_GATHER.
|
|
|
|
|
*/
|
2025-04-03 01:14:03 -07:00
|
|
|
brw_builder ubld = brw_builder(inst).uniform();
|
2024-11-20 08:12:52 -08:00
|
|
|
for (unsigned q = 0; q < DIV_ROUND_UP(count, 8); q++) {
|
|
|
|
|
uint64_t v = 0;
|
|
|
|
|
for (unsigned i = 0; i < 8; i++) {
|
|
|
|
|
const uint64_t reg = regs[(q * 8) + i];
|
|
|
|
|
v |= reg << (8 * i);
|
|
|
|
|
}
|
|
|
|
|
ubld.MOV(brw_s0(BRW_TYPE_UQ, q), brw_imm_uq(v));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inst->src[2] = brw_s0(BRW_TYPE_UD, 0);
|
|
|
|
|
inst->mlen = count * unit;
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_lower_send_gather(brw_shader &s)
|
2024-11-20 08:12:52 -08:00
|
|
|
{
|
|
|
|
|
assert(s.devinfo->ver >= 30);
|
|
|
|
|
assert(s.grf_used || !"Must be called after register allocation");
|
|
|
|
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst(block, brw_inst, inst, s.cfg) {
|
2024-11-20 08:12:52 -08:00
|
|
|
if (inst->opcode == SHADER_OPCODE_SEND_GATHER)
|
2025-08-20 15:43:08 -07:00
|
|
|
progress |= brw_lower_send_gather_inst(s, inst->as_send());
|
2024-11-20 08:12:52 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2025-03-10 16:08:31 -07:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
|
2024-12-06 20:52:05 -08:00
|
|
|
BRW_DEPENDENCY_VARIABLES);
|
2024-11-20 08:12:52 -08:00
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-24 01:24:03 -08:00
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_lower_load_subgroup_invocation(brw_shader &s)
|
2024-02-24 01:24:03 -08:00
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
|
2024-02-24 01:24:03 -08:00
|
|
|
if (inst->opcode != SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION)
|
|
|
|
|
continue;
|
|
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder abld =
|
2025-02-27 22:04:03 -08:00
|
|
|
brw_builder(inst).annotate("SubgroupInvocation");
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder ubld8 = abld.group(8, 0).exec_all();
|
intel/brw: Use CSE for LOAD_SUBGROUP_INVOCATION
Instead of emitting a single one at the top, and making reference to it,
emit the virtual instruction as needed and let CSE do its job.
Since load_subgroup_invocation now can appear not at the start of the
shader, use UNDEF in all cases to ensure that the liveness of the
destination doesn't extend to the first partial write done here (it was
being used only for SIMD > 8 before).
Note this option was considered in the past
6132992cdb858268af0e985727d80e4140be389c but at the time dismissed. The
difference now is that the lowering of the virtual instruction happens
earlier than the scheduling.
The motivation for this change is to allow passes other than the NIR
conversion to use this value. The alternative of storing a `brw_reg` in
the shader (instead of NIR state) gets complicated by passes like
compact_vgrfs, that move VGRFs around (and update the instructions).
This and maybe other passes would have to care about the brw_reg.
Fossil-db numbers, TGL
```
*** Shaders only in 'after' results are ignored:
steam-native/shadow_of_the_tomb_raider/c683ea5067ee157d/fs.32/0, steam-native/shadow_of_the_tomb_raider/f4df450c3cef40b4/fs.32/0, steam-native/shadow_of_the_tomb_raider/94b708fb8e3d9597/fs.32/0, steam-native/shadow_of_the_tomb_raider/19d44c328edabd30/fs.32/0, steam-native/shadow_of_the_tomb_raider/8a7dcbd5a74a19bf/fs.32/0, and 366 more
from 4 apps: steam-dxvk/alan_wake, steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
*** Shaders only in 'before' results are ignored:
steam-dxvk/octopath_traveler/aaa3d10acb726906/fs.32/0, steam-dxvk/batman_arkham_origins/e6872ae23569c35f/fs.32/0, steam-dxvk/octopath_traveler/fd33a99fa5c271a8/fs.32/0, steam-dxvk/octopath_traveler/9a077cdc16f24520/fs.32/0, steam-dxvk/batman_arkham_city_goty/fac7b438ad52f622/fs.32/0, and 12 more
from 4 apps: steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-dxvk/octopath_traveler, steam-native/shadow_of_the_tomb_raider
Totals:
Instrs: 149752381 -> 149751337 (-0.00%); split: -0.00%, +0.00%
Cycle count: 11553609349 -> 11549970294 (-0.03%); split: -0.06%, +0.03%
Spill count: 42763 -> 42764 (+0.00%); split: -0.01%, +0.01%
Fill count: 75650 -> 75651 (+0.00%); split: -0.00%, +0.01%
Max live registers: 31725096 -> 31671792 (-0.17%)
Max dispatch width: 5546008 -> 5551672 (+0.10%); split: +0.11%, -0.00%
Totals from 52574 (8.34% of 630441) affected shaders:
Instrs: 9535159 -> 9534115 (-0.01%); split: -0.03%, +0.02%
Cycle count: 1006627109 -> 1002988054 (-0.36%); split: -0.65%, +0.29%
Spill count: 11588 -> 11589 (+0.01%); split: -0.03%, +0.03%
Fill count: 21057 -> 21058 (+0.00%); split: -0.01%, +0.02%
Max live registers: 1992493 -> 1939189 (-2.68%)
Max dispatch width: 559696 -> 565360 (+1.01%); split: +1.06%, -0.05%
```
and DG2
```
*** Shaders only in 'after' results are ignored:
steam-native/shadow_of_the_tomb_raider/1f95a9d3db21df85/fs.32/0, steam-native/shadow_of_the_tomb_raider/56b87c4a46613a2a/fs.32/0, steam-native/shadow_of_the_tomb_raider/a74b4137f85dbbd3/fs.32/0, steam-native/shadow_of_the_tomb_raider/e07e38d3f48e8402/fs.32/0, steam-native/shadow_of_the_tomb_raider/206336789c48996c/fs.32/0, and 268 more
from 4 apps: steam-dxvk/alan_wake, steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
*** Shaders only in 'before' results are ignored:
steam-native/shadow_of_the_tomb_raider/0420d7c3a2ea99ec/fs.32/0, steam-native/shadow_of_the_tomb_raider/2ff39f8bf7d24abb/fs.32/0, steam-native/shadow_of_the_tomb_raider/92d7be2824bd9659/fs.32/0, steam-native/shadow_of_the_tomb_raider/f09ca6d2ecf18015/fs.32/0, steam-native/shadow_of_the_tomb_raider/490f8ffd59e52949/fs.32/0, and 205 more
from 3 apps: steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
Totals:
Instrs: 151597619 -> 151599914 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7699776 -> 7699784 (+0.00%)
Cycle count: 12738501989 -> 12739841170 (+0.01%); split: -0.01%, +0.02%
Spill count: 61283 -> 61274 (-0.01%)
Fill count: 119886 -> 119849 (-0.03%)
Max live registers: 31810432 -> 31758920 (-0.16%)
Max dispatch width: 5540128 -> 5541136 (+0.02%); split: +0.08%, -0.06%
Totals from 49286 (7.81% of 631231) affected shaders:
Instrs: 8607753 -> 8610048 (+0.03%); split: -0.01%, +0.04%
Subgroup size: 857752 -> 857760 (+0.00%)
Cycle count: 305939495 -> 307278676 (+0.44%); split: -0.28%, +0.72%
Spill count: 6339 -> 6330 (-0.14%)
Fill count: 12571 -> 12534 (-0.29%)
Max live registers: 1788346 -> 1736834 (-2.88%)
Max dispatch width: 510920 -> 511928 (+0.20%); split: +0.85%, -0.66%
```
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30489>
2024-07-31 22:46:20 -07:00
|
|
|
ubld8.UNDEF(inst->dst);
|
2024-02-24 01:24:03 -08:00
|
|
|
|
|
|
|
|
if (inst->exec_size == 8) {
|
|
|
|
|
assert(inst->dst.type == BRW_TYPE_UD);
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg uw = retype(inst->dst, BRW_TYPE_UW);
|
2024-02-24 01:24:03 -08:00
|
|
|
ubld8.MOV(uw, brw_imm_v(0x76543210));
|
|
|
|
|
ubld8.MOV(inst->dst, uw);
|
|
|
|
|
} else {
|
|
|
|
|
assert(inst->dst.type == BRW_TYPE_UW);
|
|
|
|
|
ubld8.MOV(inst->dst, brw_imm_v(0x76543210));
|
|
|
|
|
ubld8.ADD(byte_offset(inst->dst, 16), inst->dst, brw_imm_uw(8u));
|
|
|
|
|
if (inst->exec_size > 16) {
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder ubld16 = abld.group(16, 0).exec_all();
|
2024-02-24 01:24:03 -08:00
|
|
|
ubld16.ADD(byte_offset(inst->dst, 32), inst->dst, brw_imm_uw(16u));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-27 23:23:26 -08:00
|
|
|
inst->remove();
|
2024-02-24 01:24:03 -08:00
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2024-12-06 20:52:05 -08:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
|
|
|
|
|
BRW_DEPENDENCY_VARIABLES);
|
2024-02-24 01:24:03 -08:00
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
2024-05-21 12:56:50 -07:00
|
|
|
|
|
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_lower_indirect_mov(brw_shader &s)
|
2024-05-21 12:56:50 -07:00
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
if (s.devinfo->ver < 20)
|
|
|
|
|
return progress;
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
|
2024-05-21 12:56:50 -07:00
|
|
|
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT) {
|
|
|
|
|
if (brw_type_size_bytes(inst->src[0].type) > 1 &&
|
|
|
|
|
brw_type_size_bytes(inst->dst.type) > 1) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
assert(brw_type_size_bytes(inst->src[0].type) ==
|
|
|
|
|
brw_type_size_bytes(inst->dst.type));
|
|
|
|
|
|
2025-02-27 22:04:03 -08:00
|
|
|
const brw_builder ibld(inst);
|
2024-05-21 12:56:50 -07:00
|
|
|
|
|
|
|
|
/* Extract unaligned part */
|
|
|
|
|
uint16_t extra_offset = inst->src[0].offset & 0x1;
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg offset = ibld.ADD(inst->src[1], brw_imm_uw(extra_offset));
|
2024-05-21 12:56:50 -07:00
|
|
|
|
|
|
|
|
/* Check if offset is odd or even so that we can choose either high or
|
|
|
|
|
* low byte from the result.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg is_odd = ibld.AND(offset, brw_imm_ud(1));
|
2024-05-21 12:56:50 -07:00
|
|
|
|
|
|
|
|
/* Make sure offset is word (2-bytes) aligned */
|
|
|
|
|
offset = ibld.AND(offset, brw_imm_uw(~1));
|
|
|
|
|
|
|
|
|
|
/* Indirect addressing(vx1 and vxh) not supported with UB/B datatype for
|
|
|
|
|
* Src0, so change data type for src0 and dst to UW.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dst = ibld.vgrf(BRW_TYPE_UW);
|
2024-05-21 12:56:50 -07:00
|
|
|
|
|
|
|
|
/* Substract unaligned offset from src0 offset since we already
|
|
|
|
|
* accounted unaligned part in the indirect byte offset.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg start = retype(inst->src[0], BRW_TYPE_UW);
|
2024-05-21 12:56:50 -07:00
|
|
|
start.offset &= ~extra_offset;
|
|
|
|
|
|
|
|
|
|
/* Adjust length to account extra offset. */
|
|
|
|
|
assert(inst->src[2].file == IMM);
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg length = brw_imm_ud(inst->src[2].ud + extra_offset);
|
2024-05-21 12:56:50 -07:00
|
|
|
|
|
|
|
|
ibld.emit(SHADER_OPCODE_MOV_INDIRECT, dst, start, offset, length);
|
|
|
|
|
|
|
|
|
|
/* Select high byte if offset is odd otherwise select low byte. */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg lo = ibld.AND(dst, brw_imm_uw(0xff));
|
|
|
|
|
brw_reg hi = ibld.SHR(dst, brw_imm_uw(8));
|
|
|
|
|
brw_reg result = ibld.vgrf(BRW_TYPE_UW);
|
2024-05-21 12:56:50 -07:00
|
|
|
ibld.CSEL(result, hi, lo, is_odd, BRW_CONDITIONAL_NZ);
|
|
|
|
|
|
|
|
|
|
/* Extra MOV needed here to convert back to the corresponding B type */
|
|
|
|
|
ibld.MOV(inst->dst, result);
|
|
|
|
|
|
2025-02-27 23:23:26 -08:00
|
|
|
inst->remove();
|
2024-05-21 12:56:50 -07:00
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2024-12-06 20:52:05 -08:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
|
|
|
|
|
BRW_DEPENDENCY_VARIABLES);
|
2024-05-21 12:56:50 -07:00
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
2025-02-28 01:03:50 -08:00
|
|
|
|