2024-01-04 23:27:04 -08:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
* SPDX-License-Identifier: MIT
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "brw_fs.h"
|
|
|
|
|
#include "brw_fs_builder.h"
|
|
|
|
|
|
|
|
|
|
using namespace brw;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
|
|
|
|
|
* or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
brw_fs_lower_constant_loads(fs_visitor &s)
|
|
|
|
|
{
|
|
|
|
|
unsigned index, pull_index;
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
|
|
|
|
|
/* Set up the annotation tracking for new generated instructions. */
|
|
|
|
|
const fs_builder ibld(&s, block, inst);
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
if (inst->src[i].file != UNIFORM)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* We'll handle this case later */
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (!s.get_pull_locs(inst->src[i], &index, &pull_index))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
assert(inst->src[i].stride == 0);
|
|
|
|
|
|
|
|
|
|
const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
|
|
|
|
|
const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
|
|
|
|
|
const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
const unsigned base = pull_index * 4;
|
|
|
|
|
|
|
|
|
|
fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
|
|
|
|
|
srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index);
|
|
|
|
|
srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1));
|
|
|
|
|
srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
|
|
|
|
|
srcs, PULL_UNIFORM_CONSTANT_SRCS);
|
|
|
|
|
|
|
|
|
|
/* Rewrite the instruction to use the temporary VGRF. */
|
|
|
|
|
inst->src[i].file = VGRF;
|
|
|
|
|
inst->src[i].nr = dst.nr;
|
|
|
|
|
inst->src[i].offset = (base & (block_sz - 1)) +
|
|
|
|
|
inst->src[i].offset % 4;
|
|
|
|
|
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
|
|
|
|
|
inst->src[0].file == UNIFORM) {
|
|
|
|
|
|
|
|
|
|
if (!s.get_pull_locs(inst->src[0], &index, &pull_index))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
s.VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
|
|
|
|
|
brw_imm_ud(index),
|
|
|
|
|
fs_reg() /* surface_handle */,
|
|
|
|
|
inst->src[1],
|
|
|
|
|
pull_index * 4, 4, 1);
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
brw_fs_lower_load_payload(fs_visitor &s)
|
|
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
|
|
|
|
|
if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
|
|
|
|
continue;
|
|
|
|
|
|
2024-02-21 21:21:20 -08:00
|
|
|
assert(inst->dst.file == VGRF);
|
2024-01-04 23:27:04 -08:00
|
|
|
assert(inst->saturate == false);
|
|
|
|
|
fs_reg dst = inst->dst;
|
|
|
|
|
|
|
|
|
|
const fs_builder ibld(&s, block, inst);
|
|
|
|
|
const fs_builder ubld = ibld.exec_all();
|
|
|
|
|
|
|
|
|
|
for (uint8_t i = 0; i < inst->header_size;) {
|
|
|
|
|
/* Number of header GRFs to initialize at once with a single MOV
|
|
|
|
|
* instruction.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned n =
|
|
|
|
|
(i + 1 < inst->header_size && inst->src[i].stride == 1 &&
|
|
|
|
|
inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
|
|
|
|
|
2 : 1;
|
|
|
|
|
|
|
|
|
|
if (inst->src[i].file != BAD_FILE)
|
|
|
|
|
ubld.group(8 * n, 0).MOV(retype(dst, BRW_REGISTER_TYPE_UD),
|
|
|
|
|
retype(inst->src[i], BRW_REGISTER_TYPE_UD));
|
|
|
|
|
|
|
|
|
|
dst = byte_offset(dst, n * REG_SIZE);
|
|
|
|
|
i += n;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (uint8_t i = inst->header_size; i < inst->sources; i++) {
|
|
|
|
|
dst.type = inst->src[i].type;
|
|
|
|
|
if (inst->src[i].file != BAD_FILE) {
|
|
|
|
|
ibld.MOV(dst, inst->src[i]);
|
|
|
|
|
}
|
|
|
|
|
dst = offset(dst, ibld, 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
brw_fs_lower_minmax(fs_visitor &s)
|
|
|
|
|
{
|
|
|
|
|
assert(s.devinfo->ver < 6);
|
|
|
|
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
|
|
|
|
|
const fs_builder ibld(&s, block, inst);
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_SEL &&
|
|
|
|
|
inst->predicate == BRW_PREDICATE_NONE) {
|
|
|
|
|
/* If src1 is an immediate value that is not NaN, then it can't be
|
|
|
|
|
* NaN. In that case, emit CMP because it is much better for cmod
|
|
|
|
|
* propagation. Likewise if src1 is not float. Gfx4 and Gfx5 don't
|
|
|
|
|
* support HF or DF, so it is not necessary to check for those.
|
|
|
|
|
*/
|
|
|
|
|
if (inst->src[1].type != BRW_REGISTER_TYPE_F ||
|
|
|
|
|
(inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
|
|
|
|
|
ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
|
|
|
|
|
inst->conditional_mod);
|
|
|
|
|
} else {
|
|
|
|
|
ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
|
|
|
|
|
inst->conditional_mod);
|
|
|
|
|
}
|
|
|
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NONE;
|
|
|
|
|
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
brw_fs_lower_sub_sat(fs_visitor &s)
|
|
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
|
|
|
|
|
const fs_builder ibld(&s, block, inst);
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_USUB_SAT ||
|
|
|
|
|
inst->opcode == SHADER_OPCODE_ISUB_SAT) {
|
|
|
|
|
/* The fundamental problem is the hardware performs source negation
|
|
|
|
|
* at the bit width of the source. If the source is 0x80000000D, the
|
|
|
|
|
* negation is 0x80000000D. As a result, subtractSaturate(0,
|
|
|
|
|
* 0x80000000) will produce 0x80000000 instead of 0x7fffffff. There
|
|
|
|
|
* are at least three ways to resolve this:
|
|
|
|
|
*
|
|
|
|
|
* 1. Use the accumulator for the negated source. The accumulator is
|
|
|
|
|
* 33 bits, so our source 0x80000000 is sign-extended to
|
|
|
|
|
* 0x1800000000. The negation of which is 0x080000000. This
|
|
|
|
|
* doesn't help for 64-bit integers (which are already bigger than
|
|
|
|
|
* 33 bits). There are also only 8 accumulators, so SIMD16 or
|
|
|
|
|
* SIMD32 instructions would have to be split into multiple SIMD8
|
|
|
|
|
* instructions.
|
|
|
|
|
*
|
|
|
|
|
* 2. Use slightly different math. For any n-bit value x, we know (x
|
|
|
|
|
* >> 1) != -(x >> 1). We can use this fact to only do
|
|
|
|
|
* subtractions involving (x >> 1). subtractSaturate(a, b) ==
|
|
|
|
|
* subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
|
|
|
|
|
*
|
|
|
|
|
* 3. For unsigned sources, it is sufficient to replace the
|
|
|
|
|
* subtractSaturate with (a > b) ? a - b : 0.
|
|
|
|
|
*
|
|
|
|
|
* It may also be possible to use the SUBB instruction. This
|
|
|
|
|
* implicitly writes the accumulator, so it could only be used in the
|
|
|
|
|
* same situations as #1 above. It is further limited by only
|
|
|
|
|
* allowing UD sources.
|
|
|
|
|
*/
|
|
|
|
|
if (inst->exec_size == 8 && inst->src[0].type != BRW_REGISTER_TYPE_Q &&
|
|
|
|
|
inst->src[0].type != BRW_REGISTER_TYPE_UQ) {
|
intel/brw: Use helper to create accumulator register
This ensure the region triple <V,W,H> is set correctly, in this case the
desired region is a sequential like <8,8,1>. Without the helper the
sequence we get is <0,1,0> -- which the generator currently partially
adjusts when emitting code, but is not sufficient when doing validation
earlier.
The code generated code is slightly modified. From crucible test
func.shader.subtractSaturate.uint in the fragment shader for SIMD8, the
diff looks like
```
mov(8) acc0<1>UD g21<8,8,1>UD { align1 1Q $0.dst };
-add.sat(8) g22<1>UD -acc0<0,1,0>UD g16<8,8,1>UD { align1 1Q @1 $0.dst };
+add.sat(8) g22<1>UD -acc0<8,8,1>UD g16<8,8,1>UD { align1 1Q @1 $0.dst };
```
Note that without the patch generator adjusted the hstride for acc0 used
as destination (see brw_set_dest), but kept the src region as is. For
the source, it is not clear to me why the <0,1,0> would work correctly
here since it is a scalar, but using <8,8,1> it is correct.
Fixes: 58907568ec5 ("intel/fs: Add SHADER_OPCODE_[IU]SUB_SAT pseudo-ops")
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28059>
2024-03-08 08:36:03 -08:00
|
|
|
fs_reg acc = retype(brw_acc_reg(inst->exec_size),
|
|
|
|
|
inst->src[1].type);
|
2024-01-04 23:27:04 -08:00
|
|
|
|
|
|
|
|
ibld.MOV(acc, inst->src[1]);
|
|
|
|
|
fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
|
|
|
|
|
add->saturate = true;
|
|
|
|
|
add->src[0].negate = true;
|
|
|
|
|
} else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) {
|
|
|
|
|
/* tmp = src1 >> 1;
|
|
|
|
|
* dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
|
|
|
|
|
*/
|
|
|
|
|
fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
|
|
|
|
|
fs_reg tmp2 = ibld.vgrf(inst->src[0].type);
|
|
|
|
|
fs_reg tmp3 = ibld.vgrf(inst->src[0].type);
|
|
|
|
|
fs_inst *add;
|
|
|
|
|
|
|
|
|
|
ibld.SHR(tmp1, inst->src[1], brw_imm_d(1));
|
|
|
|
|
|
|
|
|
|
add = ibld.ADD(tmp2, inst->src[1], tmp1);
|
|
|
|
|
add->src[1].negate = true;
|
|
|
|
|
|
|
|
|
|
add = ibld.ADD(tmp3, inst->src[0], tmp1);
|
|
|
|
|
add->src[1].negate = true;
|
|
|
|
|
add->saturate = true;
|
|
|
|
|
|
|
|
|
|
add = ibld.ADD(inst->dst, tmp3, tmp2);
|
|
|
|
|
add->src[1].negate = true;
|
|
|
|
|
add->saturate = true;
|
|
|
|
|
} else {
|
|
|
|
|
/* a > b ? a - b : 0 */
|
|
|
|
|
ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
|
|
|
|
|
BRW_CONDITIONAL_G);
|
|
|
|
|
|
|
|
|
|
fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
|
|
|
|
|
add->src[1].negate = !add->src[1].negate;
|
|
|
|
|
|
|
|
|
|
ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0))
|
|
|
|
|
->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Transform barycentric vectors into the interleaved form expected by the PLN
|
|
|
|
|
* instruction and returned by the Gfx7+ PI shared function.
|
|
|
|
|
*
|
|
|
|
|
* For channels 0-15 in SIMD16 mode they are expected to be laid out as
|
|
|
|
|
* follows in the register file:
|
|
|
|
|
*
|
|
|
|
|
* rN+0: X[0-7]
|
|
|
|
|
* rN+1: Y[0-7]
|
|
|
|
|
* rN+2: X[8-15]
|
|
|
|
|
* rN+3: Y[8-15]
|
|
|
|
|
*
|
|
|
|
|
* There is no need to handle SIMD32 here -- This is expected to be run after
|
|
|
|
|
* SIMD lowering, since SIMD lowering relies on vectors having the standard
|
|
|
|
|
* component layout.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
brw_fs_lower_barycentrics(fs_visitor &s)
|
|
|
|
|
{
|
|
|
|
|
const intel_device_info *devinfo = s.devinfo;
|
|
|
|
|
|
2024-02-15 13:19:08 -08:00
|
|
|
if (s.stage != MESA_SHADER_FRAGMENT || devinfo->ver >= 20)
|
2024-01-04 23:27:04 -08:00
|
|
|
return false;
|
|
|
|
|
|
2024-02-15 13:19:08 -08:00
|
|
|
bool progress = false;
|
|
|
|
|
|
2024-01-04 23:27:04 -08:00
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
|
|
|
|
|
if (inst->exec_size < 16)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
const fs_builder ibld(&s, block, inst);
|
|
|
|
|
const fs_builder ubld = ibld.exec_all().group(8, 0);
|
|
|
|
|
|
|
|
|
|
switch (inst->opcode) {
|
|
|
|
|
case FS_OPCODE_LINTERP : {
|
|
|
|
|
assert(inst->exec_size == 16);
|
|
|
|
|
const fs_reg tmp = ibld.vgrf(inst->src[0].type, 2);
|
|
|
|
|
fs_reg srcs[4];
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
|
|
|
|
|
srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2),
|
|
|
|
|
8 * (i / 2));
|
|
|
|
|
|
|
|
|
|
ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
|
|
|
|
|
|
|
|
|
|
inst->src[0] = tmp;
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
|
|
|
|
|
assert(inst->exec_size == 16);
|
|
|
|
|
const fs_reg tmp = ibld.vgrf(inst->dst.type, 2);
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
for (unsigned g = 0; g < inst->exec_size / 8; g++) {
|
|
|
|
|
fs_inst *mov = ibld.at(block, inst->next).group(8, g)
|
|
|
|
|
.MOV(horiz_offset(offset(inst->dst, ibld, i),
|
|
|
|
|
8 * g),
|
|
|
|
|
offset(tmp, ubld, 2 * g + i));
|
|
|
|
|
mov->predicate = inst->predicate;
|
|
|
|
|
mov->predicate_inverse = inst->predicate_inverse;
|
|
|
|
|
mov->flag_subreg = inst->flag_subreg;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inst->dst = tmp;
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Lower a derivative instruction as the floating-point difference of two
|
|
|
|
|
* swizzles of the source, specified as \p swz0 and \p swz1.
|
|
|
|
|
*/
|
|
|
|
|
static bool
|
|
|
|
|
lower_derivative(fs_visitor &s, bblock_t *block, fs_inst *inst,
|
|
|
|
|
unsigned swz0, unsigned swz1)
|
|
|
|
|
{
|
|
|
|
|
const fs_builder ubld = fs_builder(&s, block, inst).exec_all();
|
|
|
|
|
const fs_reg tmp0 = ubld.vgrf(inst->src[0].type);
|
|
|
|
|
const fs_reg tmp1 = ubld.vgrf(inst->src[0].type);
|
|
|
|
|
|
|
|
|
|
ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0));
|
|
|
|
|
ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1));
|
|
|
|
|
|
|
|
|
|
inst->resize_sources(2);
|
|
|
|
|
inst->src[0] = negate(tmp0);
|
|
|
|
|
inst->src[1] = tmp1;
|
|
|
|
|
inst->opcode = BRW_OPCODE_ADD;
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Lower derivative instructions on platforms where codegen cannot implement
|
|
|
|
|
* them efficiently (i.e. XeHP).
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
brw_fs_lower_derivatives(fs_visitor &s)
|
|
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
if (s.devinfo->verx10 < 125)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
|
|
|
|
|
if (inst->opcode == FS_OPCODE_DDX_COARSE)
|
|
|
|
|
progress |= lower_derivative(s, block, inst,
|
|
|
|
|
BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY);
|
|
|
|
|
|
|
|
|
|
else if (inst->opcode == FS_OPCODE_DDX_FINE)
|
|
|
|
|
progress |= lower_derivative(s, block, inst,
|
|
|
|
|
BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW);
|
|
|
|
|
|
|
|
|
|
else if (inst->opcode == FS_OPCODE_DDY_COARSE)
|
|
|
|
|
progress |= lower_derivative(s, block, inst,
|
|
|
|
|
BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ);
|
|
|
|
|
|
|
|
|
|
else if (inst->opcode == FS_OPCODE_DDY_FINE)
|
|
|
|
|
progress |= lower_derivative(s, block, inst,
|
|
|
|
|
BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
brw_fs_lower_find_live_channel(fs_visitor &s)
|
|
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
bool packed_dispatch =
|
|
|
|
|
brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
|
2024-02-19 23:07:04 -08:00
|
|
|
s.prog_data);
|
2024-01-04 23:27:04 -08:00
|
|
|
bool vmask =
|
|
|
|
|
s.stage == MESA_SHADER_FRAGMENT &&
|
2024-02-19 23:07:04 -08:00
|
|
|
brw_wm_prog_data(s.prog_data)->uses_vmask;
|
2024-01-04 23:27:04 -08:00
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
|
|
|
|
|
if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL &&
|
2024-01-05 09:19:38 -08:00
|
|
|
inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL &&
|
|
|
|
|
inst->opcode != SHADER_OPCODE_LOAD_LIVE_CHANNELS)
|
2024-01-04 23:27:04 -08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL;
|
|
|
|
|
|
|
|
|
|
/* Getting the first active channel index is easy on Gfx8: Just find
|
|
|
|
|
* the first bit set in the execution mask. The register exists on
|
|
|
|
|
* HSW already but it reads back as all ones when the current
|
|
|
|
|
* instruction has execution masking disabled, so it's kind of
|
|
|
|
|
* useless there.
|
|
|
|
|
*/
|
|
|
|
|
fs_reg exec_mask(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
|
|
|
|
|
|
|
|
|
|
const fs_builder ibld(&s, block, inst);
|
|
|
|
|
if (!inst->is_partial_write())
|
|
|
|
|
ibld.emit_undef_for_dst(inst);
|
|
|
|
|
|
|
|
|
|
const fs_builder ubld = fs_builder(&s, block, inst).exec_all().group(1, 0);
|
|
|
|
|
|
|
|
|
|
/* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
|
|
|
|
|
* so combine the execution and dispatch masks to obtain the true mask.
|
|
|
|
|
*
|
|
|
|
|
* If we're looking for the first live channel, and we have packed
|
|
|
|
|
* dispatch, we can skip this step, as we know all dispatched channels
|
|
|
|
|
* will appear at the front of the mask.
|
|
|
|
|
*/
|
|
|
|
|
if (!(first && packed_dispatch)) {
|
|
|
|
|
fs_reg mask = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
ubld.UNDEF(mask);
|
|
|
|
|
ubld.emit(SHADER_OPCODE_READ_SR_REG, mask, brw_imm_ud(vmask ? 3 : 2));
|
|
|
|
|
|
|
|
|
|
/* Quarter control has the effect of magically shifting the value of
|
|
|
|
|
* ce0 so you'll get the first/last active channel relative to the
|
|
|
|
|
* specified quarter control as result.
|
|
|
|
|
*/
|
|
|
|
|
if (inst->group > 0)
|
|
|
|
|
ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8)));
|
|
|
|
|
|
|
|
|
|
ubld.AND(mask, exec_mask, mask);
|
|
|
|
|
exec_mask = mask;
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-05 09:19:38 -08:00
|
|
|
switch (inst->opcode) {
|
|
|
|
|
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
|
2024-01-04 23:27:04 -08:00
|
|
|
ubld.FBL(inst->dst, exec_mask);
|
2024-01-05 09:19:38 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL: {
|
2024-01-04 23:27:04 -08:00
|
|
|
fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
|
|
|
|
ubld.UNDEF(tmp);
|
|
|
|
|
ubld.LZD(tmp, exec_mask);
|
|
|
|
|
ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31));
|
2024-01-05 09:19:38 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
|
|
|
|
|
ubld.MOV(inst->dst, exec_mask);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
unreachable("Impossible.");
|
2024-01-04 23:27:04 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* From the Skylake PRM Vol. 2a docs for sends:
|
|
|
|
|
*
|
|
|
|
|
* "It is required that the second block of GRFs does not overlap with the
|
|
|
|
|
* first block."
|
|
|
|
|
*
|
|
|
|
|
* There are plenty of cases where we may accidentally violate this due to
|
|
|
|
|
* having, for instance, both sources be the constant 0. This little pass
|
|
|
|
|
* just adds a new vgrf for the second payload and copies it over.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
brw_fs_lower_sends_overlapping_payload(fs_visitor &s)
|
|
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
|
|
|
|
|
regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
|
|
|
|
|
inst->src[3], inst->ex_mlen * REG_SIZE)) {
|
2024-01-12 02:58:30 -08:00
|
|
|
const unsigned arg = inst->mlen < inst->ex_mlen ? 2 : 3;
|
|
|
|
|
const unsigned len = MIN2(inst->mlen, inst->ex_mlen);
|
|
|
|
|
|
|
|
|
|
fs_reg tmp = fs_reg(VGRF, s.alloc.allocate(len),
|
2024-01-04 23:27:04 -08:00
|
|
|
BRW_REGISTER_TYPE_UD);
|
|
|
|
|
/* Sadly, we've lost all notion of channels and bit sizes at this
|
|
|
|
|
* point. Just WE_all it.
|
|
|
|
|
*/
|
|
|
|
|
const fs_builder ibld = fs_builder(&s, block, inst).exec_all().group(16, 0);
|
2024-01-12 02:58:30 -08:00
|
|
|
fs_reg copy_src = retype(inst->src[arg], BRW_REGISTER_TYPE_UD);
|
2024-01-04 23:27:04 -08:00
|
|
|
fs_reg copy_dst = tmp;
|
2024-01-12 02:58:30 -08:00
|
|
|
for (unsigned i = 0; i < len; i += 2) {
|
|
|
|
|
if (len == i + 1) {
|
2024-01-04 23:27:04 -08:00
|
|
|
/* Only one register left; do SIMD8 */
|
|
|
|
|
ibld.group(8, 0).MOV(copy_dst, copy_src);
|
|
|
|
|
} else {
|
|
|
|
|
ibld.MOV(copy_dst, copy_src);
|
|
|
|
|
}
|
|
|
|
|
copy_src = offset(copy_src, ibld, 1);
|
|
|
|
|
copy_dst = offset(copy_dst, ibld, 1);
|
|
|
|
|
}
|
2024-01-12 02:58:30 -08:00
|
|
|
inst->src[arg] = tmp;
|
2024-01-04 23:27:04 -08:00
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
2024-02-21 21:21:20 -08:00
|
|
|
* Three source instruction must have a GRF destination register.
|
2024-01-04 23:27:04 -08:00
|
|
|
* ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
brw_fs_lower_3src_null_dest(fs_visitor &s)
|
|
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
|
|
|
|
|
if (inst->is_3src(s.compiler) && inst->dst.is_null()) {
|
|
|
|
|
inst->dst = fs_reg(VGRF, s.alloc.allocate(s.dispatch_width / 8),
|
|
|
|
|
inst->dst.type);
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
|
|
|
|
|
DEPENDENCY_VARIABLES);
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|