mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-21 11:30:11 +01:00
609 lines
20 KiB
C++
609 lines
20 KiB
C++
|
|
/*
|
||
|
|
* Copyright © 2010 Intel Corporation
|
||
|
|
* SPDX-License-Identifier: MIT
|
||
|
|
*/
|
||
|
|
|
||
|
|
#include "brw_fs.h"
|
||
|
|
#include "brw_fs_builder.h"
|
||
|
|
|
||
|
|
using namespace brw;
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
|
||
|
|
* or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
|
||
|
|
*/
|
||
|
|
bool
|
||
|
|
brw_fs_lower_constant_loads(fs_visitor &s)
|
||
|
|
{
|
||
|
|
unsigned index, pull_index;
|
||
|
|
bool progress = false;
|
||
|
|
|
||
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
|
||
|
|
/* Set up the annotation tracking for new generated instructions. */
|
||
|
|
const fs_builder ibld(&s, block, inst);
|
||
|
|
|
||
|
|
for (int i = 0; i < inst->sources; i++) {
|
||
|
|
if (inst->src[i].file != UNIFORM)
|
||
|
|
continue;
|
||
|
|
|
||
|
|
/* We'll handle this case later */
|
||
|
|
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
|
||
|
|
continue;
|
||
|
|
|
||
|
|
if (!s.get_pull_locs(inst->src[i], &index, &pull_index))
|
||
|
|
continue;
|
||
|
|
|
||
|
|
assert(inst->src[i].stride == 0);
|
||
|
|
|
||
|
|
const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
|
||
|
|
const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
|
||
|
|
const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
||
|
|
const unsigned base = pull_index * 4;
|
||
|
|
|
||
|
|
fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
|
||
|
|
srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index);
|
||
|
|
srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1));
|
||
|
|
srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz);
|
||
|
|
|
||
|
|
|
||
|
|
ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
|
||
|
|
srcs, PULL_UNIFORM_CONSTANT_SRCS);
|
||
|
|
|
||
|
|
/* Rewrite the instruction to use the temporary VGRF. */
|
||
|
|
inst->src[i].file = VGRF;
|
||
|
|
inst->src[i].nr = dst.nr;
|
||
|
|
inst->src[i].offset = (base & (block_sz - 1)) +
|
||
|
|
inst->src[i].offset % 4;
|
||
|
|
|
||
|
|
progress = true;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
|
||
|
|
inst->src[0].file == UNIFORM) {
|
||
|
|
|
||
|
|
if (!s.get_pull_locs(inst->src[0], &index, &pull_index))
|
||
|
|
continue;
|
||
|
|
|
||
|
|
s.VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
|
||
|
|
brw_imm_ud(index),
|
||
|
|
fs_reg() /* surface_handle */,
|
||
|
|
inst->src[1],
|
||
|
|
pull_index * 4, 4, 1);
|
||
|
|
inst->remove(block);
|
||
|
|
|
||
|
|
progress = true;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||
|
|
|
||
|
|
return progress;
|
||
|
|
}
|
||
|
|
|
||
|
|
bool
|
||
|
|
brw_fs_lower_load_payload(fs_visitor &s)
|
||
|
|
{
|
||
|
|
bool progress = false;
|
||
|
|
|
||
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
|
||
|
|
if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
||
|
|
continue;
|
||
|
|
|
||
|
|
assert(inst->dst.file == MRF || inst->dst.file == VGRF);
|
||
|
|
assert(inst->saturate == false);
|
||
|
|
fs_reg dst = inst->dst;
|
||
|
|
|
||
|
|
/* Get rid of COMPR4. We'll add it back in if we need it */
|
||
|
|
if (dst.file == MRF)
|
||
|
|
dst.nr = dst.nr & ~BRW_MRF_COMPR4;
|
||
|
|
|
||
|
|
const fs_builder ibld(&s, block, inst);
|
||
|
|
const fs_builder ubld = ibld.exec_all();
|
||
|
|
|
||
|
|
for (uint8_t i = 0; i < inst->header_size;) {
|
||
|
|
/* Number of header GRFs to initialize at once with a single MOV
|
||
|
|
* instruction.
|
||
|
|
*/
|
||
|
|
const unsigned n =
|
||
|
|
(i + 1 < inst->header_size && inst->src[i].stride == 1 &&
|
||
|
|
inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
|
||
|
|
2 : 1;
|
||
|
|
|
||
|
|
if (inst->src[i].file != BAD_FILE)
|
||
|
|
ubld.group(8 * n, 0).MOV(retype(dst, BRW_REGISTER_TYPE_UD),
|
||
|
|
retype(inst->src[i], BRW_REGISTER_TYPE_UD));
|
||
|
|
|
||
|
|
dst = byte_offset(dst, n * REG_SIZE);
|
||
|
|
i += n;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
|
||
|
|
inst->exec_size > 8) {
|
||
|
|
/* In this case, the payload portion of the LOAD_PAYLOAD isn't
|
||
|
|
* a straightforward copy. Instead, the result of the
|
||
|
|
* LOAD_PAYLOAD is treated as interleaved and the first four
|
||
|
|
* non-header sources are unpacked as:
|
||
|
|
*
|
||
|
|
* m + 0: r0
|
||
|
|
* m + 1: g0
|
||
|
|
* m + 2: b0
|
||
|
|
* m + 3: a0
|
||
|
|
* m + 4: r1
|
||
|
|
* m + 5: g1
|
||
|
|
* m + 6: b1
|
||
|
|
* m + 7: a1
|
||
|
|
*
|
||
|
|
* This is used for gen <= 5 fb writes.
|
||
|
|
*/
|
||
|
|
assert(inst->exec_size == 16);
|
||
|
|
assert(inst->header_size + 4 <= inst->sources);
|
||
|
|
for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
|
||
|
|
if (inst->src[i].file != BAD_FILE) {
|
||
|
|
if (s.devinfo->has_compr4) {
|
||
|
|
fs_reg compr4_dst = retype(dst, inst->src[i].type);
|
||
|
|
compr4_dst.nr |= BRW_MRF_COMPR4;
|
||
|
|
ibld.MOV(compr4_dst, inst->src[i]);
|
||
|
|
} else {
|
||
|
|
/* Platform doesn't have COMPR4. We have to fake it */
|
||
|
|
fs_reg mov_dst = retype(dst, inst->src[i].type);
|
||
|
|
ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0));
|
||
|
|
mov_dst.nr += 4;
|
||
|
|
ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
dst.nr++;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* The loop above only ever incremented us through the first set
|
||
|
|
* of 4 registers. However, thanks to the magic of COMPR4, we
|
||
|
|
* actually wrote to the first 8 registers, so we need to take
|
||
|
|
* that into account now.
|
||
|
|
*/
|
||
|
|
dst.nr += 4;
|
||
|
|
|
||
|
|
/* The COMPR4 code took care of the first 4 sources. We'll let
|
||
|
|
* the regular path handle any remaining sources. Yes, we are
|
||
|
|
* modifying the instruction but we're about to delete it so
|
||
|
|
* this really doesn't hurt anything.
|
||
|
|
*/
|
||
|
|
inst->header_size += 4;
|
||
|
|
}
|
||
|
|
|
||
|
|
for (uint8_t i = inst->header_size; i < inst->sources; i++) {
|
||
|
|
dst.type = inst->src[i].type;
|
||
|
|
if (inst->src[i].file != BAD_FILE) {
|
||
|
|
ibld.MOV(dst, inst->src[i]);
|
||
|
|
}
|
||
|
|
dst = offset(dst, ibld, 1);
|
||
|
|
}
|
||
|
|
|
||
|
|
inst->remove(block);
|
||
|
|
progress = true;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (progress)
|
||
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||
|
|
|
||
|
|
return progress;
|
||
|
|
}
|
||
|
|
|
||
|
|
bool
|
||
|
|
brw_fs_lower_minmax(fs_visitor &s)
|
||
|
|
{
|
||
|
|
assert(s.devinfo->ver < 6);
|
||
|
|
|
||
|
|
bool progress = false;
|
||
|
|
|
||
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
|
||
|
|
const fs_builder ibld(&s, block, inst);
|
||
|
|
|
||
|
|
if (inst->opcode == BRW_OPCODE_SEL &&
|
||
|
|
inst->predicate == BRW_PREDICATE_NONE) {
|
||
|
|
/* If src1 is an immediate value that is not NaN, then it can't be
|
||
|
|
* NaN. In that case, emit CMP because it is much better for cmod
|
||
|
|
* propagation. Likewise if src1 is not float. Gfx4 and Gfx5 don't
|
||
|
|
* support HF or DF, so it is not necessary to check for those.
|
||
|
|
*/
|
||
|
|
if (inst->src[1].type != BRW_REGISTER_TYPE_F ||
|
||
|
|
(inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
|
||
|
|
ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
|
||
|
|
inst->conditional_mod);
|
||
|
|
} else {
|
||
|
|
ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
|
||
|
|
inst->conditional_mod);
|
||
|
|
}
|
||
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
||
|
|
inst->conditional_mod = BRW_CONDITIONAL_NONE;
|
||
|
|
|
||
|
|
progress = true;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (progress)
|
||
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||
|
|
|
||
|
|
return progress;
|
||
|
|
}
|
||
|
|
|
||
|
|
bool
|
||
|
|
brw_fs_lower_sub_sat(fs_visitor &s)
|
||
|
|
{
|
||
|
|
bool progress = false;
|
||
|
|
|
||
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
|
||
|
|
const fs_builder ibld(&s, block, inst);
|
||
|
|
|
||
|
|
if (inst->opcode == SHADER_OPCODE_USUB_SAT ||
|
||
|
|
inst->opcode == SHADER_OPCODE_ISUB_SAT) {
|
||
|
|
/* The fundamental problem is the hardware performs source negation
|
||
|
|
* at the bit width of the source. If the source is 0x80000000D, the
|
||
|
|
* negation is 0x80000000D. As a result, subtractSaturate(0,
|
||
|
|
* 0x80000000) will produce 0x80000000 instead of 0x7fffffff. There
|
||
|
|
* are at least three ways to resolve this:
|
||
|
|
*
|
||
|
|
* 1. Use the accumulator for the negated source. The accumulator is
|
||
|
|
* 33 bits, so our source 0x80000000 is sign-extended to
|
||
|
|
* 0x1800000000. The negation of which is 0x080000000. This
|
||
|
|
* doesn't help for 64-bit integers (which are already bigger than
|
||
|
|
* 33 bits). There are also only 8 accumulators, so SIMD16 or
|
||
|
|
* SIMD32 instructions would have to be split into multiple SIMD8
|
||
|
|
* instructions.
|
||
|
|
*
|
||
|
|
* 2. Use slightly different math. For any n-bit value x, we know (x
|
||
|
|
* >> 1) != -(x >> 1). We can use this fact to only do
|
||
|
|
* subtractions involving (x >> 1). subtractSaturate(a, b) ==
|
||
|
|
* subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
|
||
|
|
*
|
||
|
|
* 3. For unsigned sources, it is sufficient to replace the
|
||
|
|
* subtractSaturate with (a > b) ? a - b : 0.
|
||
|
|
*
|
||
|
|
* It may also be possible to use the SUBB instruction. This
|
||
|
|
* implicitly writes the accumulator, so it could only be used in the
|
||
|
|
* same situations as #1 above. It is further limited by only
|
||
|
|
* allowing UD sources.
|
||
|
|
*/
|
||
|
|
if (inst->exec_size == 8 && inst->src[0].type != BRW_REGISTER_TYPE_Q &&
|
||
|
|
inst->src[0].type != BRW_REGISTER_TYPE_UQ) {
|
||
|
|
fs_reg acc(ARF, BRW_ARF_ACCUMULATOR, inst->src[1].type);
|
||
|
|
|
||
|
|
ibld.MOV(acc, inst->src[1]);
|
||
|
|
fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
|
||
|
|
add->saturate = true;
|
||
|
|
add->src[0].negate = true;
|
||
|
|
} else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) {
|
||
|
|
/* tmp = src1 >> 1;
|
||
|
|
* dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
|
||
|
|
*/
|
||
|
|
fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
|
||
|
|
fs_reg tmp2 = ibld.vgrf(inst->src[0].type);
|
||
|
|
fs_reg tmp3 = ibld.vgrf(inst->src[0].type);
|
||
|
|
fs_inst *add;
|
||
|
|
|
||
|
|
ibld.SHR(tmp1, inst->src[1], brw_imm_d(1));
|
||
|
|
|
||
|
|
add = ibld.ADD(tmp2, inst->src[1], tmp1);
|
||
|
|
add->src[1].negate = true;
|
||
|
|
|
||
|
|
add = ibld.ADD(tmp3, inst->src[0], tmp1);
|
||
|
|
add->src[1].negate = true;
|
||
|
|
add->saturate = true;
|
||
|
|
|
||
|
|
add = ibld.ADD(inst->dst, tmp3, tmp2);
|
||
|
|
add->src[1].negate = true;
|
||
|
|
add->saturate = true;
|
||
|
|
} else {
|
||
|
|
/* a > b ? a - b : 0 */
|
||
|
|
ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
|
||
|
|
BRW_CONDITIONAL_G);
|
||
|
|
|
||
|
|
fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
|
||
|
|
add->src[1].negate = !add->src[1].negate;
|
||
|
|
|
||
|
|
ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0))
|
||
|
|
->predicate = BRW_PREDICATE_NORMAL;
|
||
|
|
}
|
||
|
|
|
||
|
|
inst->remove(block);
|
||
|
|
progress = true;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (progress)
|
||
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
||
|
|
|
||
|
|
return progress;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Transform barycentric vectors into the interleaved form expected by the PLN
|
||
|
|
* instruction and returned by the Gfx7+ PI shared function.
|
||
|
|
*
|
||
|
|
* For channels 0-15 in SIMD16 mode they are expected to be laid out as
|
||
|
|
* follows in the register file:
|
||
|
|
*
|
||
|
|
* rN+0: X[0-7]
|
||
|
|
* rN+1: Y[0-7]
|
||
|
|
* rN+2: X[8-15]
|
||
|
|
* rN+3: Y[8-15]
|
||
|
|
*
|
||
|
|
* There is no need to handle SIMD32 here -- This is expected to be run after
|
||
|
|
* SIMD lowering, since SIMD lowering relies on vectors having the standard
|
||
|
|
* component layout.
|
||
|
|
*/
|
||
|
|
bool
|
||
|
|
brw_fs_lower_barycentrics(fs_visitor &s)
|
||
|
|
{
|
||
|
|
const intel_device_info *devinfo = s.devinfo;
|
||
|
|
const bool has_interleaved_layout = devinfo->has_pln ||
|
||
|
|
(devinfo->ver >= 7 && devinfo->ver < 20);
|
||
|
|
bool progress = false;
|
||
|
|
|
||
|
|
if (s.stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout)
|
||
|
|
return false;
|
||
|
|
|
||
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
|
||
|
|
if (inst->exec_size < 16)
|
||
|
|
continue;
|
||
|
|
|
||
|
|
const fs_builder ibld(&s, block, inst);
|
||
|
|
const fs_builder ubld = ibld.exec_all().group(8, 0);
|
||
|
|
|
||
|
|
switch (inst->opcode) {
|
||
|
|
case FS_OPCODE_LINTERP : {
|
||
|
|
assert(inst->exec_size == 16);
|
||
|
|
const fs_reg tmp = ibld.vgrf(inst->src[0].type, 2);
|
||
|
|
fs_reg srcs[4];
|
||
|
|
|
||
|
|
for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
|
||
|
|
srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2),
|
||
|
|
8 * (i / 2));
|
||
|
|
|
||
|
|
ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
|
||
|
|
|
||
|
|
inst->src[0] = tmp;
|
||
|
|
progress = true;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
||
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
||
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
|
||
|
|
assert(inst->exec_size == 16);
|
||
|
|
const fs_reg tmp = ibld.vgrf(inst->dst.type, 2);
|
||
|
|
|
||
|
|
for (unsigned i = 0; i < 2; i++) {
|
||
|
|
for (unsigned g = 0; g < inst->exec_size / 8; g++) {
|
||
|
|
fs_inst *mov = ibld.at(block, inst->next).group(8, g)
|
||
|
|
.MOV(horiz_offset(offset(inst->dst, ibld, i),
|
||
|
|
8 * g),
|
||
|
|
offset(tmp, ubld, 2 * g + i));
|
||
|
|
mov->predicate = inst->predicate;
|
||
|
|
mov->predicate_inverse = inst->predicate_inverse;
|
||
|
|
mov->flag_subreg = inst->flag_subreg;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
inst->dst = tmp;
|
||
|
|
progress = true;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
default:
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (progress)
|
||
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
||
|
|
|
||
|
|
return progress;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Lower a derivative instruction as the floating-point difference of two
|
||
|
|
* swizzles of the source, specified as \p swz0 and \p swz1.
|
||
|
|
*/
|
||
|
|
static bool
|
||
|
|
lower_derivative(fs_visitor &s, bblock_t *block, fs_inst *inst,
|
||
|
|
unsigned swz0, unsigned swz1)
|
||
|
|
{
|
||
|
|
const fs_builder ubld = fs_builder(&s, block, inst).exec_all();
|
||
|
|
const fs_reg tmp0 = ubld.vgrf(inst->src[0].type);
|
||
|
|
const fs_reg tmp1 = ubld.vgrf(inst->src[0].type);
|
||
|
|
|
||
|
|
ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0));
|
||
|
|
ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1));
|
||
|
|
|
||
|
|
inst->resize_sources(2);
|
||
|
|
inst->src[0] = negate(tmp0);
|
||
|
|
inst->src[1] = tmp1;
|
||
|
|
inst->opcode = BRW_OPCODE_ADD;
|
||
|
|
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Lower derivative instructions on platforms where codegen cannot implement
|
||
|
|
* them efficiently (i.e. XeHP).
|
||
|
|
*/
|
||
|
|
bool
|
||
|
|
brw_fs_lower_derivatives(fs_visitor &s)
|
||
|
|
{
|
||
|
|
bool progress = false;
|
||
|
|
|
||
|
|
if (s.devinfo->verx10 < 125)
|
||
|
|
return false;
|
||
|
|
|
||
|
|
foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
|
||
|
|
if (inst->opcode == FS_OPCODE_DDX_COARSE)
|
||
|
|
progress |= lower_derivative(s, block, inst,
|
||
|
|
BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY);
|
||
|
|
|
||
|
|
else if (inst->opcode == FS_OPCODE_DDX_FINE)
|
||
|
|
progress |= lower_derivative(s, block, inst,
|
||
|
|
BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW);
|
||
|
|
|
||
|
|
else if (inst->opcode == FS_OPCODE_DDY_COARSE)
|
||
|
|
progress |= lower_derivative(s, block, inst,
|
||
|
|
BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ);
|
||
|
|
|
||
|
|
else if (inst->opcode == FS_OPCODE_DDY_FINE)
|
||
|
|
progress |= lower_derivative(s, block, inst,
|
||
|
|
BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (progress)
|
||
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
||
|
|
|
||
|
|
return progress;
|
||
|
|
}
|
||
|
|
|
||
|
|
bool
|
||
|
|
brw_fs_lower_find_live_channel(fs_visitor &s)
|
||
|
|
{
|
||
|
|
bool progress = false;
|
||
|
|
|
||
|
|
if (s.devinfo->ver < 8)
|
||
|
|
return false;
|
||
|
|
|
||
|
|
bool packed_dispatch =
|
||
|
|
brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
|
||
|
|
s.stage_prog_data);
|
||
|
|
bool vmask =
|
||
|
|
s.stage == MESA_SHADER_FRAGMENT &&
|
||
|
|
brw_wm_prog_data(s.stage_prog_data)->uses_vmask;
|
||
|
|
|
||
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
|
||
|
|
if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL &&
|
||
|
|
inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL)
|
||
|
|
continue;
|
||
|
|
|
||
|
|
bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL;
|
||
|
|
|
||
|
|
/* Getting the first active channel index is easy on Gfx8: Just find
|
||
|
|
* the first bit set in the execution mask. The register exists on
|
||
|
|
* HSW already but it reads back as all ones when the current
|
||
|
|
* instruction has execution masking disabled, so it's kind of
|
||
|
|
* useless there.
|
||
|
|
*/
|
||
|
|
fs_reg exec_mask(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
|
||
|
|
|
||
|
|
const fs_builder ibld(&s, block, inst);
|
||
|
|
if (!inst->is_partial_write())
|
||
|
|
ibld.emit_undef_for_dst(inst);
|
||
|
|
|
||
|
|
const fs_builder ubld = fs_builder(&s, block, inst).exec_all().group(1, 0);
|
||
|
|
|
||
|
|
/* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
|
||
|
|
* so combine the execution and dispatch masks to obtain the true mask.
|
||
|
|
*
|
||
|
|
* If we're looking for the first live channel, and we have packed
|
||
|
|
* dispatch, we can skip this step, as we know all dispatched channels
|
||
|
|
* will appear at the front of the mask.
|
||
|
|
*/
|
||
|
|
if (!(first && packed_dispatch)) {
|
||
|
|
fs_reg mask = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
||
|
|
ubld.UNDEF(mask);
|
||
|
|
ubld.emit(SHADER_OPCODE_READ_SR_REG, mask, brw_imm_ud(vmask ? 3 : 2));
|
||
|
|
|
||
|
|
/* Quarter control has the effect of magically shifting the value of
|
||
|
|
* ce0 so you'll get the first/last active channel relative to the
|
||
|
|
* specified quarter control as result.
|
||
|
|
*/
|
||
|
|
if (inst->group > 0)
|
||
|
|
ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8)));
|
||
|
|
|
||
|
|
ubld.AND(mask, exec_mask, mask);
|
||
|
|
exec_mask = mask;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (first) {
|
||
|
|
ubld.FBL(inst->dst, exec_mask);
|
||
|
|
} else {
|
||
|
|
fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
||
|
|
ubld.UNDEF(tmp);
|
||
|
|
ubld.LZD(tmp, exec_mask);
|
||
|
|
ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31));
|
||
|
|
}
|
||
|
|
|
||
|
|
inst->remove(block);
|
||
|
|
progress = true;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (progress)
|
||
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
||
|
|
|
||
|
|
return progress;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* From the Skylake PRM Vol. 2a docs for sends:
|
||
|
|
*
|
||
|
|
* "It is required that the second block of GRFs does not overlap with the
|
||
|
|
* first block."
|
||
|
|
*
|
||
|
|
* There are plenty of cases where we may accidentally violate this due to
|
||
|
|
* having, for instance, both sources be the constant 0. This little pass
|
||
|
|
* just adds a new vgrf for the second payload and copies it over.
|
||
|
|
*/
|
||
|
|
bool
|
||
|
|
brw_fs_lower_sends_overlapping_payload(fs_visitor &s)
|
||
|
|
{
|
||
|
|
bool progress = false;
|
||
|
|
|
||
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
|
||
|
|
if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
|
||
|
|
regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
|
||
|
|
inst->src[3], inst->ex_mlen * REG_SIZE)) {
|
||
|
|
fs_reg tmp = fs_reg(VGRF, s.alloc.allocate(inst->ex_mlen),
|
||
|
|
BRW_REGISTER_TYPE_UD);
|
||
|
|
/* Sadly, we've lost all notion of channels and bit sizes at this
|
||
|
|
* point. Just WE_all it.
|
||
|
|
*/
|
||
|
|
const fs_builder ibld = fs_builder(&s, block, inst).exec_all().group(16, 0);
|
||
|
|
fs_reg copy_src = retype(inst->src[3], BRW_REGISTER_TYPE_UD);
|
||
|
|
fs_reg copy_dst = tmp;
|
||
|
|
for (unsigned i = 0; i < inst->ex_mlen; i += 2) {
|
||
|
|
if (inst->ex_mlen == i + 1) {
|
||
|
|
/* Only one register left; do SIMD8 */
|
||
|
|
ibld.group(8, 0).MOV(copy_dst, copy_src);
|
||
|
|
} else {
|
||
|
|
ibld.MOV(copy_dst, copy_src);
|
||
|
|
}
|
||
|
|
copy_src = offset(copy_src, ibld, 1);
|
||
|
|
copy_dst = offset(copy_dst, ibld, 1);
|
||
|
|
}
|
||
|
|
inst->src[3] = tmp;
|
||
|
|
progress = true;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (progress)
|
||
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
||
|
|
|
||
|
|
return progress;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Three source instruction must have a GRF/MRF destination register.
|
||
|
|
* ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
|
||
|
|
*/
|
||
|
|
bool
|
||
|
|
brw_fs_lower_3src_null_dest(fs_visitor &s)
|
||
|
|
{
|
||
|
|
bool progress = false;
|
||
|
|
|
||
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
|
||
|
|
if (inst->is_3src(s.compiler) && inst->dst.is_null()) {
|
||
|
|
inst->dst = fs_reg(VGRF, s.alloc.allocate(s.dispatch_width / 8),
|
||
|
|
inst->dst.type);
|
||
|
|
progress = true;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (progress)
|
||
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
|
||
|
|
DEPENDENCY_VARIABLES);
|
||
|
|
|
||
|
|
return progress;
|
||
|
|
}
|
||
|
|
|