2024-07-15 15:09:12 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright 2024 Intel Corporation
|
|
|
|
|
* SPDX-License-Identifier: MIT
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include <stdint.h>
|
|
|
|
|
#include "util/half_float.h"
|
|
|
|
|
|
|
|
|
|
#include "brw_fs.h"
|
|
|
|
|
#include "brw_fs_builder.h"
|
|
|
|
|
|
|
|
|
|
using namespace brw;
|
|
|
|
|
|
|
|
|
|
struct brw_reduction_info {
|
|
|
|
|
brw_reg identity;
|
|
|
|
|
enum opcode op;
|
|
|
|
|
brw_conditional_mod cond_mod;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static brw_reduction_info
|
|
|
|
|
brw_get_reduction_info(brw_reduce_op red_op, brw_reg_type type)
|
|
|
|
|
{
|
|
|
|
|
struct brw_reduction_info info;
|
|
|
|
|
|
|
|
|
|
info.op = BRW_OPCODE_SEL;
|
|
|
|
|
info.cond_mod = BRW_CONDITIONAL_NONE;
|
|
|
|
|
|
|
|
|
|
switch (red_op) {
|
|
|
|
|
case BRW_REDUCE_OP_ADD: info.op = BRW_OPCODE_ADD; break;
|
|
|
|
|
case BRW_REDUCE_OP_MUL: info.op = BRW_OPCODE_MUL; break;
|
|
|
|
|
case BRW_REDUCE_OP_AND: info.op = BRW_OPCODE_AND; break;
|
|
|
|
|
case BRW_REDUCE_OP_OR: info.op = BRW_OPCODE_OR; break;
|
|
|
|
|
case BRW_REDUCE_OP_XOR: info.op = BRW_OPCODE_XOR; break;
|
|
|
|
|
case BRW_REDUCE_OP_MIN: info.cond_mod = BRW_CONDITIONAL_L; break;
|
|
|
|
|
case BRW_REDUCE_OP_MAX: info.cond_mod = BRW_CONDITIONAL_GE; break;
|
|
|
|
|
default:
|
|
|
|
|
unreachable("invalid reduce op");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
switch (red_op) {
|
|
|
|
|
case BRW_REDUCE_OP_ADD:
|
|
|
|
|
case BRW_REDUCE_OP_XOR:
|
|
|
|
|
case BRW_REDUCE_OP_OR:
|
|
|
|
|
info.identity = retype(brw_imm_u64(0), type);
|
|
|
|
|
return info;
|
|
|
|
|
case BRW_REDUCE_OP_AND:
|
|
|
|
|
info.identity = retype(brw_imm_u64(~0ull), type);
|
|
|
|
|
return info;
|
|
|
|
|
default:
|
|
|
|
|
/* Continue below. */
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
brw_reg id;
|
|
|
|
|
const unsigned size = brw_type_size_bytes(type);
|
|
|
|
|
|
|
|
|
|
switch (red_op) {
|
|
|
|
|
case BRW_REDUCE_OP_MUL: {
|
|
|
|
|
if (brw_type_is_int(type)) {
|
|
|
|
|
id = size < 4 ? brw_imm_uw(1) :
|
|
|
|
|
size == 4 ? brw_imm_ud(1) :
|
|
|
|
|
brw_imm_u64(1);
|
|
|
|
|
} else {
|
|
|
|
|
assert(brw_type_is_float(type));
|
|
|
|
|
id = size == 2 ? brw_imm_uw(_mesa_float_to_half(1.0)) :
|
|
|
|
|
size == 4 ? brw_imm_f(1.0) :
|
|
|
|
|
brw_imm_df(1.0);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
case BRW_REDUCE_OP_MIN: {
|
|
|
|
|
if (brw_type_is_uint(type)) {
|
|
|
|
|
id = brw_imm_u64(~0ull);
|
|
|
|
|
} else if (brw_type_is_sint(type)) {
|
|
|
|
|
id = size == 1 ? brw_imm_w(INT8_MAX) :
|
|
|
|
|
size == 2 ? brw_imm_w(INT16_MAX) :
|
|
|
|
|
size == 4 ? brw_imm_d(INT32_MAX) :
|
|
|
|
|
brw_imm_q(INT64_MAX);
|
|
|
|
|
} else {
|
|
|
|
|
assert(brw_type_is_float(type));
|
|
|
|
|
id = size == 2 ? brw_imm_uw(_mesa_float_to_half(INFINITY)) :
|
|
|
|
|
size == 4 ? brw_imm_f(INFINITY) :
|
|
|
|
|
brw_imm_df(INFINITY);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
case BRW_REDUCE_OP_MAX: {
|
|
|
|
|
if (brw_type_is_uint(type)) {
|
|
|
|
|
id = brw_imm_u64(0);
|
|
|
|
|
} else if (brw_type_is_sint(type)) {
|
|
|
|
|
id = size == 1 ? brw_imm_w(INT8_MIN) :
|
|
|
|
|
size == 2 ? brw_imm_w(INT16_MIN) :
|
|
|
|
|
size == 4 ? brw_imm_d(INT32_MIN) :
|
|
|
|
|
brw_imm_q(INT64_MIN);
|
|
|
|
|
} else {
|
|
|
|
|
assert(brw_type_is_float(type));
|
|
|
|
|
id = size == 2 ? brw_imm_uw(_mesa_float_to_half(-INFINITY)) :
|
|
|
|
|
size == 4 ? brw_imm_f(-INFINITY) :
|
|
|
|
|
brw_imm_df(-INFINITY);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
unreachable("invalid reduce op");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* For some cases above (e.g. all bits zeros, all bits ones, first bit one)
|
|
|
|
|
* either the size or the signedness was ignored, so adjust the final type
|
|
|
|
|
* now.
|
|
|
|
|
*
|
|
|
|
|
* B/UB types can't have immediates, so used W/UW above and here.
|
|
|
|
|
*/
|
|
|
|
|
if (type == BRW_TYPE_UB) type = BRW_TYPE_UW;
|
|
|
|
|
else if (type == BRW_TYPE_B) type = BRW_TYPE_W;
|
|
|
|
|
|
|
|
|
|
info.identity = retype(id, type);
|
|
|
|
|
|
|
|
|
|
return info;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-29 12:01:45 -07:00
|
|
|
static void
|
|
|
|
|
brw_emit_scan_step(const fs_builder &bld, enum opcode opcode, brw_conditional_mod mod,
|
|
|
|
|
const brw_reg &tmp,
|
|
|
|
|
unsigned left_offset, unsigned left_stride,
|
|
|
|
|
unsigned right_offset, unsigned right_stride)
|
|
|
|
|
{
|
|
|
|
|
brw_reg left, right;
|
|
|
|
|
left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
|
|
|
|
|
right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
|
|
|
|
|
if ((tmp.type == BRW_TYPE_Q || tmp.type == BRW_TYPE_UQ) &&
|
|
|
|
|
(!bld.shader->devinfo->has_64bit_int || bld.shader->devinfo->ver >= 20)) {
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
case BRW_OPCODE_MUL:
|
|
|
|
|
/* This will get lowered by integer MUL lowering */
|
|
|
|
|
set_condmod(mod, bld.emit(opcode, right, left, right));
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_SEL: {
|
|
|
|
|
/* In order for the comparisons to work out right, we need our
|
|
|
|
|
* comparisons to be strict.
|
|
|
|
|
*/
|
|
|
|
|
assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE);
|
|
|
|
|
if (mod == BRW_CONDITIONAL_GE)
|
|
|
|
|
mod = BRW_CONDITIONAL_G;
|
|
|
|
|
|
|
|
|
|
/* We treat the bottom 32 bits as unsigned regardless of
|
|
|
|
|
* whether or not the integer as a whole is signed.
|
|
|
|
|
*/
|
|
|
|
|
brw_reg right_low = subscript(right, BRW_TYPE_UD, 0);
|
|
|
|
|
brw_reg left_low = subscript(left, BRW_TYPE_UD, 0);
|
|
|
|
|
|
|
|
|
|
/* The upper bits get the same sign as the 64-bit type */
|
|
|
|
|
brw_reg_type type32 = brw_type_with_size(tmp.type, 32);
|
|
|
|
|
brw_reg right_high = subscript(right, type32, 1);
|
|
|
|
|
brw_reg left_high = subscript(left, type32, 1);
|
|
|
|
|
|
|
|
|
|
/* Build up our comparison:
|
|
|
|
|
*
|
|
|
|
|
* l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
|
|
|
|
|
*/
|
|
|
|
|
bld.CMP(bld.null_reg_ud(), retype(left_low, BRW_TYPE_UD),
|
|
|
|
|
retype(right_low, BRW_TYPE_UD), mod);
|
|
|
|
|
set_predicate(BRW_PREDICATE_NORMAL,
|
|
|
|
|
bld.CMP(bld.null_reg_ud(), left_high, right_high,
|
|
|
|
|
BRW_CONDITIONAL_EQ));
|
|
|
|
|
set_predicate_inv(BRW_PREDICATE_NORMAL, true,
|
|
|
|
|
bld.CMP(bld.null_reg_ud(), left_high, right_high, mod));
|
|
|
|
|
|
|
|
|
|
/* We could use selects here or we could use predicated MOVs
|
|
|
|
|
* because the destination and second source (if it were a SEL)
|
|
|
|
|
* are the same.
|
|
|
|
|
*/
|
|
|
|
|
set_predicate(BRW_PREDICATE_NORMAL, bld.MOV(right_low, left_low));
|
|
|
|
|
set_predicate(BRW_PREDICATE_NORMAL, bld.MOV(right_high, left_high));
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
unreachable("Unsupported 64-bit scan op");
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
set_condmod(mod, bld.emit(opcode, right, left, right));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
brw_emit_scan(const fs_builder &bld, enum opcode opcode, const brw_reg &tmp,
|
|
|
|
|
unsigned cluster_size, brw_conditional_mod mod)
|
|
|
|
|
{
|
|
|
|
|
unsigned dispatch_width = bld.dispatch_width();
|
|
|
|
|
assert(dispatch_width >= 8);
|
|
|
|
|
|
|
|
|
|
/* The instruction splitting code isn't advanced enough to split
|
|
|
|
|
* these so we need to handle that ourselves.
|
|
|
|
|
*/
|
|
|
|
|
if (dispatch_width * brw_type_size_bytes(tmp.type) > 2 * REG_SIZE) {
|
|
|
|
|
const unsigned half_width = dispatch_width / 2;
|
|
|
|
|
const fs_builder ubld = bld.exec_all().group(half_width, 0);
|
|
|
|
|
brw_reg left = tmp;
|
|
|
|
|
brw_reg right = horiz_offset(tmp, half_width);
|
|
|
|
|
brw_emit_scan(ubld, opcode, left, cluster_size, mod);
|
|
|
|
|
brw_emit_scan(ubld, opcode, right, cluster_size, mod);
|
|
|
|
|
if (cluster_size > half_width) {
|
|
|
|
|
brw_emit_scan_step(ubld, opcode, mod, tmp,
|
|
|
|
|
half_width - 1, 0, half_width, 1);
|
|
|
|
|
}
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (cluster_size > 1) {
|
|
|
|
|
const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0);
|
|
|
|
|
brw_emit_scan_step(ubld, opcode, mod, tmp, 0, 2, 1, 2);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (cluster_size > 2) {
|
|
|
|
|
if (brw_type_size_bytes(tmp.type) <= 4) {
|
|
|
|
|
const fs_builder ubld =
|
|
|
|
|
bld.exec_all().group(dispatch_width / 4, 0);
|
|
|
|
|
brw_emit_scan_step(ubld, opcode, mod, tmp, 1, 4, 2, 4);
|
|
|
|
|
brw_emit_scan_step(ubld, opcode, mod, tmp, 1, 4, 3, 4);
|
|
|
|
|
} else {
|
|
|
|
|
/* For 64-bit types, we have to do things differently because
|
|
|
|
|
* the code above would land us with destination strides that
|
|
|
|
|
* the hardware can't handle. Fortunately, we'll only be
|
|
|
|
|
* 8-wide in that case and it's the same number of
|
|
|
|
|
* instructions.
|
|
|
|
|
*/
|
|
|
|
|
const fs_builder ubld = bld.exec_all().group(2, 0);
|
|
|
|
|
for (unsigned i = 0; i < dispatch_width; i += 4)
|
|
|
|
|
brw_emit_scan_step(ubld, opcode, mod, tmp, i + 1, 0, i + 2, 1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 4;
|
|
|
|
|
i < MIN2(cluster_size, dispatch_width);
|
|
|
|
|
i *= 2) {
|
|
|
|
|
const fs_builder ubld = bld.exec_all().group(i, 0);
|
|
|
|
|
brw_emit_scan_step(ubld, opcode, mod, tmp, i - 1, 0, i, 1);
|
|
|
|
|
|
|
|
|
|
if (dispatch_width > i * 2)
|
|
|
|
|
brw_emit_scan_step(ubld, opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
|
|
|
|
|
|
|
|
|
|
if (dispatch_width > i * 4) {
|
|
|
|
|
brw_emit_scan_step(ubld, opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
|
|
|
|
|
brw_emit_scan_step(ubld, opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-15 15:09:12 -07:00
|
|
|
static bool
|
|
|
|
|
brw_lower_reduce(fs_visitor &s, bblock_t *block, fs_inst *inst)
|
|
|
|
|
{
|
|
|
|
|
const fs_builder bld(&s, block, inst);
|
|
|
|
|
|
|
|
|
|
assert(inst->dst.type == inst->src[0].type);
|
|
|
|
|
brw_reg dst = inst->dst;
|
|
|
|
|
brw_reg src = inst->src[0];
|
|
|
|
|
|
|
|
|
|
assert(inst->src[1].file == IMM);
|
|
|
|
|
enum brw_reduce_op op = (enum brw_reduce_op)inst->src[1].ud;
|
|
|
|
|
|
|
|
|
|
assert(inst->src[2].file == IMM);
|
|
|
|
|
unsigned cluster_size = inst->src[2].ud;
|
|
|
|
|
|
|
|
|
|
assert(cluster_size > 0);
|
|
|
|
|
assert(cluster_size <= s.dispatch_width);
|
|
|
|
|
|
|
|
|
|
struct brw_reduction_info info = brw_get_reduction_info(op, src.type);
|
|
|
|
|
|
|
|
|
|
/* Set up a register for all of our scratching around and initialize it
|
|
|
|
|
* to reduction operation's identity value.
|
|
|
|
|
*/
|
|
|
|
|
brw_reg scan = bld.vgrf(src.type);
|
|
|
|
|
bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, info.identity);
|
|
|
|
|
|
2024-07-29 12:01:45 -07:00
|
|
|
brw_emit_scan(bld, info.op, scan, cluster_size, info.cond_mod);
|
2024-07-15 15:09:12 -07:00
|
|
|
|
|
|
|
|
if (cluster_size * brw_type_size_bytes(src.type) >= REG_SIZE * 2) {
|
|
|
|
|
/* In this case, CLUSTER_BROADCAST instruction isn't needed because
|
|
|
|
|
* the distance between clusters is at least 2 GRFs. In this case,
|
|
|
|
|
* we don't need the weird striding of the CLUSTER_BROADCAST
|
|
|
|
|
* instruction and can just do regular MOVs.
|
|
|
|
|
*/
|
|
|
|
|
assert((cluster_size * brw_type_size_bytes(src.type)) % (REG_SIZE * 2) == 0);
|
|
|
|
|
const unsigned groups =
|
|
|
|
|
(s.dispatch_width * brw_type_size_bytes(src.type)) / (REG_SIZE * 2);
|
|
|
|
|
const unsigned group_size = s.dispatch_width / groups;
|
|
|
|
|
for (unsigned i = 0; i < groups; i++) {
|
|
|
|
|
const unsigned cluster = (i * group_size) / cluster_size;
|
|
|
|
|
const unsigned comp = cluster * cluster_size + (cluster_size - 1);
|
|
|
|
|
bld.group(group_size, i).MOV(horiz_offset(dst, i * group_size),
|
|
|
|
|
component(scan, comp));
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dst, scan,
|
|
|
|
|
brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
|
|
|
|
|
}
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-16 14:06:12 -07:00
|
|
|
static bool
|
|
|
|
|
brw_lower_scan(fs_visitor &s, bblock_t *block, fs_inst *inst)
|
|
|
|
|
{
|
|
|
|
|
const fs_builder bld(&s, block, inst);
|
|
|
|
|
|
|
|
|
|
assert(inst->dst.type == inst->src[0].type);
|
|
|
|
|
brw_reg dst = inst->dst;
|
|
|
|
|
brw_reg src = inst->src[0];
|
|
|
|
|
|
|
|
|
|
assert(inst->src[1].file == IMM);
|
|
|
|
|
enum brw_reduce_op op = (enum brw_reduce_op)inst->src[1].ud;
|
|
|
|
|
|
|
|
|
|
struct brw_reduction_info info = brw_get_reduction_info(op, src.type);
|
|
|
|
|
|
|
|
|
|
/* Set up a register for all of our scratching around and initialize it
|
|
|
|
|
* to reduction operation's identity value.
|
|
|
|
|
*/
|
|
|
|
|
brw_reg scan = bld.vgrf(src.type);
|
|
|
|
|
const fs_builder ubld = bld.exec_all();
|
|
|
|
|
ubld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, info.identity);
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_EXCLUSIVE_SCAN) {
|
|
|
|
|
/* Exclusive scan is a bit harder because we have to do an annoying
|
|
|
|
|
* shift of the contents before we can begin. To make things worse,
|
|
|
|
|
* we can't do this with a normal stride; we have to use indirects.
|
|
|
|
|
*/
|
|
|
|
|
brw_reg shifted = bld.vgrf(src.type);
|
|
|
|
|
brw_reg idx = bld.vgrf(BRW_TYPE_W);
|
|
|
|
|
|
|
|
|
|
ubld.ADD(idx, bld.LOAD_SUBGROUP_INVOCATION(), brw_imm_w(-1));
|
|
|
|
|
ubld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
|
|
|
|
|
ubld.group(1, 0).MOV(horiz_offset(shifted, 0), info.identity);
|
|
|
|
|
scan = shifted;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-29 12:01:45 -07:00
|
|
|
brw_emit_scan(bld, info.op, scan, s.dispatch_width, info.cond_mod);
|
2024-07-16 14:06:12 -07:00
|
|
|
|
|
|
|
|
bld.MOV(dst, scan);
|
|
|
|
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-15 15:09:12 -07:00
|
|
|
bool
|
|
|
|
|
brw_fs_lower_subgroup_ops(fs_visitor &s)
|
|
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
|
|
|
|
|
switch (inst->opcode) {
|
|
|
|
|
case SHADER_OPCODE_REDUCE:
|
|
|
|
|
progress |= brw_lower_reduce(s, block, inst);
|
|
|
|
|
break;
|
|
|
|
|
|
2024-07-16 14:06:12 -07:00
|
|
|
case SHADER_OPCODE_INCLUSIVE_SCAN:
|
|
|
|
|
case SHADER_OPCODE_EXCLUSIVE_SCAN:
|
|
|
|
|
progress |= brw_lower_scan(s, block, inst);
|
|
|
|
|
break;
|
|
|
|
|
|
2024-07-15 15:09:12 -07:00
|
|
|
default:
|
|
|
|
|
/* Nothing to do. */
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|