mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-27 01:50:10 +01:00
intel/fs: Implement reduce and scan opeprations
Acked-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
This commit is contained in:
parent
4150920b95
commit
2292b20b29
2 changed files with 162 additions and 0 deletions
|
|
@ -3722,6 +3722,71 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
|
|||
}
|
||||
}
|
||||
|
||||
static fs_reg
|
||||
brw_nir_reduction_op_identity(const fs_builder &bld,
|
||||
nir_op op, brw_reg_type type)
|
||||
{
|
||||
nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
|
||||
switch (type_sz(type)) {
|
||||
case 2:
|
||||
assert(type != BRW_REGISTER_TYPE_HF);
|
||||
return retype(brw_imm_uw(value.u16[0]), type);
|
||||
case 4:
|
||||
return retype(brw_imm_ud(value.u32[0]), type);
|
||||
case 8:
|
||||
if (type == BRW_REGISTER_TYPE_DF)
|
||||
return setup_imm_df(bld, value.f64[0]);
|
||||
else
|
||||
return retype(brw_imm_u64(value.u64[0]), type);
|
||||
default:
|
||||
unreachable("Invalid type size");
|
||||
}
|
||||
}
|
||||
|
||||
static opcode
|
||||
brw_op_for_nir_reduction_op(nir_op op)
|
||||
{
|
||||
switch (op) {
|
||||
case nir_op_iadd: return BRW_OPCODE_ADD;
|
||||
case nir_op_fadd: return BRW_OPCODE_ADD;
|
||||
case nir_op_imul: return BRW_OPCODE_MUL;
|
||||
case nir_op_fmul: return BRW_OPCODE_MUL;
|
||||
case nir_op_imin: return BRW_OPCODE_SEL;
|
||||
case nir_op_umin: return BRW_OPCODE_SEL;
|
||||
case nir_op_fmin: return BRW_OPCODE_SEL;
|
||||
case nir_op_imax: return BRW_OPCODE_SEL;
|
||||
case nir_op_umax: return BRW_OPCODE_SEL;
|
||||
case nir_op_fmax: return BRW_OPCODE_SEL;
|
||||
case nir_op_iand: return BRW_OPCODE_AND;
|
||||
case nir_op_ior: return BRW_OPCODE_OR;
|
||||
case nir_op_ixor: return BRW_OPCODE_XOR;
|
||||
default:
|
||||
unreachable("Invalid reduction operation");
|
||||
}
|
||||
}
|
||||
|
||||
static brw_conditional_mod
|
||||
brw_cond_mod_for_nir_reduction_op(nir_op op)
|
||||
{
|
||||
switch (op) {
|
||||
case nir_op_iadd: return BRW_CONDITIONAL_NONE;
|
||||
case nir_op_fadd: return BRW_CONDITIONAL_NONE;
|
||||
case nir_op_imul: return BRW_CONDITIONAL_NONE;
|
||||
case nir_op_fmul: return BRW_CONDITIONAL_NONE;
|
||||
case nir_op_imin: return BRW_CONDITIONAL_L;
|
||||
case nir_op_umin: return BRW_CONDITIONAL_L;
|
||||
case nir_op_fmin: return BRW_CONDITIONAL_L;
|
||||
case nir_op_imax: return BRW_CONDITIONAL_GE;
|
||||
case nir_op_umax: return BRW_CONDITIONAL_GE;
|
||||
case nir_op_fmax: return BRW_CONDITIONAL_GE;
|
||||
case nir_op_iand: return BRW_CONDITIONAL_NONE;
|
||||
case nir_op_ior: return BRW_CONDITIONAL_NONE;
|
||||
case nir_op_ixor: return BRW_CONDITIONAL_NONE;
|
||||
default:
|
||||
unreachable("Invalid reduction operation");
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
|
||||
{
|
||||
|
|
@ -4523,6 +4588,95 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
|
|||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_reduce: {
|
||||
fs_reg src = get_nir_src(instr->src[0]);
|
||||
nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
|
||||
unsigned cluster_size = nir_intrinsic_cluster_size(instr);
|
||||
if (cluster_size == 0 || cluster_size > dispatch_width)
|
||||
cluster_size = dispatch_width;
|
||||
|
||||
/* Figure out the source type */
|
||||
src.type = brw_type_for_nir_type(devinfo,
|
||||
(nir_alu_type)(nir_op_infos[redop].input_types[0] |
|
||||
nir_src_bit_size(instr->src[0])));
|
||||
|
||||
fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
|
||||
opcode brw_op = brw_op_for_nir_reduction_op(redop);
|
||||
brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
|
||||
|
||||
/* Set up a register for all of our scratching around and initialize it
|
||||
* to reduction operation's identity value.
|
||||
*/
|
||||
fs_reg scan = bld.vgrf(src.type);
|
||||
bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
|
||||
|
||||
bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
|
||||
|
||||
dest.type = src.type;
|
||||
if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
|
||||
/* In this case, CLUSTER_BROADCAST instruction isn't needed because
|
||||
* the distance between clusters is at least 2 GRFs. In this case,
|
||||
* we don't need the weird striding of the CLUSTER_BROADCAST
|
||||
* instruction and can just do regular MOVs.
|
||||
*/
|
||||
assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
|
||||
const unsigned groups =
|
||||
(dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
|
||||
const unsigned group_size = dispatch_width / groups;
|
||||
for (unsigned i = 0; i < groups; i++) {
|
||||
const unsigned cluster = (i * group_size) / cluster_size;
|
||||
const unsigned comp = cluster * cluster_size + (cluster_size - 1);
|
||||
bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
|
||||
component(scan, comp));
|
||||
}
|
||||
} else {
|
||||
bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
|
||||
brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_inclusive_scan:
|
||||
case nir_intrinsic_exclusive_scan: {
|
||||
fs_reg src = get_nir_src(instr->src[0]);
|
||||
nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
|
||||
|
||||
/* Figure out the source type */
|
||||
src.type = brw_type_for_nir_type(devinfo,
|
||||
(nir_alu_type)(nir_op_infos[redop].input_types[0] |
|
||||
nir_src_bit_size(instr->src[0])));
|
||||
|
||||
fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
|
||||
opcode brw_op = brw_op_for_nir_reduction_op(redop);
|
||||
brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
|
||||
|
||||
/* Set up a register for all of our scratching around and initialize it
|
||||
* to reduction operation's identity value.
|
||||
*/
|
||||
fs_reg scan = bld.vgrf(src.type);
|
||||
const fs_builder allbld = bld.exec_all();
|
||||
allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
|
||||
|
||||
if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
|
||||
/* Exclusive scan is a bit harder because we have to do an annoying
|
||||
* shift of the contents before we can begin. To make things worse,
|
||||
* we can't do this with a normal stride; we have to use indirects.
|
||||
*/
|
||||
fs_reg shifted = bld.vgrf(src.type);
|
||||
fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
|
||||
allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
|
||||
brw_imm_w(-1));
|
||||
allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
|
||||
allbld.group(1, 0).MOV(component(shifted, 0), identity);
|
||||
scan = shifted;
|
||||
}
|
||||
|
||||
bld.emit_scan(brw_op, scan, dispatch_width, cond_mod);
|
||||
|
||||
bld.MOV(retype(dest, src.type), scan);
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
unreachable("unknown intrinsic");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -589,6 +589,14 @@ brw_imm_df(double df)
|
|||
return imm;
|
||||
}
|
||||
|
||||
static inline struct brw_reg
|
||||
brw_imm_u64(uint64_t u64)
|
||||
{
|
||||
struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UQ);
|
||||
imm.u64 = u64;
|
||||
return imm;
|
||||
}
|
||||
|
||||
static inline struct brw_reg
|
||||
brw_imm_f(float f)
|
||||
{
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue