diff --git a/src/nouveau/compiler/nak/from_nir.rs b/src/nouveau/compiler/nak/from_nir.rs index acf0b8e6398..61e94ed1f91 100644 --- a/src/nouveau/compiler/nak/from_nir.rs +++ b/src/nouveau/compiler/nak/from_nir.rs @@ -3449,6 +3449,31 @@ impl<'a> ShaderFromNir<'a> { }); self.set_dst(&intrin.def, dst.into()); } + nir_intrinsic_reduce => { + assert!(srcs[0].bit_size() == 32); + assert!(srcs[0].num_components() == 1); + let src = self.get_src(&srcs[0]); + let dst = b.alloc_ssa(RegFile::UGPR); + + let op = match intrin.reduction_op() { + nir_op_iand => ReduxOp::And, + nir_op_ior => ReduxOp::Or, + nir_op_ixor => ReduxOp::Xor, + nir_op_iadd => ReduxOp::Sum, + nir_op_imin => ReduxOp::Min(IntCmpType::I32), + nir_op_imax => ReduxOp::Max(IntCmpType::I32), + nir_op_umin => ReduxOp::Min(IntCmpType::U32), + nir_op_umax => ReduxOp::Max(IntCmpType::U32), + _ => panic!("Unknown reduction op"), + }; + + b.push_op(OpRedux { + dst: dst.into(), + src: src, + op, + }); + self.set_dst(&intrin.def, dst.into()); + } nir_intrinsic_shared_atomic => { let bit_size = intrin.def.bit_size(); let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24); diff --git a/src/nouveau/compiler/nak_nir.c b/src/nouveau/compiler/nak_nir.c index b1f3e1167c8..5d6f43d2669 100644 --- a/src/nouveau/compiler/nak_nir.c +++ b/src/nouveau/compiler/nak_nir.c @@ -998,7 +998,7 @@ nak_postprocess_nir(nir_shader *nir, nir_divergence_analysis(nir); if (nir->info.stage == MESA_SHADER_FRAGMENT) OPT(nir, nir_opt_tex_skip_helpers, true); - OPT(nir, nak_nir_lower_scan_reduce); + OPT(nir, nak_nir_lower_scan_reduce, nak); nak_optimize_nir(nir, nak); diff --git a/src/nouveau/compiler/nak_nir_lower_scan_reduce.c b/src/nouveau/compiler/nak_nir_lower_scan_reduce.c index 26544ef117c..ddd0a9d7b81 100644 --- a/src/nouveau/compiler/nak_nir_lower_scan_reduce.c +++ b/src/nouveau/compiler/nak_nir_lower_scan_reduce.c @@ -193,8 +193,10 @@ build_scan_reduce(nir_builder *b, nir_intrinsic_op op, nir_op red_op, static bool nak_nir_lower_scan_reduce_intrin(nir_builder *b, nir_intrinsic_instr *intrin, - UNUSED void *_data) + void *_nak) { + const struct nak_compiler *nak = (const struct nak_compiler *) _nak; + switch (intrin->intrinsic) { case nir_intrinsic_exclusive_scan: case nir_intrinsic_inclusive_scan: @@ -221,6 +223,17 @@ nak_nir_lower_scan_reduce_intrin(nir_builder *b, /* Simple case where we're not actually doing any reducing at all. */ assert(intrin->intrinsic == nir_intrinsic_reduce); data = intrin->src[0].ssa; + } else if (intrin->intrinsic == nir_intrinsic_reduce && + nak->sm >= 80 && + red_op != nir_op_imul && + nir_op_infos[red_op].output_type != nir_type_float && + intrin->src[0].ssa->bit_size == 32 && + cluster_size == 32 && + !intrin->instr.block->divergent) { + /* TODO: We could probably also use REDUX for the non-uniform case if we + * were allowed to write uregs from non-uniform control flow. + */ + return false; } else if (intrin->src[0].ssa->bit_size == 1) { data = build_scan_bool(b, intrin->intrinsic, red_op, intrin->src[0].ssa, cluster_size); @@ -252,8 +265,8 @@ nak_nir_lower_scan_reduce_intrin(nir_builder *b, } bool -nak_nir_lower_scan_reduce(nir_shader *nir) +nak_nir_lower_scan_reduce(nir_shader *nir, const struct nak_compiler *nak) { return nir_shader_intrinsics_pass(nir, nak_nir_lower_scan_reduce_intrin, - nir_metadata_none, NULL); + nir_metadata_none, (void*) nak); } diff --git a/src/nouveau/compiler/nak_private.h b/src/nouveau/compiler/nak_private.h index 63c080d262d..d1cf3e3f46c 100644 --- a/src/nouveau/compiler/nak_private.h +++ b/src/nouveau/compiler/nak_private.h @@ -198,7 +198,7 @@ static_assert(sizeof(struct nak_nir_tex_flags) == 4, _u; \ }) -bool nak_nir_lower_scan_reduce(nir_shader *shader); +bool nak_nir_lower_scan_reduce(nir_shader *shader, const struct nak_compiler *nak); bool nak_nir_lower_tex(nir_shader *nir, const struct nak_compiler *nak); bool nak_nir_lower_gs_intrinsics(nir_shader *shader); bool nak_nir_lower_algebraic_late(nir_shader *nir, const struct nak_compiler *nak);