diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 1256b77eaf3..fc72f511f1d 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -412,6 +412,15 @@ enum opcode { */ SHADER_OPCODE_REDUCE, + /* Combine values of previous channels using an operation. Inclusive scan + * will include the value of the channel itself in the channel result. + * + * Source 0: Value. + * Source 1: Immediate with brw_reduce_op. + */ + SHADER_OPCODE_INCLUSIVE_SCAN, + SHADER_OPCODE_EXCLUSIVE_SCAN, + /* Select between src0 and src1 based on channel enables. * * This instruction copies src0 into the enabled channels of the diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index f4128b8cde3..50d459551d2 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -319,6 +319,8 @@ fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const case SHADER_OPCODE_INT_QUOTIENT: case SHADER_OPCODE_INT_REMAINDER: case SHADER_OPCODE_REDUCE: + case SHADER_OPCODE_INCLUSIVE_SCAN: + case SHADER_OPCODE_EXCLUSIVE_SCAN: return false; default: return true; diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index e901cbd819f..f94ac31ba59 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4680,35 +4680,6 @@ fs_nir_emit_bs_intrinsic(nir_to_brw_state &ntb, } } -static brw_reg -brw_nir_reduction_op_identity(const fs_builder &bld, - nir_op op, brw_reg_type type) -{ - nir_const_value value = - nir_alu_binop_identity(op, brw_type_size_bits(type)); - - switch (brw_type_size_bytes(type)) { - case 1: - if (type == BRW_TYPE_UB) { - return brw_imm_uw(value.u8); - } else { - assert(type == BRW_TYPE_B); - return brw_imm_w(value.i8); - } - case 2: - return retype(brw_imm_uw(value.u16), type); - case 4: - return retype(brw_imm_ud(value.u32), type); - case 8: - if (type == BRW_TYPE_DF) - return brw_imm_df(value.f64); - else - return retype(brw_imm_u64(value.u64), type); - default: - unreachable("Invalid type size"); - } -} - static brw_reduce_op brw_reduce_op_for_nir_reduction_op(nir_op op) { @@ -4731,50 +4702,6 @@ brw_reduce_op_for_nir_reduction_op(nir_op op) } } -static opcode -brw_op_for_nir_reduction_op(nir_op op) -{ - switch (op) { - case nir_op_iadd: return BRW_OPCODE_ADD; - case nir_op_fadd: return BRW_OPCODE_ADD; - case nir_op_imul: return BRW_OPCODE_MUL; - case nir_op_fmul: return BRW_OPCODE_MUL; - case nir_op_imin: return BRW_OPCODE_SEL; - case nir_op_umin: return BRW_OPCODE_SEL; - case nir_op_fmin: return BRW_OPCODE_SEL; - case nir_op_imax: return BRW_OPCODE_SEL; - case nir_op_umax: return BRW_OPCODE_SEL; - case nir_op_fmax: return BRW_OPCODE_SEL; - case nir_op_iand: return BRW_OPCODE_AND; - case nir_op_ior: return BRW_OPCODE_OR; - case nir_op_ixor: return BRW_OPCODE_XOR; - default: - unreachable("Invalid reduction operation"); - } -} - -static brw_conditional_mod -brw_cond_mod_for_nir_reduction_op(nir_op op) -{ - switch (op) { - case nir_op_iadd: return BRW_CONDITIONAL_NONE; - case nir_op_fadd: return BRW_CONDITIONAL_NONE; - case nir_op_imul: return BRW_CONDITIONAL_NONE; - case nir_op_fmul: return BRW_CONDITIONAL_NONE; - case nir_op_imin: return BRW_CONDITIONAL_L; - case nir_op_umin: return BRW_CONDITIONAL_L; - case nir_op_fmin: return BRW_CONDITIONAL_L; - case nir_op_imax: return BRW_CONDITIONAL_GE; - case nir_op_umax: return BRW_CONDITIONAL_GE; - case nir_op_fmax: return BRW_CONDITIONAL_GE; - case nir_op_iand: return BRW_CONDITIONAL_NONE; - case nir_op_ior: return BRW_CONDITIONAL_NONE; - case nir_op_ixor: return BRW_CONDITIONAL_NONE; - default: - unreachable("Invalid reduction operation"); - } -} - struct rebuild_resource { unsigned idx; std::vector array; @@ -7074,40 +7001,18 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb, case nir_intrinsic_inclusive_scan: case nir_intrinsic_exclusive_scan: { brw_reg src = get_nir_src(ntb, instr->src[0]); - nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr); + nir_op op = (nir_op)nir_intrinsic_reduction_op(instr); + enum brw_reduce_op brw_op = brw_reduce_op_for_nir_reduction_op(op); /* Figure out the source type */ src.type = brw_type_for_nir_type(devinfo, - (nir_alu_type)(nir_op_infos[redop].input_types[0] | + (nir_alu_type)(nir_op_infos[op].input_types[0] | nir_src_bit_size(instr->src[0]))); - brw_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type); - opcode brw_op = brw_op_for_nir_reduction_op(redop); - brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); + enum opcode opcode = instr->intrinsic == nir_intrinsic_exclusive_scan ? + SHADER_OPCODE_EXCLUSIVE_SCAN : SHADER_OPCODE_INCLUSIVE_SCAN; - /* Set up a register for all of our scratching around and initialize it - * to reduction operation's identity value. - */ - brw_reg scan = bld.vgrf(src.type); - const fs_builder allbld = bld.exec_all(); - allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); - - if (instr->intrinsic == nir_intrinsic_exclusive_scan) { - /* Exclusive scan is a bit harder because we have to do an annoying - * shift of the contents before we can begin. To make things worse, - * we can't do this with a normal stride; we have to use indirects. - */ - brw_reg shifted = bld.vgrf(src.type); - brw_reg idx = bld.vgrf(BRW_TYPE_W); - allbld.ADD(idx, bld.LOAD_SUBGROUP_INVOCATION(), brw_imm_w(-1)); - allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx); - allbld.group(1, 0).MOV(horiz_offset(shifted, 0), identity); - scan = shifted; - } - - bld.emit_scan(brw_op, scan, s.dispatch_width, cond_mod); - - bld.MOV(retype(dest, src.type), scan); + bld.emit(opcode, retype(dest, src.type), src, brw_imm_ud(brw_op)); break; } diff --git a/src/intel/compiler/brw_fs_validate.cpp b/src/intel/compiler/brw_fs_validate.cpp index 0fe0dfdc73d..966377c41b8 100644 --- a/src/intel/compiler/brw_fs_validate.cpp +++ b/src/intel/compiler/brw_fs_validate.cpp @@ -232,6 +232,8 @@ brw_validate_instruction_phase(const fs_visitor &s, fs_inst *inst) case SHADER_OPCODE_URB_READ_LOGICAL: case SHADER_OPCODE_URB_WRITE_LOGICAL: case SHADER_OPCODE_REDUCE: + case SHADER_OPCODE_INCLUSIVE_SCAN: + case SHADER_OPCODE_EXCLUSIVE_SCAN: invalid_from = BRW_SHADER_PHASE_AFTER_EARLY_LOWERING; break; diff --git a/src/intel/compiler/brw_lower_subgroup_ops.cpp b/src/intel/compiler/brw_lower_subgroup_ops.cpp index 4d427b5ede0..cedb2e17759 100644 --- a/src/intel/compiler/brw_lower_subgroup_ops.cpp +++ b/src/intel/compiler/brw_lower_subgroup_ops.cpp @@ -173,6 +173,49 @@ brw_lower_reduce(fs_visitor &s, bblock_t *block, fs_inst *inst) return true; } +static bool +brw_lower_scan(fs_visitor &s, bblock_t *block, fs_inst *inst) +{ + const fs_builder bld(&s, block, inst); + + assert(inst->dst.type == inst->src[0].type); + brw_reg dst = inst->dst; + brw_reg src = inst->src[0]; + + assert(inst->src[1].file == IMM); + enum brw_reduce_op op = (enum brw_reduce_op)inst->src[1].ud; + + struct brw_reduction_info info = brw_get_reduction_info(op, src.type); + + /* Set up a register for all of our scratching around and initialize it + * to reduction operation's identity value. + */ + brw_reg scan = bld.vgrf(src.type); + const fs_builder ubld = bld.exec_all(); + ubld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, info.identity); + + if (inst->opcode == SHADER_OPCODE_EXCLUSIVE_SCAN) { + /* Exclusive scan is a bit harder because we have to do an annoying + * shift of the contents before we can begin. To make things worse, + * we can't do this with a normal stride; we have to use indirects. + */ + brw_reg shifted = bld.vgrf(src.type); + brw_reg idx = bld.vgrf(BRW_TYPE_W); + + ubld.ADD(idx, bld.LOAD_SUBGROUP_INVOCATION(), brw_imm_w(-1)); + ubld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx); + ubld.group(1, 0).MOV(horiz_offset(shifted, 0), info.identity); + scan = shifted; + } + + bld.emit_scan(info.op, scan, s.dispatch_width, info.cond_mod); + + bld.MOV(dst, scan); + + inst->remove(block); + return true; +} + bool brw_fs_lower_subgroup_ops(fs_visitor &s) { @@ -184,6 +227,11 @@ brw_fs_lower_subgroup_ops(fs_visitor &s) progress |= brw_lower_reduce(s, block, inst); break; + case SHADER_OPCODE_INCLUSIVE_SCAN: + case SHADER_OPCODE_EXCLUSIVE_SCAN: + progress |= brw_lower_scan(s, block, inst); + break; + default: /* Nothing to do. */ break; diff --git a/src/intel/compiler/brw_print.cpp b/src/intel/compiler/brw_print.cpp index 2b445d9991a..1cfdee24a5f 100644 --- a/src/intel/compiler/brw_print.cpp +++ b/src/intel/compiler/brw_print.cpp @@ -293,6 +293,10 @@ brw_instruction_name(const struct brw_isa_info *isa, enum opcode op) return "memory_atomic"; case SHADER_OPCODE_REDUCE: return "reduce"; + case SHADER_OPCODE_INCLUSIVE_SCAN: + return "inclusive_scan"; + case SHADER_OPCODE_EXCLUSIVE_SCAN: + return "exclusive_scan"; } unreachable("not reached");