2017-08-22 13:23:59 -07:00
|
|
|
|
/*
|
2024-05-06 22:31:53 -04:00
|
|
|
|
* Copyright © 2023 Collabora, Ltd.
|
2017-08-22 13:23:59 -07:00
|
|
|
|
* Copyright © 2017 Intel Corporation
|
|
|
|
|
|
*
|
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
|
*
|
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
|
* Software.
|
|
|
|
|
|
*
|
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
2023-08-08 12:00:35 -05:00
|
|
|
|
#include "util/u_math.h"
|
2017-08-22 13:23:59 -07:00
|
|
|
|
#include "nir.h"
|
|
|
|
|
|
#include "nir_builder.h"
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* \file nir_opt_intrinsics.c
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
2018-04-10 16:07:27 +02:00
|
|
|
|
static nir_intrinsic_instr *
|
|
|
|
|
|
lower_subgroups_64bit_split_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
|
|
|
|
|
|
unsigned int component)
|
|
|
|
|
|
{
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *comp;
|
2018-04-10 16:07:27 +02:00
|
|
|
|
if (component == 0)
|
|
|
|
|
|
comp = nir_unpack_64_2x32_split_x(b, intrin->src[0].ssa);
|
|
|
|
|
|
else
|
|
|
|
|
|
comp = nir_unpack_64_2x32_split_y(b, intrin->src[0].ssa);
|
|
|
|
|
|
|
|
|
|
|
|
nir_intrinsic_instr *intr = nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
|
2023-08-14 11:56:00 -05:00
|
|
|
|
nir_def_init(&intr->instr, &intr->def, 1, 32);
|
2018-04-10 16:07:27 +02:00
|
|
|
|
intr->const_index[0] = intrin->const_index[0];
|
|
|
|
|
|
intr->const_index[1] = intrin->const_index[1];
|
|
|
|
|
|
intr->src[0] = nir_src_for_ssa(comp);
|
|
|
|
|
|
if (nir_intrinsic_infos[intrin->intrinsic].num_srcs == 2)
|
2023-09-06 13:56:09 +10:00
|
|
|
|
intr->src[1] = nir_src_for_ssa(intrin->src[1].ssa);
|
2018-04-10 16:07:27 +02:00
|
|
|
|
|
|
|
|
|
|
intr->num_components = 1;
|
|
|
|
|
|
nir_builder_instr_insert(b, &intr->instr);
|
|
|
|
|
|
return intr;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
static nir_def *
|
2018-04-10 16:07:27 +02:00
|
|
|
|
lower_subgroup_op_to_32bit(nir_builder *b, nir_intrinsic_instr *intrin)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(intrin->src[0].ssa->bit_size == 64);
|
|
|
|
|
|
nir_intrinsic_instr *intr_x = lower_subgroups_64bit_split_intrinsic(b, intrin, 0);
|
|
|
|
|
|
nir_intrinsic_instr *intr_y = lower_subgroups_64bit_split_intrinsic(b, intrin, 1);
|
2023-08-14 11:56:00 -05:00
|
|
|
|
return nir_pack_64_2x32_split(b, &intr_x->def, &intr_y->def);
|
2018-04-10 16:07:27 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
2024-10-17 21:44:39 +02:00
|
|
|
|
/* Return a mask which is 1 for threads up to the run-time subgroup size, i.e.
|
|
|
|
|
|
* 1 for the entire subgroup. SPIR-V requires us to return 0 for indices at or
|
|
|
|
|
|
* above the subgroup size for the masks, but gt_mask and ge_mask make them 1
|
|
|
|
|
|
* so we have to "and" with this mask.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static nir_def *
|
|
|
|
|
|
build_subgroup_mask(nir_builder *b,
|
|
|
|
|
|
const nir_lower_subgroups_options *options)
|
|
|
|
|
|
{
|
|
|
|
|
|
nir_def *subgroup_size = nir_load_subgroup_size(b);
|
|
|
|
|
|
|
|
|
|
|
|
/* First compute the result assuming one ballot component. */
|
|
|
|
|
|
nir_def *result =
|
|
|
|
|
|
nir_ushr(b, nir_imm_intN_t(b, ~0ull, options->ballot_bit_size),
|
|
|
|
|
|
nir_isub_imm(b, options->ballot_bit_size,
|
|
|
|
|
|
subgroup_size));
|
|
|
|
|
|
|
|
|
|
|
|
/* Since the subgroup size and ballot bitsize are both powers of two, there
|
|
|
|
|
|
* are two possible cases to consider:
|
|
|
|
|
|
*
|
|
|
|
|
|
* (1) The subgroup size is less than the ballot bitsize. We need to return
|
|
|
|
|
|
* "result" in the first component and 0 in every other component.
|
|
|
|
|
|
* (2) The subgroup size is a multiple of the ballot bitsize. We need to
|
|
|
|
|
|
* return ~0 if the subgroup size divided by the ballot bitsize is less
|
|
|
|
|
|
* than or equal to the index in the vector and 0 otherwise. For example,
|
|
|
|
|
|
* with a target ballot type of 4 x uint32 and subgroup_size = 64 we'd need
|
|
|
|
|
|
* to return { ~0, ~0, 0, 0 }.
|
|
|
|
|
|
*
|
|
|
|
|
|
* In case (2) it turns out that "result" will be ~0, because
|
|
|
|
|
|
* "ballot_bit_size - subgroup_size" is also a multiple of
|
|
|
|
|
|
* "ballot_bit_size" and since nir_ushr masks the shift value it will
|
|
|
|
|
|
* shifted by 0. This means that the first component can just be "result"
|
|
|
|
|
|
* in all cases. The other components will also get the correct value in
|
|
|
|
|
|
* case (1) if we just use the rule in case (2), so we'll get the correct
|
|
|
|
|
|
* result if we just follow (2) and then replace the first component with
|
|
|
|
|
|
* "result".
|
|
|
|
|
|
*/
|
|
|
|
|
|
nir_const_value min_idx[4];
|
|
|
|
|
|
for (unsigned i = 0; i < options->ballot_components; i++)
|
|
|
|
|
|
min_idx[i] = nir_const_value_for_int(i * options->ballot_bit_size, 32);
|
|
|
|
|
|
nir_def *min_idx_val = nir_build_imm(b, options->ballot_components, 32, min_idx);
|
|
|
|
|
|
|
|
|
|
|
|
nir_def *result_extended =
|
|
|
|
|
|
nir_pad_vector_imm_int(b, result, ~0ull, options->ballot_components);
|
|
|
|
|
|
|
|
|
|
|
|
return nir_bcsel(b, nir_ult(b, min_idx_val, subgroup_size),
|
|
|
|
|
|
result_extended, nir_imm_intN_t(b, 0, options->ballot_bit_size));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Return a ballot-mask-sized value which represents "val" sign-extended and
|
|
|
|
|
|
* then shifted left by "shift". Only particular values for "val" are
|
|
|
|
|
|
* supported, see below.
|
|
|
|
|
|
*
|
|
|
|
|
|
* This function assumes that `val << shift` will never span a ballot_bit_size
|
|
|
|
|
|
* word and that the high bit of val can be extended across the entire result.
|
|
|
|
|
|
* This is trivially satisfied for 0, 1, ~0, and ~1. However, it may also be
|
|
|
|
|
|
* fine for other values if the shift is guaranteed to be sufficiently
|
|
|
|
|
|
* aligned. One example is 0xf when the shift is known to be a multiple of 4.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static nir_def *
|
|
|
|
|
|
build_ballot_imm_ishl(nir_builder *b, int64_t val, nir_def *shift,
|
|
|
|
|
|
const nir_lower_subgroups_options *options)
|
|
|
|
|
|
{
|
|
|
|
|
|
/* First compute the result assuming one ballot component. */
|
|
|
|
|
|
nir_def *result =
|
|
|
|
|
|
nir_ishl(b, nir_imm_intN_t(b, val, options->ballot_bit_size), shift);
|
|
|
|
|
|
|
|
|
|
|
|
if (options->ballot_components == 1)
|
|
|
|
|
|
return result;
|
|
|
|
|
|
|
|
|
|
|
|
/* Fix up the result when there is > 1 component. The idea is that nir_ishl
|
|
|
|
|
|
* masks out the high bits of the shift value already, so in case there's
|
|
|
|
|
|
* more than one component the component which 1 would be shifted into
|
|
|
|
|
|
* already has the right value and all we have to do is fixup the other
|
|
|
|
|
|
* components. Components below it should always be 0, and components above
|
|
|
|
|
|
* it must be either 0 or ~0 because of the assert above. For example, if
|
|
|
|
|
|
* the target ballot size is 2 x uint32, and we're shifting 1 by 33, then
|
|
|
|
|
|
* we'll feed 33 into ishl, which will mask it off to get 1, so we'll
|
|
|
|
|
|
* compute a single-component result of 2, which is correct for the second
|
|
|
|
|
|
* component, but the first component needs to be 0, which we get by
|
|
|
|
|
|
* comparing the high bits of the shift with 0 and selecting the original
|
|
|
|
|
|
* answer or 0 for the first component (and something similar with the
|
|
|
|
|
|
* second component). This idea is generalized here for any component count
|
|
|
|
|
|
*/
|
|
|
|
|
|
nir_const_value min_shift[4];
|
|
|
|
|
|
for (unsigned i = 0; i < options->ballot_components; i++)
|
|
|
|
|
|
min_shift[i] = nir_const_value_for_int(i * options->ballot_bit_size, 32);
|
|
|
|
|
|
nir_def *min_shift_val = nir_build_imm(b, options->ballot_components, 32, min_shift);
|
|
|
|
|
|
|
|
|
|
|
|
nir_const_value max_shift[4];
|
|
|
|
|
|
for (unsigned i = 0; i < options->ballot_components; i++)
|
|
|
|
|
|
max_shift[i] = nir_const_value_for_int((i + 1) * options->ballot_bit_size, 32);
|
|
|
|
|
|
nir_def *max_shift_val = nir_build_imm(b, options->ballot_components, 32, max_shift);
|
|
|
|
|
|
|
|
|
|
|
|
return nir_bcsel(b, nir_ult(b, shift, max_shift_val),
|
|
|
|
|
|
nir_bcsel(b, nir_ult(b, shift, min_shift_val),
|
|
|
|
|
|
nir_imm_intN_t(b, val >> 63, result->bit_size),
|
|
|
|
|
|
result),
|
|
|
|
|
|
nir_imm_intN_t(b, 0, result->bit_size));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
static nir_def *
|
|
|
|
|
|
ballot_type_to_uint(nir_builder *b, nir_def *value,
|
2020-09-10 18:48:04 +02:00
|
|
|
|
const nir_lower_subgroups_options *options)
|
2017-10-02 18:19:44 -07:00
|
|
|
|
{
|
2024-05-06 22:08:48 -04:00
|
|
|
|
/* Allow internal generated ballots to pass through */
|
|
|
|
|
|
if (value->num_components == options->ballot_components &&
|
|
|
|
|
|
value->bit_size == options->ballot_bit_size)
|
|
|
|
|
|
return value;
|
|
|
|
|
|
|
2020-09-10 18:48:04 +02:00
|
|
|
|
/* Only the new-style SPIR-V subgroup instructions take a ballot result as
|
|
|
|
|
|
* an argument, so we only use this on uvec4 types.
|
|
|
|
|
|
*/
|
2017-10-02 18:19:44 -07:00
|
|
|
|
assert(value->num_components == 4 && value->bit_size == 32);
|
|
|
|
|
|
|
2020-09-10 18:48:04 +02:00
|
|
|
|
return nir_extract_bits(b, &value, 1, 0, options->ballot_components,
|
|
|
|
|
|
options->ballot_bit_size);
|
2017-10-02 18:19:44 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
static nir_def *
|
|
|
|
|
|
uint_to_ballot_type(nir_builder *b, nir_def *value,
|
2017-08-22 18:44:51 -07:00
|
|
|
|
unsigned num_components, unsigned bit_size)
|
|
|
|
|
|
{
|
2021-07-08 14:36:15 -05:00
|
|
|
|
assert(util_is_power_of_two_nonzero(num_components));
|
|
|
|
|
|
assert(util_is_power_of_two_nonzero(value->num_components));
|
|
|
|
|
|
|
|
|
|
|
|
unsigned total_bits = bit_size * num_components;
|
|
|
|
|
|
|
|
|
|
|
|
/* If the source doesn't have enough bits, zero-pad */
|
|
|
|
|
|
if (total_bits > value->bit_size * value->num_components)
|
|
|
|
|
|
value = nir_pad_vector_imm_int(b, value, 0, total_bits / value->bit_size);
|
|
|
|
|
|
|
2021-07-20 17:14:56 -05:00
|
|
|
|
value = nir_bitcast_vector(b, value, bit_size);
|
|
|
|
|
|
|
|
|
|
|
|
/* If the source has too many components, truncate. This can happen if,
|
|
|
|
|
|
* for instance, we're implementing GL_ARB_shader_ballot or
|
|
|
|
|
|
* VK_EXT_shader_subgroup_ballot which have 64-bit ballot values on an
|
|
|
|
|
|
* architecture with a native 128-bit uvec4 ballot. This comes up in Zink
|
|
|
|
|
|
* for OpenGL on Vulkan. It's the job of the driver calling this lowering
|
|
|
|
|
|
* pass to ensure that it's restricted subgroup sizes sufficiently that we
|
|
|
|
|
|
* have enough ballot bits.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (value->num_components > num_components)
|
2022-05-09 12:15:44 -05:00
|
|
|
|
value = nir_trim_vector(b, value, num_components);
|
2021-07-20 17:14:56 -05:00
|
|
|
|
|
|
|
|
|
|
return value;
|
2017-08-22 18:44:51 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
static nir_def *
|
2019-02-01 11:01:31 +01:00
|
|
|
|
lower_subgroup_op_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin)
|
2017-08-22 13:23:59 -07:00
|
|
|
|
{
|
|
|
|
|
|
/* This is safe to call on scalar things but it would be silly */
|
2023-08-14 11:56:00 -05:00
|
|
|
|
assert(intrin->def.num_components > 1);
|
2017-08-22 13:23:59 -07:00
|
|
|
|
|
2023-09-15 10:57:20 -04:00
|
|
|
|
nir_def *value = intrin->src[0].ssa;
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *reads[NIR_MAX_VEC_COMPONENTS];
|
2017-08-22 13:23:59 -07:00
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < intrin->num_components; i++) {
|
|
|
|
|
|
nir_intrinsic_instr *chan_intrin =
|
|
|
|
|
|
nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
|
2023-08-14 11:56:00 -05:00
|
|
|
|
nir_def_init(&chan_intrin->instr, &chan_intrin->def, 1,
|
|
|
|
|
|
intrin->def.bit_size);
|
2017-08-22 13:23:59 -07:00
|
|
|
|
chan_intrin->num_components = 1;
|
|
|
|
|
|
|
|
|
|
|
|
/* value */
|
|
|
|
|
|
chan_intrin->src[0] = nir_src_for_ssa(nir_channel(b, value, i));
|
|
|
|
|
|
/* invocation */
|
2017-12-06 21:41:47 -08:00
|
|
|
|
if (nir_intrinsic_infos[intrin->intrinsic].num_srcs > 1) {
|
|
|
|
|
|
assert(nir_intrinsic_infos[intrin->intrinsic].num_srcs == 2);
|
2023-09-06 13:56:09 +10:00
|
|
|
|
chan_intrin->src[1] = nir_src_for_ssa(intrin->src[1].ssa);
|
2017-12-06 21:41:47 -08:00
|
|
|
|
}
|
2017-08-22 13:23:59 -07:00
|
|
|
|
|
2017-08-29 20:09:58 -07:00
|
|
|
|
chan_intrin->const_index[0] = intrin->const_index[0];
|
|
|
|
|
|
chan_intrin->const_index[1] = intrin->const_index[1];
|
|
|
|
|
|
|
2019-02-01 11:01:31 +01:00
|
|
|
|
nir_builder_instr_insert(b, &chan_intrin->instr);
|
|
|
|
|
|
reads[i] = &chan_intrin->def;
|
2017-08-22 13:23:59 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return nir_vec(b, reads, intrin->num_components);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
static nir_def *
|
2017-08-28 19:55:34 -07:00
|
|
|
|
lower_vote_eq_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin)
|
|
|
|
|
|
{
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *value = intrin->src[0].ssa;
|
2017-08-28 19:55:34 -07:00
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *result = NULL;
|
2017-08-28 19:55:34 -07:00
|
|
|
|
for (unsigned i = 0; i < intrin->num_components; i++) {
|
2023-10-02 13:02:14 +02:00
|
|
|
|
nir_def* chan = nir_channel(b, value, i);
|
|
|
|
|
|
|
|
|
|
|
|
if (intrin->intrinsic == nir_intrinsic_vote_feq) {
|
|
|
|
|
|
chan = nir_vote_feq(b, intrin->def.bit_size, chan);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
chan = nir_vote_ieq(b, intrin->def.bit_size, chan);
|
|
|
|
|
|
}
|
2017-08-28 19:55:34 -07:00
|
|
|
|
|
|
|
|
|
|
if (result) {
|
2023-10-02 13:02:14 +02:00
|
|
|
|
result = nir_iand(b, result, chan);
|
2017-08-28 19:55:34 -07:00
|
|
|
|
} else {
|
2023-10-02 13:02:14 +02:00
|
|
|
|
result = chan;
|
2017-08-28 19:55:34 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
static nir_def *
|
2020-09-11 13:07:48 +02:00
|
|
|
|
lower_vote_eq(nir_builder *b, nir_intrinsic_instr *intrin)
|
2018-03-10 10:05:58 -08:00
|
|
|
|
{
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *value = intrin->src[0].ssa;
|
2018-03-10 10:05:58 -08:00
|
|
|
|
|
|
|
|
|
|
/* We have to implicitly lower to scalar */
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *all_eq = NULL;
|
2018-03-10 10:05:58 -08:00
|
|
|
|
for (unsigned i = 0; i < intrin->num_components; i++) {
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *rfi = nir_read_first_invocation(b, nir_channel(b, value, i));
|
2018-03-10 10:05:58 -08:00
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *is_eq;
|
2018-03-10 10:05:58 -08:00
|
|
|
|
if (intrin->intrinsic == nir_intrinsic_vote_feq) {
|
2020-09-03 17:20:17 +01:00
|
|
|
|
is_eq = nir_feq(b, rfi, nir_channel(b, value, i));
|
2018-03-10 10:05:58 -08:00
|
|
|
|
} else {
|
2020-09-03 17:20:17 +01:00
|
|
|
|
is_eq = nir_ieq(b, rfi, nir_channel(b, value, i));
|
2018-03-10 10:05:58 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (all_eq == NULL) {
|
|
|
|
|
|
all_eq = is_eq;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
all_eq = nir_iand(b, all_eq, is_eq);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2020-09-11 13:07:48 +02:00
|
|
|
|
return nir_vote_all(b, 1, all_eq);
|
2018-03-10 10:05:58 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
static nir_def *
|
2019-02-01 11:01:31 +01:00
|
|
|
|
lower_shuffle_to_swizzle(nir_builder *b, nir_intrinsic_instr *intrin)
|
2020-06-23 17:37:37 +01:00
|
|
|
|
{
|
|
|
|
|
|
unsigned mask = nir_src_as_uint(intrin->src[1]);
|
|
|
|
|
|
|
|
|
|
|
|
if (mask >= 32)
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
2023-10-02 13:02:14 +02:00
|
|
|
|
return nir_masked_swizzle_amd(b, intrin->src[0].ssa,
|
2023-10-02 16:13:35 +01:00
|
|
|
|
.swizzle_mask = (mask << 10) | 0x1f,
|
|
|
|
|
|
.fetch_inactive = true);
|
2020-06-23 17:37:37 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
2022-01-04 14:31:29 +01:00
|
|
|
|
/* Lowers "specialized" shuffles to a generic nir_intrinsic_shuffle. */
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
static nir_def *
|
2022-01-04 14:31:29 +01:00
|
|
|
|
lower_to_shuffle(nir_builder *b, nir_intrinsic_instr *intrin,
|
|
|
|
|
|
const nir_lower_subgroups_options *options)
|
2017-12-06 21:41:47 -08:00
|
|
|
|
{
|
2020-06-23 17:37:37 +01:00
|
|
|
|
if (intrin->intrinsic == nir_intrinsic_shuffle_xor &&
|
|
|
|
|
|
options->lower_shuffle_to_swizzle_amd &&
|
|
|
|
|
|
nir_src_is_const(intrin->src[1])) {
|
2019-02-01 11:01:31 +01:00
|
|
|
|
|
|
|
|
|
|
nir_def *result = lower_shuffle_to_swizzle(b, intrin);
|
2020-06-23 17:37:37 +01:00
|
|
|
|
if (result)
|
|
|
|
|
|
return result;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *index = nir_load_subgroup_invocation(b);
|
2017-12-06 21:41:47 -08:00
|
|
|
|
switch (intrin->intrinsic) {
|
|
|
|
|
|
case nir_intrinsic_shuffle_xor:
|
|
|
|
|
|
index = nir_ixor(b, index, intrin->src[1].ssa);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case nir_intrinsic_shuffle_up:
|
|
|
|
|
|
index = nir_isub(b, index, intrin->src[1].ssa);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case nir_intrinsic_shuffle_down:
|
|
|
|
|
|
index = nir_iadd(b, index, intrin->src[1].ssa);
|
|
|
|
|
|
break;
|
2017-08-29 10:20:56 -07:00
|
|
|
|
case nir_intrinsic_quad_broadcast:
|
2023-06-16 19:43:30 +02:00
|
|
|
|
index = nir_ior(b, nir_iand_imm(b, index, ~0x3),
|
2023-08-08 12:00:35 -05:00
|
|
|
|
intrin->src[1].ssa);
|
2017-08-29 10:20:56 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case nir_intrinsic_quad_swap_horizontal:
|
|
|
|
|
|
/* For Quad operations, subgroups are divided into quads where
|
|
|
|
|
|
* (invocation % 4) is the index to a square arranged as follows:
|
|
|
|
|
|
*
|
|
|
|
|
|
* +---+---+
|
|
|
|
|
|
* | 0 | 1 |
|
|
|
|
|
|
* +---+---+
|
|
|
|
|
|
* | 2 | 3 |
|
|
|
|
|
|
* +---+---+
|
|
|
|
|
|
*/
|
|
|
|
|
|
index = nir_ixor(b, index, nir_imm_int(b, 0x1));
|
|
|
|
|
|
break;
|
|
|
|
|
|
case nir_intrinsic_quad_swap_vertical:
|
|
|
|
|
|
index = nir_ixor(b, index, nir_imm_int(b, 0x2));
|
|
|
|
|
|
break;
|
|
|
|
|
|
case nir_intrinsic_quad_swap_diagonal:
|
|
|
|
|
|
index = nir_ixor(b, index, nir_imm_int(b, 0x3));
|
|
|
|
|
|
break;
|
2022-11-16 23:58:27 -08:00
|
|
|
|
case nir_intrinsic_rotate: {
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *delta = intrin->src[1].ssa;
|
|
|
|
|
|
nir_def *local_id = nir_load_subgroup_invocation(b);
|
2022-11-16 23:58:27 -08:00
|
|
|
|
const unsigned cluster_size = nir_intrinsic_cluster_size(intrin);
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *rotation_group_mask =
|
2023-08-08 12:00:35 -05:00
|
|
|
|
cluster_size > 0 ? nir_imm_int(b, (int)(cluster_size - 1)) : nir_iadd_imm(b, nir_load_subgroup_size(b), -1);
|
2022-11-16 23:58:27 -08:00
|
|
|
|
|
|
|
|
|
|
index = nir_iand(b, nir_iadd(b, local_id, delta),
|
2023-08-08 12:00:35 -05:00
|
|
|
|
rotation_group_mask);
|
2022-11-16 23:58:27 -08:00
|
|
|
|
if (cluster_size > 0) {
|
|
|
|
|
|
index = nir_iadd(b, index,
|
|
|
|
|
|
nir_iand(b, local_id, nir_inot(b, rotation_group_mask)));
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2017-12-06 21:41:47 -08:00
|
|
|
|
default:
|
|
|
|
|
|
unreachable("Invalid intrinsic");
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-10-02 13:02:14 +02:00
|
|
|
|
return nir_shuffle(b, intrin->src[0].ssa, index);
|
2017-12-06 21:41:47 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2022-01-04 15:44:31 +01:00
|
|
|
|
static const struct glsl_type *
|
2023-08-12 16:17:15 -04:00
|
|
|
|
glsl_type_for_ssa(nir_def *def)
|
2022-01-04 15:44:31 +01:00
|
|
|
|
{
|
2023-08-08 12:00:35 -05:00
|
|
|
|
const struct glsl_type *comp_type = def->bit_size == 1 ? glsl_bool_type() : glsl_uintN_t_type(def->bit_size);
|
2022-01-04 15:44:31 +01:00
|
|
|
|
return glsl_replace_vector_type(comp_type, def->num_components);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Lower nir_intrinsic_shuffle to a waterfall loop + nir_read_invocation.
|
|
|
|
|
|
*/
|
2023-08-12 16:17:15 -04:00
|
|
|
|
static nir_def *
|
2022-01-04 15:44:31 +01:00
|
|
|
|
lower_shuffle(nir_builder *b, nir_intrinsic_instr *intrin)
|
|
|
|
|
|
{
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *val = intrin->src[0].ssa;
|
|
|
|
|
|
nir_def *id = intrin->src[1].ssa;
|
2022-01-04 15:44:31 +01:00
|
|
|
|
|
|
|
|
|
|
/* The loop is something like:
|
|
|
|
|
|
*
|
|
|
|
|
|
* while (true) {
|
|
|
|
|
|
* first_id = readFirstInvocation(gl_SubgroupInvocationID);
|
|
|
|
|
|
* first_val = readFirstInvocation(val);
|
|
|
|
|
|
* first_result = readInvocation(val, readFirstInvocation(id));
|
|
|
|
|
|
* if (id == first_id)
|
|
|
|
|
|
* result = first_val;
|
|
|
|
|
|
* if (elect()) {
|
|
|
|
|
|
* if (id > gl_SubgroupInvocationID) {
|
|
|
|
|
|
* result = first_result;
|
|
|
|
|
|
* }
|
|
|
|
|
|
* break;
|
|
|
|
|
|
* }
|
|
|
|
|
|
* }
|
|
|
|
|
|
*
|
|
|
|
|
|
* The idea is to guarantee, on each iteration of the loop, that anything
|
|
|
|
|
|
* reading from first_id gets the correct value, so that we can then kill
|
|
|
|
|
|
* it off by breaking out of the loop. Before doing that we also have to
|
|
|
|
|
|
* ensure that first_id invocation gets the correct value. It only won't be
|
|
|
|
|
|
* assigned the correct value already if the invocation it's reading from
|
|
|
|
|
|
* isn't already killed off, that is, if it's later than its own ID.
|
|
|
|
|
|
* Invocations where id <= gl_SubgroupInvocationID will be assigned their
|
|
|
|
|
|
* result in the first if, and invocations where id >
|
|
|
|
|
|
* gl_SubgroupInvocationID will be assigned their result in the second if.
|
|
|
|
|
|
*
|
|
|
|
|
|
* We do this more complicated loop rather than looping over all id's
|
|
|
|
|
|
* explicitly because at this point we don't know the "actual" subgroup
|
|
|
|
|
|
* size and at the moment there's no way to get at it, which means we may
|
|
|
|
|
|
* loop over always-inactive invocations.
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *subgroup_id = nir_load_subgroup_invocation(b);
|
2022-01-04 15:44:31 +01:00
|
|
|
|
|
|
|
|
|
|
nir_variable *result =
|
|
|
|
|
|
nir_local_variable_create(b->impl, glsl_type_for_ssa(val), "result");
|
|
|
|
|
|
|
2023-08-08 12:00:35 -05:00
|
|
|
|
nir_loop *loop = nir_push_loop(b);
|
|
|
|
|
|
{
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *first_id = nir_read_first_invocation(b, subgroup_id);
|
|
|
|
|
|
nir_def *first_val = nir_read_first_invocation(b, val);
|
|
|
|
|
|
nir_def *first_result =
|
2022-01-04 15:44:31 +01:00
|
|
|
|
nir_read_invocation(b, val, nir_read_first_invocation(b, id));
|
|
|
|
|
|
|
2023-08-08 12:00:35 -05:00
|
|
|
|
nir_if *nif = nir_push_if(b, nir_ieq(b, id, first_id));
|
|
|
|
|
|
{
|
2022-01-04 15:44:31 +01:00
|
|
|
|
nir_store_var(b, result, first_val, BITFIELD_MASK(val->num_components));
|
2023-08-08 12:00:35 -05:00
|
|
|
|
}
|
|
|
|
|
|
nir_pop_if(b, nif);
|
2022-01-04 15:44:31 +01:00
|
|
|
|
|
2023-08-08 12:00:35 -05:00
|
|
|
|
nir_if *nif2 = nir_push_if(b, nir_elect(b, 1));
|
|
|
|
|
|
{
|
|
|
|
|
|
nir_if *nif3 = nir_push_if(b, nir_ult(b, subgroup_id, id));
|
|
|
|
|
|
{
|
2022-01-04 15:44:31 +01:00
|
|
|
|
nir_store_var(b, result, first_result, BITFIELD_MASK(val->num_components));
|
2023-08-08 12:00:35 -05:00
|
|
|
|
}
|
|
|
|
|
|
nir_pop_if(b, nif3);
|
2022-01-04 15:44:31 +01:00
|
|
|
|
|
|
|
|
|
|
nir_jump(b, nir_jump_break);
|
2023-08-08 12:00:35 -05:00
|
|
|
|
}
|
|
|
|
|
|
nir_pop_if(b, nif2);
|
|
|
|
|
|
}
|
|
|
|
|
|
nir_pop_loop(b, loop);
|
2022-01-04 15:44:31 +01:00
|
|
|
|
|
|
|
|
|
|
return nir_load_var(b, result);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2024-01-17 14:12:43 +01:00
|
|
|
|
static nir_def *
|
|
|
|
|
|
lower_boolean_shuffle(nir_builder *b, nir_intrinsic_instr *intrin,
|
|
|
|
|
|
const nir_lower_subgroups_options *options)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(options->ballot_components == 1 && options->subgroup_size);
|
|
|
|
|
|
nir_def *ballot = nir_ballot_relaxed(b, 1, options->ballot_bit_size, intrin->src[0].ssa);
|
|
|
|
|
|
|
|
|
|
|
|
nir_def *index = NULL;
|
|
|
|
|
|
|
|
|
|
|
|
/* If the shuffle amount isn't constant, it might be divergent but
|
|
|
|
|
|
* inverse_ballot requires a uniform source, so take a different path.
|
|
|
|
|
|
* rotate allows us to assume the delta is uniform unlike shuffle_up/down.
|
|
|
|
|
|
*/
|
|
|
|
|
|
switch (intrin->intrinsic) {
|
|
|
|
|
|
case nir_intrinsic_shuffle_up:
|
|
|
|
|
|
if (nir_src_is_const(intrin->src[1]))
|
|
|
|
|
|
ballot = nir_ishl(b, ballot, intrin->src[1].ssa);
|
|
|
|
|
|
else
|
|
|
|
|
|
index = nir_isub(b, nir_load_subgroup_invocation(b), intrin->src[1].ssa);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case nir_intrinsic_shuffle_down:
|
|
|
|
|
|
if (nir_src_is_const(intrin->src[1]))
|
|
|
|
|
|
ballot = nir_ushr(b, ballot, intrin->src[1].ssa);
|
|
|
|
|
|
else
|
|
|
|
|
|
index = nir_iadd(b, nir_load_subgroup_invocation(b), intrin->src[1].ssa);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case nir_intrinsic_shuffle_xor:
|
|
|
|
|
|
index = nir_ixor(b, nir_load_subgroup_invocation(b), intrin->src[1].ssa);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case nir_intrinsic_rotate: {
|
|
|
|
|
|
nir_def *delta = nir_as_uniform(b, intrin->src[1].ssa);
|
|
|
|
|
|
uint32_t cluster_size = nir_intrinsic_cluster_size(intrin);
|
|
|
|
|
|
cluster_size = cluster_size ? cluster_size : options->subgroup_size;
|
|
|
|
|
|
cluster_size = MIN2(cluster_size, options->subgroup_size);
|
|
|
|
|
|
if (cluster_size == 1) {
|
|
|
|
|
|
return intrin->src[0].ssa;
|
|
|
|
|
|
} else if (cluster_size == 2) {
|
|
|
|
|
|
delta = nir_iand_imm(b, delta, cluster_size - 1);
|
|
|
|
|
|
nir_def *lo = nir_iand_imm(b, nir_ushr_imm(b, ballot, 1), 0x5555555555555555ull);
|
|
|
|
|
|
nir_def *hi = nir_iand_imm(b, nir_ishl_imm(b, ballot, 1), 0xaaaaaaaaaaaaaaaaull);
|
|
|
|
|
|
ballot = nir_bcsel(b, nir_ine_imm(b, delta, 0), nir_ior(b, hi, lo), ballot);
|
|
|
|
|
|
} else if (cluster_size == ballot->bit_size) {
|
|
|
|
|
|
ballot = nir_uror(b, ballot, delta);
|
|
|
|
|
|
} else if (cluster_size == 32) {
|
|
|
|
|
|
nir_def *unpacked = nir_unpack_64_2x32(b, ballot);
|
|
|
|
|
|
unpacked = nir_uror(b, unpacked, delta);
|
|
|
|
|
|
ballot = nir_pack_64_2x32(b, unpacked);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
delta = nir_iand_imm(b, delta, cluster_size - 1);
|
|
|
|
|
|
nir_def *delta_rev = nir_isub_imm(b, cluster_size, delta);
|
|
|
|
|
|
nir_def *mask = nir_mask(b, delta_rev, ballot->bit_size);
|
|
|
|
|
|
for (uint32_t i = cluster_size; i < ballot->bit_size; i *= 2) {
|
|
|
|
|
|
mask = nir_ior(b, nir_ishl_imm(b, mask, i), mask);
|
|
|
|
|
|
}
|
|
|
|
|
|
nir_def *lo = nir_iand(b, nir_ushr(b, ballot, delta), mask);
|
|
|
|
|
|
nir_def *hi = nir_iand(b, nir_ishl(b, ballot, delta_rev), nir_inot(b, mask));
|
|
|
|
|
|
ballot = nir_ior(b, lo, hi);
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
case nir_intrinsic_shuffle:
|
|
|
|
|
|
index = intrin->src[1].ssa;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case nir_intrinsic_read_invocation:
|
|
|
|
|
|
index = nir_as_uniform(b, intrin->src[1].ssa);
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
unreachable("not a boolean shuffle");
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (index) {
|
|
|
|
|
|
nir_def *mask = nir_ishl(b, nir_imm_intN_t(b, 1, ballot->bit_size), index);
|
|
|
|
|
|
return nir_ine_imm(b, nir_iand(b, ballot, mask), 0);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
return nir_inverse_ballot(b, 1, ballot);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-02-04 12:55:32 +01:00
|
|
|
|
static nir_def *
|
|
|
|
|
|
vec_bit_count(nir_builder *b, nir_def *value)
|
|
|
|
|
|
{
|
|
|
|
|
|
nir_def *vec_result = nir_bit_count(b, value);
|
|
|
|
|
|
nir_def *result = nir_channel(b, vec_result, 0);
|
|
|
|
|
|
for (unsigned i = 1; i < value->num_components; i++)
|
|
|
|
|
|
result = nir_iadd(b, result, nir_channel(b, vec_result, i));
|
|
|
|
|
|
return result;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* produce a bitmask of 111...000...111... alternating between "size"
|
|
|
|
|
|
* 1's and "size" 0's (the LSB is 1).
|
|
|
|
|
|
*/
|
|
|
|
|
|
static uint64_t
|
|
|
|
|
|
reduce_mask(unsigned size, unsigned ballot_bit_size)
|
|
|
|
|
|
{
|
|
|
|
|
|
uint64_t mask = 0;
|
|
|
|
|
|
for (unsigned i = 0; i < ballot_bit_size; i += 2 * size) {
|
|
|
|
|
|
mask |= ((1ull << size) - 1) << i;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return mask;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* operate on a uniform per-thread bitmask provided by ballot() to perform the
|
|
|
|
|
|
* desired Boolean reduction. Assumes that the identity of the operation is
|
|
|
|
|
|
* false (so, no iand).
|
|
|
|
|
|
*/
|
|
|
|
|
|
static nir_def *
|
|
|
|
|
|
lower_boolean_reduce_internal(nir_builder *b, nir_def *src,
|
|
|
|
|
|
unsigned cluster_size, nir_op op,
|
|
|
|
|
|
const nir_lower_subgroups_options *options)
|
|
|
|
|
|
{
|
|
|
|
|
|
for (unsigned size = 1; size < cluster_size; size *= 2) {
|
|
|
|
|
|
nir_def *shifted = nir_ushr_imm(b, src, size);
|
|
|
|
|
|
src = nir_build_alu2(b, op, shifted, src);
|
|
|
|
|
|
uint64_t mask = reduce_mask(size, options->ballot_bit_size);
|
|
|
|
|
|
src = nir_iand_imm(b, src, mask);
|
|
|
|
|
|
shifted = nir_ishl_imm(b, src, size);
|
|
|
|
|
|
src = nir_ior(b, src, shifted);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return src;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* operate on a uniform per-thread bitmask provided by ballot() to perform the
|
|
|
|
|
|
* desired Boolean inclusive scan. Assumes that the identity of the operation is
|
|
|
|
|
|
* false (so, no iand).
|
|
|
|
|
|
*/
|
|
|
|
|
|
static nir_def *
|
|
|
|
|
|
lower_boolean_scan_internal(nir_builder *b, nir_def *src,
|
|
|
|
|
|
nir_op op,
|
|
|
|
|
|
const nir_lower_subgroups_options *options)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (op == nir_op_ior) {
|
|
|
|
|
|
/* We want to return a bitmask with all 1's starting at the first 1 in
|
|
|
|
|
|
* src. -src is equivalent to ~src + 1. While src | ~src returns all
|
|
|
|
|
|
* 1's, src | (~src + 1) returns all 1's except for the bits changed by
|
|
|
|
|
|
* the increment. Any 1's before the least significant 0 of ~src are
|
|
|
|
|
|
* turned into 0 (zeroing those bits after or'ing) and the least
|
|
|
|
|
|
* signficant 0 of ~src is turned into 1 (not doing anything). So the
|
|
|
|
|
|
* final output is what we want.
|
|
|
|
|
|
*/
|
|
|
|
|
|
return nir_ior(b, src, nir_ineg(b, src));
|
|
|
|
|
|
} else {
|
|
|
|
|
|
assert(op == nir_op_ixor);
|
|
|
|
|
|
for (unsigned shift = 1; shift < options->ballot_bit_size; shift *= 2) {
|
|
|
|
|
|
src = nir_ixor(b, src, nir_ishl_imm(b, src, shift));
|
|
|
|
|
|
}
|
|
|
|
|
|
return src;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static nir_def *
|
|
|
|
|
|
lower_boolean_reduce(nir_builder *b, nir_intrinsic_instr *intrin,
|
|
|
|
|
|
const nir_lower_subgroups_options *options)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(intrin->num_components == 1);
|
|
|
|
|
|
assert(options->ballot_components == 1);
|
|
|
|
|
|
|
|
|
|
|
|
unsigned cluster_size =
|
|
|
|
|
|
intrin->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(intrin) : 0;
|
|
|
|
|
|
nir_op op = nir_intrinsic_reduction_op(intrin);
|
|
|
|
|
|
|
|
|
|
|
|
/* For certain cluster sizes, reductions of iand and ior can be implemented
|
|
|
|
|
|
* more efficiently.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (intrin->intrinsic == nir_intrinsic_reduce) {
|
|
|
|
|
|
if (cluster_size == 0) {
|
|
|
|
|
|
if (op == nir_op_iand)
|
|
|
|
|
|
return nir_vote_all(b, 1, intrin->src[0].ssa);
|
|
|
|
|
|
else if (op == nir_op_ior)
|
|
|
|
|
|
return nir_vote_any(b, 1, intrin->src[0].ssa);
|
|
|
|
|
|
else if (op == nir_op_ixor)
|
|
|
|
|
|
return nir_i2b(b, nir_iand_imm(b, vec_bit_count(b, nir_ballot(b,
|
|
|
|
|
|
options->ballot_components,
|
|
|
|
|
|
options->ballot_bit_size,
|
|
|
|
|
|
intrin->src[0].ssa)),
|
|
|
|
|
|
1));
|
|
|
|
|
|
else
|
|
|
|
|
|
unreachable("bad boolean reduction op");
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (cluster_size == 4) {
|
|
|
|
|
|
if (op == nir_op_iand)
|
|
|
|
|
|
return nir_quad_vote_all(b, 1, intrin->src[0].ssa);
|
|
|
|
|
|
else if (op == nir_op_ior)
|
|
|
|
|
|
return nir_quad_vote_any(b, 1, intrin->src[0].ssa);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
nir_def *src = intrin->src[0].ssa;
|
|
|
|
|
|
|
|
|
|
|
|
/* Apply DeMorgan's law to implement "and" reductions, since all the
|
|
|
|
|
|
* lower_boolean_*_internal() functions assume an identity of 0 to make the
|
|
|
|
|
|
* generated code shorter.
|
|
|
|
|
|
*/
|
|
|
|
|
|
nir_op new_op = (op == nir_op_iand) ? nir_op_ior : op;
|
|
|
|
|
|
if (op == nir_op_iand) {
|
|
|
|
|
|
src = nir_inot(b, src);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
nir_def *val = nir_ballot(b, options->ballot_components, options->ballot_bit_size, src);
|
|
|
|
|
|
|
|
|
|
|
|
switch (intrin->intrinsic) {
|
|
|
|
|
|
case nir_intrinsic_reduce:
|
|
|
|
|
|
val = lower_boolean_reduce_internal(b, val, cluster_size, new_op, options);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case nir_intrinsic_inclusive_scan:
|
|
|
|
|
|
val = lower_boolean_scan_internal(b, val, new_op, options);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case nir_intrinsic_exclusive_scan:
|
|
|
|
|
|
val = lower_boolean_scan_internal(b, val, new_op, options);
|
|
|
|
|
|
val = nir_ishl_imm(b, val, 1);
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
unreachable("bad intrinsic");
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (op == nir_op_iand) {
|
|
|
|
|
|
val = nir_inot(b, val);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return nir_inverse_ballot(b, 1, val);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2024-05-06 22:31:53 -04:00
|
|
|
|
static nir_def *
|
|
|
|
|
|
build_identity(nir_builder *b, unsigned bit_size, nir_op op)
|
|
|
|
|
|
{
|
|
|
|
|
|
nir_const_value ident_const = nir_alu_binop_identity(op, bit_size);
|
|
|
|
|
|
return nir_build_imm(b, 1, bit_size, &ident_const);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Implementation of scan/reduce that assumes a full subgroup */
|
|
|
|
|
|
static nir_def *
|
|
|
|
|
|
build_scan_full(nir_builder *b, nir_intrinsic_op op, nir_op red_op,
|
|
|
|
|
|
nir_def *data, unsigned cluster_size)
|
|
|
|
|
|
{
|
|
|
|
|
|
switch (op) {
|
|
|
|
|
|
case nir_intrinsic_exclusive_scan:
|
|
|
|
|
|
case nir_intrinsic_inclusive_scan: {
|
|
|
|
|
|
for (unsigned i = 1; i < cluster_size; i *= 2) {
|
|
|
|
|
|
nir_def *idx = nir_load_subgroup_invocation(b);
|
|
|
|
|
|
nir_def *has_buddy = nir_ige_imm(b, idx, i);
|
|
|
|
|
|
|
|
|
|
|
|
nir_def *buddy_data = nir_shuffle_up(b, data, nir_imm_int(b, i));
|
|
|
|
|
|
nir_def *accum = nir_build_alu2(b, red_op, data, buddy_data);
|
|
|
|
|
|
data = nir_bcsel(b, has_buddy, accum, data);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (op == nir_intrinsic_exclusive_scan) {
|
|
|
|
|
|
/* For exclusive scans, we need to shift one more time and fill in the
|
|
|
|
|
|
* bottom channel with identity.
|
|
|
|
|
|
*/
|
|
|
|
|
|
nir_def *idx = nir_load_subgroup_invocation(b);
|
|
|
|
|
|
nir_def *has_buddy = nir_ige_imm(b, idx, 1);
|
|
|
|
|
|
|
|
|
|
|
|
nir_def *buddy_data = nir_shuffle_up(b, data, nir_imm_int(b, 1));
|
|
|
|
|
|
nir_def *identity = build_identity(b, data->bit_size, red_op);
|
|
|
|
|
|
data = nir_bcsel(b, has_buddy, buddy_data, identity);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return data;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_reduce: {
|
|
|
|
|
|
for (unsigned i = 1; i < cluster_size; i *= 2) {
|
|
|
|
|
|
nir_def *buddy_data = nir_shuffle_xor(b, data, nir_imm_int(b, i));
|
|
|
|
|
|
data = nir_build_alu2(b, red_op, data, buddy_data);
|
|
|
|
|
|
}
|
|
|
|
|
|
return data;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
unreachable("Unsupported scan/reduce op");
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Fully generic implementation of scan/reduce that takes a mask */
|
|
|
|
|
|
static nir_def *
|
|
|
|
|
|
build_scan_reduce(nir_builder *b, nir_intrinsic_op op, nir_op red_op,
|
|
|
|
|
|
nir_def *data, nir_def *mask, unsigned max_mask_bits,
|
2024-10-17 21:44:39 +02:00
|
|
|
|
const nir_lower_subgroups_options *options)
|
2024-05-06 22:31:53 -04:00
|
|
|
|
{
|
2024-10-17 21:44:39 +02:00
|
|
|
|
nir_def *lt_mask = nir_load_subgroup_lt_mask(b, options->ballot_components,
|
|
|
|
|
|
options->ballot_bit_size);
|
2024-05-06 22:31:53 -04:00
|
|
|
|
|
|
|
|
|
|
/* Mask of all channels whose values we need to accumulate. Our own value
|
|
|
|
|
|
* is already in accum, if inclusive, thanks to the initialization above.
|
|
|
|
|
|
* We only need to consider lower indexed invocations.
|
|
|
|
|
|
*/
|
|
|
|
|
|
nir_def *remaining = nir_iand(b, mask, lt_mask);
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 1; i < max_mask_bits; i *= 2) {
|
|
|
|
|
|
/* At each step, our buddy channel is the first channel we have yet to
|
|
|
|
|
|
* take into account in the accumulator.
|
|
|
|
|
|
*/
|
2024-10-17 21:44:39 +02:00
|
|
|
|
nir_def *has_buddy = nir_bany_inequal(b, remaining, nir_imm_int(b, 0));
|
|
|
|
|
|
nir_def *buddy = nir_ballot_find_msb(b, 32, remaining);
|
2024-05-06 22:31:53 -04:00
|
|
|
|
|
|
|
|
|
|
/* Accumulate with our buddy channel, if any */
|
|
|
|
|
|
nir_def *buddy_data = nir_shuffle(b, data, buddy);
|
|
|
|
|
|
nir_def *accum = nir_build_alu2(b, red_op, data, buddy_data);
|
|
|
|
|
|
data = nir_bcsel(b, has_buddy, accum, data);
|
|
|
|
|
|
|
|
|
|
|
|
/* We just took into account everything in our buddy's accumulator from
|
|
|
|
|
|
* the previous step. The only things remaining are whatever channels
|
|
|
|
|
|
* were remaining for our buddy.
|
|
|
|
|
|
*/
|
|
|
|
|
|
nir_def *buddy_remaining = nir_shuffle(b, remaining, buddy);
|
|
|
|
|
|
remaining = nir_bcsel(b, has_buddy, buddy_remaining, nir_imm_int(b, 0));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
switch (op) {
|
|
|
|
|
|
case nir_intrinsic_exclusive_scan: {
|
|
|
|
|
|
/* For exclusive scans, we need to shift one more time and fill in the
|
|
|
|
|
|
* bottom channel with identity.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Some of this will get CSE'd with the first step but that's okay. The
|
|
|
|
|
|
* code is cleaner this way.
|
|
|
|
|
|
*/
|
|
|
|
|
|
nir_def *lower = nir_iand(b, mask, lt_mask);
|
2024-10-17 21:44:39 +02:00
|
|
|
|
nir_def *has_buddy = nir_bany_inequal(b, lower, nir_imm_int(b, 0));
|
|
|
|
|
|
nir_def *buddy = nir_ballot_find_msb(b, 32, lower);
|
2024-05-06 22:31:53 -04:00
|
|
|
|
|
|
|
|
|
|
nir_def *buddy_data = nir_shuffle(b, data, buddy);
|
|
|
|
|
|
nir_def *identity = build_identity(b, data->bit_size, red_op);
|
|
|
|
|
|
return nir_bcsel(b, has_buddy, buddy_data, identity);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_inclusive_scan:
|
|
|
|
|
|
return data;
|
|
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_reduce: {
|
|
|
|
|
|
/* For reductions, we need to take the top value of the scan */
|
2024-10-17 21:44:39 +02:00
|
|
|
|
nir_def *idx = nir_ballot_find_msb(b, 32, mask);
|
2024-05-06 22:31:53 -04:00
|
|
|
|
return nir_shuffle(b, data, idx);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
unreachable("Unsupported scan/reduce op");
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2024-10-17 21:44:39 +02:00
|
|
|
|
static nir_def *
|
2024-10-17 21:44:39 +02:00
|
|
|
|
build_cluster_mask(nir_builder *b, unsigned cluster_size,
|
|
|
|
|
|
const nir_lower_subgroups_options *options)
|
2024-10-17 21:44:39 +02:00
|
|
|
|
{
|
|
|
|
|
|
nir_def *idx = nir_load_subgroup_invocation(b);
|
|
|
|
|
|
nir_def *cluster = nir_iand_imm(b, idx, ~(uint64_t)(cluster_size - 1));
|
|
|
|
|
|
|
2024-10-17 21:44:39 +02:00
|
|
|
|
if (cluster_size <= options->ballot_bit_size) {
|
|
|
|
|
|
return build_ballot_imm_ishl(b, BITFIELD_MASK(cluster_size), cluster,
|
|
|
|
|
|
options);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Since the cluster size and the ballot bit size are both powers of 2,
|
|
|
|
|
|
* cluster size will be a multiple of the ballot bit size. Therefore, each
|
|
|
|
|
|
* ballot component will be either all ones or all zeros. Build a vec for
|
|
|
|
|
|
* which each component holds the value of `cluster` for which the mask
|
|
|
|
|
|
* should be all ones.
|
|
|
|
|
|
*/
|
|
|
|
|
|
nir_const_value cluster_sel_const[4];
|
|
|
|
|
|
assert(ARRAY_SIZE(cluster_sel_const) >= options->ballot_components);
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < options->ballot_components; i++) {
|
|
|
|
|
|
unsigned cluster_val =
|
|
|
|
|
|
ROUND_DOWN_TO(i * options->ballot_bit_size, cluster_size);
|
|
|
|
|
|
cluster_sel_const[i] =
|
|
|
|
|
|
nir_const_value_for_uint(cluster_val, options->ballot_bit_size);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
nir_def *cluster_sel =
|
|
|
|
|
|
nir_build_imm(b, options->ballot_components, options->ballot_bit_size,
|
|
|
|
|
|
cluster_sel_const);
|
|
|
|
|
|
nir_def *ones = nir_imm_intN_t(b, -1, options->ballot_bit_size);
|
|
|
|
|
|
nir_def *zeros = nir_imm_intN_t(b, 0, options->ballot_bit_size);
|
|
|
|
|
|
return nir_bcsel(b, nir_ieq(b, cluster, cluster_sel), ones, zeros);
|
2024-10-17 21:44:39 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
2024-05-06 22:31:53 -04:00
|
|
|
|
static nir_def *
|
|
|
|
|
|
lower_scan_reduce(nir_builder *b, nir_intrinsic_instr *intrin,
|
2024-10-17 21:44:39 +02:00
|
|
|
|
const nir_lower_subgroups_options *options)
|
2024-05-06 22:31:53 -04:00
|
|
|
|
{
|
|
|
|
|
|
const nir_op red_op = nir_intrinsic_reduction_op(intrin);
|
2024-10-17 21:44:39 +02:00
|
|
|
|
unsigned subgroup_size = options->subgroup_size;
|
2024-05-06 22:31:53 -04:00
|
|
|
|
|
|
|
|
|
|
/* Grab the cluster size */
|
|
|
|
|
|
unsigned cluster_size = subgroup_size;
|
|
|
|
|
|
if (nir_intrinsic_has_cluster_size(intrin)) {
|
|
|
|
|
|
cluster_size = nir_intrinsic_cluster_size(intrin);
|
|
|
|
|
|
if (cluster_size == 0 || cluster_size > subgroup_size)
|
|
|
|
|
|
cluster_size = subgroup_size;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Check if all invocations are active. If so, we use the fast path. */
|
2024-10-17 21:44:39 +02:00
|
|
|
|
nir_def *mask = nir_ballot(b, options->ballot_components,
|
|
|
|
|
|
options->ballot_bit_size, nir_imm_true(b));
|
2024-05-06 22:31:53 -04:00
|
|
|
|
|
|
|
|
|
|
nir_def *full, *partial;
|
2024-10-17 21:44:39 +02:00
|
|
|
|
nir_push_if(b, nir_ball_iequal(b, mask, build_subgroup_mask(b, options)));
|
2024-05-06 22:31:53 -04:00
|
|
|
|
{
|
|
|
|
|
|
full = build_scan_full(b, intrin->intrinsic, red_op,
|
|
|
|
|
|
intrin->src[0].ssa, cluster_size);
|
|
|
|
|
|
}
|
|
|
|
|
|
nir_push_else(b, NULL);
|
|
|
|
|
|
{
|
|
|
|
|
|
/* Mask according to the cluster size */
|
|
|
|
|
|
if (cluster_size < subgroup_size) {
|
2024-10-17 21:44:39 +02:00
|
|
|
|
nir_def *cluster_mask = build_cluster_mask(b, cluster_size, options);
|
2024-05-06 22:31:53 -04:00
|
|
|
|
mask = nir_iand(b, mask, cluster_mask);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
partial = build_scan_reduce(b, intrin->intrinsic, red_op,
|
|
|
|
|
|
intrin->src[0].ssa, mask, cluster_size,
|
2024-10-17 21:44:39 +02:00
|
|
|
|
options);
|
2024-05-06 22:31:53 -04:00
|
|
|
|
}
|
|
|
|
|
|
nir_pop_if(b, NULL);
|
|
|
|
|
|
return nir_if_phi(b, full, partial);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-07-11 13:04:05 -05:00
|
|
|
|
static bool
|
|
|
|
|
|
lower_subgroups_filter(const nir_instr *instr, const void *_options)
|
|
|
|
|
|
{
|
|
|
|
|
|
return instr->type == nir_instr_type_intrinsic;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
static nir_def *
|
2020-09-10 18:48:04 +02:00
|
|
|
|
build_subgroup_eq_mask(nir_builder *b,
|
|
|
|
|
|
const nir_lower_subgroups_options *options)
|
|
|
|
|
|
{
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *subgroup_idx = nir_load_subgroup_invocation(b);
|
2020-09-10 18:48:04 +02:00
|
|
|
|
|
|
|
|
|
|
return build_ballot_imm_ishl(b, 1, subgroup_idx, options);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
static nir_def *
|
2020-09-10 18:48:04 +02:00
|
|
|
|
build_subgroup_ge_mask(nir_builder *b,
|
|
|
|
|
|
const nir_lower_subgroups_options *options)
|
|
|
|
|
|
{
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *subgroup_idx = nir_load_subgroup_invocation(b);
|
2020-09-10 18:48:04 +02:00
|
|
|
|
|
|
|
|
|
|
return build_ballot_imm_ishl(b, ~0ull, subgroup_idx, options);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
static nir_def *
|
2020-09-10 18:48:04 +02:00
|
|
|
|
build_subgroup_gt_mask(nir_builder *b,
|
|
|
|
|
|
const nir_lower_subgroups_options *options)
|
|
|
|
|
|
{
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *subgroup_idx = nir_load_subgroup_invocation(b);
|
2020-09-10 18:48:04 +02:00
|
|
|
|
|
|
|
|
|
|
return build_ballot_imm_ishl(b, ~1ull, subgroup_idx, options);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2024-09-30 17:56:26 -05:00
|
|
|
|
static nir_def *
|
|
|
|
|
|
build_subgroup_quad_mask(nir_builder *b,
|
|
|
|
|
|
const nir_lower_subgroups_options *options)
|
|
|
|
|
|
{
|
|
|
|
|
|
nir_def *subgroup_idx = nir_load_subgroup_invocation(b);
|
|
|
|
|
|
nir_def *quad_first_idx = nir_iand_imm(b, subgroup_idx, ~0x3);
|
|
|
|
|
|
|
|
|
|
|
|
return build_ballot_imm_ishl(b, 0xf, quad_first_idx, options);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static nir_def *
|
|
|
|
|
|
build_quad_vote_any(nir_builder *b, nir_def *src,
|
|
|
|
|
|
const nir_lower_subgroups_options *options)
|
|
|
|
|
|
{
|
|
|
|
|
|
nir_def *ballot = nir_ballot(b, options->ballot_components,
|
|
|
|
|
|
options->ballot_bit_size,
|
|
|
|
|
|
src);
|
|
|
|
|
|
nir_def *mask = build_subgroup_quad_mask(b, options);
|
|
|
|
|
|
|
|
|
|
|
|
return nir_ine_imm(b, nir_iand(b, ballot, mask), 0);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
static nir_def *
|
|
|
|
|
|
vec_find_lsb(nir_builder *b, nir_def *value)
|
2020-09-10 18:48:04 +02:00
|
|
|
|
{
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *vec_result = nir_find_lsb(b, value);
|
|
|
|
|
|
nir_def *result = nir_imm_int(b, -1);
|
2020-09-10 18:48:04 +02:00
|
|
|
|
for (int i = value->num_components - 1; i >= 0; i--) {
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *channel = nir_channel(b, vec_result, i);
|
2020-09-10 18:48:04 +02:00
|
|
|
|
/* result = channel >= 0 ? (i * bitsize + channel) : result */
|
2023-05-08 14:00:41 +02:00
|
|
|
|
result = nir_bcsel(b, nir_ige_imm(b, channel, 0),
|
2020-09-10 18:48:04 +02:00
|
|
|
|
nir_iadd_imm(b, channel, i * value->bit_size),
|
|
|
|
|
|
result);
|
|
|
|
|
|
}
|
|
|
|
|
|
return result;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
static nir_def *
|
|
|
|
|
|
vec_find_msb(nir_builder *b, nir_def *value)
|
2020-09-10 18:48:04 +02:00
|
|
|
|
{
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *vec_result = nir_ufind_msb(b, value);
|
|
|
|
|
|
nir_def *result = nir_imm_int(b, -1);
|
2020-09-10 18:48:04 +02:00
|
|
|
|
for (unsigned i = 0; i < value->num_components; i++) {
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *channel = nir_channel(b, vec_result, i);
|
2020-09-10 18:48:04 +02:00
|
|
|
|
/* result = channel >= 0 ? (i * bitsize + channel) : result */
|
2023-05-08 14:00:41 +02:00
|
|
|
|
result = nir_bcsel(b, nir_ige_imm(b, channel, 0),
|
2020-09-10 18:48:04 +02:00
|
|
|
|
nir_iadd_imm(b, channel, i * value->bit_size),
|
|
|
|
|
|
result);
|
|
|
|
|
|
}
|
|
|
|
|
|
return result;
|
2019-07-10 22:20:00 -05:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
static nir_def *
|
2020-03-11 15:01:56 +01:00
|
|
|
|
lower_dynamic_quad_broadcast(nir_builder *b, nir_intrinsic_instr *intrin,
|
|
|
|
|
|
const nir_lower_subgroups_options *options)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (!options->lower_quad_broadcast_dynamic_to_const)
|
2022-01-04 14:31:29 +01:00
|
|
|
|
return lower_to_shuffle(b, intrin, options);
|
2020-03-11 15:01:56 +01:00
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *dst = NULL;
|
2020-03-11 15:01:56 +01:00
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 4; ++i) {
|
2023-10-02 13:02:14 +02:00
|
|
|
|
nir_def *qbcst = nir_quad_broadcast(b, intrin->src[0].ssa,
|
|
|
|
|
|
nir_imm_int(b, i));
|
2020-03-11 15:01:56 +01:00
|
|
|
|
|
|
|
|
|
|
if (i)
|
2023-06-16 19:43:30 +02:00
|
|
|
|
dst = nir_bcsel(b, nir_ieq_imm(b, intrin->src[1].ssa, i),
|
2023-10-02 13:02:14 +02:00
|
|
|
|
qbcst, dst);
|
2020-03-11 15:01:56 +01:00
|
|
|
|
else
|
2023-10-02 13:02:14 +02:00
|
|
|
|
dst = qbcst;
|
2020-03-11 15:01:56 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return dst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-10-23 10:28:30 -05:00
|
|
|
|
static nir_def *
|
|
|
|
|
|
lower_first_invocation_to_ballot(nir_builder *b, nir_intrinsic_instr *intrin,
|
|
|
|
|
|
const nir_lower_subgroups_options *options)
|
|
|
|
|
|
{
|
|
|
|
|
|
return nir_ballot_find_lsb(b, 32, nir_ballot(b, 4, 32, nir_imm_true(b)));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-10-23 10:33:14 -05:00
|
|
|
|
static nir_def *
|
|
|
|
|
|
lower_read_first_invocation(nir_builder *b, nir_intrinsic_instr *intrin)
|
|
|
|
|
|
{
|
|
|
|
|
|
return nir_read_invocation(b, intrin->src[0].ssa, nir_first_invocation(b));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
static nir_def *
|
2020-09-14 10:14:55 +02:00
|
|
|
|
lower_read_invocation_to_cond(nir_builder *b, nir_intrinsic_instr *intrin)
|
|
|
|
|
|
{
|
2023-08-14 11:56:00 -05:00
|
|
|
|
return nir_read_invocation_cond_ir3(b, intrin->def.bit_size,
|
2020-09-14 10:14:55 +02:00
|
|
|
|
intrin->src[0].ssa,
|
|
|
|
|
|
nir_ieq(b, intrin->src[1].ssa,
|
|
|
|
|
|
nir_load_subgroup_invocation(b)));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
static nir_def *
|
2019-07-11 13:04:05 -05:00
|
|
|
|
lower_subgroups_instr(nir_builder *b, nir_instr *instr, void *_options)
|
2017-08-22 13:23:59 -07:00
|
|
|
|
{
|
2019-07-11 13:04:05 -05:00
|
|
|
|
const nir_lower_subgroups_options *options = _options;
|
|
|
|
|
|
|
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
2017-08-22 13:23:59 -07:00
|
|
|
|
switch (intrin->intrinsic) {
|
|
|
|
|
|
case nir_intrinsic_vote_any:
|
|
|
|
|
|
case nir_intrinsic_vote_all:
|
|
|
|
|
|
if (options->lower_vote_trivial)
|
2023-09-15 10:57:20 -04:00
|
|
|
|
return intrin->src[0].ssa;
|
2017-08-22 13:23:59 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2017-08-28 17:33:33 -07:00
|
|
|
|
case nir_intrinsic_vote_feq:
|
|
|
|
|
|
case nir_intrinsic_vote_ieq:
|
2017-08-22 13:23:59 -07:00
|
|
|
|
if (options->lower_vote_trivial)
|
2018-10-19 09:35:49 -05:00
|
|
|
|
return nir_imm_true(b);
|
2017-08-28 19:55:34 -07:00
|
|
|
|
|
2023-10-24 13:40:47 -05:00
|
|
|
|
if (nir_src_bit_size(intrin->src[0]) == 1) {
|
|
|
|
|
|
if (options->lower_vote_bool_eq)
|
|
|
|
|
|
return lower_vote_eq(b, intrin);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
if (options->lower_vote_eq)
|
|
|
|
|
|
return lower_vote_eq(b, intrin);
|
|
|
|
|
|
}
|
2018-03-10 10:05:58 -08:00
|
|
|
|
|
2017-08-28 19:55:34 -07:00
|
|
|
|
if (options->lower_to_scalar && intrin->num_components > 1)
|
|
|
|
|
|
return lower_vote_eq_to_scalar(b, intrin);
|
2017-08-22 13:23:59 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2017-08-22 18:57:56 -07:00
|
|
|
|
case nir_intrinsic_load_subgroup_size:
|
|
|
|
|
|
if (options->subgroup_size)
|
|
|
|
|
|
return nir_imm_int(b, options->subgroup_size);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2023-10-23 10:28:30 -05:00
|
|
|
|
case nir_intrinsic_first_invocation:
|
|
|
|
|
|
if (options->subgroup_size == 1)
|
|
|
|
|
|
return nir_imm_int(b, 0);
|
|
|
|
|
|
|
|
|
|
|
|
if (options->lower_first_invocation_to_ballot)
|
|
|
|
|
|
return lower_first_invocation_to_ballot(b, intrin, options);
|
|
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2017-08-22 13:23:59 -07:00
|
|
|
|
case nir_intrinsic_read_invocation:
|
2020-09-14 10:14:55 +02:00
|
|
|
|
if (options->lower_to_scalar && intrin->num_components > 1)
|
2019-02-01 11:01:31 +01:00
|
|
|
|
return lower_subgroup_op_to_scalar(b, intrin);
|
2020-09-14 10:14:55 +02:00
|
|
|
|
|
2024-01-17 14:12:43 +01:00
|
|
|
|
if (options->lower_boolean_shuffle && intrin->src[0].ssa->bit_size == 1)
|
|
|
|
|
|
return lower_boolean_shuffle(b, intrin, options);
|
|
|
|
|
|
|
2020-09-14 10:14:55 +02:00
|
|
|
|
if (options->lower_read_invocation_to_cond)
|
|
|
|
|
|
return lower_read_invocation_to_cond(b, intrin);
|
|
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2017-08-22 13:23:59 -07:00
|
|
|
|
case nir_intrinsic_read_first_invocation:
|
|
|
|
|
|
if (options->lower_to_scalar && intrin->num_components > 1)
|
2019-02-01 11:01:31 +01:00
|
|
|
|
return lower_subgroup_op_to_scalar(b, intrin);
|
2023-10-23 10:33:14 -05:00
|
|
|
|
|
|
|
|
|
|
if (options->lower_read_first_invocation)
|
|
|
|
|
|
return lower_read_first_invocation(b, intrin);
|
2017-08-22 13:23:59 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_subgroup_eq_mask:
|
|
|
|
|
|
case nir_intrinsic_load_subgroup_ge_mask:
|
|
|
|
|
|
case nir_intrinsic_load_subgroup_gt_mask:
|
|
|
|
|
|
case nir_intrinsic_load_subgroup_le_mask:
|
|
|
|
|
|
case nir_intrinsic_load_subgroup_lt_mask: {
|
|
|
|
|
|
if (!options->lower_subgroup_masks)
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *val;
|
2017-08-22 13:23:59 -07:00
|
|
|
|
switch (intrin->intrinsic) {
|
|
|
|
|
|
case nir_intrinsic_load_subgroup_eq_mask:
|
2020-09-10 18:48:04 +02:00
|
|
|
|
val = build_subgroup_eq_mask(b, options);
|
2017-08-22 18:44:51 -07:00
|
|
|
|
break;
|
2017-08-22 13:23:59 -07:00
|
|
|
|
case nir_intrinsic_load_subgroup_ge_mask:
|
2020-09-10 18:48:04 +02:00
|
|
|
|
val = nir_iand(b, build_subgroup_ge_mask(b, options),
|
2023-08-08 12:00:35 -05:00
|
|
|
|
build_subgroup_mask(b, options));
|
2017-08-22 18:44:51 -07:00
|
|
|
|
break;
|
2017-08-22 13:23:59 -07:00
|
|
|
|
case nir_intrinsic_load_subgroup_gt_mask:
|
2020-09-10 18:48:04 +02:00
|
|
|
|
val = nir_iand(b, build_subgroup_gt_mask(b, options),
|
2023-08-08 12:00:35 -05:00
|
|
|
|
build_subgroup_mask(b, options));
|
2017-08-22 18:44:51 -07:00
|
|
|
|
break;
|
2017-08-22 13:23:59 -07:00
|
|
|
|
case nir_intrinsic_load_subgroup_le_mask:
|
2020-09-10 18:48:04 +02:00
|
|
|
|
val = nir_inot(b, build_subgroup_gt_mask(b, options));
|
2017-08-22 18:44:51 -07:00
|
|
|
|
break;
|
2017-08-22 13:23:59 -07:00
|
|
|
|
case nir_intrinsic_load_subgroup_lt_mask:
|
2020-09-10 18:48:04 +02:00
|
|
|
|
val = nir_inot(b, build_subgroup_ge_mask(b, options));
|
2017-08-22 18:44:51 -07:00
|
|
|
|
break;
|
2017-08-22 13:23:59 -07:00
|
|
|
|
default:
|
|
|
|
|
|
unreachable("you seriously can't tell this is unreachable?");
|
|
|
|
|
|
}
|
2017-08-22 18:44:51 -07:00
|
|
|
|
|
|
|
|
|
|
return uint_to_ballot_type(b, val,
|
2023-08-14 11:56:00 -05:00
|
|
|
|
intrin->def.num_components,
|
|
|
|
|
|
intrin->def.bit_size);
|
2017-08-22 18:44:51 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_ballot: {
|
2023-08-14 11:56:00 -05:00
|
|
|
|
if (intrin->def.num_components == options->ballot_components &&
|
|
|
|
|
|
intrin->def.bit_size == options->ballot_bit_size)
|
2017-08-22 18:44:51 -07:00
|
|
|
|
return NULL;
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *ballot =
|
2020-09-10 18:48:04 +02:00
|
|
|
|
nir_ballot(b, options->ballot_components, options->ballot_bit_size,
|
|
|
|
|
|
intrin->src[0].ssa);
|
2017-08-22 18:44:51 -07:00
|
|
|
|
|
2020-09-03 17:20:17 +01:00
|
|
|
|
return uint_to_ballot_type(b, ballot,
|
2023-08-14 11:56:00 -05:00
|
|
|
|
intrin->def.num_components,
|
|
|
|
|
|
intrin->def.bit_size);
|
2017-08-22 13:23:59 -07:00
|
|
|
|
}
|
2017-08-22 18:44:51 -07:00
|
|
|
|
|
nir/spirv: Add inverse_ballot intrinsic
This is actually a no-op on AMD, so we really don't want to lower it to
something more complicated. There may be a more efficient way to do
this on Intel too. In addition, in the future we'll want to use this for
lowering boolean reduce operations, where the inverse ballot will
operate on the backend's "natural" ballot type as indicated by
options->ballot_bit_size, instead of uvec4 as produced by SPIR-V. In
total, there are now three possible lowerings we may have to perform:
- inverse_ballot with source type of uvec4 from SPIR-V to inverse_ballot
with natural source type, when the backend supports inverse_ballot
natively.
- inverse_ballot with source type of uvec4 from SPIR-V to arithmetic,
when the backend doesn't support inverse_ballot.
- inverse_ballot with natural source type from reduce operation, when
the backend doesn't support inverse_ballot.
Previously we just did the second lowering unconditionally in vtn, but
it's just a combination of the first and third. We add support here for
the first and third lowerings in nir_lower_subgroups, instead of simply
moving the second lowering, to avoid unnecessary churn.
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25123>
2019-02-01 11:37:50 +01:00
|
|
|
|
case nir_intrinsic_inverse_ballot:
|
|
|
|
|
|
if (options->lower_inverse_ballot) {
|
|
|
|
|
|
return nir_ballot_bitfield_extract(b, 1, intrin->src[0].ssa,
|
|
|
|
|
|
nir_load_subgroup_invocation(b));
|
|
|
|
|
|
} else if (intrin->src[0].ssa->num_components != options->ballot_components ||
|
|
|
|
|
|
intrin->src[0].ssa->bit_size != options->ballot_bit_size) {
|
|
|
|
|
|
return nir_inverse_ballot(b, 1, ballot_type_to_uint(b, intrin->src[0].ssa, options));
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2017-10-02 18:19:44 -07:00
|
|
|
|
case nir_intrinsic_ballot_bitfield_extract:
|
|
|
|
|
|
case nir_intrinsic_ballot_bit_count_reduce:
|
|
|
|
|
|
case nir_intrinsic_ballot_find_lsb:
|
|
|
|
|
|
case nir_intrinsic_ballot_find_msb: {
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *int_val = ballot_type_to_uint(b, intrin->src[0].ssa,
|
|
|
|
|
|
options);
|
2020-04-22 21:35:48 -05:00
|
|
|
|
|
|
|
|
|
|
if (intrin->intrinsic != nir_intrinsic_ballot_bitfield_extract &&
|
|
|
|
|
|
intrin->intrinsic != nir_intrinsic_ballot_find_lsb) {
|
|
|
|
|
|
/* For OpGroupNonUniformBallotFindMSB, the SPIR-V Spec says:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "Find the most significant bit set to 1 in Value, considering
|
|
|
|
|
|
* only the bits in Value required to represent all bits of the
|
|
|
|
|
|
* group’s invocations. If none of the considered bits is set to
|
|
|
|
|
|
* 1, the result is undefined."
|
|
|
|
|
|
*
|
|
|
|
|
|
* It has similar text for the other three. This means that, in case
|
|
|
|
|
|
* the subgroup size is less than 32, we have to mask off the unused
|
|
|
|
|
|
* bits. If the subgroup size is fixed and greater than or equal to
|
|
|
|
|
|
* 32, the mask will be 0xffffffff and nir_opt_algebraic will delete
|
|
|
|
|
|
* the iand.
|
|
|
|
|
|
*
|
|
|
|
|
|
* We only have to worry about this for BitCount and FindMSB because
|
|
|
|
|
|
* FindLSB counts from the bottom and BitfieldExtract selects
|
|
|
|
|
|
* individual bits. In either case, if run outside the range of
|
|
|
|
|
|
* valid bits, we hit the undefined results case and we can return
|
|
|
|
|
|
* anything we want.
|
|
|
|
|
|
*/
|
2020-09-10 18:48:04 +02:00
|
|
|
|
int_val = nir_iand(b, int_val, build_subgroup_mask(b, options));
|
2020-04-22 21:35:48 -05:00
|
|
|
|
}
|
|
|
|
|
|
|
2017-10-02 18:19:44 -07:00
|
|
|
|
switch (intrin->intrinsic) {
|
2020-09-10 18:48:04 +02:00
|
|
|
|
case nir_intrinsic_ballot_bitfield_extract: {
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *idx = intrin->src[1].ssa;
|
2020-09-10 18:48:04 +02:00
|
|
|
|
if (int_val->num_components > 1) {
|
|
|
|
|
|
/* idx will be truncated by nir_ushr, so we just need to select
|
|
|
|
|
|
* the right component using the bits of idx that are truncated in
|
|
|
|
|
|
* the shift.
|
|
|
|
|
|
*/
|
|
|
|
|
|
int_val =
|
|
|
|
|
|
nir_vector_extract(b, int_val,
|
|
|
|
|
|
nir_udiv_imm(b, idx, int_val->bit_size));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2022-06-24 21:31:51 +02:00
|
|
|
|
return nir_test_mask(b, nir_ushr(b, int_val, idx), 1);
|
2020-09-10 18:48:04 +02:00
|
|
|
|
}
|
2017-10-02 18:19:44 -07:00
|
|
|
|
case nir_intrinsic_ballot_bit_count_reduce:
|
2020-09-10 18:48:04 +02:00
|
|
|
|
return vec_bit_count(b, int_val);
|
2017-10-02 18:19:44 -07:00
|
|
|
|
case nir_intrinsic_ballot_find_lsb:
|
2020-09-10 18:48:04 +02:00
|
|
|
|
return vec_find_lsb(b, int_val);
|
2017-10-02 18:19:44 -07:00
|
|
|
|
case nir_intrinsic_ballot_find_msb:
|
2020-09-10 18:48:04 +02:00
|
|
|
|
return vec_find_msb(b, int_val);
|
2017-10-02 18:19:44 -07:00
|
|
|
|
default:
|
|
|
|
|
|
unreachable("you seriously can't tell this is unreachable?");
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_ballot_bit_count_exclusive:
|
|
|
|
|
|
case nir_intrinsic_ballot_bit_count_inclusive: {
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *int_val = ballot_type_to_uint(b, intrin->src[0].ssa,
|
|
|
|
|
|
options);
|
2023-05-01 19:04:03 +02:00
|
|
|
|
if (options->lower_ballot_bit_count_to_mbcnt_amd) {
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *acc;
|
2023-05-01 19:04:03 +02:00
|
|
|
|
if (intrin->intrinsic == nir_intrinsic_ballot_bit_count_exclusive) {
|
|
|
|
|
|
acc = nir_imm_int(b, 0);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
acc = nir_iand_imm(b, nir_u2u32(b, int_val), 0x1);
|
|
|
|
|
|
int_val = nir_ushr_imm(b, int_val, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
return nir_mbcnt_amd(b, int_val, acc);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *mask;
|
2017-10-02 18:19:44 -07:00
|
|
|
|
if (intrin->intrinsic == nir_intrinsic_ballot_bit_count_inclusive) {
|
2020-09-10 18:48:04 +02:00
|
|
|
|
mask = nir_inot(b, build_subgroup_gt_mask(b, options));
|
2017-10-02 18:19:44 -07:00
|
|
|
|
} else {
|
2020-09-10 18:48:04 +02:00
|
|
|
|
mask = nir_inot(b, build_subgroup_ge_mask(b, options));
|
2017-10-02 18:19:44 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2020-09-10 18:48:04 +02:00
|
|
|
|
return vec_bit_count(b, nir_iand(b, int_val, mask));
|
2017-10-02 18:19:44 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2017-05-09 16:44:13 -07:00
|
|
|
|
case nir_intrinsic_elect: {
|
2020-09-01 16:35:24 +01:00
|
|
|
|
if (!options->lower_elect)
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
2020-09-03 17:20:17 +01:00
|
|
|
|
return nir_ieq(b, nir_load_subgroup_invocation(b), nir_first_invocation(b));
|
2017-05-09 16:44:13 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2017-12-06 21:41:47 -08:00
|
|
|
|
case nir_intrinsic_shuffle:
|
2024-01-17 14:12:43 +01:00
|
|
|
|
if (options->lower_shuffle &&
|
|
|
|
|
|
(!options->lower_boolean_shuffle || intrin->src[0].ssa->bit_size != 1))
|
2022-01-04 15:44:31 +01:00
|
|
|
|
return lower_shuffle(b, intrin);
|
|
|
|
|
|
else if (options->lower_to_scalar && intrin->num_components > 1)
|
2019-02-01 11:01:31 +01:00
|
|
|
|
return lower_subgroup_op_to_scalar(b, intrin);
|
2024-01-17 14:12:43 +01:00
|
|
|
|
else if (options->lower_boolean_shuffle && intrin->src[0].ssa->bit_size == 1)
|
|
|
|
|
|
return lower_boolean_shuffle(b, intrin, options);
|
2018-04-10 16:07:27 +02:00
|
|
|
|
else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64)
|
|
|
|
|
|
return lower_subgroup_op_to_32bit(b, intrin);
|
2017-12-06 21:41:47 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case nir_intrinsic_shuffle_xor:
|
|
|
|
|
|
case nir_intrinsic_shuffle_up:
|
|
|
|
|
|
case nir_intrinsic_shuffle_down:
|
2024-01-17 14:12:43 +01:00
|
|
|
|
if (options->lower_relative_shuffle &&
|
|
|
|
|
|
(!options->lower_boolean_shuffle || intrin->src[0].ssa->bit_size != 1))
|
2022-01-04 14:31:29 +01:00
|
|
|
|
return lower_to_shuffle(b, intrin, options);
|
2017-12-06 21:41:47 -08:00
|
|
|
|
else if (options->lower_to_scalar && intrin->num_components > 1)
|
2019-02-01 11:01:31 +01:00
|
|
|
|
return lower_subgroup_op_to_scalar(b, intrin);
|
2024-01-17 14:12:43 +01:00
|
|
|
|
else if (options->lower_boolean_shuffle && intrin->src[0].ssa->bit_size == 1)
|
|
|
|
|
|
return lower_boolean_shuffle(b, intrin, options);
|
2018-04-10 16:07:27 +02:00
|
|
|
|
else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64)
|
|
|
|
|
|
return lower_subgroup_op_to_32bit(b, intrin);
|
2017-12-06 21:41:47 -08:00
|
|
|
|
break;
|
|
|
|
|
|
|
2017-08-29 10:20:56 -07:00
|
|
|
|
case nir_intrinsic_quad_broadcast:
|
|
|
|
|
|
case nir_intrinsic_quad_swap_horizontal:
|
|
|
|
|
|
case nir_intrinsic_quad_swap_vertical:
|
|
|
|
|
|
case nir_intrinsic_quad_swap_diagonal:
|
2019-12-16 10:43:18 -06:00
|
|
|
|
if (options->lower_quad ||
|
|
|
|
|
|
(options->lower_quad_broadcast_dynamic &&
|
|
|
|
|
|
intrin->intrinsic == nir_intrinsic_quad_broadcast &&
|
|
|
|
|
|
!nir_src_is_const(intrin->src[1])))
|
2020-03-11 15:01:56 +01:00
|
|
|
|
return lower_dynamic_quad_broadcast(b, intrin, options);
|
2017-08-29 10:20:56 -07:00
|
|
|
|
else if (options->lower_to_scalar && intrin->num_components > 1)
|
2019-02-01 11:01:31 +01:00
|
|
|
|
return lower_subgroup_op_to_scalar(b, intrin);
|
2017-08-29 10:20:56 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2024-09-30 17:56:26 -05:00
|
|
|
|
case nir_intrinsic_quad_vote_any:
|
|
|
|
|
|
if (options->lower_quad_vote)
|
|
|
|
|
|
return build_quad_vote_any(b, intrin->src[0].ssa, options);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case nir_intrinsic_quad_vote_all:
|
|
|
|
|
|
if (options->lower_quad_vote) {
|
|
|
|
|
|
nir_def *not_src = nir_inot(b, intrin->src[0].ssa);
|
|
|
|
|
|
nir_def *any_not = build_quad_vote_any(b, not_src, options);
|
|
|
|
|
|
return nir_inot(b, any_not);
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2019-04-24 10:25:25 +01:00
|
|
|
|
case nir_intrinsic_reduce: {
|
2023-08-12 16:17:15 -04:00
|
|
|
|
nir_def *ret = NULL;
|
2019-04-24 10:25:25 +01:00
|
|
|
|
/* A cluster size greater than the subgroup size is implemention defined */
|
|
|
|
|
|
if (options->subgroup_size &&
|
|
|
|
|
|
nir_intrinsic_cluster_size(intrin) >= options->subgroup_size) {
|
|
|
|
|
|
nir_intrinsic_set_cluster_size(intrin, 0);
|
|
|
|
|
|
ret = NIR_LOWER_INSTR_PROGRESS;
|
|
|
|
|
|
}
|
2023-11-01 16:15:09 +01:00
|
|
|
|
if (nir_intrinsic_cluster_size(intrin) == 1)
|
|
|
|
|
|
return intrin->src[0].ssa;
|
2019-04-24 10:25:25 +01:00
|
|
|
|
if (options->lower_to_scalar && intrin->num_components > 1)
|
2023-11-01 16:15:09 +01:00
|
|
|
|
return lower_subgroup_op_to_scalar(b, intrin);
|
2024-05-06 22:31:53 -04:00
|
|
|
|
if (intrin->def.bit_size == 1 &&
|
|
|
|
|
|
(options->lower_boolean_reduce || options->lower_reduce))
|
2019-02-04 12:55:32 +01:00
|
|
|
|
return lower_boolean_reduce(b, intrin, options);
|
2024-05-06 22:31:53 -04:00
|
|
|
|
if (options->lower_reduce)
|
2024-10-17 21:44:39 +02:00
|
|
|
|
return lower_scan_reduce(b, intrin, options);
|
2019-04-24 10:25:25 +01:00
|
|
|
|
return ret;
|
|
|
|
|
|
}
|
2017-08-29 20:09:58 -07:00
|
|
|
|
case nir_intrinsic_inclusive_scan:
|
|
|
|
|
|
case nir_intrinsic_exclusive_scan:
|
|
|
|
|
|
if (options->lower_to_scalar && intrin->num_components > 1)
|
2019-02-01 11:01:31 +01:00
|
|
|
|
return lower_subgroup_op_to_scalar(b, intrin);
|
2024-05-06 22:31:53 -04:00
|
|
|
|
if (intrin->def.bit_size == 1 &&
|
|
|
|
|
|
(options->lower_boolean_reduce || options->lower_reduce))
|
2019-02-04 12:55:32 +01:00
|
|
|
|
return lower_boolean_reduce(b, intrin, options);
|
2024-05-06 22:31:53 -04:00
|
|
|
|
if (options->lower_reduce)
|
2024-10-17 21:44:39 +02:00
|
|
|
|
return lower_scan_reduce(b, intrin, options);
|
2017-08-29 20:09:58 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2022-11-16 23:58:27 -08:00
|
|
|
|
case nir_intrinsic_rotate:
|
2024-03-04 15:04:47 +01:00
|
|
|
|
if (options->lower_rotate_to_shuffle &&
|
|
|
|
|
|
(!options->lower_boolean_shuffle || intrin->src[0].ssa->bit_size != 1))
|
|
|
|
|
|
return lower_to_shuffle(b, intrin, options);
|
|
|
|
|
|
else if (options->lower_to_scalar && intrin->num_components > 1)
|
|
|
|
|
|
return lower_subgroup_op_to_scalar(b, intrin);
|
|
|
|
|
|
else if (options->lower_boolean_shuffle && intrin->src[0].ssa->bit_size == 1)
|
|
|
|
|
|
return lower_boolean_shuffle(b, intrin, options);
|
|
|
|
|
|
else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64)
|
|
|
|
|
|
return lower_subgroup_op_to_32bit(b, intrin);
|
2022-11-16 23:58:27 -08:00
|
|
|
|
break;
|
2023-09-29 08:08:15 +02:00
|
|
|
|
case nir_intrinsic_masked_swizzle_amd:
|
|
|
|
|
|
if (options->lower_to_scalar && intrin->num_components > 1) {
|
|
|
|
|
|
return lower_subgroup_op_to_scalar(b, intrin);
|
|
|
|
|
|
} else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64) {
|
|
|
|
|
|
return lower_subgroup_op_to_32bit(b, intrin);
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2022-11-16 23:58:27 -08:00
|
|
|
|
|
2017-08-22 13:23:59 -07:00
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
|
nir_lower_subgroups(nir_shader *shader,
|
|
|
|
|
|
const nir_lower_subgroups_options *options)
|
|
|
|
|
|
{
|
2024-05-06 22:31:14 -04:00
|
|
|
|
void *filter = options->filter ? options->filter : lower_subgroups_filter;
|
|
|
|
|
|
return nir_shader_lower_instructions(shader, filter,
|
2019-07-11 13:04:05 -05:00
|
|
|
|
lower_subgroups_instr,
|
|
|
|
|
|
(void *)options);
|
2017-08-22 13:23:59 -07:00
|
|
|
|
}
|