dxil: Remove custom SSBO lowering

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23173>
This commit is contained in:
Jesse Natalie 2023-05-22 15:15:09 -07:00 committed by Marge Bot
parent 16aeaad73e
commit 92dcaf7deb
5 changed files with 45 additions and 234 deletions

View file

@ -1256,10 +1256,6 @@ store("uniform_ir3", [], indices=[BASE])
# vec4's.
intrinsic("copy_ubo_to_uniform_ir3", [1, 1], indices=[BASE, RANGE])
# DXIL specific intrinsics
# src[] = { value, mask, index, offset }.
intrinsic("store_ssbo_masked_dxil", [1, 1, 1, 1])
# Intrinsics used by the Midgard/Bifrost blend pipeline. These are defined
# within a blend shader to read/write the raw value from the tile buffer,
# without applying any format conversion in the process. If the shader needs

View file

@ -909,7 +909,7 @@ clc_spirv_to_dxil(struct clc_libclc *lib,
NIR_PASS_V(nir, dxil_nir_lower_deref_ssbo);
NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_all & ~(nir_var_mem_constant | nir_var_mem_ubo));
NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_mem_shared | nir_var_function_temp);
assert(nir->info.cs.ptr_size == 64);
NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo,

View file

@ -67,188 +67,6 @@ load_comps_to_vec(nir_builder *b, unsigned src_bit_size,
return nir_vec(b, dst_comps, num_dst_comps);
}
static bool
lower_load_ssbo(nir_builder *b, nir_intrinsic_instr *intr, unsigned min_bit_size)
{
assert(intr->dest.is_ssa);
assert(intr->src[0].is_ssa);
assert(intr->src[1].is_ssa);
b->cursor = nir_before_instr(&intr->instr);
unsigned src_bit_size = nir_dest_bit_size(intr->dest);
unsigned store_bit_size = CLAMP(src_bit_size, min_bit_size, 32);
unsigned offset_mask = store_bit_size / 8 - 1;
nir_ssa_def *buffer = intr->src[0].ssa;
nir_ssa_def *offset = nir_iand(b, intr->src[1].ssa, nir_imm_int(b, ~offset_mask));
enum gl_access_qualifier access = nir_intrinsic_access(intr);
unsigned num_components = nir_dest_num_components(intr->dest);
unsigned num_bits = num_components * src_bit_size;
nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
unsigned comp_idx = 0;
/* We need to split loads in 4-component chunks because that's the optimal
* granularity of bufferLoad(). Minimum alignment is 2-byte.
*/
for (unsigned i = 0; i < num_bits; i += 4 * store_bit_size) {
/* For each 4-component chunk (or smaller) we generate a N-bit ssbo vec load. */
unsigned subload_num_bits = MIN2(num_bits - i, 4 * store_bit_size);
/* The number of components to store depends on the number of bytes. */
nir_ssa_def *result =
nir_load_ssbo(b, DIV_ROUND_UP(subload_num_bits, store_bit_size), store_bit_size,
buffer, nir_iadd(b, offset, nir_imm_int(b, i / 8)),
.align_mul = store_bit_size / 8,
.align_offset = 0,
.access = access);
/* If we have an unaligned load we need to adjust the result value so
* we can always extract the LSB.
*/
if (nir_intrinsic_align(intr) < store_bit_size / 8) {
nir_ssa_def *shift = nir_imul(b, nir_iand(b, intr->src[1].ssa, nir_imm_int(b, offset_mask)),
nir_imm_int(b, 8));
result = nir_ushr(b, result, shift);
}
/* And now comes the pack/unpack step to match the original type. */
nir_ssa_def *temp_vec = nir_extract_bits(b, &result, 1, 0, subload_num_bits / src_bit_size, src_bit_size);
for (unsigned comp = 0; comp < subload_num_bits / src_bit_size; ++comp, ++comp_idx)
comps[comp_idx] = nir_channel(b, temp_vec, comp);
}
assert(comp_idx == num_components);
nir_ssa_def *result = nir_vec(b, comps, num_components);
nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
nir_instr_remove(&intr->instr);
return true;
}
static bool
lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr, unsigned min_bit_size)
{
b->cursor = nir_before_instr(&intr->instr);
assert(intr->src[0].is_ssa);
assert(intr->src[1].is_ssa);
assert(intr->src[2].is_ssa);
nir_ssa_def *val = intr->src[0].ssa;
nir_ssa_def *buffer = intr->src[1].ssa;
unsigned src_bit_size = val->bit_size;
unsigned store_bit_size = CLAMP(src_bit_size, min_bit_size, 32);
unsigned masked_store_bit_size = 32;
unsigned num_components = val->num_components;
unsigned num_bits = num_components * src_bit_size;
unsigned offset_mask = store_bit_size / 8 - 1;
unsigned masked_store_offset_mask = masked_store_bit_size / 8 - 1;
nir_ssa_def *offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~offset_mask));
nir_ssa_def *masked_offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~masked_store_offset_mask));
nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS] = { 0 };
unsigned comp_idx = 0;
unsigned write_mask = nir_intrinsic_write_mask(intr);
for (unsigned i = 0; i < num_components; i++)
if (write_mask & (1 << i))
comps[i] = nir_channel(b, val, i);
/* We split stores in 4-component chunks because that's the optimal granularity
* of bufferStore(). Minimum alignment is 2-byte. */
unsigned bit_offset = 0;
while (true) {
/* Skip over holes in the write mask */
while (comp_idx < num_components && comps[comp_idx] == NULL) {
comp_idx++;
bit_offset += src_bit_size;
}
if (comp_idx >= num_components)
break;
/* For each 4-component chunk (or smaller) we generate a ssbo vec
* store. If a component is skipped by the write mask, do a smaller
* sub-store
*/
unsigned num_src_comps_stored = 0, substore_num_bits = 0;
while(num_src_comps_stored + comp_idx < num_components &&
substore_num_bits + bit_offset < num_bits &&
substore_num_bits < 4 * store_bit_size &&
comps[comp_idx + num_src_comps_stored]) {
++num_src_comps_stored;
substore_num_bits += src_bit_size;
}
bool force_masked = false;
if (substore_num_bits > store_bit_size &&
substore_num_bits % store_bit_size != 0) {
/* Split this into two, one unmasked store of the first bits,
* and then the second loop iteration will handle a masked store
* for the rest. */
assert(num_src_comps_stored == 3);
if (store_bit_size == 16) {
assert(substore_num_bits < 32);
/* If we're already doing atomics to store, just do one
* 32bit masked store instead of a 16bit store and a masked
* store for the other 8 bits. */
force_masked = true;
} else {
--num_src_comps_stored;
substore_num_bits = store_bit_size;
}
}
nir_intrinsic_instr *store;
if (substore_num_bits < store_bit_size || force_masked) {
nir_ssa_def *store_vec = load_comps_to_vec(b, src_bit_size, &comps[comp_idx],
num_src_comps_stored, masked_store_bit_size);
nir_ssa_def *mask = nir_imm_intN_t(b, (1 << substore_num_bits) - 1, masked_store_bit_size);
/* If we have small alignments we need to place them correctly in the component. */
if (nir_intrinsic_align(intr) <= masked_store_bit_size / 8) {
nir_ssa_def *pos = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, masked_store_offset_mask));
nir_ssa_def *shift = nir_imul_imm(b, pos, 8);
store_vec = nir_ishl(b, store_vec, shift);
mask = nir_ishl(b, mask, shift);
}
nir_ssa_def *local_offset = nir_iadd(b, masked_offset, nir_imm_int(b, bit_offset / 8));
store = nir_intrinsic_instr_create(b->shader,
nir_intrinsic_store_ssbo_masked_dxil);
store->src[0] = nir_src_for_ssa(store_vec);
store->src[1] = nir_src_for_ssa(nir_inot(b, mask));
store->src[2] = nir_src_for_ssa(buffer);
store->src[3] = nir_src_for_ssa(local_offset);
} else {
nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, bit_offset / 8));
nir_ssa_def *store_vec = load_comps_to_vec(b, src_bit_size, &comps[comp_idx],
num_src_comps_stored, store_bit_size);
store = nir_intrinsic_instr_create(b->shader,
nir_intrinsic_store_ssbo);
store->src[0] = nir_src_for_ssa(store_vec);
store->src[1] = nir_src_for_ssa(buffer);
store->src[2] = nir_src_for_ssa(local_offset);
nir_intrinsic_set_align(store, store_bit_size / 8, 0);
}
/* The number of components to store depends on the number of bits. */
store->num_components = DIV_ROUND_UP(substore_num_bits, store_bit_size);
nir_builder_instr_insert(b, &store->instr);
comp_idx += num_src_comps_stored;
bit_offset += substore_num_bits;
if (nir_intrinsic_has_write_mask(store))
nir_intrinsic_set_write_mask(store, (1 << store->num_components) - 1);
}
nir_instr_remove(&intr->instr);
return true;
}
static bool
lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var)
{
@ -872,18 +690,12 @@ dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir,
case nir_intrinsic_load_scratch:
progress |= lower_32b_offset_load(&b, intr, scratch_var);
break;
case nir_intrinsic_load_ssbo:
progress |= lower_load_ssbo(&b, intr, options->use_16bit_ssbo ? 16 : 32);
break;
case nir_intrinsic_store_shared:
progress |= lower_32b_offset_store(&b, intr, shared_var);
break;
case nir_intrinsic_store_scratch:
progress |= lower_32b_offset_store(&b, intr, scratch_var);
break;
case nir_intrinsic_store_ssbo:
progress |= lower_store_ssbo(&b, intr, options->use_16bit_ssbo ? 16 : 32);
break;
case nir_intrinsic_shared_atomic:
case nir_intrinsic_shared_atomic_swap:
progress |= lower_shared_atomic(&b, intr, shared_var);

View file

@ -3531,32 +3531,6 @@ emit_store_ssbo(struct ntd_context *ctx, nir_intrinsic_instr *intr)
emit_bufferstore_call(ctx, handle, coord, value, write_mask, overload);
}
static bool
emit_store_ssbo_masked(struct ntd_context *ctx, nir_intrinsic_instr *intr)
{
const struct dxil_value *value =
get_src(ctx, &intr->src[0], 0, nir_type_uint);
const struct dxil_value *mask =
get_src(ctx, &intr->src[1], 0, nir_type_uint);
const struct dxil_value* handle = get_resource_handle(ctx, &intr->src[2], DXIL_RESOURCE_CLASS_UAV, DXIL_RESOURCE_KIND_RAW_BUFFER);
const struct dxil_value *offset =
get_src(ctx, &intr->src[3], 0, nir_type_uint);
if (!value || !mask || !handle || !offset)
return false;
const struct dxil_value *int32_undef = get_int32_undef(&ctx->mod);
if (!int32_undef)
return false;
const struct dxil_value *coord[3] = {
offset, int32_undef, int32_undef
};
return
emit_atomic_binop(ctx, handle, DXIL_ATOMIC_AND, coord, mask) != NULL &&
emit_atomic_binop(ctx, handle, DXIL_ATOMIC_OR, coord, value) != NULL;
}
static bool
emit_load_ubo_vec4(struct ntd_context *ctx, nir_intrinsic_instr *intr)
{
@ -4833,8 +4807,6 @@ emit_intrinsic(struct ntd_context *ctx, nir_intrinsic_instr *intr)
return emit_load_ssbo(ctx, intr);
case nir_intrinsic_store_ssbo:
return emit_store_ssbo(ctx, intr);
case nir_intrinsic_store_ssbo_masked_dxil:
return emit_store_ssbo_masked(ctx, intr);
case nir_intrinsic_load_deref:
return emit_load_deref(ctx, intr);
case nir_intrinsic_store_deref:
@ -6217,18 +6189,49 @@ lower_mem_access_bit_sizes_cb(nir_intrinsic_op intrin,
const void *cb_data)
{
const struct lower_mem_bit_sizes_data *data = cb_data;
unsigned max_bit_size = data->nir_options->lower_int64_options ? 32 : 64;
unsigned max_bit_size = 32;
unsigned min_bit_size = data->dxil_options->lower_int16 ? 32 : 16;
unsigned closest_bit_size = MAX2(min_bit_size, MIN2(max_bit_size, bit_size_in));
/* UBO loads can be done at whatever (supported) bit size, but require 16 byte
* alignment and can load up to 16 bytes per instruction. However this pass requires
* loading 16 bytes of data to get 16-byte alignment. We're going to run lower_ubo_vec4
* which can deal with unaligned vec4s, so for this pass let's just deal with bit size
* and total size restrictions. */
if (intrin == nir_intrinsic_load_ubo) {
/* UBO loads can be done at whatever (supported) bit size, but require 16 byte
* alignment and can load up to 16 bytes per instruction. However this pass requires
* loading 16 bytes of data to get 16-byte alignment. We're going to run lower_ubo_vec4
* which can deal with unaligned vec4s, so for this pass let's just deal with bit size
* and total size restrictions. */
return (nir_mem_access_size_align) {
.align = closest_bit_size / 8,
.bit_size = closest_bit_size,
.num_components = DIV_ROUND_UP(MIN2(bytes, 16) * 8, closest_bit_size),
};
}
assert(intrin == nir_intrinsic_load_ssbo || intrin == nir_intrinsic_store_ssbo);
uint32_t align = nir_combined_align(align_mul, align_offset);
if (align < min_bit_size / 8) {
/* Unaligned load/store, use the minimum bit size, up to 4 components */
unsigned ideal_num_components = intrin == nir_intrinsic_load_ssbo ?
DIV_ROUND_UP(bytes * 8, min_bit_size) :
(bytes * 8 / min_bit_size);
return (nir_mem_access_size_align) {
.align = min_bit_size / 8,
.bit_size = min_bit_size,
.num_components = MIN2(4, ideal_num_components),
};
}
/* Increase/decrease bit size to try to get closer to the requested byte size/align */
unsigned bit_size = closest_bit_size;
unsigned target = MIN2(bytes, align);
while (target < bit_size / 8 && bit_size > min_bit_size)
bit_size /= 2;
while (target > bit_size / 8 * 4 && bit_size < max_bit_size)
bit_size *= 2;
/* This is the best we can do */
return (nir_mem_access_size_align) {
.align = closest_bit_size / 8,
.bit_size = closest_bit_size,
.num_components = DIV_ROUND_UP(MIN2(bytes, 16) * 8, closest_bit_size),
.align = bit_size / 8,
.bit_size = bit_size,
.num_components = MIN2(4, DIV_ROUND_UP(bytes * 8, bit_size)),
};
}
@ -6540,12 +6543,12 @@ nir_to_dxil(struct nir_shader *s, const struct nir_to_dxil_options *opts,
* might be too opaque for the pass to see that they're next to each other. */
optimize_nir(s, opts);
/* Vectorize UBO accesses aggressively. This can help increase alignment to enable us to do better
/* Vectorize UBO/SSBO accesses aggressively. This can help increase alignment to enable us to do better
* chunking of loads and stores after lowering bit sizes. Ignore load/store size limitations here, we'll
* address them with lower_mem_access_bit_sizes */
nir_load_store_vectorize_options vectorize_opts = {
.callback = vectorize_filter,
.modes = nir_var_mem_ubo,
.modes = nir_var_mem_ubo | nir_var_mem_ssbo,
};
NIR_PASS_V(s, nir_opt_load_store_vectorize, &vectorize_opts);
@ -6553,8 +6556,9 @@ nir_to_dxil(struct nir_shader *s, const struct nir_to_dxil_options *opts,
* a single load/store op. */
struct lower_mem_bit_sizes_data mem_size_data = { s->options, opts };
nir_lower_mem_access_bit_sizes_options mem_size_options = {
.modes = nir_var_mem_ubo,
.modes = nir_var_mem_ubo | nir_var_mem_ssbo,
.callback = lower_mem_access_bit_sizes_cb,
.may_lower_unaligned_stores_to_atomics = true,
.cb_data = &mem_size_data
};
NIR_PASS_V(s, nir_lower_mem_access_bit_sizes, &mem_size_options);

View file

@ -1048,7 +1048,6 @@ dxil_spirv_nir_passes(nir_shader *nir,
}
NIR_PASS_V(nir, nir_opt_deref);
NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_mem_ssbo);
if (conf->inferred_read_only_images_as_srvs) {
const nir_opt_access_options opt_access_options = {