dxil: Delete load_ubo_dxil intrinsic

Instead of splitting unaligned UBO loads while still using derefs,
and then lowering load_ubo to load_ubo_dxil in lower_loads_stores_to_dxil,
use lower_mem_access_bit_sizes and lower_ubo_vec4 to handle load size and
alignment restrictions while converting to load_ubo_vec4 instead, which
has the same semantics as load_ubo_dxil.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/3842
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23173>
This commit is contained in:
Jesse Natalie 2023-05-22 10:03:36 -07:00 committed by Marge Bot
parent 42877c8b63
commit ecfbc16f61
6 changed files with 70 additions and 120 deletions

View file

@ -1259,8 +1259,6 @@ intrinsic("copy_ubo_to_uniform_ir3", [1, 1], indices=[BASE, RANGE])
# DXIL specific intrinsics
# src[] = { value, mask, index, offset }.
intrinsic("store_ssbo_masked_dxil", [1, 1, 1, 1])
# src[] = { index, 16-byte-based-offset }
load("ubo_dxil", [1, 1], [], [CAN_ELIMINATE, CAN_REORDER])
# Intrinsics used by the Midgard/Bifrost blend pipeline. These are defined
# within a blend shader to read/write the raw value from the tile buffer,

View file

@ -909,7 +909,7 @@ clc_spirv_to_dxil(struct clc_libclc *lib,
NIR_PASS_V(nir, dxil_nir_lower_deref_ssbo);
NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_all & ~nir_var_mem_constant);
NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_all & ~(nir_var_mem_constant | nir_var_mem_ubo));
assert(nir->info.cs.ptr_size == 64);
NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo,
@ -969,7 +969,6 @@ clc_spirv_to_dxil(struct clc_libclc *lib,
}
NIR_PASS_V(nir, clc_nir_lower_kernel_input_loads, inputs_var);
NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_mem_ubo);
NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo,
nir_address_format_32bit_index_offset);
NIR_PASS_V(nir, clc_nir_lower_system_values, work_properties_var);

View file

@ -67,89 +67,6 @@ load_comps_to_vec(nir_builder *b, unsigned src_bit_size,
return nir_vec(b, dst_comps, num_dst_comps);
}
static nir_ssa_def *
ubo_load_select_32b_comps(nir_builder *b, nir_ssa_def *vec32,
nir_ssa_def *offset, unsigned alignment)
{
assert(alignment >= 16 || alignment == 8 ||
alignment == 4 || alignment == 2 ||
alignment == 1);
assert(vec32->num_components == 4);
if (alignment > 8)
return vec32;
nir_ssa_def *comps[4];
nir_ssa_def *cond;
for (unsigned i = 0; i < 4; i++)
comps[i] = nir_channel(b, vec32, i);
/* If we have 8bytes alignment or less, select which half the vec4 should
* be used.
*/
cond = nir_ine(b, nir_iand(b, offset, nir_imm_int(b, 0x8)),
nir_imm_int(b, 0));
comps[0] = nir_bcsel(b, cond, comps[2], comps[0]);
comps[1] = nir_bcsel(b, cond, comps[3], comps[1]);
if (alignment == 8)
return nir_vec(b, comps, 2);
/* 4 byte align or less needed, select which of the 32bit component should be
* used and return it. The sub-32bit split is handled in nir_extract_bits().
*/
cond = nir_ine(b, nir_iand(b, offset, nir_imm_int(b, 0x4)),
nir_imm_int(b, 0));
return nir_bcsel(b, cond, comps[1], comps[0]);
}
nir_ssa_def *
build_load_ubo_dxil(nir_builder *b, nir_ssa_def *buffer,
nir_ssa_def *offset, unsigned num_components,
unsigned bit_size, unsigned alignment)
{
nir_ssa_def *idx = nir_ushr(b, offset, nir_imm_int(b, 4));
nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
unsigned num_bits = num_components * bit_size;
unsigned comp_idx = 0;
/* We need to split loads in 16byte chunks because that's the
* granularity of cBufferLoadLegacy().
*/
for (unsigned i = 0; i < num_bits; i += (16 * 8)) {
/* For each 16byte chunk (or smaller) we generate a 32bit ubo vec
* load.
*/
unsigned subload_num_bits = MIN2(num_bits - i, 16 * 8);
nir_ssa_def *vec32 =
nir_load_ubo_dxil(b, 4, 32, buffer, nir_iadd(b, idx, nir_imm_int(b, i / (16 * 8))));
/* First re-arrange the vec32 to account for intra 16-byte offset. */
assert(subload_num_bits / 8 <= alignment);
vec32 = ubo_load_select_32b_comps(b, vec32, offset, alignment);
/* If we have 2 bytes or less to load we need to adjust the u32 value so
* we can always extract the LSB.
*/
if (alignment <= 2) {
nir_ssa_def *shift = nir_imul(b, nir_iand(b, offset,
nir_imm_int(b, 3)),
nir_imm_int(b, 8));
vec32 = nir_ushr(b, vec32, shift);
}
/* And now comes the pack/unpack step to match the original type. */
nir_ssa_def *temp_vec = nir_extract_bits(b, &vec32, 1, 0, subload_num_bits / bit_size, bit_size);
for (unsigned comp = 0; comp < subload_num_bits / bit_size; ++comp, ++comp_idx)
comps[comp_idx] = nir_channel(b, temp_vec, comp);
}
assert(comp_idx == num_components);
return nir_vec(b, comps, num_components);
}
static bool
lower_load_ssbo(nir_builder *b, nir_intrinsic_instr *intr, unsigned min_bit_size)
{
@ -888,26 +805,6 @@ dxil_nir_lower_var_bit_size(nir_shader *shader, nir_variable_mode modes,
return true;
}
static bool
lower_load_ubo(nir_builder *b, nir_intrinsic_instr *intr)
{
assert(intr->dest.is_ssa);
assert(intr->src[0].is_ssa);
assert(intr->src[1].is_ssa);
b->cursor = nir_before_instr(&intr->instr);
nir_ssa_def *result =
build_load_ubo_dxil(b, intr->src[0].ssa, intr->src[1].ssa,
nir_dest_num_components(intr->dest),
nir_dest_bit_size(intr->dest),
nir_intrinsic_align(intr));
nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
nir_instr_remove(&intr->instr);
return true;
}
static bool
lower_shared_atomic(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var)
{
@ -978,9 +875,6 @@ dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir,
case nir_intrinsic_load_ssbo:
progress |= lower_load_ssbo(&b, intr, options->use_16bit_ssbo ? 16 : 32);
break;
case nir_intrinsic_load_ubo:
progress |= lower_load_ubo(&b, intr);
break;
case nir_intrinsic_store_shared:
progress |= lower_32b_offset_store(&b, intr, shared_var);
break;

View file

@ -58,11 +58,6 @@ bool dxil_nir_split_typed_samplers(nir_shader *shader);
bool dxil_nir_lower_sysval_to_load_input(nir_shader *s, nir_variable **sysval_vars);
bool dxil_nir_lower_vs_vertex_conversion(nir_shader *s, enum pipe_format target_formats[]);
nir_ssa_def *
build_load_ubo_dxil(nir_builder *b, nir_ssa_def *buffer,
nir_ssa_def *offset, unsigned num_components,
unsigned bit_size, unsigned alignment);
uint64_t
dxil_sort_by_driver_location(nir_shader* s, nir_variable_mode modes);

View file

@ -4843,7 +4843,6 @@ emit_intrinsic(struct ntd_context *ctx, nir_intrinsic_instr *intr)
return emit_atomic_deref(ctx, intr);
case nir_intrinsic_deref_atomic_swap:
return emit_atomic_deref_swap(ctx, intr);
case nir_intrinsic_load_ubo_dxil:
case nir_intrinsic_load_ubo_vec4:
return emit_load_ubo_vec4(ctx, intr);
case nir_intrinsic_load_primitive_id:
@ -6188,6 +6187,48 @@ lower_bit_size_callback(const nir_instr* instr, void *data)
return ret;
}
static bool
vectorize_filter(
unsigned align_mul,
unsigned align_offset,
unsigned bit_size,
unsigned num_components,
nir_intrinsic_instr *low, nir_intrinsic_instr *high,
void *data)
{
return util_is_power_of_two_nonzero(num_components);
}
struct lower_mem_bit_sizes_data {
const nir_shader_compiler_options *nir_options;
const struct nir_to_dxil_options *dxil_options;
};
static nir_mem_access_size_align
lower_mem_access_bit_sizes_cb(nir_intrinsic_op intrin,
uint8_t bytes,
uint8_t bit_size_in,
uint32_t align_mul,
uint32_t align_offset,
bool offset_is_const,
const void *cb_data)
{
const struct lower_mem_bit_sizes_data *data = cb_data;
unsigned max_bit_size = data->nir_options->lower_int64_options ? 32 : 64;
unsigned min_bit_size = data->dxil_options->lower_int16 ? 32 : 16;
unsigned closest_bit_size = MAX2(min_bit_size, MIN2(max_bit_size, bit_size_in));
/* UBO loads can be done at whatever (supported) bit size, but require 16 byte
* alignment and can load up to 16 bytes per instruction. However this pass requires
* loading 16 bytes of data to get 16-byte alignment. We're going to run lower_ubo_vec4
* which can deal with unaligned vec4s, so for this pass let's just deal with bit size
* and total size restrictions. */
return (nir_mem_access_size_align) {
.align = closest_bit_size / 8,
.bit_size = closest_bit_size,
.num_components = DIV_ROUND_UP(MIN2(bytes, 16) * 8, closest_bit_size),
};
}
static void
optimize_nir(struct nir_shader *s, const struct nir_to_dxil_options *opts)
{
@ -6222,6 +6263,7 @@ optimize_nir(struct nir_shader *s, const struct nir_to_dxil_options *opts)
NIR_PASS(progress, s, nir_lower_64bit_phis);
NIR_PASS(progress, s, nir_lower_phis_to_scalar, true);
NIR_PASS(progress, s, nir_opt_loop_unroll);
NIR_PASS(progress, s, nir_lower_pack);
NIR_PASS_V(s, nir_lower_system_values);
} while (progress);
@ -6488,10 +6530,34 @@ nir_to_dxil(struct nir_shader *s, const struct nir_to_dxil_options *opts,
NIR_PASS_V(s, nir_lower_flrp, 16 | 32 | 64, true);
NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, type_size_vec4, nir_lower_io_lower_64bit_to_32);
NIR_PASS_V(s, dxil_nir_ensure_position_writes);
NIR_PASS_V(s, nir_lower_pack);
NIR_PASS_V(s, dxil_nir_lower_system_values);
NIR_PASS_V(s, nir_lower_io_to_scalar, nir_var_shader_in | nir_var_system_value | nir_var_shader_out);
/* Do a round of optimization to try to vectorize loads/stores. Otherwise the addresses used for loads
* might be too opaque for the pass to see that they're next to each other. */
optimize_nir(s, opts);
/* Vectorize UBO accesses aggressively. This can help increase alignment to enable us to do better
* chunking of loads and stores after lowering bit sizes. Ignore load/store size limitations here, we'll
* address them with lower_mem_access_bit_sizes */
nir_load_store_vectorize_options vectorize_opts = {
.callback = vectorize_filter,
.modes = nir_var_mem_ubo,
};
NIR_PASS_V(s, nir_opt_load_store_vectorize, &vectorize_opts);
/* Now that they're bloated to the max, address bit size restrictions and overall size limitations for
* a single load/store op. */
struct lower_mem_bit_sizes_data mem_size_data = { s->options, opts };
nir_lower_mem_access_bit_sizes_options mem_size_options = {
.modes = nir_var_mem_ubo,
.callback = lower_mem_access_bit_sizes_cb,
.cb_data = &mem_size_data
};
NIR_PASS_V(s, nir_lower_mem_access_bit_sizes, &mem_size_options);
/* Lastly, conver byte-address UBO loads to vec-addressed. This pass can also deal with selecting sub-
* components from the load and dealing with vec-straddling loads. */
NIR_PASS_V(s, nir_lower_ubo_vec4);
if (opts->shader_model_max < SHADER_MODEL_6_6) {

View file

@ -1048,9 +1048,7 @@ dxil_spirv_nir_passes(nir_shader *nir,
}
NIR_PASS_V(nir, nir_opt_deref);
NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores,
nir_var_mem_ubo | nir_var_mem_push_const |
nir_var_mem_ssbo);
NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_mem_ssbo);
if (conf->inferred_read_only_images_as_srvs) {
const nir_opt_access_options opt_access_options = {