microsoft/compiler: Use lower_mem_access_bit_sizes for scratch/shared

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37778>
This commit is contained in:
Jesse Natalie 2025-10-10 11:13:36 -07:00 committed by Marge Bot
parent f7cbc7b1c5
commit b3242516ad
5 changed files with 115 additions and 235 deletions

View file

@ -117,8 +117,7 @@ compile_nir(struct d3d12_context *ctx, struct d3d12_shader_selector *sel,
NIR_PASS(_, nir, d3d12_lower_state_vars, shader);
const struct dxil_nir_lower_loads_stores_options loads_stores_options = {};
NIR_PASS(_, nir, dxil_nir_lower_loads_stores_to_dxil, &loads_stores_options);
NIR_PASS(_, nir, dxil_nir_scratch_and_shared_to_dxil);
if (key->stage == MESA_SHADER_FRAGMENT && key->fs.multisample_disabled)
NIR_PASS(_, nir, d3d12_disable_multisampling);

View file

@ -1109,9 +1109,6 @@ clc_spirv_to_dxil(struct clc_libclc *lib,
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo,
nir_address_format_32bit_index_offset);
NIR_PASS(_, nir, clc_nir_lower_system_values, work_properties_var);
const struct dxil_nir_lower_loads_stores_options loads_stores_options = {
.use_16bit_ssbo = false,
};
/* Now that function-declared local vars have been sized, append args */
for (unsigned i = 0; i < out_dxil->kernel->num_args; i++) {
@ -1140,7 +1137,7 @@ clc_spirv_to_dxil(struct clc_libclc *lib,
nir->info.shared_size += size;
}
NIR_PASS(_, nir, dxil_nir_lower_loads_stores_to_dxil, &loads_stores_options);
NIR_PASS(_, nir, dxil_nir_scratch_and_shared_to_dxil);
NIR_PASS(_, nir, dxil_nir_opt_alu_deref_srcs);
NIR_PASS(_, nir, nir_lower_fp16_casts, nir_lower_fp16_all);
NIR_PASS(_, nir, nir_lower_convert_alu_types, NULL);

View file

@ -39,166 +39,6 @@ cl_type_size_align(const struct glsl_type *type, unsigned *size,
*align = glsl_get_cl_alignment(type);
}
static nir_def *
load_comps_to_vec(nir_builder *b, unsigned src_bit_size,
nir_def **src_comps, unsigned num_src_comps,
unsigned dst_bit_size)
{
if (src_bit_size == dst_bit_size)
return nir_vec(b, src_comps, num_src_comps);
else if (src_bit_size > dst_bit_size)
return nir_extract_bits(b, src_comps, num_src_comps, 0, src_bit_size * num_src_comps / dst_bit_size, dst_bit_size);
unsigned num_dst_comps = DIV_ROUND_UP(num_src_comps * src_bit_size, dst_bit_size);
unsigned comps_per_dst = dst_bit_size / src_bit_size;
nir_def *dst_comps[4];
for (unsigned i = 0; i < num_dst_comps; i++) {
unsigned src_offs = i * comps_per_dst;
dst_comps[i] = nir_u2uN(b, src_comps[src_offs], dst_bit_size);
for (unsigned j = 1; j < comps_per_dst && src_offs + j < num_src_comps; j++) {
nir_def *tmp = nir_ishl_imm(b, nir_u2uN(b, src_comps[src_offs + j], dst_bit_size),
j * src_bit_size);
dst_comps[i] = nir_ior(b, dst_comps[i], tmp);
}
}
return nir_vec(b, dst_comps, num_dst_comps);
}
static bool
lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var)
{
unsigned bit_size = intr->def.bit_size;
unsigned num_components = intr->def.num_components;
unsigned num_bits = num_components * bit_size;
b->cursor = nir_before_instr(&intr->instr);
nir_def *offset = intr->src[0].ssa;
if (intr->intrinsic == nir_intrinsic_load_shared)
offset = nir_iadd_imm(b, offset, nir_intrinsic_base(intr));
else
offset = nir_u2u32(b, offset);
nir_def *index = nir_ushr_imm(b, offset, 2);
nir_def *comps[NIR_MAX_VEC_COMPONENTS];
nir_def *comps_32bit[NIR_MAX_VEC_COMPONENTS * 2];
/* We need to split loads in 32-bit accesses because the buffer
* is an i32 array and DXIL does not support type casts.
*/
unsigned num_32bit_comps = DIV_ROUND_UP(num_bits, 32);
for (unsigned i = 0; i < num_32bit_comps; i++)
comps_32bit[i] = nir_load_array_var(b, var, nir_iadd_imm(b, index, i));
unsigned num_comps_per_pass = MIN2(num_32bit_comps, 4);
for (unsigned i = 0; i < num_32bit_comps; i += num_comps_per_pass) {
unsigned num_vec32_comps = MIN2(num_32bit_comps - i, 4);
unsigned num_dest_comps = num_vec32_comps * 32 / bit_size;
nir_def *vec32 = nir_vec(b, &comps_32bit[i], num_vec32_comps);
/* If we have 16 bits or less to load we need to adjust the u32 value so
* we can always extract the LSB.
*/
if (num_bits <= 16) {
nir_def *shift =
nir_imul_imm(b, nir_iand_imm(b, offset, 3), 8);
vec32 = nir_ushr(b, vec32, shift);
}
/* And now comes the pack/unpack step to match the original type. */
unsigned dest_index = i * 32 / bit_size;
nir_def *temp_vec = nir_extract_bits(b, &vec32, 1, 0, num_dest_comps, bit_size);
for (unsigned comp = 0; comp < num_dest_comps; ++comp, ++dest_index)
comps[dest_index] = nir_channel(b, temp_vec, comp);
}
nir_def *result = nir_vec(b, comps, num_components);
nir_def_replace(&intr->def, result);
return true;
}
static void
lower_masked_store_vec32(nir_builder *b, nir_def *offset, nir_def *index,
nir_def *vec32, unsigned num_bits, nir_variable *var, unsigned alignment)
{
nir_def *mask = nir_imm_int(b, (1 << num_bits) - 1);
/* If we have small alignments, we need to place them correctly in the u32 component. */
if (alignment <= 2) {
nir_def *shift =
nir_imul_imm(b, nir_iand_imm(b, offset, 3), 8);
vec32 = nir_ishl(b, vec32, shift);
mask = nir_ishl(b, mask, shift);
}
if (var->data.mode == nir_var_mem_shared) {
/* Use the dedicated masked intrinsic */
nir_deref_instr *deref = nir_build_deref_array(b, nir_build_deref_var(b, var), index);
nir_deref_atomic(b, 32, &deref->def, nir_inot(b, mask), .atomic_op = nir_atomic_op_iand);
nir_deref_atomic(b, 32, &deref->def, vec32, .atomic_op = nir_atomic_op_ior);
} else {
/* For scratch, since we don't need atomics, just generate the read-modify-write in NIR */
nir_def *load = nir_load_array_var(b, var, index);
nir_def *new_val = nir_ior(b, vec32,
nir_iand(b,
nir_inot(b, mask),
load));
nir_store_array_var(b, var, index, new_val, 1);
}
}
static bool
lower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var)
{
unsigned num_components = nir_src_num_components(intr->src[0]);
unsigned bit_size = nir_src_bit_size(intr->src[0]);
unsigned num_bits = num_components * bit_size;
b->cursor = nir_before_instr(&intr->instr);
nir_def *offset = intr->src[1].ssa;
if (intr->intrinsic == nir_intrinsic_store_shared)
offset = nir_iadd_imm(b, offset, nir_intrinsic_base(intr));
else
offset = nir_u2u32(b, offset);
nir_def *comps[NIR_MAX_VEC_COMPONENTS];
unsigned comp_idx = 0;
for (unsigned i = 0; i < num_components; i++)
comps[i] = nir_channel(b, intr->src[0].ssa, i);
unsigned step = MAX2(bit_size, 32);
for (unsigned i = 0; i < num_bits; i += step) {
/* For each 4byte chunk (or smaller) we generate a 32bit scalar store. */
unsigned substore_num_bits = MIN2(num_bits - i, step);
nir_def *local_offset = nir_iadd_imm(b, offset, i / 8);
nir_def *vec32 = load_comps_to_vec(b, bit_size, &comps[comp_idx],
substore_num_bits / bit_size, 32);
nir_def *index = nir_ushr_imm(b, local_offset, 2);
/* For anything less than 32bits we need to use the masked version of the
* intrinsic to preserve data living in the same 32bit slot. */
if (substore_num_bits < 32) {
lower_masked_store_vec32(b, local_offset, index, vec32, num_bits, var, nir_intrinsic_align(intr));
} else {
for (unsigned i = 0; i < vec32->num_components; ++i)
nir_store_array_var(b, var, nir_iadd_imm(b, index, i), nir_channel(b, vec32, i), 1);
}
comp_idx += substore_num_bits / bit_size;
}
nir_instr_remove(&intr->instr);
return true;
}
#define CONSTANT_LOCATION_UNVISITED 0
#define CONSTANT_LOCATION_VALID 1
#define CONSTANT_LOCATION_INVALID 2
@ -654,83 +494,134 @@ dxil_nir_remove_oob_array_accesses(nir_shader *shader)
NULL);
}
static bool
lower_shared_atomic(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var)
/*
* This pass operates only on 32-bit scalars, so this callback instructs
* nir_lower_mem_access_bit_sizes_options to turn all shared access into
* 32-bit scalars. We don't want to use 8-bit accesses, since that would be
* challenging to optimize the resulting pack/unpack on some drivers. Larger
* 32-bit access however requires nontrivial tracking to extract/insert. Since
* nir_lower_mem_access_bit_sizes already has that code, we use it in this pass
* instead of NIH'ing it here.
*/
static nir_mem_access_size_align
mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size,
uint32_t align, uint32_t align_offset, bool offset_is_const,
enum gl_access_qualifier access, const void *cb_data)
{
return (nir_mem_access_size_align){
.num_components = 1,
.bit_size = 32,
.align = 4,
.shift = nir_mem_access_shift_method_scalar,
};
}
/*
* Thanks to nir_lower_mem_access_bit_sizes, we can lower shared intrinsics 1:1
* to word-based array access.
*/
static bool
lower_shared_to_var(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
nir_variable *shared = data;
b->cursor = nir_before_instr(&intr->instr);
nir_def *offset =
nir_iadd_imm(b, intr->src[0].ssa, nir_intrinsic_base(intr));
nir_def *index = nir_ushr_imm(b, offset, 2);
switch (intr->intrinsic) {
case nir_intrinsic_store_shared: {
nir_def *index = nir_udiv_aligned_4(b, intr->src[1].ssa);
nir_def *value = intr->src[0].ssa;
nir_deref_instr *deref = nir_build_deref_array(b, nir_build_deref_var(b, var), index);
nir_def *result;
if (intr->intrinsic == nir_intrinsic_shared_atomic_swap)
result = nir_deref_atomic_swap(b, 32, &deref->def, intr->src[1].ssa, intr->src[2].ssa,
.atomic_op = nir_intrinsic_atomic_op(intr));
else
result = nir_deref_atomic(b, 32, &deref->def, intr->src[1].ssa,
.atomic_op = nir_intrinsic_atomic_op(intr));
index = nir_u2uN(b, index, nir_get_ptr_bitsize(b->shader));
nir_store_array_var(b, shared, index, value, nir_component_mask(1));
break;
}
case nir_intrinsic_load_shared: {
nir_def *index = nir_udiv_aligned_4(b, intr->src[0].ssa);
index = nir_u2uN(b, index, nir_get_ptr_bitsize(b->shader));
nir_def_rewrite_uses(&intr->def, nir_load_array_var(b, shared, index));
break;
}
case nir_intrinsic_shared_atomic:
case nir_intrinsic_shared_atomic_swap: {
nir_def *index = nir_udiv_aligned_4(b, intr->src[0].ssa);
index = nir_u2uN(b, index, nir_get_ptr_bitsize(b->shader));
nir_deref_instr *deref = nir_build_deref_array(b, nir_build_deref_var(b, shared), index);
if (intr->intrinsic == nir_intrinsic_shared_atomic_swap)
nir_def_rewrite_uses(&intr->def, nir_deref_atomic_swap(b, 32, &deref->def, intr->src[1].ssa, intr->src[2].ssa,
.atomic_op = nir_intrinsic_atomic_op(intr)));
else
nir_def_rewrite_uses(&intr->def, nir_deref_atomic(b, 32, &deref->def, intr->src[1].ssa,
.atomic_op = nir_intrinsic_atomic_op(intr)));
break;
}
default:
return false;
}
nir_instr_remove(&intr->instr);
return true;
}
static bool
dxil_nir_lower_shared_to_var(nir_shader *nir)
{
unsigned words = DIV_ROUND_UP(nir->info.shared_size, 4);
/* Early exit in the common case that scratch is not used. */
if (words == 0) {
return false;
}
/* First, lower bit sizes and vectors as required by lower_shared_to_var */
nir_lower_mem_access_bit_sizes_options lower_mem_access_options = {
.modes = nir_var_mem_shared,
.callback = mem_access_cb,
.may_lower_unaligned_stores_to_atomics = true,
};
NIR_PASS(_, nir, nir_lower_mem_access_bit_sizes, &lower_mem_access_options);
/* Then, back shared by an array of words and turn all shared access into
* array access. */
const glsl_type *type_ = glsl_array_type(glsl_uint_type(), words, 1);
nir_variable *var = nir_variable_create(nir, nir_var_mem_shared, type_, "shared");
nir_foreach_function_impl(impl, nir) {
nir_function_intrinsics_pass(impl, lower_shared_to_var, nir_metadata_control_flow, var);
}
/* After lowering, we've eliminated all shared memory in the shader. */
nir->info.shared_size = 0;
/* Now clean up the mess we made */
bool progress;
do {
progress = false;
NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
NIR_PASS(progress, nir, nir_opt_constant_folding);
NIR_PASS(progress, nir, nir_opt_algebraic);
NIR_PASS(progress, nir, nir_copy_prop);
NIR_PASS(progress, nir, nir_opt_cse);
NIR_PASS(progress, nir, nir_opt_dce);
} while (progress);
nir_def_replace(&intr->def, result);
return true;
}
bool
dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir,
const struct dxil_nir_lower_loads_stores_options *options)
dxil_nir_scratch_and_shared_to_dxil(nir_shader *nir)
{
bool progress = nir_remove_dead_variables(nir, nir_var_function_temp | nir_var_mem_shared, NULL);
nir_variable *shared_var = NULL;
if (nir->info.shared_size) {
shared_var = nir_variable_create(nir, nir_var_mem_shared,
glsl_array_type(glsl_uint_type(), DIV_ROUND_UP(nir->info.shared_size, 4), 4),
"lowered_shared_mem");
}
unsigned ptr_size = nir->info.cs.ptr_size;
if (nir->info.stage == MESA_SHADER_KERNEL) {
/* All the derefs created here will be used as GEP indices so force 32-bit */
nir->info.cs.ptr_size = 32;
}
nir_foreach_function_impl(impl, nir) {
nir_builder b = nir_builder_create(impl);
nir_variable *scratch_var = NULL;
if (nir->scratch_size) {
const struct glsl_type *scratch_type = glsl_array_type(glsl_uint_type(), DIV_ROUND_UP(nir->scratch_size, 4), 4);
scratch_var = nir_local_variable_create(impl, scratch_type, "lowered_scratch_mem");
}
nir_foreach_block(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
switch (intr->intrinsic) {
case nir_intrinsic_load_shared:
progress |= lower_32b_offset_load(&b, intr, shared_var);
break;
case nir_intrinsic_load_scratch:
progress |= lower_32b_offset_load(&b, intr, scratch_var);
break;
case nir_intrinsic_store_shared:
progress |= lower_32b_offset_store(&b, intr, shared_var);
break;
case nir_intrinsic_store_scratch:
progress |= lower_32b_offset_store(&b, intr, scratch_var);
break;
case nir_intrinsic_shared_atomic:
case nir_intrinsic_shared_atomic_swap:
progress |= lower_shared_atomic(&b, intr, shared_var);
break;
default:
break;
}
}
}
}
progress |= nir_lower_scratch_to_var(nir);
progress |= dxil_nir_lower_shared_to_var(nir);
if (nir->info.stage == MESA_SHADER_KERNEL) {
nir->info.cs.ptr_size = ptr_size;
}

View file

@ -41,11 +41,7 @@ bool dxil_nir_flatten_var_arrays(nir_shader *shader, nir_variable_mode modes);
bool dxil_nir_lower_var_bit_size(nir_shader *shader, nir_variable_mode modes,
unsigned min_bit_size, unsigned max_bit_size);
bool dxil_nir_remove_oob_array_accesses(nir_shader *shader);
struct dxil_nir_lower_loads_stores_options {
bool use_16bit_ssbo;
};
bool dxil_nir_lower_loads_stores_to_dxil(nir_shader *shader,
const struct dxil_nir_lower_loads_stores_options *options);
bool dxil_nir_scratch_and_shared_to_dxil(nir_shader *shader);
bool dxil_nir_lower_deref_ssbo(nir_shader *shader);
bool dxil_nir_opt_alu_deref_srcs(nir_shader *shader);
bool dxil_nir_lower_upcast_phis(nir_shader *shader, unsigned min_bit_size);

View file

@ -1016,6 +1016,7 @@ dxil_spirv_nir_passes(nir_shader *nir,
shared_var_info);
NIR_PASS(_, nir, dxil_nir_split_unaligned_loads_stores, nir_var_mem_shared);
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared, nir_address_format_32bit_offset);
NIR_PASS(_, nir, dxil_nir_scratch_and_shared_to_dxil);
} else {
NIR_PASS(_, nir, nir_split_struct_vars, nir_var_mem_shared);
NIR_PASS(_, nir, dxil_nir_flatten_var_arrays, nir_var_mem_shared);
@ -1100,10 +1101,6 @@ dxil_spirv_nir_passes(nir_shader *nir,
NIR_PASS(_, nir, nir_lower_tex, &lower_tex_options);
NIR_PASS(_, nir, dxil_nir_split_clip_cull_distance);
const struct dxil_nir_lower_loads_stores_options loads_stores_options = {
.use_16bit_ssbo = conf->shader_model_max >= SHADER_MODEL_6_2,
};
NIR_PASS(_, nir, dxil_nir_lower_loads_stores_to_dxil, &loads_stores_options);
NIR_PASS(_, nir, dxil_nir_split_typed_samplers);
NIR_PASS(_, nir, dxil_nir_lower_ubo_array_one_to_static);
NIR_PASS(_, nir, nir_opt_dce);