mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-06 11:38:05 +02:00
microsoft/compiler: Use lower_mem_access_bit_sizes for scratch/shared
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37778>
This commit is contained in:
parent
f7cbc7b1c5
commit
b3242516ad
5 changed files with 115 additions and 235 deletions
|
|
@ -117,8 +117,7 @@ compile_nir(struct d3d12_context *ctx, struct d3d12_shader_selector *sel,
|
|||
|
||||
NIR_PASS(_, nir, d3d12_lower_state_vars, shader);
|
||||
|
||||
const struct dxil_nir_lower_loads_stores_options loads_stores_options = {};
|
||||
NIR_PASS(_, nir, dxil_nir_lower_loads_stores_to_dxil, &loads_stores_options);
|
||||
NIR_PASS(_, nir, dxil_nir_scratch_and_shared_to_dxil);
|
||||
|
||||
if (key->stage == MESA_SHADER_FRAGMENT && key->fs.multisample_disabled)
|
||||
NIR_PASS(_, nir, d3d12_disable_multisampling);
|
||||
|
|
|
|||
|
|
@ -1109,9 +1109,6 @@ clc_spirv_to_dxil(struct clc_libclc *lib,
|
|||
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo,
|
||||
nir_address_format_32bit_index_offset);
|
||||
NIR_PASS(_, nir, clc_nir_lower_system_values, work_properties_var);
|
||||
const struct dxil_nir_lower_loads_stores_options loads_stores_options = {
|
||||
.use_16bit_ssbo = false,
|
||||
};
|
||||
|
||||
/* Now that function-declared local vars have been sized, append args */
|
||||
for (unsigned i = 0; i < out_dxil->kernel->num_args; i++) {
|
||||
|
|
@ -1140,7 +1137,7 @@ clc_spirv_to_dxil(struct clc_libclc *lib,
|
|||
nir->info.shared_size += size;
|
||||
}
|
||||
|
||||
NIR_PASS(_, nir, dxil_nir_lower_loads_stores_to_dxil, &loads_stores_options);
|
||||
NIR_PASS(_, nir, dxil_nir_scratch_and_shared_to_dxil);
|
||||
NIR_PASS(_, nir, dxil_nir_opt_alu_deref_srcs);
|
||||
NIR_PASS(_, nir, nir_lower_fp16_casts, nir_lower_fp16_all);
|
||||
NIR_PASS(_, nir, nir_lower_convert_alu_types, NULL);
|
||||
|
|
|
|||
|
|
@ -39,166 +39,6 @@ cl_type_size_align(const struct glsl_type *type, unsigned *size,
|
|||
*align = glsl_get_cl_alignment(type);
|
||||
}
|
||||
|
||||
static nir_def *
|
||||
load_comps_to_vec(nir_builder *b, unsigned src_bit_size,
|
||||
nir_def **src_comps, unsigned num_src_comps,
|
||||
unsigned dst_bit_size)
|
||||
{
|
||||
if (src_bit_size == dst_bit_size)
|
||||
return nir_vec(b, src_comps, num_src_comps);
|
||||
else if (src_bit_size > dst_bit_size)
|
||||
return nir_extract_bits(b, src_comps, num_src_comps, 0, src_bit_size * num_src_comps / dst_bit_size, dst_bit_size);
|
||||
|
||||
unsigned num_dst_comps = DIV_ROUND_UP(num_src_comps * src_bit_size, dst_bit_size);
|
||||
unsigned comps_per_dst = dst_bit_size / src_bit_size;
|
||||
nir_def *dst_comps[4];
|
||||
|
||||
for (unsigned i = 0; i < num_dst_comps; i++) {
|
||||
unsigned src_offs = i * comps_per_dst;
|
||||
|
||||
dst_comps[i] = nir_u2uN(b, src_comps[src_offs], dst_bit_size);
|
||||
for (unsigned j = 1; j < comps_per_dst && src_offs + j < num_src_comps; j++) {
|
||||
nir_def *tmp = nir_ishl_imm(b, nir_u2uN(b, src_comps[src_offs + j], dst_bit_size),
|
||||
j * src_bit_size);
|
||||
dst_comps[i] = nir_ior(b, dst_comps[i], tmp);
|
||||
}
|
||||
}
|
||||
|
||||
return nir_vec(b, dst_comps, num_dst_comps);
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var)
|
||||
{
|
||||
unsigned bit_size = intr->def.bit_size;
|
||||
unsigned num_components = intr->def.num_components;
|
||||
unsigned num_bits = num_components * bit_size;
|
||||
|
||||
b->cursor = nir_before_instr(&intr->instr);
|
||||
|
||||
nir_def *offset = intr->src[0].ssa;
|
||||
if (intr->intrinsic == nir_intrinsic_load_shared)
|
||||
offset = nir_iadd_imm(b, offset, nir_intrinsic_base(intr));
|
||||
else
|
||||
offset = nir_u2u32(b, offset);
|
||||
nir_def *index = nir_ushr_imm(b, offset, 2);
|
||||
nir_def *comps[NIR_MAX_VEC_COMPONENTS];
|
||||
nir_def *comps_32bit[NIR_MAX_VEC_COMPONENTS * 2];
|
||||
|
||||
/* We need to split loads in 32-bit accesses because the buffer
|
||||
* is an i32 array and DXIL does not support type casts.
|
||||
*/
|
||||
unsigned num_32bit_comps = DIV_ROUND_UP(num_bits, 32);
|
||||
for (unsigned i = 0; i < num_32bit_comps; i++)
|
||||
comps_32bit[i] = nir_load_array_var(b, var, nir_iadd_imm(b, index, i));
|
||||
unsigned num_comps_per_pass = MIN2(num_32bit_comps, 4);
|
||||
|
||||
for (unsigned i = 0; i < num_32bit_comps; i += num_comps_per_pass) {
|
||||
unsigned num_vec32_comps = MIN2(num_32bit_comps - i, 4);
|
||||
unsigned num_dest_comps = num_vec32_comps * 32 / bit_size;
|
||||
nir_def *vec32 = nir_vec(b, &comps_32bit[i], num_vec32_comps);
|
||||
|
||||
/* If we have 16 bits or less to load we need to adjust the u32 value so
|
||||
* we can always extract the LSB.
|
||||
*/
|
||||
if (num_bits <= 16) {
|
||||
nir_def *shift =
|
||||
nir_imul_imm(b, nir_iand_imm(b, offset, 3), 8);
|
||||
vec32 = nir_ushr(b, vec32, shift);
|
||||
}
|
||||
|
||||
/* And now comes the pack/unpack step to match the original type. */
|
||||
unsigned dest_index = i * 32 / bit_size;
|
||||
nir_def *temp_vec = nir_extract_bits(b, &vec32, 1, 0, num_dest_comps, bit_size);
|
||||
for (unsigned comp = 0; comp < num_dest_comps; ++comp, ++dest_index)
|
||||
comps[dest_index] = nir_channel(b, temp_vec, comp);
|
||||
}
|
||||
|
||||
nir_def *result = nir_vec(b, comps, num_components);
|
||||
nir_def_replace(&intr->def, result);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
lower_masked_store_vec32(nir_builder *b, nir_def *offset, nir_def *index,
|
||||
nir_def *vec32, unsigned num_bits, nir_variable *var, unsigned alignment)
|
||||
{
|
||||
nir_def *mask = nir_imm_int(b, (1 << num_bits) - 1);
|
||||
|
||||
/* If we have small alignments, we need to place them correctly in the u32 component. */
|
||||
if (alignment <= 2) {
|
||||
nir_def *shift =
|
||||
nir_imul_imm(b, nir_iand_imm(b, offset, 3), 8);
|
||||
|
||||
vec32 = nir_ishl(b, vec32, shift);
|
||||
mask = nir_ishl(b, mask, shift);
|
||||
}
|
||||
|
||||
if (var->data.mode == nir_var_mem_shared) {
|
||||
/* Use the dedicated masked intrinsic */
|
||||
nir_deref_instr *deref = nir_build_deref_array(b, nir_build_deref_var(b, var), index);
|
||||
nir_deref_atomic(b, 32, &deref->def, nir_inot(b, mask), .atomic_op = nir_atomic_op_iand);
|
||||
nir_deref_atomic(b, 32, &deref->def, vec32, .atomic_op = nir_atomic_op_ior);
|
||||
} else {
|
||||
/* For scratch, since we don't need atomics, just generate the read-modify-write in NIR */
|
||||
nir_def *load = nir_load_array_var(b, var, index);
|
||||
|
||||
nir_def *new_val = nir_ior(b, vec32,
|
||||
nir_iand(b,
|
||||
nir_inot(b, mask),
|
||||
load));
|
||||
|
||||
nir_store_array_var(b, var, index, new_val, 1);
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var)
|
||||
{
|
||||
unsigned num_components = nir_src_num_components(intr->src[0]);
|
||||
unsigned bit_size = nir_src_bit_size(intr->src[0]);
|
||||
unsigned num_bits = num_components * bit_size;
|
||||
|
||||
b->cursor = nir_before_instr(&intr->instr);
|
||||
|
||||
nir_def *offset = intr->src[1].ssa;
|
||||
if (intr->intrinsic == nir_intrinsic_store_shared)
|
||||
offset = nir_iadd_imm(b, offset, nir_intrinsic_base(intr));
|
||||
else
|
||||
offset = nir_u2u32(b, offset);
|
||||
nir_def *comps[NIR_MAX_VEC_COMPONENTS];
|
||||
|
||||
unsigned comp_idx = 0;
|
||||
for (unsigned i = 0; i < num_components; i++)
|
||||
comps[i] = nir_channel(b, intr->src[0].ssa, i);
|
||||
|
||||
unsigned step = MAX2(bit_size, 32);
|
||||
for (unsigned i = 0; i < num_bits; i += step) {
|
||||
/* For each 4byte chunk (or smaller) we generate a 32bit scalar store. */
|
||||
unsigned substore_num_bits = MIN2(num_bits - i, step);
|
||||
nir_def *local_offset = nir_iadd_imm(b, offset, i / 8);
|
||||
nir_def *vec32 = load_comps_to_vec(b, bit_size, &comps[comp_idx],
|
||||
substore_num_bits / bit_size, 32);
|
||||
nir_def *index = nir_ushr_imm(b, local_offset, 2);
|
||||
|
||||
/* For anything less than 32bits we need to use the masked version of the
|
||||
* intrinsic to preserve data living in the same 32bit slot. */
|
||||
if (substore_num_bits < 32) {
|
||||
lower_masked_store_vec32(b, local_offset, index, vec32, num_bits, var, nir_intrinsic_align(intr));
|
||||
} else {
|
||||
for (unsigned i = 0; i < vec32->num_components; ++i)
|
||||
nir_store_array_var(b, var, nir_iadd_imm(b, index, i), nir_channel(b, vec32, i), 1);
|
||||
}
|
||||
|
||||
comp_idx += substore_num_bits / bit_size;
|
||||
}
|
||||
|
||||
nir_instr_remove(&intr->instr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#define CONSTANT_LOCATION_UNVISITED 0
|
||||
#define CONSTANT_LOCATION_VALID 1
|
||||
#define CONSTANT_LOCATION_INVALID 2
|
||||
|
|
@ -654,83 +494,134 @@ dxil_nir_remove_oob_array_accesses(nir_shader *shader)
|
|||
NULL);
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_shared_atomic(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var)
|
||||
/*
|
||||
* This pass operates only on 32-bit scalars, so this callback instructs
|
||||
* nir_lower_mem_access_bit_sizes_options to turn all shared access into
|
||||
* 32-bit scalars. We don't want to use 8-bit accesses, since that would be
|
||||
* challenging to optimize the resulting pack/unpack on some drivers. Larger
|
||||
* 32-bit access however requires nontrivial tracking to extract/insert. Since
|
||||
* nir_lower_mem_access_bit_sizes already has that code, we use it in this pass
|
||||
* instead of NIH'ing it here.
|
||||
*/
|
||||
static nir_mem_access_size_align
|
||||
mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size,
|
||||
uint32_t align, uint32_t align_offset, bool offset_is_const,
|
||||
enum gl_access_qualifier access, const void *cb_data)
|
||||
{
|
||||
return (nir_mem_access_size_align){
|
||||
.num_components = 1,
|
||||
.bit_size = 32,
|
||||
.align = 4,
|
||||
.shift = nir_mem_access_shift_method_scalar,
|
||||
};
|
||||
}
|
||||
|
||||
/*
|
||||
* Thanks to nir_lower_mem_access_bit_sizes, we can lower shared intrinsics 1:1
|
||||
* to word-based array access.
|
||||
*/
|
||||
static bool
|
||||
lower_shared_to_var(nir_builder *b, nir_intrinsic_instr *intr, void *data)
|
||||
{
|
||||
nir_variable *shared = data;
|
||||
b->cursor = nir_before_instr(&intr->instr);
|
||||
|
||||
nir_def *offset =
|
||||
nir_iadd_imm(b, intr->src[0].ssa, nir_intrinsic_base(intr));
|
||||
nir_def *index = nir_ushr_imm(b, offset, 2);
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_store_shared: {
|
||||
nir_def *index = nir_udiv_aligned_4(b, intr->src[1].ssa);
|
||||
nir_def *value = intr->src[0].ssa;
|
||||
|
||||
nir_deref_instr *deref = nir_build_deref_array(b, nir_build_deref_var(b, var), index);
|
||||
nir_def *result;
|
||||
if (intr->intrinsic == nir_intrinsic_shared_atomic_swap)
|
||||
result = nir_deref_atomic_swap(b, 32, &deref->def, intr->src[1].ssa, intr->src[2].ssa,
|
||||
.atomic_op = nir_intrinsic_atomic_op(intr));
|
||||
else
|
||||
result = nir_deref_atomic(b, 32, &deref->def, intr->src[1].ssa,
|
||||
.atomic_op = nir_intrinsic_atomic_op(intr));
|
||||
index = nir_u2uN(b, index, nir_get_ptr_bitsize(b->shader));
|
||||
nir_store_array_var(b, shared, index, value, nir_component_mask(1));
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_load_shared: {
|
||||
nir_def *index = nir_udiv_aligned_4(b, intr->src[0].ssa);
|
||||
|
||||
index = nir_u2uN(b, index, nir_get_ptr_bitsize(b->shader));
|
||||
nir_def_rewrite_uses(&intr->def, nir_load_array_var(b, shared, index));
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_shared_atomic:
|
||||
case nir_intrinsic_shared_atomic_swap: {
|
||||
nir_def *index = nir_udiv_aligned_4(b, intr->src[0].ssa);
|
||||
|
||||
index = nir_u2uN(b, index, nir_get_ptr_bitsize(b->shader));
|
||||
nir_deref_instr *deref = nir_build_deref_array(b, nir_build_deref_var(b, shared), index);
|
||||
|
||||
if (intr->intrinsic == nir_intrinsic_shared_atomic_swap)
|
||||
nir_def_rewrite_uses(&intr->def, nir_deref_atomic_swap(b, 32, &deref->def, intr->src[1].ssa, intr->src[2].ssa,
|
||||
.atomic_op = nir_intrinsic_atomic_op(intr)));
|
||||
else
|
||||
nir_def_rewrite_uses(&intr->def, nir_deref_atomic(b, 32, &deref->def, intr->src[1].ssa,
|
||||
.atomic_op = nir_intrinsic_atomic_op(intr)));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
nir_instr_remove(&intr->instr);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
dxil_nir_lower_shared_to_var(nir_shader *nir)
|
||||
{
|
||||
unsigned words = DIV_ROUND_UP(nir->info.shared_size, 4);
|
||||
|
||||
/* Early exit in the common case that scratch is not used. */
|
||||
if (words == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* First, lower bit sizes and vectors as required by lower_shared_to_var */
|
||||
nir_lower_mem_access_bit_sizes_options lower_mem_access_options = {
|
||||
.modes = nir_var_mem_shared,
|
||||
.callback = mem_access_cb,
|
||||
.may_lower_unaligned_stores_to_atomics = true,
|
||||
};
|
||||
NIR_PASS(_, nir, nir_lower_mem_access_bit_sizes, &lower_mem_access_options);
|
||||
|
||||
/* Then, back shared by an array of words and turn all shared access into
|
||||
* array access. */
|
||||
const glsl_type *type_ = glsl_array_type(glsl_uint_type(), words, 1);
|
||||
nir_variable *var = nir_variable_create(nir, nir_var_mem_shared, type_, "shared");
|
||||
|
||||
nir_foreach_function_impl(impl, nir) {
|
||||
nir_function_intrinsics_pass(impl, lower_shared_to_var, nir_metadata_control_flow, var);
|
||||
}
|
||||
|
||||
/* After lowering, we've eliminated all shared memory in the shader. */
|
||||
nir->info.shared_size = 0;
|
||||
|
||||
/* Now clean up the mess we made */
|
||||
bool progress;
|
||||
do {
|
||||
progress = false;
|
||||
NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
|
||||
NIR_PASS(progress, nir, nir_opt_constant_folding);
|
||||
NIR_PASS(progress, nir, nir_opt_algebraic);
|
||||
NIR_PASS(progress, nir, nir_copy_prop);
|
||||
NIR_PASS(progress, nir, nir_opt_cse);
|
||||
NIR_PASS(progress, nir, nir_opt_dce);
|
||||
} while (progress);
|
||||
|
||||
nir_def_replace(&intr->def, result);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir,
|
||||
const struct dxil_nir_lower_loads_stores_options *options)
|
||||
dxil_nir_scratch_and_shared_to_dxil(nir_shader *nir)
|
||||
{
|
||||
bool progress = nir_remove_dead_variables(nir, nir_var_function_temp | nir_var_mem_shared, NULL);
|
||||
nir_variable *shared_var = NULL;
|
||||
if (nir->info.shared_size) {
|
||||
shared_var = nir_variable_create(nir, nir_var_mem_shared,
|
||||
glsl_array_type(glsl_uint_type(), DIV_ROUND_UP(nir->info.shared_size, 4), 4),
|
||||
"lowered_shared_mem");
|
||||
}
|
||||
|
||||
unsigned ptr_size = nir->info.cs.ptr_size;
|
||||
if (nir->info.stage == MESA_SHADER_KERNEL) {
|
||||
/* All the derefs created here will be used as GEP indices so force 32-bit */
|
||||
nir->info.cs.ptr_size = 32;
|
||||
}
|
||||
nir_foreach_function_impl(impl, nir) {
|
||||
nir_builder b = nir_builder_create(impl);
|
||||
|
||||
nir_variable *scratch_var = NULL;
|
||||
if (nir->scratch_size) {
|
||||
const struct glsl_type *scratch_type = glsl_array_type(glsl_uint_type(), DIV_ROUND_UP(nir->scratch_size, 4), 4);
|
||||
scratch_var = nir_local_variable_create(impl, scratch_type, "lowered_scratch_mem");
|
||||
}
|
||||
|
||||
nir_foreach_block(block, impl) {
|
||||
nir_foreach_instr_safe(instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_load_shared:
|
||||
progress |= lower_32b_offset_load(&b, intr, shared_var);
|
||||
break;
|
||||
case nir_intrinsic_load_scratch:
|
||||
progress |= lower_32b_offset_load(&b, intr, scratch_var);
|
||||
break;
|
||||
case nir_intrinsic_store_shared:
|
||||
progress |= lower_32b_offset_store(&b, intr, shared_var);
|
||||
break;
|
||||
case nir_intrinsic_store_scratch:
|
||||
progress |= lower_32b_offset_store(&b, intr, scratch_var);
|
||||
break;
|
||||
case nir_intrinsic_shared_atomic:
|
||||
case nir_intrinsic_shared_atomic_swap:
|
||||
progress |= lower_shared_atomic(&b, intr, shared_var);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
progress |= nir_lower_scratch_to_var(nir);
|
||||
progress |= dxil_nir_lower_shared_to_var(nir);
|
||||
if (nir->info.stage == MESA_SHADER_KERNEL) {
|
||||
nir->info.cs.ptr_size = ptr_size;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -41,11 +41,7 @@ bool dxil_nir_flatten_var_arrays(nir_shader *shader, nir_variable_mode modes);
|
|||
bool dxil_nir_lower_var_bit_size(nir_shader *shader, nir_variable_mode modes,
|
||||
unsigned min_bit_size, unsigned max_bit_size);
|
||||
bool dxil_nir_remove_oob_array_accesses(nir_shader *shader);
|
||||
struct dxil_nir_lower_loads_stores_options {
|
||||
bool use_16bit_ssbo;
|
||||
};
|
||||
bool dxil_nir_lower_loads_stores_to_dxil(nir_shader *shader,
|
||||
const struct dxil_nir_lower_loads_stores_options *options);
|
||||
bool dxil_nir_scratch_and_shared_to_dxil(nir_shader *shader);
|
||||
bool dxil_nir_lower_deref_ssbo(nir_shader *shader);
|
||||
bool dxil_nir_opt_alu_deref_srcs(nir_shader *shader);
|
||||
bool dxil_nir_lower_upcast_phis(nir_shader *shader, unsigned min_bit_size);
|
||||
|
|
|
|||
|
|
@ -1016,6 +1016,7 @@ dxil_spirv_nir_passes(nir_shader *nir,
|
|||
shared_var_info);
|
||||
NIR_PASS(_, nir, dxil_nir_split_unaligned_loads_stores, nir_var_mem_shared);
|
||||
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared, nir_address_format_32bit_offset);
|
||||
NIR_PASS(_, nir, dxil_nir_scratch_and_shared_to_dxil);
|
||||
} else {
|
||||
NIR_PASS(_, nir, nir_split_struct_vars, nir_var_mem_shared);
|
||||
NIR_PASS(_, nir, dxil_nir_flatten_var_arrays, nir_var_mem_shared);
|
||||
|
|
@ -1100,10 +1101,6 @@ dxil_spirv_nir_passes(nir_shader *nir,
|
|||
NIR_PASS(_, nir, nir_lower_tex, &lower_tex_options);
|
||||
|
||||
NIR_PASS(_, nir, dxil_nir_split_clip_cull_distance);
|
||||
const struct dxil_nir_lower_loads_stores_options loads_stores_options = {
|
||||
.use_16bit_ssbo = conf->shader_model_max >= SHADER_MODEL_6_2,
|
||||
};
|
||||
NIR_PASS(_, nir, dxil_nir_lower_loads_stores_to_dxil, &loads_stores_options);
|
||||
NIR_PASS(_, nir, dxil_nir_split_typed_samplers);
|
||||
NIR_PASS(_, nir, dxil_nir_lower_ubo_array_one_to_static);
|
||||
NIR_PASS(_, nir, nir_opt_dce);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue