diff --git a/src/gallium/drivers/d3d12/d3d12_compiler.cpp b/src/gallium/drivers/d3d12/d3d12_compiler.cpp index 143cfa9066c..8ade408bc7d 100644 --- a/src/gallium/drivers/d3d12/d3d12_compiler.cpp +++ b/src/gallium/drivers/d3d12/d3d12_compiler.cpp @@ -117,8 +117,7 @@ compile_nir(struct d3d12_context *ctx, struct d3d12_shader_selector *sel, NIR_PASS(_, nir, d3d12_lower_state_vars, shader); - const struct dxil_nir_lower_loads_stores_options loads_stores_options = {}; - NIR_PASS(_, nir, dxil_nir_lower_loads_stores_to_dxil, &loads_stores_options); + NIR_PASS(_, nir, dxil_nir_scratch_and_shared_to_dxil); if (key->stage == MESA_SHADER_FRAGMENT && key->fs.multisample_disabled) NIR_PASS(_, nir, d3d12_disable_multisampling); diff --git a/src/microsoft/clc/clc_compiler.c b/src/microsoft/clc/clc_compiler.c index 525c96f023c..e687951f509 100644 --- a/src/microsoft/clc/clc_compiler.c +++ b/src/microsoft/clc/clc_compiler.c @@ -1109,9 +1109,6 @@ clc_spirv_to_dxil(struct clc_libclc *lib, NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo, nir_address_format_32bit_index_offset); NIR_PASS(_, nir, clc_nir_lower_system_values, work_properties_var); - const struct dxil_nir_lower_loads_stores_options loads_stores_options = { - .use_16bit_ssbo = false, - }; /* Now that function-declared local vars have been sized, append args */ for (unsigned i = 0; i < out_dxil->kernel->num_args; i++) { @@ -1140,7 +1137,7 @@ clc_spirv_to_dxil(struct clc_libclc *lib, nir->info.shared_size += size; } - NIR_PASS(_, nir, dxil_nir_lower_loads_stores_to_dxil, &loads_stores_options); + NIR_PASS(_, nir, dxil_nir_scratch_and_shared_to_dxil); NIR_PASS(_, nir, dxil_nir_opt_alu_deref_srcs); NIR_PASS(_, nir, nir_lower_fp16_casts, nir_lower_fp16_all); NIR_PASS(_, nir, nir_lower_convert_alu_types, NULL); diff --git a/src/microsoft/compiler/dxil_nir.c b/src/microsoft/compiler/dxil_nir.c index 9e4f56fb360..3e754c90843 100644 --- a/src/microsoft/compiler/dxil_nir.c +++ b/src/microsoft/compiler/dxil_nir.c @@ -39,166 +39,6 @@ cl_type_size_align(const struct glsl_type *type, unsigned *size, *align = glsl_get_cl_alignment(type); } -static nir_def * -load_comps_to_vec(nir_builder *b, unsigned src_bit_size, - nir_def **src_comps, unsigned num_src_comps, - unsigned dst_bit_size) -{ - if (src_bit_size == dst_bit_size) - return nir_vec(b, src_comps, num_src_comps); - else if (src_bit_size > dst_bit_size) - return nir_extract_bits(b, src_comps, num_src_comps, 0, src_bit_size * num_src_comps / dst_bit_size, dst_bit_size); - - unsigned num_dst_comps = DIV_ROUND_UP(num_src_comps * src_bit_size, dst_bit_size); - unsigned comps_per_dst = dst_bit_size / src_bit_size; - nir_def *dst_comps[4]; - - for (unsigned i = 0; i < num_dst_comps; i++) { - unsigned src_offs = i * comps_per_dst; - - dst_comps[i] = nir_u2uN(b, src_comps[src_offs], dst_bit_size); - for (unsigned j = 1; j < comps_per_dst && src_offs + j < num_src_comps; j++) { - nir_def *tmp = nir_ishl_imm(b, nir_u2uN(b, src_comps[src_offs + j], dst_bit_size), - j * src_bit_size); - dst_comps[i] = nir_ior(b, dst_comps[i], tmp); - } - } - - return nir_vec(b, dst_comps, num_dst_comps); -} - -static bool -lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var) -{ - unsigned bit_size = intr->def.bit_size; - unsigned num_components = intr->def.num_components; - unsigned num_bits = num_components * bit_size; - - b->cursor = nir_before_instr(&intr->instr); - - nir_def *offset = intr->src[0].ssa; - if (intr->intrinsic == nir_intrinsic_load_shared) - offset = nir_iadd_imm(b, offset, nir_intrinsic_base(intr)); - else - offset = nir_u2u32(b, offset); - nir_def *index = nir_ushr_imm(b, offset, 2); - nir_def *comps[NIR_MAX_VEC_COMPONENTS]; - nir_def *comps_32bit[NIR_MAX_VEC_COMPONENTS * 2]; - - /* We need to split loads in 32-bit accesses because the buffer - * is an i32 array and DXIL does not support type casts. - */ - unsigned num_32bit_comps = DIV_ROUND_UP(num_bits, 32); - for (unsigned i = 0; i < num_32bit_comps; i++) - comps_32bit[i] = nir_load_array_var(b, var, nir_iadd_imm(b, index, i)); - unsigned num_comps_per_pass = MIN2(num_32bit_comps, 4); - - for (unsigned i = 0; i < num_32bit_comps; i += num_comps_per_pass) { - unsigned num_vec32_comps = MIN2(num_32bit_comps - i, 4); - unsigned num_dest_comps = num_vec32_comps * 32 / bit_size; - nir_def *vec32 = nir_vec(b, &comps_32bit[i], num_vec32_comps); - - /* If we have 16 bits or less to load we need to adjust the u32 value so - * we can always extract the LSB. - */ - if (num_bits <= 16) { - nir_def *shift = - nir_imul_imm(b, nir_iand_imm(b, offset, 3), 8); - vec32 = nir_ushr(b, vec32, shift); - } - - /* And now comes the pack/unpack step to match the original type. */ - unsigned dest_index = i * 32 / bit_size; - nir_def *temp_vec = nir_extract_bits(b, &vec32, 1, 0, num_dest_comps, bit_size); - for (unsigned comp = 0; comp < num_dest_comps; ++comp, ++dest_index) - comps[dest_index] = nir_channel(b, temp_vec, comp); - } - - nir_def *result = nir_vec(b, comps, num_components); - nir_def_replace(&intr->def, result); - - return true; -} - -static void -lower_masked_store_vec32(nir_builder *b, nir_def *offset, nir_def *index, - nir_def *vec32, unsigned num_bits, nir_variable *var, unsigned alignment) -{ - nir_def *mask = nir_imm_int(b, (1 << num_bits) - 1); - - /* If we have small alignments, we need to place them correctly in the u32 component. */ - if (alignment <= 2) { - nir_def *shift = - nir_imul_imm(b, nir_iand_imm(b, offset, 3), 8); - - vec32 = nir_ishl(b, vec32, shift); - mask = nir_ishl(b, mask, shift); - } - - if (var->data.mode == nir_var_mem_shared) { - /* Use the dedicated masked intrinsic */ - nir_deref_instr *deref = nir_build_deref_array(b, nir_build_deref_var(b, var), index); - nir_deref_atomic(b, 32, &deref->def, nir_inot(b, mask), .atomic_op = nir_atomic_op_iand); - nir_deref_atomic(b, 32, &deref->def, vec32, .atomic_op = nir_atomic_op_ior); - } else { - /* For scratch, since we don't need atomics, just generate the read-modify-write in NIR */ - nir_def *load = nir_load_array_var(b, var, index); - - nir_def *new_val = nir_ior(b, vec32, - nir_iand(b, - nir_inot(b, mask), - load)); - - nir_store_array_var(b, var, index, new_val, 1); - } -} - -static bool -lower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var) -{ - unsigned num_components = nir_src_num_components(intr->src[0]); - unsigned bit_size = nir_src_bit_size(intr->src[0]); - unsigned num_bits = num_components * bit_size; - - b->cursor = nir_before_instr(&intr->instr); - - nir_def *offset = intr->src[1].ssa; - if (intr->intrinsic == nir_intrinsic_store_shared) - offset = nir_iadd_imm(b, offset, nir_intrinsic_base(intr)); - else - offset = nir_u2u32(b, offset); - nir_def *comps[NIR_MAX_VEC_COMPONENTS]; - - unsigned comp_idx = 0; - for (unsigned i = 0; i < num_components; i++) - comps[i] = nir_channel(b, intr->src[0].ssa, i); - - unsigned step = MAX2(bit_size, 32); - for (unsigned i = 0; i < num_bits; i += step) { - /* For each 4byte chunk (or smaller) we generate a 32bit scalar store. */ - unsigned substore_num_bits = MIN2(num_bits - i, step); - nir_def *local_offset = nir_iadd_imm(b, offset, i / 8); - nir_def *vec32 = load_comps_to_vec(b, bit_size, &comps[comp_idx], - substore_num_bits / bit_size, 32); - nir_def *index = nir_ushr_imm(b, local_offset, 2); - - /* For anything less than 32bits we need to use the masked version of the - * intrinsic to preserve data living in the same 32bit slot. */ - if (substore_num_bits < 32) { - lower_masked_store_vec32(b, local_offset, index, vec32, num_bits, var, nir_intrinsic_align(intr)); - } else { - for (unsigned i = 0; i < vec32->num_components; ++i) - nir_store_array_var(b, var, nir_iadd_imm(b, index, i), nir_channel(b, vec32, i), 1); - } - - comp_idx += substore_num_bits / bit_size; - } - - nir_instr_remove(&intr->instr); - - return true; -} - #define CONSTANT_LOCATION_UNVISITED 0 #define CONSTANT_LOCATION_VALID 1 #define CONSTANT_LOCATION_INVALID 2 @@ -654,83 +494,134 @@ dxil_nir_remove_oob_array_accesses(nir_shader *shader) NULL); } -static bool -lower_shared_atomic(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var) +/* + * This pass operates only on 32-bit scalars, so this callback instructs + * nir_lower_mem_access_bit_sizes_options to turn all shared access into + * 32-bit scalars. We don't want to use 8-bit accesses, since that would be + * challenging to optimize the resulting pack/unpack on some drivers. Larger + * 32-bit access however requires nontrivial tracking to extract/insert. Since + * nir_lower_mem_access_bit_sizes already has that code, we use it in this pass + * instead of NIH'ing it here. + */ +static nir_mem_access_size_align +mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, + uint32_t align, uint32_t align_offset, bool offset_is_const, + enum gl_access_qualifier access, const void *cb_data) { + return (nir_mem_access_size_align){ + .num_components = 1, + .bit_size = 32, + .align = 4, + .shift = nir_mem_access_shift_method_scalar, + }; +} + +/* + * Thanks to nir_lower_mem_access_bit_sizes, we can lower shared intrinsics 1:1 + * to word-based array access. + */ +static bool +lower_shared_to_var(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + nir_variable *shared = data; b->cursor = nir_before_instr(&intr->instr); - nir_def *offset = - nir_iadd_imm(b, intr->src[0].ssa, nir_intrinsic_base(intr)); - nir_def *index = nir_ushr_imm(b, offset, 2); + switch (intr->intrinsic) { + case nir_intrinsic_store_shared: { + nir_def *index = nir_udiv_aligned_4(b, intr->src[1].ssa); + nir_def *value = intr->src[0].ssa; - nir_deref_instr *deref = nir_build_deref_array(b, nir_build_deref_var(b, var), index); - nir_def *result; - if (intr->intrinsic == nir_intrinsic_shared_atomic_swap) - result = nir_deref_atomic_swap(b, 32, &deref->def, intr->src[1].ssa, intr->src[2].ssa, - .atomic_op = nir_intrinsic_atomic_op(intr)); - else - result = nir_deref_atomic(b, 32, &deref->def, intr->src[1].ssa, - .atomic_op = nir_intrinsic_atomic_op(intr)); + index = nir_u2uN(b, index, nir_get_ptr_bitsize(b->shader)); + nir_store_array_var(b, shared, index, value, nir_component_mask(1)); + break; + } + case nir_intrinsic_load_shared: { + nir_def *index = nir_udiv_aligned_4(b, intr->src[0].ssa); + + index = nir_u2uN(b, index, nir_get_ptr_bitsize(b->shader)); + nir_def_rewrite_uses(&intr->def, nir_load_array_var(b, shared, index)); + break; + } + case nir_intrinsic_shared_atomic: + case nir_intrinsic_shared_atomic_swap: { + nir_def *index = nir_udiv_aligned_4(b, intr->src[0].ssa); + + index = nir_u2uN(b, index, nir_get_ptr_bitsize(b->shader)); + nir_deref_instr *deref = nir_build_deref_array(b, nir_build_deref_var(b, shared), index); + + if (intr->intrinsic == nir_intrinsic_shared_atomic_swap) + nir_def_rewrite_uses(&intr->def, nir_deref_atomic_swap(b, 32, &deref->def, intr->src[1].ssa, intr->src[2].ssa, + .atomic_op = nir_intrinsic_atomic_op(intr))); + else + nir_def_rewrite_uses(&intr->def, nir_deref_atomic(b, 32, &deref->def, intr->src[1].ssa, + .atomic_op = nir_intrinsic_atomic_op(intr))); + break; + } + default: + return false; + } + + nir_instr_remove(&intr->instr); + return true; +} + +static bool +dxil_nir_lower_shared_to_var(nir_shader *nir) +{ + unsigned words = DIV_ROUND_UP(nir->info.shared_size, 4); + + /* Early exit in the common case that scratch is not used. */ + if (words == 0) { + return false; + } + + /* First, lower bit sizes and vectors as required by lower_shared_to_var */ + nir_lower_mem_access_bit_sizes_options lower_mem_access_options = { + .modes = nir_var_mem_shared, + .callback = mem_access_cb, + .may_lower_unaligned_stores_to_atomics = true, + }; + NIR_PASS(_, nir, nir_lower_mem_access_bit_sizes, &lower_mem_access_options); + + /* Then, back shared by an array of words and turn all shared access into + * array access. */ + const glsl_type *type_ = glsl_array_type(glsl_uint_type(), words, 1); + nir_variable *var = nir_variable_create(nir, nir_var_mem_shared, type_, "shared"); + + nir_foreach_function_impl(impl, nir) { + nir_function_intrinsics_pass(impl, lower_shared_to_var, nir_metadata_control_flow, var); + } + + /* After lowering, we've eliminated all shared memory in the shader. */ + nir->info.shared_size = 0; + + /* Now clean up the mess we made */ + bool progress; + do { + progress = false; + NIR_PASS(progress, nir, nir_lower_vars_to_ssa); + NIR_PASS(progress, nir, nir_opt_constant_folding); + NIR_PASS(progress, nir, nir_opt_algebraic); + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_cse); + NIR_PASS(progress, nir, nir_opt_dce); + } while (progress); - nir_def_replace(&intr->def, result); return true; } bool -dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir, - const struct dxil_nir_lower_loads_stores_options *options) +dxil_nir_scratch_and_shared_to_dxil(nir_shader *nir) { bool progress = nir_remove_dead_variables(nir, nir_var_function_temp | nir_var_mem_shared, NULL); - nir_variable *shared_var = NULL; - if (nir->info.shared_size) { - shared_var = nir_variable_create(nir, nir_var_mem_shared, - glsl_array_type(glsl_uint_type(), DIV_ROUND_UP(nir->info.shared_size, 4), 4), - "lowered_shared_mem"); - } unsigned ptr_size = nir->info.cs.ptr_size; if (nir->info.stage == MESA_SHADER_KERNEL) { /* All the derefs created here will be used as GEP indices so force 32-bit */ nir->info.cs.ptr_size = 32; } - nir_foreach_function_impl(impl, nir) { - nir_builder b = nir_builder_create(impl); - - nir_variable *scratch_var = NULL; - if (nir->scratch_size) { - const struct glsl_type *scratch_type = glsl_array_type(glsl_uint_type(), DIV_ROUND_UP(nir->scratch_size, 4), 4); - scratch_var = nir_local_variable_create(impl, scratch_type, "lowered_scratch_mem"); - } - - nir_foreach_block(block, impl) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - - switch (intr->intrinsic) { - case nir_intrinsic_load_shared: - progress |= lower_32b_offset_load(&b, intr, shared_var); - break; - case nir_intrinsic_load_scratch: - progress |= lower_32b_offset_load(&b, intr, scratch_var); - break; - case nir_intrinsic_store_shared: - progress |= lower_32b_offset_store(&b, intr, shared_var); - break; - case nir_intrinsic_store_scratch: - progress |= lower_32b_offset_store(&b, intr, scratch_var); - break; - case nir_intrinsic_shared_atomic: - case nir_intrinsic_shared_atomic_swap: - progress |= lower_shared_atomic(&b, intr, shared_var); - break; - default: - break; - } - } - } - } + progress |= nir_lower_scratch_to_var(nir); + progress |= dxil_nir_lower_shared_to_var(nir); if (nir->info.stage == MESA_SHADER_KERNEL) { nir->info.cs.ptr_size = ptr_size; } diff --git a/src/microsoft/compiler/dxil_nir.h b/src/microsoft/compiler/dxil_nir.h index 0a64633df33..267aa88f19d 100644 --- a/src/microsoft/compiler/dxil_nir.h +++ b/src/microsoft/compiler/dxil_nir.h @@ -41,11 +41,7 @@ bool dxil_nir_flatten_var_arrays(nir_shader *shader, nir_variable_mode modes); bool dxil_nir_lower_var_bit_size(nir_shader *shader, nir_variable_mode modes, unsigned min_bit_size, unsigned max_bit_size); bool dxil_nir_remove_oob_array_accesses(nir_shader *shader); -struct dxil_nir_lower_loads_stores_options { - bool use_16bit_ssbo; -}; -bool dxil_nir_lower_loads_stores_to_dxil(nir_shader *shader, - const struct dxil_nir_lower_loads_stores_options *options); +bool dxil_nir_scratch_and_shared_to_dxil(nir_shader *shader); bool dxil_nir_lower_deref_ssbo(nir_shader *shader); bool dxil_nir_opt_alu_deref_srcs(nir_shader *shader); bool dxil_nir_lower_upcast_phis(nir_shader *shader, unsigned min_bit_size); diff --git a/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c b/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c index b749f050c6b..66a996e9f64 100644 --- a/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c +++ b/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c @@ -1016,6 +1016,7 @@ dxil_spirv_nir_passes(nir_shader *nir, shared_var_info); NIR_PASS(_, nir, dxil_nir_split_unaligned_loads_stores, nir_var_mem_shared); NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared, nir_address_format_32bit_offset); + NIR_PASS(_, nir, dxil_nir_scratch_and_shared_to_dxil); } else { NIR_PASS(_, nir, nir_split_struct_vars, nir_var_mem_shared); NIR_PASS(_, nir, dxil_nir_flatten_var_arrays, nir_var_mem_shared); @@ -1100,10 +1101,6 @@ dxil_spirv_nir_passes(nir_shader *nir, NIR_PASS(_, nir, nir_lower_tex, &lower_tex_options); NIR_PASS(_, nir, dxil_nir_split_clip_cull_distance); - const struct dxil_nir_lower_loads_stores_options loads_stores_options = { - .use_16bit_ssbo = conf->shader_model_max >= SHADER_MODEL_6_2, - }; - NIR_PASS(_, nir, dxil_nir_lower_loads_stores_to_dxil, &loads_stores_options); NIR_PASS(_, nir, dxil_nir_split_typed_samplers); NIR_PASS(_, nir, dxil_nir_lower_ubo_array_one_to_static); NIR_PASS(_, nir, nir_opt_dce);