diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 9591a1a497f..1002901124a 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1259,8 +1259,6 @@ intrinsic("copy_ubo_to_uniform_ir3", [1, 1], indices=[BASE, RANGE]) # DXIL specific intrinsics # src[] = { value, mask, index, offset }. intrinsic("store_ssbo_masked_dxil", [1, 1, 1, 1]) -# src[] = { index, 16-byte-based-offset } -load("ubo_dxil", [1, 1], [], [CAN_ELIMINATE, CAN_REORDER]) # Intrinsics used by the Midgard/Bifrost blend pipeline. These are defined # within a blend shader to read/write the raw value from the tile buffer, diff --git a/src/microsoft/clc/clc_compiler.c b/src/microsoft/clc/clc_compiler.c index fc73b2e730b..bcd8145eb47 100644 --- a/src/microsoft/clc/clc_compiler.c +++ b/src/microsoft/clc/clc_compiler.c @@ -909,7 +909,7 @@ clc_spirv_to_dxil(struct clc_libclc *lib, NIR_PASS_V(nir, dxil_nir_lower_deref_ssbo); - NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_all & ~nir_var_mem_constant); + NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_all & ~(nir_var_mem_constant | nir_var_mem_ubo)); assert(nir->info.cs.ptr_size == 64); NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo, @@ -969,7 +969,6 @@ clc_spirv_to_dxil(struct clc_libclc *lib, } NIR_PASS_V(nir, clc_nir_lower_kernel_input_loads, inputs_var); - NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_mem_ubo); NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo, nir_address_format_32bit_index_offset); NIR_PASS_V(nir, clc_nir_lower_system_values, work_properties_var); diff --git a/src/microsoft/compiler/dxil_nir.c b/src/microsoft/compiler/dxil_nir.c index 454b80aa0d5..2554d3e5f85 100644 --- a/src/microsoft/compiler/dxil_nir.c +++ b/src/microsoft/compiler/dxil_nir.c @@ -67,89 +67,6 @@ load_comps_to_vec(nir_builder *b, unsigned src_bit_size, return nir_vec(b, dst_comps, num_dst_comps); } -static nir_ssa_def * -ubo_load_select_32b_comps(nir_builder *b, nir_ssa_def *vec32, - nir_ssa_def *offset, unsigned alignment) -{ - assert(alignment >= 16 || alignment == 8 || - alignment == 4 || alignment == 2 || - alignment == 1); - assert(vec32->num_components == 4); - - if (alignment > 8) - return vec32; - - nir_ssa_def *comps[4]; - nir_ssa_def *cond; - - for (unsigned i = 0; i < 4; i++) - comps[i] = nir_channel(b, vec32, i); - - /* If we have 8bytes alignment or less, select which half the vec4 should - * be used. - */ - cond = nir_ine(b, nir_iand(b, offset, nir_imm_int(b, 0x8)), - nir_imm_int(b, 0)); - - comps[0] = nir_bcsel(b, cond, comps[2], comps[0]); - comps[1] = nir_bcsel(b, cond, comps[3], comps[1]); - - if (alignment == 8) - return nir_vec(b, comps, 2); - - /* 4 byte align or less needed, select which of the 32bit component should be - * used and return it. The sub-32bit split is handled in nir_extract_bits(). - */ - cond = nir_ine(b, nir_iand(b, offset, nir_imm_int(b, 0x4)), - nir_imm_int(b, 0)); - return nir_bcsel(b, cond, comps[1], comps[0]); -} - -nir_ssa_def * -build_load_ubo_dxil(nir_builder *b, nir_ssa_def *buffer, - nir_ssa_def *offset, unsigned num_components, - unsigned bit_size, unsigned alignment) -{ - nir_ssa_def *idx = nir_ushr(b, offset, nir_imm_int(b, 4)); - nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; - unsigned num_bits = num_components * bit_size; - unsigned comp_idx = 0; - - /* We need to split loads in 16byte chunks because that's the - * granularity of cBufferLoadLegacy(). - */ - for (unsigned i = 0; i < num_bits; i += (16 * 8)) { - /* For each 16byte chunk (or smaller) we generate a 32bit ubo vec - * load. - */ - unsigned subload_num_bits = MIN2(num_bits - i, 16 * 8); - nir_ssa_def *vec32 = - nir_load_ubo_dxil(b, 4, 32, buffer, nir_iadd(b, idx, nir_imm_int(b, i / (16 * 8)))); - - /* First re-arrange the vec32 to account for intra 16-byte offset. */ - assert(subload_num_bits / 8 <= alignment); - vec32 = ubo_load_select_32b_comps(b, vec32, offset, alignment); - - /* If we have 2 bytes or less to load we need to adjust the u32 value so - * we can always extract the LSB. - */ - if (alignment <= 2) { - nir_ssa_def *shift = nir_imul(b, nir_iand(b, offset, - nir_imm_int(b, 3)), - nir_imm_int(b, 8)); - vec32 = nir_ushr(b, vec32, shift); - } - - /* And now comes the pack/unpack step to match the original type. */ - nir_ssa_def *temp_vec = nir_extract_bits(b, &vec32, 1, 0, subload_num_bits / bit_size, bit_size); - for (unsigned comp = 0; comp < subload_num_bits / bit_size; ++comp, ++comp_idx) - comps[comp_idx] = nir_channel(b, temp_vec, comp); - } - - assert(comp_idx == num_components); - return nir_vec(b, comps, num_components); -} - static bool lower_load_ssbo(nir_builder *b, nir_intrinsic_instr *intr, unsigned min_bit_size) { @@ -888,26 +805,6 @@ dxil_nir_lower_var_bit_size(nir_shader *shader, nir_variable_mode modes, return true; } -static bool -lower_load_ubo(nir_builder *b, nir_intrinsic_instr *intr) -{ - assert(intr->dest.is_ssa); - assert(intr->src[0].is_ssa); - assert(intr->src[1].is_ssa); - - b->cursor = nir_before_instr(&intr->instr); - - nir_ssa_def *result = - build_load_ubo_dxil(b, intr->src[0].ssa, intr->src[1].ssa, - nir_dest_num_components(intr->dest), - nir_dest_bit_size(intr->dest), - nir_intrinsic_align(intr)); - - nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); - nir_instr_remove(&intr->instr); - return true; -} - static bool lower_shared_atomic(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var) { @@ -978,9 +875,6 @@ dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir, case nir_intrinsic_load_ssbo: progress |= lower_load_ssbo(&b, intr, options->use_16bit_ssbo ? 16 : 32); break; - case nir_intrinsic_load_ubo: - progress |= lower_load_ubo(&b, intr); - break; case nir_intrinsic_store_shared: progress |= lower_32b_offset_store(&b, intr, shared_var); break; diff --git a/src/microsoft/compiler/dxil_nir.h b/src/microsoft/compiler/dxil_nir.h index 03651db0bc0..e9c2a18af58 100644 --- a/src/microsoft/compiler/dxil_nir.h +++ b/src/microsoft/compiler/dxil_nir.h @@ -58,11 +58,6 @@ bool dxil_nir_split_typed_samplers(nir_shader *shader); bool dxil_nir_lower_sysval_to_load_input(nir_shader *s, nir_variable **sysval_vars); bool dxil_nir_lower_vs_vertex_conversion(nir_shader *s, enum pipe_format target_formats[]); -nir_ssa_def * -build_load_ubo_dxil(nir_builder *b, nir_ssa_def *buffer, - nir_ssa_def *offset, unsigned num_components, - unsigned bit_size, unsigned alignment); - uint64_t dxil_sort_by_driver_location(nir_shader* s, nir_variable_mode modes); diff --git a/src/microsoft/compiler/nir_to_dxil.c b/src/microsoft/compiler/nir_to_dxil.c index d585d29122b..9f9b21b2059 100644 --- a/src/microsoft/compiler/nir_to_dxil.c +++ b/src/microsoft/compiler/nir_to_dxil.c @@ -4843,7 +4843,6 @@ emit_intrinsic(struct ntd_context *ctx, nir_intrinsic_instr *intr) return emit_atomic_deref(ctx, intr); case nir_intrinsic_deref_atomic_swap: return emit_atomic_deref_swap(ctx, intr); - case nir_intrinsic_load_ubo_dxil: case nir_intrinsic_load_ubo_vec4: return emit_load_ubo_vec4(ctx, intr); case nir_intrinsic_load_primitive_id: @@ -6188,6 +6187,48 @@ lower_bit_size_callback(const nir_instr* instr, void *data) return ret; } +static bool +vectorize_filter( + unsigned align_mul, + unsigned align_offset, + unsigned bit_size, + unsigned num_components, + nir_intrinsic_instr *low, nir_intrinsic_instr *high, + void *data) +{ + return util_is_power_of_two_nonzero(num_components); +} + +struct lower_mem_bit_sizes_data { + const nir_shader_compiler_options *nir_options; + const struct nir_to_dxil_options *dxil_options; +}; + +static nir_mem_access_size_align +lower_mem_access_bit_sizes_cb(nir_intrinsic_op intrin, + uint8_t bytes, + uint8_t bit_size_in, + uint32_t align_mul, + uint32_t align_offset, + bool offset_is_const, + const void *cb_data) +{ + const struct lower_mem_bit_sizes_data *data = cb_data; + unsigned max_bit_size = data->nir_options->lower_int64_options ? 32 : 64; + unsigned min_bit_size = data->dxil_options->lower_int16 ? 32 : 16; + unsigned closest_bit_size = MAX2(min_bit_size, MIN2(max_bit_size, bit_size_in)); + /* UBO loads can be done at whatever (supported) bit size, but require 16 byte + * alignment and can load up to 16 bytes per instruction. However this pass requires + * loading 16 bytes of data to get 16-byte alignment. We're going to run lower_ubo_vec4 + * which can deal with unaligned vec4s, so for this pass let's just deal with bit size + * and total size restrictions. */ + return (nir_mem_access_size_align) { + .align = closest_bit_size / 8, + .bit_size = closest_bit_size, + .num_components = DIV_ROUND_UP(MIN2(bytes, 16) * 8, closest_bit_size), + }; +} + static void optimize_nir(struct nir_shader *s, const struct nir_to_dxil_options *opts) { @@ -6222,6 +6263,7 @@ optimize_nir(struct nir_shader *s, const struct nir_to_dxil_options *opts) NIR_PASS(progress, s, nir_lower_64bit_phis); NIR_PASS(progress, s, nir_lower_phis_to_scalar, true); NIR_PASS(progress, s, nir_opt_loop_unroll); + NIR_PASS(progress, s, nir_lower_pack); NIR_PASS_V(s, nir_lower_system_values); } while (progress); @@ -6488,10 +6530,34 @@ nir_to_dxil(struct nir_shader *s, const struct nir_to_dxil_options *opts, NIR_PASS_V(s, nir_lower_flrp, 16 | 32 | 64, true); NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, type_size_vec4, nir_lower_io_lower_64bit_to_32); NIR_PASS_V(s, dxil_nir_ensure_position_writes); - NIR_PASS_V(s, nir_lower_pack); NIR_PASS_V(s, dxil_nir_lower_system_values); NIR_PASS_V(s, nir_lower_io_to_scalar, nir_var_shader_in | nir_var_system_value | nir_var_shader_out); + /* Do a round of optimization to try to vectorize loads/stores. Otherwise the addresses used for loads + * might be too opaque for the pass to see that they're next to each other. */ + optimize_nir(s, opts); + + /* Vectorize UBO accesses aggressively. This can help increase alignment to enable us to do better + * chunking of loads and stores after lowering bit sizes. Ignore load/store size limitations here, we'll + * address them with lower_mem_access_bit_sizes */ + nir_load_store_vectorize_options vectorize_opts = { + .callback = vectorize_filter, + .modes = nir_var_mem_ubo, + }; + NIR_PASS_V(s, nir_opt_load_store_vectorize, &vectorize_opts); + + /* Now that they're bloated to the max, address bit size restrictions and overall size limitations for + * a single load/store op. */ + struct lower_mem_bit_sizes_data mem_size_data = { s->options, opts }; + nir_lower_mem_access_bit_sizes_options mem_size_options = { + .modes = nir_var_mem_ubo, + .callback = lower_mem_access_bit_sizes_cb, + .cb_data = &mem_size_data + }; + NIR_PASS_V(s, nir_lower_mem_access_bit_sizes, &mem_size_options); + + /* Lastly, conver byte-address UBO loads to vec-addressed. This pass can also deal with selecting sub- + * components from the load and dealing with vec-straddling loads. */ NIR_PASS_V(s, nir_lower_ubo_vec4); if (opts->shader_model_max < SHADER_MODEL_6_6) { diff --git a/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c b/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c index 469ed612d72..556c4ed967f 100644 --- a/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c +++ b/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c @@ -1048,9 +1048,7 @@ dxil_spirv_nir_passes(nir_shader *nir, } NIR_PASS_V(nir, nir_opt_deref); - NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, - nir_var_mem_ubo | nir_var_mem_push_const | - nir_var_mem_ssbo); + NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_mem_ssbo); if (conf->inferred_read_only_images_as_srvs) { const nir_opt_access_options opt_access_options = {