diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 1002901124a..c5b5193ed7c 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1256,10 +1256,6 @@ store("uniform_ir3", [], indices=[BASE]) # vec4's. intrinsic("copy_ubo_to_uniform_ir3", [1, 1], indices=[BASE, RANGE]) -# DXIL specific intrinsics -# src[] = { value, mask, index, offset }. -intrinsic("store_ssbo_masked_dxil", [1, 1, 1, 1]) - # Intrinsics used by the Midgard/Bifrost blend pipeline. These are defined # within a blend shader to read/write the raw value from the tile buffer, # without applying any format conversion in the process. If the shader needs diff --git a/src/microsoft/clc/clc_compiler.c b/src/microsoft/clc/clc_compiler.c index bcd8145eb47..a41718ebfd3 100644 --- a/src/microsoft/clc/clc_compiler.c +++ b/src/microsoft/clc/clc_compiler.c @@ -909,7 +909,7 @@ clc_spirv_to_dxil(struct clc_libclc *lib, NIR_PASS_V(nir, dxil_nir_lower_deref_ssbo); - NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_all & ~(nir_var_mem_constant | nir_var_mem_ubo)); + NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_mem_shared | nir_var_function_temp); assert(nir->info.cs.ptr_size == 64); NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo, diff --git a/src/microsoft/compiler/dxil_nir.c b/src/microsoft/compiler/dxil_nir.c index 2554d3e5f85..938848406a6 100644 --- a/src/microsoft/compiler/dxil_nir.c +++ b/src/microsoft/compiler/dxil_nir.c @@ -67,188 +67,6 @@ load_comps_to_vec(nir_builder *b, unsigned src_bit_size, return nir_vec(b, dst_comps, num_dst_comps); } -static bool -lower_load_ssbo(nir_builder *b, nir_intrinsic_instr *intr, unsigned min_bit_size) -{ - assert(intr->dest.is_ssa); - assert(intr->src[0].is_ssa); - assert(intr->src[1].is_ssa); - - b->cursor = nir_before_instr(&intr->instr); - - unsigned src_bit_size = nir_dest_bit_size(intr->dest); - unsigned store_bit_size = CLAMP(src_bit_size, min_bit_size, 32); - unsigned offset_mask = store_bit_size / 8 - 1; - - nir_ssa_def *buffer = intr->src[0].ssa; - nir_ssa_def *offset = nir_iand(b, intr->src[1].ssa, nir_imm_int(b, ~offset_mask)); - enum gl_access_qualifier access = nir_intrinsic_access(intr); - unsigned num_components = nir_dest_num_components(intr->dest); - unsigned num_bits = num_components * src_bit_size; - - nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; - unsigned comp_idx = 0; - - /* We need to split loads in 4-component chunks because that's the optimal - * granularity of bufferLoad(). Minimum alignment is 2-byte. - */ - for (unsigned i = 0; i < num_bits; i += 4 * store_bit_size) { - /* For each 4-component chunk (or smaller) we generate a N-bit ssbo vec load. */ - unsigned subload_num_bits = MIN2(num_bits - i, 4 * store_bit_size); - - /* The number of components to store depends on the number of bytes. */ - nir_ssa_def *result = - nir_load_ssbo(b, DIV_ROUND_UP(subload_num_bits, store_bit_size), store_bit_size, - buffer, nir_iadd(b, offset, nir_imm_int(b, i / 8)), - .align_mul = store_bit_size / 8, - .align_offset = 0, - .access = access); - - /* If we have an unaligned load we need to adjust the result value so - * we can always extract the LSB. - */ - if (nir_intrinsic_align(intr) < store_bit_size / 8) { - nir_ssa_def *shift = nir_imul(b, nir_iand(b, intr->src[1].ssa, nir_imm_int(b, offset_mask)), - nir_imm_int(b, 8)); - result = nir_ushr(b, result, shift); - } - - /* And now comes the pack/unpack step to match the original type. */ - nir_ssa_def *temp_vec = nir_extract_bits(b, &result, 1, 0, subload_num_bits / src_bit_size, src_bit_size); - for (unsigned comp = 0; comp < subload_num_bits / src_bit_size; ++comp, ++comp_idx) - comps[comp_idx] = nir_channel(b, temp_vec, comp); - } - - assert(comp_idx == num_components); - nir_ssa_def *result = nir_vec(b, comps, num_components); - nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); - nir_instr_remove(&intr->instr); - return true; -} - -static bool -lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr, unsigned min_bit_size) -{ - b->cursor = nir_before_instr(&intr->instr); - - assert(intr->src[0].is_ssa); - assert(intr->src[1].is_ssa); - assert(intr->src[2].is_ssa); - - nir_ssa_def *val = intr->src[0].ssa; - nir_ssa_def *buffer = intr->src[1].ssa; - - unsigned src_bit_size = val->bit_size; - unsigned store_bit_size = CLAMP(src_bit_size, min_bit_size, 32); - unsigned masked_store_bit_size = 32; - unsigned num_components = val->num_components; - unsigned num_bits = num_components * src_bit_size; - - unsigned offset_mask = store_bit_size / 8 - 1; - unsigned masked_store_offset_mask = masked_store_bit_size / 8 - 1; - nir_ssa_def *offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~offset_mask)); - nir_ssa_def *masked_offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~masked_store_offset_mask)); - - nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS] = { 0 }; - unsigned comp_idx = 0; - - unsigned write_mask = nir_intrinsic_write_mask(intr); - for (unsigned i = 0; i < num_components; i++) - if (write_mask & (1 << i)) - comps[i] = nir_channel(b, val, i); - - /* We split stores in 4-component chunks because that's the optimal granularity - * of bufferStore(). Minimum alignment is 2-byte. */ - unsigned bit_offset = 0; - while (true) { - /* Skip over holes in the write mask */ - while (comp_idx < num_components && comps[comp_idx] == NULL) { - comp_idx++; - bit_offset += src_bit_size; - } - if (comp_idx >= num_components) - break; - - /* For each 4-component chunk (or smaller) we generate a ssbo vec - * store. If a component is skipped by the write mask, do a smaller - * sub-store - */ - unsigned num_src_comps_stored = 0, substore_num_bits = 0; - while(num_src_comps_stored + comp_idx < num_components && - substore_num_bits + bit_offset < num_bits && - substore_num_bits < 4 * store_bit_size && - comps[comp_idx + num_src_comps_stored]) { - ++num_src_comps_stored; - substore_num_bits += src_bit_size; - } - bool force_masked = false; - if (substore_num_bits > store_bit_size && - substore_num_bits % store_bit_size != 0) { - /* Split this into two, one unmasked store of the first bits, - * and then the second loop iteration will handle a masked store - * for the rest. */ - assert(num_src_comps_stored == 3); - if (store_bit_size == 16) { - assert(substore_num_bits < 32); - /* If we're already doing atomics to store, just do one - * 32bit masked store instead of a 16bit store and a masked - * store for the other 8 bits. */ - force_masked = true; - } else { - --num_src_comps_stored; - substore_num_bits = store_bit_size; - } - } - nir_intrinsic_instr *store; - - if (substore_num_bits < store_bit_size || force_masked) { - nir_ssa_def *store_vec = load_comps_to_vec(b, src_bit_size, &comps[comp_idx], - num_src_comps_stored, masked_store_bit_size); - nir_ssa_def *mask = nir_imm_intN_t(b, (1 << substore_num_bits) - 1, masked_store_bit_size); - - /* If we have small alignments we need to place them correctly in the component. */ - if (nir_intrinsic_align(intr) <= masked_store_bit_size / 8) { - nir_ssa_def *pos = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, masked_store_offset_mask)); - nir_ssa_def *shift = nir_imul_imm(b, pos, 8); - - store_vec = nir_ishl(b, store_vec, shift); - mask = nir_ishl(b, mask, shift); - } - - nir_ssa_def *local_offset = nir_iadd(b, masked_offset, nir_imm_int(b, bit_offset / 8)); - store = nir_intrinsic_instr_create(b->shader, - nir_intrinsic_store_ssbo_masked_dxil); - store->src[0] = nir_src_for_ssa(store_vec); - store->src[1] = nir_src_for_ssa(nir_inot(b, mask)); - store->src[2] = nir_src_for_ssa(buffer); - store->src[3] = nir_src_for_ssa(local_offset); - } else { - nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, bit_offset / 8)); - nir_ssa_def *store_vec = load_comps_to_vec(b, src_bit_size, &comps[comp_idx], - num_src_comps_stored, store_bit_size); - store = nir_intrinsic_instr_create(b->shader, - nir_intrinsic_store_ssbo); - store->src[0] = nir_src_for_ssa(store_vec); - store->src[1] = nir_src_for_ssa(buffer); - store->src[2] = nir_src_for_ssa(local_offset); - - nir_intrinsic_set_align(store, store_bit_size / 8, 0); - } - - /* The number of components to store depends on the number of bits. */ - store->num_components = DIV_ROUND_UP(substore_num_bits, store_bit_size); - nir_builder_instr_insert(b, &store->instr); - comp_idx += num_src_comps_stored; - bit_offset += substore_num_bits; - - if (nir_intrinsic_has_write_mask(store)) - nir_intrinsic_set_write_mask(store, (1 << store->num_components) - 1); - } - - nir_instr_remove(&intr->instr); - return true; -} - static bool lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var) { @@ -872,18 +690,12 @@ dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir, case nir_intrinsic_load_scratch: progress |= lower_32b_offset_load(&b, intr, scratch_var); break; - case nir_intrinsic_load_ssbo: - progress |= lower_load_ssbo(&b, intr, options->use_16bit_ssbo ? 16 : 32); - break; case nir_intrinsic_store_shared: progress |= lower_32b_offset_store(&b, intr, shared_var); break; case nir_intrinsic_store_scratch: progress |= lower_32b_offset_store(&b, intr, scratch_var); break; - case nir_intrinsic_store_ssbo: - progress |= lower_store_ssbo(&b, intr, options->use_16bit_ssbo ? 16 : 32); - break; case nir_intrinsic_shared_atomic: case nir_intrinsic_shared_atomic_swap: progress |= lower_shared_atomic(&b, intr, shared_var); diff --git a/src/microsoft/compiler/nir_to_dxil.c b/src/microsoft/compiler/nir_to_dxil.c index ae561b1e254..484ae9b0fd7 100644 --- a/src/microsoft/compiler/nir_to_dxil.c +++ b/src/microsoft/compiler/nir_to_dxil.c @@ -3531,32 +3531,6 @@ emit_store_ssbo(struct ntd_context *ctx, nir_intrinsic_instr *intr) emit_bufferstore_call(ctx, handle, coord, value, write_mask, overload); } -static bool -emit_store_ssbo_masked(struct ntd_context *ctx, nir_intrinsic_instr *intr) -{ - const struct dxil_value *value = - get_src(ctx, &intr->src[0], 0, nir_type_uint); - const struct dxil_value *mask = - get_src(ctx, &intr->src[1], 0, nir_type_uint); - const struct dxil_value* handle = get_resource_handle(ctx, &intr->src[2], DXIL_RESOURCE_CLASS_UAV, DXIL_RESOURCE_KIND_RAW_BUFFER); - const struct dxil_value *offset = - get_src(ctx, &intr->src[3], 0, nir_type_uint); - if (!value || !mask || !handle || !offset) - return false; - - const struct dxil_value *int32_undef = get_int32_undef(&ctx->mod); - if (!int32_undef) - return false; - - const struct dxil_value *coord[3] = { - offset, int32_undef, int32_undef - }; - - return - emit_atomic_binop(ctx, handle, DXIL_ATOMIC_AND, coord, mask) != NULL && - emit_atomic_binop(ctx, handle, DXIL_ATOMIC_OR, coord, value) != NULL; -} - static bool emit_load_ubo_vec4(struct ntd_context *ctx, nir_intrinsic_instr *intr) { @@ -4833,8 +4807,6 @@ emit_intrinsic(struct ntd_context *ctx, nir_intrinsic_instr *intr) return emit_load_ssbo(ctx, intr); case nir_intrinsic_store_ssbo: return emit_store_ssbo(ctx, intr); - case nir_intrinsic_store_ssbo_masked_dxil: - return emit_store_ssbo_masked(ctx, intr); case nir_intrinsic_load_deref: return emit_load_deref(ctx, intr); case nir_intrinsic_store_deref: @@ -6217,18 +6189,49 @@ lower_mem_access_bit_sizes_cb(nir_intrinsic_op intrin, const void *cb_data) { const struct lower_mem_bit_sizes_data *data = cb_data; - unsigned max_bit_size = data->nir_options->lower_int64_options ? 32 : 64; + unsigned max_bit_size = 32; unsigned min_bit_size = data->dxil_options->lower_int16 ? 32 : 16; unsigned closest_bit_size = MAX2(min_bit_size, MIN2(max_bit_size, bit_size_in)); - /* UBO loads can be done at whatever (supported) bit size, but require 16 byte - * alignment and can load up to 16 bytes per instruction. However this pass requires - * loading 16 bytes of data to get 16-byte alignment. We're going to run lower_ubo_vec4 - * which can deal with unaligned vec4s, so for this pass let's just deal with bit size - * and total size restrictions. */ + if (intrin == nir_intrinsic_load_ubo) { + /* UBO loads can be done at whatever (supported) bit size, but require 16 byte + * alignment and can load up to 16 bytes per instruction. However this pass requires + * loading 16 bytes of data to get 16-byte alignment. We're going to run lower_ubo_vec4 + * which can deal with unaligned vec4s, so for this pass let's just deal with bit size + * and total size restrictions. */ + return (nir_mem_access_size_align) { + .align = closest_bit_size / 8, + .bit_size = closest_bit_size, + .num_components = DIV_ROUND_UP(MIN2(bytes, 16) * 8, closest_bit_size), + }; + } + + assert(intrin == nir_intrinsic_load_ssbo || intrin == nir_intrinsic_store_ssbo); + uint32_t align = nir_combined_align(align_mul, align_offset); + if (align < min_bit_size / 8) { + /* Unaligned load/store, use the minimum bit size, up to 4 components */ + unsigned ideal_num_components = intrin == nir_intrinsic_load_ssbo ? + DIV_ROUND_UP(bytes * 8, min_bit_size) : + (bytes * 8 / min_bit_size); + return (nir_mem_access_size_align) { + .align = min_bit_size / 8, + .bit_size = min_bit_size, + .num_components = MIN2(4, ideal_num_components), + }; + } + + /* Increase/decrease bit size to try to get closer to the requested byte size/align */ + unsigned bit_size = closest_bit_size; + unsigned target = MIN2(bytes, align); + while (target < bit_size / 8 && bit_size > min_bit_size) + bit_size /= 2; + while (target > bit_size / 8 * 4 && bit_size < max_bit_size) + bit_size *= 2; + + /* This is the best we can do */ return (nir_mem_access_size_align) { - .align = closest_bit_size / 8, - .bit_size = closest_bit_size, - .num_components = DIV_ROUND_UP(MIN2(bytes, 16) * 8, closest_bit_size), + .align = bit_size / 8, + .bit_size = bit_size, + .num_components = MIN2(4, DIV_ROUND_UP(bytes * 8, bit_size)), }; } @@ -6540,12 +6543,12 @@ nir_to_dxil(struct nir_shader *s, const struct nir_to_dxil_options *opts, * might be too opaque for the pass to see that they're next to each other. */ optimize_nir(s, opts); - /* Vectorize UBO accesses aggressively. This can help increase alignment to enable us to do better + /* Vectorize UBO/SSBO accesses aggressively. This can help increase alignment to enable us to do better * chunking of loads and stores after lowering bit sizes. Ignore load/store size limitations here, we'll * address them with lower_mem_access_bit_sizes */ nir_load_store_vectorize_options vectorize_opts = { .callback = vectorize_filter, - .modes = nir_var_mem_ubo, + .modes = nir_var_mem_ubo | nir_var_mem_ssbo, }; NIR_PASS_V(s, nir_opt_load_store_vectorize, &vectorize_opts); @@ -6553,8 +6556,9 @@ nir_to_dxil(struct nir_shader *s, const struct nir_to_dxil_options *opts, * a single load/store op. */ struct lower_mem_bit_sizes_data mem_size_data = { s->options, opts }; nir_lower_mem_access_bit_sizes_options mem_size_options = { - .modes = nir_var_mem_ubo, + .modes = nir_var_mem_ubo | nir_var_mem_ssbo, .callback = lower_mem_access_bit_sizes_cb, + .may_lower_unaligned_stores_to_atomics = true, .cb_data = &mem_size_data }; NIR_PASS_V(s, nir_lower_mem_access_bit_sizes, &mem_size_options); diff --git a/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c b/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c index 556c4ed967f..25a43e45a96 100644 --- a/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c +++ b/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c @@ -1048,7 +1048,6 @@ dxil_spirv_nir_passes(nir_shader *nir, } NIR_PASS_V(nir, nir_opt_deref); - NIR_PASS_V(nir, dxil_nir_split_unaligned_loads_stores, nir_var_mem_ssbo); if (conf->inferred_read_only_images_as_srvs) { const nir_opt_access_options opt_access_options = {