diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 32ebac13524..d51cd55c87a 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -892,6 +892,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_is_helper_invocation: case nir_intrinsic_load_scratch: case nir_intrinsic_load_scratch_nv: + case nir_intrinsic_load_scratch_intel: case nir_intrinsic_deref_atomic: case nir_intrinsic_deref_atomic_swap: case nir_intrinsic_ssbo_atomic: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 264ea5c1d7c..0a2b9a26f47 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -2660,6 +2660,13 @@ store("ssbo_block_intel", [-1, 1], [ACCESS, ALIGN_MUL, ALIGN_OFFSET]) # src[] = { value, offset }. store("shared_block_intel", [1], [BASE, ALIGN_MUL, ALIGN_OFFSET]) +# These offsets are into per-subgroup scratch memory, rather than the per-lane +# offsets the standard NIR intrinsics use. +# src[] = { offset }. +load("scratch_intel", [1], [ACCESS], [CAN_ELIMINATE]) +# src[] = { value, offset }. +store("scratch_intel", [1], []) + # src[] = { address }. load("global_constant_uniform_block_intel", [1], [ACCESS, ALIGN_MUL, ALIGN_OFFSET, BASE], [CAN_ELIMINATE, CAN_REORDER]) diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c index b64bb4bd567..6bb281e6cba 100644 --- a/src/compiler/nir/nir_lower_io.c +++ b/src/compiler/nir/nir_lower_io.c @@ -992,6 +992,7 @@ nir_get_io_offset_src_number(const nir_intrinsic_instr *instr) case nir_intrinsic_load_global_nv: case nir_intrinsic_load_scratch: case nir_intrinsic_load_scratch_nv: + case nir_intrinsic_load_scratch_intel: case nir_intrinsic_load_fs_input_interp_deltas: case nir_intrinsic_shared_atomic: case nir_intrinsic_shared_atomic_nv: @@ -1043,6 +1044,7 @@ nir_get_io_offset_src_number(const nir_intrinsic_instr *instr) case nir_intrinsic_store_urb_lsc_intel: case nir_intrinsic_store_scratch: case nir_intrinsic_store_scratch_nv: + case nir_intrinsic_store_scratch_intel: case nir_intrinsic_ssbo_atomic: case nir_intrinsic_ssbo_atomic_swap: case nir_intrinsic_ldc_nv: @@ -1189,6 +1191,7 @@ nir_get_io_data_src_number(const nir_intrinsic_instr *intr) case nir_intrinsic_store_global_nv: case nir_intrinsic_store_scratch: case nir_intrinsic_store_scratch_nv: + case nir_intrinsic_store_scratch_intel: case nir_intrinsic_store_raw_output_pan: case nir_intrinsic_store_combined_output_pan: case nir_intrinsic_store_tile_pan: diff --git a/src/intel/compiler/brw/brw_from_nir.cpp b/src/intel/compiler/brw/brw_from_nir.cpp index 96697e7664d..986ead9f8df 100644 --- a/src/intel/compiler/brw/brw_from_nir.cpp +++ b/src/intel/compiler/brw/brw_from_nir.cpp @@ -4561,82 +4561,6 @@ get_nir_buffer_intrinsic_index(nir_to_brw_state &ntb, const brw_builder &bld, return bld.emit_uniformize(retype(surf_index, type)); } -/** - * The offsets we get from NIR act as if each SIMD channel has it's own blob - * of contiguous space. However, if we actually place each SIMD channel in - * it's own space, we end up with terrible cache performance because each SIMD - * channel accesses a different cache line even when they're all accessing the - * same byte offset. To deal with this problem, we swizzle the address using - * a simple algorithm which ensures that any time a SIMD message reads or - * writes the same address, it's all in the same cache line. We have to keep - * the bottom two bits fixed so that we can read/write up to a dword at a time - * and the individual element is contiguous. We do this by splitting the - * address as follows: - * - * 31 4-6 2 0 - * +-------------------------------+------------+----------+ - * | Hi address bits | chan index | addr low | - * +-------------------------------+------------+----------+ - * - * In other words, the bottom two address bits stay, and the top 30 get - * shifted up so that we can stick the SIMD channel index in the middle. This - * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit - * at the same logical offset, the scratch read/write instruction acts on - * continuous elements and we get good cache locality. - */ -static brw_reg -swizzle_nir_scratch_addr(nir_to_brw_state &ntb, - const brw_builder &bld, - const nir_src &nir_addr_src, - bool in_dwords) -{ - brw_shader &s = ntb.s; - - const brw_reg chan_index = bld.LOAD_SUBGROUP_INVOCATION(); - const unsigned chan_index_bits = ffs(s.dispatch_width) - 1; - - if (nir_src_is_const(nir_addr_src)) { - unsigned nir_addr = nir_src_as_uint(nir_addr_src); - if (in_dwords) { - /* In this case, we know the address is aligned to a DWORD and we want - * the final address in DWORDs. - */ - return bld.OR(chan_index, - brw_imm_ud(nir_addr << (chan_index_bits - 2))); - } else { - /* This case is substantially more annoying because we have to pay - * attention to those pesky two bottom bits. - */ - unsigned addr_hi = (nir_addr & ~0x3u) << chan_index_bits; - unsigned addr_lo = (nir_addr & 0x3u); - - return bld.OR(bld.SHL(chan_index, brw_imm_ud(2)), - brw_imm_ud(addr_lo | addr_hi)); - } - } - - const brw_reg nir_addr = - retype(get_nir_src(ntb, nir_addr_src, 0), BRW_TYPE_UD); - - if (in_dwords) { - /* In this case, we know the address is aligned to a DWORD and we want - * the final address in DWORDs. - */ - return bld.OR(bld.SHL(nir_addr, brw_imm_ud(chan_index_bits - 2)), - chan_index); - } else { - /* This case substantially more annoying because we have to pay - * attention to those pesky two bottom bits. - */ - brw_reg chan_addr = bld.SHL(chan_index, brw_imm_ud(2)); - brw_reg addr_bits = - bld.OR(bld.AND(nir_addr, brw_imm_ud(0x3u)), - bld.SHL(bld.AND(nir_addr, brw_imm_ud(~0x3u)), - brw_imm_ud(chan_index_bits))); - return bld.OR(addr_bits, chan_addr); - } -} - static unsigned choose_block_size_dwords(const intel_device_info *devinfo, unsigned dwords) { @@ -4919,6 +4843,8 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb, case nir_intrinsic_global_atomic_swap: case nir_intrinsic_load_scratch: case nir_intrinsic_store_scratch: + case nir_intrinsic_load_scratch_intel: + case nir_intrinsic_store_scratch_intel: case nir_intrinsic_load_shader_indirect_data_intel: brw_from_nir_emit_memory_access(ntb, bld, xbld, instr); break; @@ -6098,8 +6024,8 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb, no_mask_handle = true; break; } - case nir_intrinsic_load_scratch: - case nir_intrinsic_store_scratch: { + case nir_intrinsic_load_scratch_intel: + case nir_intrinsic_store_scratch_intel: { mode = MEMORY_MODE_SCRATCH; const nir_src &addr = instr->src[is_store ? 1 : 0]; @@ -6113,25 +6039,17 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb, if (devinfo->ver >= 20 || intel_has_extended_bindless(devinfo)) bind = ubld.SHR(bind, brw_imm_ud(4)); - /* load_scratch / store_scratch cannot be is_scalar yet. */ - assert(xbld.dispatch_width() == bld.dispatch_width()); - srcs[MEMORY_LOGICAL_BINDING] = component(bind, 0); - srcs[MEMORY_LOGICAL_ADDRESS] = - swizzle_nir_scratch_addr(ntb, bld, addr, false); } else { - unsigned bit_size = - is_store ? nir_src_bit_size(instr->src[0]) : instr->def.bit_size; - bool dword_aligned = alignment >= 4 && bit_size == 32; - - /* load_scratch / store_scratch cannot be is_scalar yet. */ - assert(xbld.dispatch_width() == bld.dispatch_width()); - binding_type = LSC_ADDR_SURFTYPE_FLAT; - srcs[MEMORY_LOGICAL_ADDRESS] = - swizzle_nir_scratch_addr(ntb, bld, addr, dword_aligned); } + /* load_scratch / store_scratch cannot be is_scalar yet. */ + assert(xbld.dispatch_width() == bld.dispatch_width()); + + srcs[MEMORY_LOGICAL_ADDRESS] = + retype(get_nir_src(ntb, addr, 0), BRW_TYPE_UD); + if (is_store) ++s.shader_stats.spill_count; else diff --git a/src/intel/compiler/brw/brw_lower_logical_sends.cpp b/src/intel/compiler/brw/brw_lower_logical_sends.cpp index d4c9e29b1f2..a2618eb1788 100644 --- a/src/intel/compiler/brw/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw/brw_lower_logical_sends.cpp @@ -1425,9 +1425,9 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem) 8 * lsc_data_size_bytes(data_size); const bool byte_scattered = - data_bit_size < 32 || (alignment != 0 && alignment < 4); - const bool dword_scattered = !byte_scattered && mode == MEMORY_MODE_SCRATCH; - const bool surface_access = !byte_scattered && !dword_scattered && !block; + data_bit_size < 32 || (alignment != 0 && alignment < 4) || + mode == MEMORY_MODE_SCRATCH; + const bool surface_access = !byte_scattered && !block; /* SLM block reads must use the 16B-aligned OWord Block Read messages, * as the unaligned message doesn't exist for SLM. @@ -1550,7 +1550,6 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem) desc = brw_dp_oword_block_rw_desc(devinfo, false, components, !has_dest); } else if (addr_size == LSC_ADDR_SIZE_A64) { assert(binding_type == LSC_ADDR_SURFTYPE_FLAT); - assert(!dword_scattered); sfid = BRW_SFID_HDC1; @@ -1595,9 +1594,6 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem) } else if (byte_scattered) { desc = brw_dp_byte_scattered_rw_desc(devinfo, mem->exec_size, data_bit_size, !has_dest); - } else if (dword_scattered) { - desc = brw_dp_dword_scattered_rw_desc(devinfo, mem->exec_size, - !has_dest); } else { desc = brw_dp_untyped_surface_rw_desc(devinfo, mem->exec_size, components, !has_dest); diff --git a/src/intel/compiler/brw/brw_nir.c b/src/intel/compiler/brw/brw_nir.c index 50cca73644c..767e833a788 100644 --- a/src/intel/compiler/brw/brw_nir.c +++ b/src/intel/compiler/brw/brw_nir.c @@ -3,7 +3,6 @@ * SPDX-License-Identifier: MIT */ -#include "intel_nir.h" #include "brw_nir.h" #include "brw_private.h" #include "brw_sampler.h" @@ -11,6 +10,82 @@ #include "compiler/nir/nir_builder.h" #include "dev/intel_debug.h" #include "util/sparse_bitset.h" +#include "intel_nir.h" +#include "nir.h" +#include "nir_builder_opcodes.h" +#include "nir_intrinsics.h" +#include "nir_intrinsics_indices.h" + +/* + * Intel scratch swizzling can be described with the formula: + * + * (SIMD width * round_down(offset_B, stride_B)) + + * (lane * stride_B) + + * (offset_B % stride_B) + */ +static nir_def * +swizzle_scratch(nir_builder *b, + nir_def *offset_B, + unsigned stride_B, + unsigned align_B) +{ + struct shader_info *info = &b->shader->info; + + assert(util_is_power_of_two_nonzero(stride_B)); + assert(util_is_power_of_two_nonzero(align_B)); + + nir_def *trailing_B = NULL; + if (align_B < stride_B) { + trailing_B = nir_umod_imm(b, offset_B, stride_B); + offset_B = nir_iand_imm(b, offset_B, ~(stride_B - 1)); + } + + nir_def *simd_width = info->min_subgroup_size == info->max_subgroup_size ? + nir_imm_int(b, info->max_subgroup_size) : + nir_load_simd_width_intel(b); + + nir_def *simd_offs_B = nir_imul(b, simd_width, offset_B); + + nir_def *lane = nir_load_subgroup_invocation(b); + nir_def *lane_offs_B = nir_imul_imm(b, lane, stride_B); + nir_def *swizzled_B = nir_iadd(b, simd_offs_B, lane_offs_B); + + return trailing_B ? nir_iadd(b, swizzled_B, trailing_B) : swizzled_B; +} + +static bool +lower_scratch(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + b->cursor = nir_before_instr(&intr->instr); + b->constant_fold_alu = true; + + unsigned stride = 4 /* TODO */; + + if (intr->intrinsic == nir_intrinsic_load_scratch) { + nir_def *val = + nir_load_scratch_intel(b, intr->def.num_components, intr->def.bit_size, + swizzle_scratch(b, intr->src[0].ssa, stride, + nir_intrinsic_align(intr)), + .access = nir_intrinsic_access(intr)); + nir_def_replace(&intr->def, val); + } else if (intr->intrinsic == nir_intrinsic_store_scratch) { + nir_store_scratch_intel(b, intr->src[0].ssa, + swizzle_scratch(b, intr->src[1].ssa, stride, + nir_intrinsic_align(intr))); + nir_instr_remove(&intr->instr); + } else { + return false; + } + + return true; +} + +static bool +intel_nir_lower_scratch(nir_shader *nir) +{ + return nir_shader_intrinsics_pass(nir, lower_scratch, + nir_metadata_control_flow, NULL); +} /** * Returns the minimum number of vec4 elements needed to pack a type. @@ -2493,6 +2568,10 @@ brw_vectorize_lower_mem_access(brw_pass_tracker *pt) OPT(nir_opt_algebraic); OPT(nir_opt_cse); + if (pt->nir->scratch_size) { + OPT(intel_nir_lower_scratch); + } + /* Do this after the vectorization & brw_nir_rebase_const_offset_ubo_loads * so that we maximize the offset put into the messages. */ @@ -3139,6 +3218,7 @@ lsc_op_for_nir_intrinsic(const nir_intrinsic_instr *intrin) case nir_intrinsic_load_ssbo_uniform_block_intel: case nir_intrinsic_load_ubo_uniform_block_intel: case nir_intrinsic_load_scratch: + case nir_intrinsic_load_scratch_intel: case nir_intrinsic_load_shader_indirect_data_intel: return LSC_OP_LOAD; @@ -3149,7 +3229,7 @@ lsc_op_for_nir_intrinsic(const nir_intrinsic_instr *intrin) case nir_intrinsic_store_global_block_intel: case nir_intrinsic_store_shared_block_intel: case nir_intrinsic_store_ssbo_block_intel: - case nir_intrinsic_store_scratch: + case nir_intrinsic_store_scratch_intel: return LSC_OP_STORE; case nir_intrinsic_image_load: