diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 31444759a8c..06a55897b7e 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -226,7 +226,6 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_load_scalar_arg_amd: case nir_intrinsic_load_smem_amd: case nir_intrinsic_load_resume_shader_address_amd: - case nir_intrinsic_load_global_const_block_intel: case nir_intrinsic_load_reloc_const_intel: case nir_intrinsic_load_btd_global_arg_addr_intel: case nir_intrinsic_load_btd_local_arg_addr_intel: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index a757fdeaecd..406697e5469 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -2133,14 +2133,6 @@ image("load_raw_intel", src_comp=[1], dest_comp=0, flags=[CAN_ELIMINATE]) image("store_raw_intel", src_comp=[1, 0]) -# Intrinsic to load a block of at least 32B of constant data from a 64-bit -# global memory address. The memory address must be uniform and 32B-aligned. -# The second source is a predicate which indicates whether or not to actually -# do the load. -# src[] = { address, predicate }. -intrinsic("load_global_const_block_intel", src_comp=[1, 1], dest_comp=0, - bit_sizes=[32], indices=[BASE], flags=[CAN_ELIMINATE, CAN_REORDER]) - # Number of data items being operated on for a SIMD program. system_value("simd_width_intel", 1) diff --git a/src/compiler/nir/nir_lower_shader_calls.c b/src/compiler/nir/nir_lower_shader_calls.c index cfa5d1e5814..931c1aaf161 100644 --- a/src/compiler/nir/nir_lower_shader_calls.c +++ b/src/compiler/nir/nir_lower_shader_calls.c @@ -155,7 +155,6 @@ can_remat_instr(nir_instr *instr, struct sized_bitset *remat) case nir_intrinsic_load_vulkan_descriptor: case nir_intrinsic_load_push_constant: case nir_intrinsic_load_global_constant: - case nir_intrinsic_load_global_const_block_intel: case nir_intrinsic_load_desc_set_address_intel: /* These intrinsics don't need to be spilled as long as they don't * depend on any spilled values. diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 223893a8ab8..4e233a4ed20 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -6816,69 +6816,6 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb, fs_nir_emit_global_atomic(ntb, bld, instr); break; - case nir_intrinsic_load_global_const_block_intel: { - assert(instr->def.bit_size == 32); - assert(instr->num_components == 8 || instr->num_components == 16); - - const fs_builder ubld = bld.exec_all().group(instr->num_components, 0); - brw_reg load_val; - - bool is_pred_const = nir_src_is_const(instr->src[1]); - if (is_pred_const && nir_src_as_uint(instr->src[1]) == 0) { - /* In this case, we don't want the UBO load at all. We really - * shouldn't get here but it's possible. - */ - load_val = brw_imm_ud(0); - } else { - /* The uniform process may stomp the flag so do this first */ - brw_reg addr = bld.emit_uniformize(get_nir_src(ntb, instr->src[0])); - - load_val = ubld.vgrf(BRW_TYPE_UD); - - /* If the predicate is constant and we got here, then it's non-zero - * and we don't need the predicate at all. - */ - if (!is_pred_const) { - /* Load the predicate */ - brw_reg pred = bld.emit_uniformize(get_nir_src(ntb, instr->src[1])); - fs_inst *mov = ubld.MOV(bld.null_reg_d(), pred); - mov->conditional_mod = BRW_CONDITIONAL_NZ; - - /* Stomp the destination with 0 if we're OOB */ - mov = ubld.MOV(load_val, brw_imm_ud(0)); - mov->predicate = BRW_PREDICATE_NORMAL; - mov->predicate_inverse = true; - } - - brw_reg srcs[A64_LOGICAL_NUM_SRCS]; - srcs[A64_LOGICAL_ADDRESS] = addr; - srcs[A64_LOGICAL_SRC] = brw_reg(); /* No source data */ - srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components); - /* This intrinsic loads memory from a uniform address, sometimes - * shared across lanes. We never need to mask it. - */ - srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0); - - fs_inst *load = ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL, - load_val, srcs, A64_LOGICAL_NUM_SRCS); - if (!is_pred_const) - load->predicate = BRW_PREDICATE_NORMAL; - } - - /* From the HW perspective, we just did a single SIMD16 instruction - * which loaded a dword in each SIMD channel. From NIR's perspective, - * this instruction returns a vec16. Any users of this data in the - * back-end will expect a vec16 per SIMD channel so we have to emit a - * pile of MOVs to resolve this discrepancy. Fortunately, copy-prop - * will generally clean them up for us. - */ - for (unsigned i = 0; i < instr->num_components; i++) { - bld.MOV(retype(offset(dest, bld, i), BRW_TYPE_UD), - component(load_val, i)); - } - break; - } - case nir_intrinsic_load_global_constant_uniform_block_intel: { const unsigned total_dwords = ALIGN(instr->num_components, REG_SIZE * reg_unit(devinfo) / 4); diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index fd41543b688..9959604ce6d 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -1421,8 +1421,7 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, if (bit_size > 32) return false; - if (low->intrinsic == nir_intrinsic_load_global_const_block_intel || - low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel || + if (low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel || low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel || low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel || low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel) { @@ -2175,8 +2174,12 @@ brw_nir_load_global_const(nir_builder *b, nir_intrinsic_instr *load_uniform, nir_def *data[2]; for (unsigned i = 0; i < 2; i++) { nir_def *addr = nir_iadd_imm(b, base_addr, aligned_offset + i * 64); - data[i] = nir_load_global_const_block_intel(b, 16, addr, - nir_imm_true(b)); + + data[i] = nir_load_global_constant_uniform_block_intel( + b, 16, 32, addr, + .access = ACCESS_CAN_REORDER | ACCESS_NON_WRITEABLE, + .align_mul = 64, + .align_offset = 64); } sysval = nir_extract_bits(b, data, 2, suboffset * 8, diff --git a/src/intel/compiler/brw_nir_rt_builder.h b/src/intel/compiler/brw_nir_rt_builder.h index 3f8189e4155..0db54d75093 100644 --- a/src/intel/compiler/brw_nir_rt_builder.h +++ b/src/intel/compiler/brw_nir_rt_builder.h @@ -60,10 +60,13 @@ brw_nir_rt_store(nir_builder *b, nir_def *addr, unsigned align, } static inline nir_def * -brw_nir_rt_load_const(nir_builder *b, unsigned components, - nir_def *addr, nir_def *pred) +brw_nir_rt_load_const(nir_builder *b, unsigned components, nir_def *addr) { - return nir_load_global_const_block_intel(b, components, addr, pred); + return nir_load_global_constant_uniform_block_intel( + b, components, 32, addr, + .access = ACCESS_CAN_REORDER | ACCESS_NON_WRITEABLE, + .align_mul = 64, + .align_offset = 64); } static inline nir_def * @@ -312,7 +315,7 @@ brw_nir_rt_load_globals_addr(nir_builder *b, nir_def *addr) { nir_def *data; - data = brw_nir_rt_load_const(b, 16, addr, nir_imm_true(b)); + data = brw_nir_rt_load_const(b, 16, addr); defs->base_mem_addr = nir_pack_64_2x32(b, nir_trim_vector(b, data, 2)); defs->call_stack_handler_addr = @@ -335,7 +338,7 @@ brw_nir_rt_load_globals_addr(nir_builder *b, defs->sw_stack_size = nir_channel(b, data, 12); defs->launch_size = nir_channels(b, data, 0x7u << 13); - data = brw_nir_rt_load_const(b, 8, nir_iadd_imm(b, addr, 64), nir_imm_true(b)); + data = brw_nir_rt_load_const(b, 8, nir_iadd_imm(b, addr, 64)); defs->call_sbt_addr = nir_pack_64_2x32_split(b, nir_channel(b, data, 0), nir_extract_i16(b, nir_channel(b, data, 1), diff --git a/src/intel/compiler/elk/elk_fs_nir.cpp b/src/intel/compiler/elk/elk_fs_nir.cpp index 5fc12015fa9..aba761ef915 100644 --- a/src/intel/compiler/elk/elk_fs_nir.cpp +++ b/src/intel/compiler/elk/elk_fs_nir.cpp @@ -5251,69 +5251,6 @@ fs_nir_emit_intrinsic(nir_to_elk_state &ntb, fs_nir_emit_global_atomic(ntb, bld, instr); break; - case nir_intrinsic_load_global_const_block_intel: { - assert(instr->def.bit_size == 32); - assert(instr->num_components == 8 || instr->num_components == 16); - - const fs_builder ubld = bld.exec_all().group(instr->num_components, 0); - elk_fs_reg load_val; - - bool is_pred_const = nir_src_is_const(instr->src[1]); - if (is_pred_const && nir_src_as_uint(instr->src[1]) == 0) { - /* In this case, we don't want the UBO load at all. We really - * shouldn't get here but it's possible. - */ - load_val = elk_imm_ud(0); - } else { - /* The uniform process may stomp the flag so do this first */ - elk_fs_reg addr = bld.emit_uniformize(get_nir_src(ntb, instr->src[0])); - - load_val = ubld.vgrf(ELK_REGISTER_TYPE_UD); - - /* If the predicate is constant and we got here, then it's non-zero - * and we don't need the predicate at all. - */ - if (!is_pred_const) { - /* Load the predicate */ - elk_fs_reg pred = bld.emit_uniformize(get_nir_src(ntb, instr->src[1])); - elk_fs_inst *mov = ubld.MOV(bld.null_reg_d(), pred); - mov->conditional_mod = ELK_CONDITIONAL_NZ; - - /* Stomp the destination with 0 if we're OOB */ - mov = ubld.MOV(load_val, elk_imm_ud(0)); - mov->predicate = ELK_PREDICATE_NORMAL; - mov->predicate_inverse = true; - } - - elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS]; - srcs[A64_LOGICAL_ADDRESS] = addr; - srcs[A64_LOGICAL_SRC] = elk_fs_reg(); /* No source data */ - srcs[A64_LOGICAL_ARG] = elk_imm_ud(instr->num_components); - /* This intrinsic loads memory from a uniform address, sometimes - * shared across lanes. We never need to mask it. - */ - srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(0); - - elk_fs_inst *load = ubld.emit(ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL, - load_val, srcs, A64_LOGICAL_NUM_SRCS); - if (!is_pred_const) - load->predicate = ELK_PREDICATE_NORMAL; - } - - /* From the HW perspective, we just did a single SIMD16 instruction - * which loaded a dword in each SIMD channel. From NIR's perspective, - * this instruction returns a vec16. Any users of this data in the - * back-end will expect a vec16 per SIMD channel so we have to emit a - * pile of MOVs to resolve this discrepancy. Fortunately, copy-prop - * will generally clean them up for us. - */ - for (unsigned i = 0; i < instr->num_components; i++) { - bld.MOV(retype(offset(dest, bld, i), ELK_REGISTER_TYPE_UD), - component(load_val, i)); - } - break; - } - case nir_intrinsic_load_global_constant_uniform_block_intel: { const unsigned total_dwords = ALIGN(instr->num_components, REG_SIZE * reg_unit(devinfo) / 4); diff --git a/src/intel/compiler/elk/elk_nir.c b/src/intel/compiler/elk/elk_nir.c index 9bc3cc57f22..48f8ec6297b 100644 --- a/src/intel/compiler/elk/elk_nir.c +++ b/src/intel/compiler/elk/elk_nir.c @@ -1139,8 +1139,7 @@ elk_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, if (bit_size > 32) return false; - if (low->intrinsic == nir_intrinsic_load_global_const_block_intel || - low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel || + if (low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel || low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel || low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel || low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel) { @@ -1873,8 +1872,7 @@ elk_nir_load_global_const(nir_builder *b, nir_intrinsic_instr *load_uniform, nir_def *data[2]; for (unsigned i = 0; i < 2; i++) { nir_def *addr = nir_iadd_imm(b, base_addr, aligned_offset + i * 64); - data[i] = nir_load_global_const_block_intel(b, 16, addr, - nir_imm_true(b)); + data[i] = nir_load_global_constant_uniform_block_intel(b, 16, 32, addr); } sysval = nir_extract_bits(b, data, 2, suboffset * 8, diff --git a/src/intel/compiler/intel_nir_blockify_uniform_loads.c b/src/intel/compiler/intel_nir_blockify_uniform_loads.c index 11816a4aa5a..75f1a7a921b 100644 --- a/src/intel/compiler/intel_nir_blockify_uniform_loads.c +++ b/src/intel/compiler/intel_nir_blockify_uniform_loads.c @@ -106,39 +106,6 @@ intel_nir_blockify_uniform_loads_instr(nir_builder *b, intrin->intrinsic = nir_intrinsic_load_global_constant_uniform_block_intel; return true; - case nir_intrinsic_load_global_const_block_intel: - /* Only deal with the simple predication true case */ - if (!nir_src_is_const(intrin->src[1]) || - nir_src_as_uint(intrin->src[1]) == 0) - return false; - - if (nir_src_is_divergent(intrin->src[0])) - return false; - - if (intrin->def.bit_size != 32) - return false; - - /* Without the LSC, we can only do block loads of at least 4dwords (1 - * oword). - */ - if (!devinfo->has_lsc && intrin->def.num_components < 4) - return false; - - b->cursor = nir_before_instr(&intrin->instr); - nir_def *def = - nir_load_global_constant_uniform_block_intel( - b, - intrin->def.num_components, - intrin->def.bit_size, - intrin->src[0].ssa, - .access = ACCESS_NON_WRITEABLE | ACCESS_CAN_REORDER, - .align_mul = 4, - .align_offset = 4); - - nir_def_rewrite_uses(&intrin->def, def); - nir_instr_remove(&intrin->instr); - return true; - default: return false; } diff --git a/src/intel/vulkan/anv_nir_lower_ubo_loads.c b/src/intel/vulkan/anv_nir_lower_ubo_loads.c index cbcd869a69a..57c9b571142 100644 --- a/src/intel/vulkan/anv_nir_lower_ubo_loads.c +++ b/src/intel/vulkan/anv_nir_lower_ubo_loads.c @@ -58,17 +58,19 @@ lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load, /* Load two just in case we go over a 64B boundary */ nir_def *data[2]; for (unsigned i = 0; i < 2; i++) { - nir_def *pred; + nir_def *addr = nir_iadd_imm(b, base_addr, aligned_offset + i * 64); + + data[i] = nir_load_global_constant_uniform_block_intel( + b, 16, 32, addr, + .access = nir_intrinsic_access(load), + .align_mul = 64, + .align_offset = 64); if (bound) { - pred = nir_igt_imm(b, bound, aligned_offset + i * 64 + 63); - } else { - pred = nir_imm_true(b); + data[i] = nir_bcsel(b, + nir_igt_imm(b, bound, aligned_offset + i * 64 + 63), + data[i], + nir_imm_int(b, 0)); } - - nir_def *addr = nir_iadd_imm(b, base_addr, - aligned_offset + i * 64); - - data[i] = nir_load_global_const_block_intel(b, 16, addr, pred); } val = nir_extract_bits(b, data, 2, suboffset * 8, diff --git a/src/intel/vulkan_hasvk/anv_nir_lower_ubo_loads.c b/src/intel/vulkan_hasvk/anv_nir_lower_ubo_loads.c index 31878328bb5..57c9b571142 100644 --- a/src/intel/vulkan_hasvk/anv_nir_lower_ubo_loads.c +++ b/src/intel/vulkan_hasvk/anv_nir_lower_ubo_loads.c @@ -44,7 +44,7 @@ lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load, unsigned byte_size = bit_size / 8; nir_def *val; - if (nir_src_is_const(load->src[1])) { + if (!nir_src_is_divergent(load->src[0]) && nir_src_is_const(load->src[1])) { uint32_t offset = nir_src_as_uint(load->src[1]); /* Things should be component-aligned. */ @@ -58,17 +58,19 @@ lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load, /* Load two just in case we go over a 64B boundary */ nir_def *data[2]; for (unsigned i = 0; i < 2; i++) { - nir_def *pred; + nir_def *addr = nir_iadd_imm(b, base_addr, aligned_offset + i * 64); + + data[i] = nir_load_global_constant_uniform_block_intel( + b, 16, 32, addr, + .access = nir_intrinsic_access(load), + .align_mul = 64, + .align_offset = 64); if (bound) { - pred = nir_igt_imm(b, bound, aligned_offset + i * 64 + 63); - } else { - pred = nir_imm_true(b); + data[i] = nir_bcsel(b, + nir_igt_imm(b, bound, aligned_offset + i * 64 + 63), + data[i], + nir_imm_int(b, 0)); } - - nir_def *addr = nir_iadd_imm(b, base_addr, - aligned_offset + i * 64); - - data[i] = nir_load_global_const_block_intel(b, 16, addr, pred); } val = nir_extract_bits(b, data, 2, suboffset * 8,