diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 2688e1a24ea..db24434a382 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1191,9 +1191,11 @@ image("store_raw_intel", src_comp=[1, 0]) # Intrinsic to load a block of at least 32B of constant data from a 64-bit # global memory address. The memory address must be uniform and 32B-aligned. -# src[] = { address }. -intrinsic("load_global_const_block_intel", src_comp=[1], dest_comp=0, bit_sizes=[32], - indices=[BASE], flags=[CAN_ELIMINATE, CAN_REORDER]) +# The second source is a predicate which indicates whether or not to actually +# do the load. +# src[] = { address, predicate }. +intrinsic("load_global_const_block_intel", src_comp=[1, 1], dest_comp=0, + bit_sizes=[32], indices=[BASE], flags=[CAN_ELIMINATE, CAN_REORDER]) # Number of data items being operated on for a SIMD program. system_value("simd_width_intel", 1) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 8123c89f410..f3f59006ad0 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4665,12 +4665,43 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr assert(instr->num_components == 8 || instr->num_components == 16); const fs_builder ubld = bld.exec_all().group(instr->num_components, 0); - fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD); - ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL, - tmp, - bld.emit_uniformize(get_nir_src(instr->src[0])), /* Address */ - fs_reg(), /* No source data */ - brw_imm_ud(instr->num_components)); + fs_reg load_val; + + bool is_pred_const = nir_src_is_const(instr->src[1]); + if (is_pred_const && nir_src_as_uint(instr->src[1]) == 0) { + /* In this case, we don't want the UBO load at all. We really + * shouldn't get here but it's possible. + */ + load_val = brw_imm_ud(0); + } else { + /* The uniform process may stomp the flag so do this first */ + fs_reg addr = bld.emit_uniformize(get_nir_src(instr->src[0])); + + load_val = ubld.vgrf(BRW_REGISTER_TYPE_UD); + + /* If the predicate is constant and we got here, then it's non-zero + * and we don't need the predicate at all. + */ + if (!is_pred_const) { + /* Load the predicate */ + fs_reg pred = bld.emit_uniformize(get_nir_src(instr->src[1])); + fs_inst *mov = ubld.MOV(bld.null_reg_d(), pred); + mov->conditional_mod = BRW_CONDITIONAL_NZ; + + /* Stomp the destination with 0 if we're OOB */ + mov = ubld.MOV(load_val, brw_imm_ud(0)); + mov->predicate = BRW_PREDICATE_NORMAL; + mov->predicate_inverse = true; + } + + fs_inst *load = ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL, + load_val, addr, + fs_reg(), /* No source data */ + brw_imm_ud(instr->num_components)); + + if (!is_pred_const) + load->predicate = BRW_PREDICATE_NORMAL; + } /* From the HW perspective, we just did a single SIMD16 instruction * which loaded a dword in each SIMD channel. From NIR's perspective, @@ -4681,7 +4712,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr */ for (unsigned i = 0; i < instr->num_components; i++) { bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD), - component(tmp, i)); + component(load_val, i)); } break; } diff --git a/src/intel/compiler/brw_nir_lower_rt_intrinsics.c b/src/intel/compiler/brw_nir_lower_rt_intrinsics.c index 205604f6563..d5d95b97fce 100644 --- a/src/intel/compiler/brw_nir_lower_rt_intrinsics.c +++ b/src/intel/compiler/brw_nir_lower_rt_intrinsics.c @@ -164,7 +164,8 @@ lower_rt_intrinsics_impl(nir_function_impl *impl, nir_ssa_def *addr = nir_iadd_imm(b, nir_load_btd_global_arg_addr_intel(b), aligned_offset + i * 64); - data[i] = nir_load_global_const_block_intel(b, 16, addr); + data[i] = nir_load_global_const_block_intel(b, 16, addr, + nir_imm_true(b)); } sysval = nir_extract_bits(b, data, 2, suboffset * 8, diff --git a/src/intel/compiler/brw_nir_rt_builder.h b/src/intel/compiler/brw_nir_rt_builder.h index ffe4f875777..9c7b95d1c8f 100644 --- a/src/intel/compiler/brw_nir_rt_builder.h +++ b/src/intel/compiler/brw_nir_rt_builder.h @@ -217,7 +217,7 @@ brw_nir_rt_load_globals(nir_builder *b, nir_ssa_def *addr = nir_load_btd_global_arg_addr_intel(b); nir_ssa_def *data; - data = nir_load_global_const_block_intel(b, 16, addr); + data = nir_load_global_const_block_intel(b, 16, addr, nir_imm_true(b)); defs->base_mem_addr = nir_pack_64_2x32(b, nir_channels(b, data, 0x3)); defs->call_stack_handler_addr = @@ -240,7 +240,8 @@ brw_nir_rt_load_globals(nir_builder *b, defs->sw_stack_size = nir_channel(b, data, 12); defs->launch_size = nir_channels(b, data, 0x7u << 13); - data = nir_load_global_const_block_intel(b, 8, nir_iadd_imm(b, addr, 64)); + data = nir_load_global_const_block_intel(b, 8, nir_iadd_imm(b, addr, 64), + nir_imm_true(b)); defs->call_sbt_addr = nir_pack_64_2x32_split(b, nir_channel(b, data, 0), nir_extract_i16(b, nir_channel(b, data, 1),