diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index 2688e1a24ea..db24434a382 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -1191,9 +1191,11 @@ image("store_raw_intel", src_comp=[1, 0])
 
 # Intrinsic to load a block of at least 32B of constant data from a 64-bit
 # global memory address.  The memory address must be uniform and 32B-aligned.
-# src[] = { address }.
-intrinsic("load_global_const_block_intel", src_comp=[1], dest_comp=0, bit_sizes=[32],
-          indices=[BASE], flags=[CAN_ELIMINATE, CAN_REORDER])
+# The second source is a predicate which indicates whether or not to actually
+# do the load.
+# src[] = { address, predicate }.
+intrinsic("load_global_const_block_intel", src_comp=[1, 1], dest_comp=0,
+          bit_sizes=[32], indices=[BASE], flags=[CAN_ELIMINATE, CAN_REORDER])
 
 # Number of data items being operated on for a SIMD program.
 system_value("simd_width_intel", 1)
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 8123c89f410..f3f59006ad0 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4665,12 +4665,43 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       assert(instr->num_components == 8 || instr->num_components == 16);
 
       const fs_builder ubld = bld.exec_all().group(instr->num_components, 0);
-      fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
-      ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL,
-                tmp,
-                bld.emit_uniformize(get_nir_src(instr->src[0])), /* Address */
-                fs_reg(), /* No source data */
-                brw_imm_ud(instr->num_components));
+      fs_reg load_val;
+
+      bool is_pred_const = nir_src_is_const(instr->src[1]);
+      if (is_pred_const && nir_src_as_uint(instr->src[1]) == 0) {
+         /* In this case, we don't want the UBO load at all.  We really
+          * shouldn't get here but it's possible.
+          */
+         load_val = brw_imm_ud(0);
+      } else {
+         /* The uniform process may stomp the flag so do this first */
+         fs_reg addr = bld.emit_uniformize(get_nir_src(instr->src[0]));
+
+         load_val = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+
+         /* If the predicate is constant and we got here, then it's non-zero
+          * and we don't need the predicate at all.
+          */
+         if (!is_pred_const) {
+            /* Load the predicate */
+            fs_reg pred = bld.emit_uniformize(get_nir_src(instr->src[1]));
+            fs_inst *mov = ubld.MOV(bld.null_reg_d(), pred);
+            mov->conditional_mod = BRW_CONDITIONAL_NZ;
+
+            /* Stomp the destination with 0 if we're OOB */
+            mov = ubld.MOV(load_val, brw_imm_ud(0));
+            mov->predicate = BRW_PREDICATE_NORMAL;
+            mov->predicate_inverse = true;
+         }
+
+         fs_inst *load = ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL,
+                                   load_val, addr,
+                                   fs_reg(), /* No source data */
+                                   brw_imm_ud(instr->num_components));
+
+         if (!is_pred_const)
+            load->predicate = BRW_PREDICATE_NORMAL;
+      }
 
       /* From the HW perspective, we just did a single SIMD16 instruction
        * which loaded a dword in each SIMD channel.  From NIR's perspective,
@@ -4681,7 +4712,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        */
       for (unsigned i = 0; i < instr->num_components; i++) {
          bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD),
-                 component(tmp, i));
+                 component(load_val, i));
       }
       break;
    }
diff --git a/src/intel/compiler/brw_nir_lower_rt_intrinsics.c b/src/intel/compiler/brw_nir_lower_rt_intrinsics.c
index 205604f6563..d5d95b97fce 100644
--- a/src/intel/compiler/brw_nir_lower_rt_intrinsics.c
+++ b/src/intel/compiler/brw_nir_lower_rt_intrinsics.c
@@ -164,7 +164,8 @@ lower_rt_intrinsics_impl(nir_function_impl *impl,
                   nir_ssa_def *addr =
                      nir_iadd_imm(b, nir_load_btd_global_arg_addr_intel(b),
                                      aligned_offset + i * 64);
-                  data[i] = nir_load_global_const_block_intel(b, 16, addr);
+                  data[i] = nir_load_global_const_block_intel(b, 16, addr,
+                                                              nir_imm_true(b));
                }
 
                sysval = nir_extract_bits(b, data, 2, suboffset * 8,
diff --git a/src/intel/compiler/brw_nir_rt_builder.h b/src/intel/compiler/brw_nir_rt_builder.h
index ffe4f875777..9c7b95d1c8f 100644
--- a/src/intel/compiler/brw_nir_rt_builder.h
+++ b/src/intel/compiler/brw_nir_rt_builder.h
@@ -217,7 +217,7 @@ brw_nir_rt_load_globals(nir_builder *b,
    nir_ssa_def *addr = nir_load_btd_global_arg_addr_intel(b);
 
    nir_ssa_def *data;
-   data = nir_load_global_const_block_intel(b, 16, addr);
+   data = nir_load_global_const_block_intel(b, 16, addr, nir_imm_true(b));
    defs->base_mem_addr = nir_pack_64_2x32(b, nir_channels(b, data, 0x3));
 
    defs->call_stack_handler_addr =
@@ -240,7 +240,8 @@ brw_nir_rt_load_globals(nir_builder *b,
    defs->sw_stack_size = nir_channel(b, data, 12);
    defs->launch_size = nir_channels(b, data, 0x7u << 13);
 
-   data = nir_load_global_const_block_intel(b, 8, nir_iadd_imm(b, addr, 64));
+   data = nir_load_global_const_block_intel(b, 8, nir_iadd_imm(b, addr, 64),
+                                                  nir_imm_true(b));
    defs->call_sbt_addr =
       nir_pack_64_2x32_split(b, nir_channel(b, data, 0),
                                 nir_extract_i16(b, nir_channel(b, data, 1),