diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index cfa9a6e8a73..279c57e2241 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -5787,11 +5787,13 @@ nir_lower_shader_calls(nir_shader *shader, void *mem_ctx); int nir_get_io_offset_src_number(const nir_intrinsic_instr *instr); +int nir_get_io_uniform_offset_src_number(const nir_intrinsic_instr *instr); int nir_get_io_index_src_number(const nir_intrinsic_instr *instr); int nir_get_io_data_src_number(const nir_intrinsic_instr *instr); int nir_get_io_arrayed_index_src_number(const nir_intrinsic_instr *instr); nir_src *nir_get_io_offset_src(nir_intrinsic_instr *instr); +nir_src *nir_get_io_uniform_offset_src(nir_intrinsic_instr *instr); nir_src *nir_get_io_index_src(nir_intrinsic_instr *instr); nir_src *nir_get_io_data_src(nir_intrinsic_instr *instr); nir_src *nir_get_io_arrayed_index_src(nir_intrinsic_instr *instr); @@ -5801,7 +5803,6 @@ static inline unsigned nir_get_io_base_size_nv(const nir_intrinsic_instr *intr) { switch (intr->intrinsic) { - case nir_intrinsic_global_atomic_nv: case nir_intrinsic_global_atomic_swap_nv: case nir_intrinsic_shared_atomic_nv: case nir_intrinsic_shared_atomic_swap_nv: @@ -5814,6 +5815,9 @@ nir_get_io_base_size_nv(const nir_intrinsic_instr *intr) case nir_intrinsic_store_shared_nv: case nir_intrinsic_store_shared_unlock_nv: return 24; + case nir_intrinsic_global_atomic_nv: + /* TODO: SM100+ only has 23 bits for the UGPR + GPR form */ + return 23; case nir_intrinsic_ldc_nv: case nir_intrinsic_ldcx_nv: return 16; diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 1a7b029a3b6..daad6d6a305 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -941,7 +941,8 @@ intrinsic("load_vulkan_descriptor", src_comp=[-1], dest_comp=0, # The offset is sign-extended or zero-extended based on the SIGN_EXTEND index. # # NV variants all come with a 24 bit base, that is unsigned with a constant 0 address, -# signed otherwise. +# signed otherwise. Non swap atomic also comes with an additional uniform address source +# right after the non uniform memory address. # # PCO global variants use a vec3 for the memory address and data, where component X # has the low 32 address bits, component Y has the high 32 address bits, and component Z @@ -950,13 +951,13 @@ intrinsic("load_vulkan_descriptor", src_comp=[-1], dest_comp=0, intrinsic("deref_atomic", src_comp=[-1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP]) intrinsic("ssbo_atomic", src_comp=[-1, 1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP, OFFSET_SHIFT]) intrinsic("shared_atomic", src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP]) -intrinsic("shared_atomic_nv", src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP, OFFSET_SHIFT_NV]) +intrinsic("shared_atomic_nv", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP, OFFSET_SHIFT_NV]) intrinsic("task_payload_atomic", src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP]) intrinsic("global_atomic", src_comp=[1, 1], dest_comp=1, indices=[ATOMIC_OP]) intrinsic("global_atomic_2x32", src_comp=[2, 1], dest_comp=1, indices=[ATOMIC_OP]) intrinsic("global_atomic_amd", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP]) intrinsic("global_atomic_agx", src_comp=[1, 1, 1], dest_comp=1, indices=[ATOMIC_OP, SIGN_EXTEND]) -intrinsic("global_atomic_nv", src_comp=[1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP]) +intrinsic("global_atomic_nv", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE, ATOMIC_OP]) intrinsic("global_atomic_pco", src_comp=[3], dest_comp=1, indices=[ATOMIC_OP], bit_sizes=[32]) intrinsic("deref_atomic_swap", src_comp=[-1, 1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP]) @@ -1920,15 +1921,15 @@ load("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flag # src[] = { value, address, unsigned 32-bit offset }. store("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET, WRITE_MASK]) -# src[] = { address }. BASE is a 24 bit unsigned offset if a constant 0 address is given, -# signed otherwise. +# src[] = { address, uniform_address }. BASE is a 24 bit unsigned offset if a constant 0 address and +# a constant 0 uniform_address is given, signed otherwise. # load_global_nv has an additional boolean input that makes the load return 0 on false. -load("global_nv", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) -store("global_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET]) -load("scratch_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) -store("scratch_nv", [1], indices=[BASE, ALIGN_MUL, ALIGN_OFFSET]) -load("shared_nv", [1], indices=[BASE, OFFSET_SHIFT_NV, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) -store("shared_nv", [1], indices=[BASE, OFFSET_SHIFT_NV, ACCESS, ALIGN_MUL, ALIGN_OFFSET]) +load("global_nv", [1, 1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) +store("global_nv", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET]) +load("scratch_nv", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) +store("scratch_nv", [1, 1], indices=[BASE, ALIGN_MUL, ALIGN_OFFSET]) +load("shared_nv", [1, 1], indices=[BASE, OFFSET_SHIFT_NV, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) +store("shared_nv", [1, 1], indices=[BASE, OFFSET_SHIFT_NV, ACCESS, ALIGN_MUL, ALIGN_OFFSET]) # Same as shared_atomic_add, but with GDS. src[] = {store_val, gds_addr, m0} intrinsic("gds_atomic_add_amd", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE]) @@ -2942,7 +2943,8 @@ intrinsic("ssa_bar_nv", src_comp=[1]) intrinsic("cmat_muladd_nv", src_comp=[-1, -1, -1], dest_comp=0, bit_sizes=src2, indices=[FLAGS], flags=[CAN_ELIMINATE]) -intrinsic("cmat_load_shared_nv", src_comp=[1], dest_comp=0, indices=[NUM_MATRICES, MATRIX_LAYOUT, BASE], flags=[CAN_ELIMINATE]) +# src[] = { address, uniform_address } +intrinsic("cmat_load_shared_nv", src_comp=[1, 1], dest_comp=0, indices=[NUM_MATRICES, MATRIX_LAYOUT, BASE], flags=[CAN_ELIMINATE]) # Moves a 8x8 16bit matrix with transposition within a subgroup intrinsic("cmat_mov_transpose_nv", src_comp=[2], dest_comp=2, bit_sizes=[16], flags=[CAN_ELIMINATE, CAN_REORDER, SUBGROUP]) diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c index 86f7a3591fd..4ae7cc7ca1a 100644 --- a/src/compiler/nir/nir_lower_io.c +++ b/src/compiler/nir/nir_lower_io.c @@ -1106,6 +1106,39 @@ nir_get_io_offset_src(nir_intrinsic_instr *instr) case nir_intrinsic_bindless_image_##name: \ case nir_intrinsic_image_heap_##name +/** + * Return the uniform offset source number for a load/store intrinsic or -1 if there's no offset. + */ +int +nir_get_io_uniform_offset_src_number(const nir_intrinsic_instr *instr) +{ + switch (instr->intrinsic) { + case nir_intrinsic_cmat_load_shared_nv: + case nir_intrinsic_global_atomic_nv: + case nir_intrinsic_load_global_nv: + case nir_intrinsic_load_scratch_nv: + case nir_intrinsic_load_shared_nv: + case nir_intrinsic_shared_atomic_nv: + return 1; + case nir_intrinsic_store_global_nv: + case nir_intrinsic_store_scratch_nv: + case nir_intrinsic_store_shared_nv: + return 2; + default: + return -1; + } +} + +/** + * Return the uniform offset source for a load/store intrinsic. + */ +nir_src * +nir_get_io_uniform_offset_src(nir_intrinsic_instr *instr) +{ + const int idx = nir_get_io_uniform_offset_src_number(instr); + return idx >= 0 ? &instr->src[idx] : NULL; +} + /** * Return the index or handle source number for a load/store intrinsic or -1 * if there's no index or handle. diff --git a/src/compiler/nir/nir_opt_offsets.c b/src/compiler/nir/nir_opt_offsets.c index 5e53ac297c2..70a47461b84 100644 --- a/src/compiler/nir/nir_opt_offsets.c +++ b/src/compiler/nir/nir_opt_offsets.c @@ -193,11 +193,12 @@ try_fold_load_store_nv(nir_builder *b, assert(offset_idx >= 0); nir_src src = intrin->src[offset_idx]; + nir_src *uniform_src = nir_get_io_uniform_offset_src(intrin); int32_t min = 0; uint32_t max = BITFIELD_MASK(offset_bits); - if (!nir_src_is_const(src)) { + if (!nir_src_is_const(src) || (uniform_src && !nir_src_is_const(*uniform_src))) { max >>= 1; min = ~max; } @@ -211,6 +212,11 @@ try_fold_load_store_nv(nir_builder *b, return false; } + /* We don't try to fold the offset for the uniform source on purpose, + * because we rely on running nir_opt_offsets before moving in the uniform + * source. However, we might run this pass again _after_ that, because we + * can eliminate a u2u64 on the _non uniform_ source and therefore might be + * able to fold in more constants into base. */ return try_fold_load_store(b, intrin, state, offset_idx, min, max, false); } diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c index 20116c51a2e..59d9e9a92dc 100644 --- a/src/compiler/nir/nir_validate.c +++ b/src/compiler/nir/nir_validate.c @@ -761,9 +761,11 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state) case nir_intrinsic_vild_nv: { int base = nir_intrinsic_base(instr); nir_src src = *nir_get_io_offset_src(instr); + nir_src *uniform_src = nir_get_io_uniform_offset_src(instr); unsigned const_bits = nir_get_io_base_size_nv(instr); - if (nir_src_is_const(src) && nir_src_as_int(src) == 0) { + if (nir_src_is_const(src) && nir_src_as_int(src) == 0 && + (!uniform_src || (nir_src_is_const(*uniform_src) && nir_src_as_int(*uniform_src) == 0))) { validate_assert(state, base >= 0 && base < BITFIELD_MASK(const_bits)); } else { int32_t max = BITFIELD_MASK(const_bits - 1); @@ -771,8 +773,14 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state) validate_assert(state, base >= min && base < max); } + if (uniform_src) { + validate_assert(state, uniform_src->ssa->bit_size >= src.ssa->bit_size); + if (state->impl->valid_metadata & nir_metadata_divergence) + validate_assert(state, !uniform_src->ssa->divergent); + } + if (instr->intrinsic == nir_intrinsic_load_global_nv) { - validate_assert(state, instr->src[1].ssa->bit_size == 1); + validate_assert(state, instr->src[2].ssa->bit_size == 1); } break; diff --git a/src/nouveau/compiler/nak/from_nir.rs b/src/nouveau/compiler/nak/from_nir.rs index a7665ed4ec8..fdc457b8685 100644 --- a/src/nouveau/compiler/nak/from_nir.rs +++ b/src/nouveau/compiler/nak/from_nir.rs @@ -2975,7 +2975,8 @@ impl<'a> ShaderFromNir<'a> { nir_intrinsic_global_atomic_nv => { let bit_size = intrin.def.bit_size(); let addr = self.get_src(&srcs[0]); - let data = self.get_src(&srcs[1]); + let uaddr = self.get_src(&srcs[1]); + let data = self.get_src(&srcs[2]); let atom_type = self.get_atomic_type(intrin); let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Separate); @@ -2992,7 +2993,7 @@ impl<'a> ShaderFromNir<'a> { dst.clone().into() }, addr: addr, - uniform_address: Src::ZERO, + uniform_address: uaddr, cmpr: 0.into(), data: data, atom_op: atom_op, @@ -3220,13 +3221,14 @@ impl<'a> ShaderFromNir<'a> { .get_eviction_priority(intrin.access()), }; let addr = self.get_src(&srcs[0]); - let pred = self.get_src(&srcs[1]); + let uaddr = self.get_src(&srcs[1]); + let pred = self.get_src(&srcs[2]); let dst = b.alloc_ssa_vec(RegFile::GPR, size_B.div_ceil(4)); b.push_op(OpLd { dst: dst.clone().into(), addr: addr, - uniform_addr: Src::ZERO, + uniform_addr: uaddr, pred: pred, offset: intrin.base(), stride: OffsetStride::X1, @@ -3333,12 +3335,13 @@ impl<'a> ShaderFromNir<'a> { eviction_priority: MemEvictionPriority::Normal, }; let addr = self.get_src(&srcs[0]); + let uaddr = self.get_src(&srcs[1]); let dst = b.alloc_ssa_vec(RegFile::GPR, size_B.div_ceil(4)); b.push_op(OpLd { dst: dst.clone().into(), addr: addr, - uniform_addr: Src::ZERO, + uniform_addr: uaddr, pred: true.into(), offset: intrin.base(), stride: OffsetStride::X1, @@ -3357,12 +3360,14 @@ impl<'a> ShaderFromNir<'a> { eviction_priority: MemEvictionPriority::Normal, }; let addr = self.get_src(&srcs[0]); + let uaddr = self.get_src(&srcs[1]); + let dst = b.alloc_ssa_vec(RegFile::GPR, size_B.div_ceil(4)); b.push_op(OpLd { dst: dst.clone().into(), addr: addr, - uniform_addr: Src::ZERO, + uniform_addr: uaddr, pred: true.into(), offset: intrin.base(), stride: intrin.offset_shift_nv().try_into().unwrap(), @@ -3673,7 +3678,8 @@ impl<'a> ShaderFromNir<'a> { nir_intrinsic_shared_atomic_nv => { let bit_size = intrin.def.bit_size(); let addr = self.get_src(&srcs[0]); - let data = self.get_src(&srcs[1]); + let uaddr = self.get_src(&srcs[1]); + let data = self.get_src(&srcs[2]); let atom_type = self.get_atomic_type(intrin); let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Separate); @@ -3683,7 +3689,7 @@ impl<'a> ShaderFromNir<'a> { b.push_op(OpAtom { dst: dst.clone().into(), addr: addr, - uniform_address: Src::ZERO, + uniform_address: uaddr, cmpr: 0.into(), data: data, atom_op: atom_op, @@ -3740,10 +3746,11 @@ impl<'a> ShaderFromNir<'a> { .get_eviction_priority(intrin.access()), }; let addr = self.get_src(&srcs[1]); + let uaddr = self.get_src(&srcs[2]); b.push_op(OpSt { addr: addr, - uniform_addr: Src::ZERO, + uniform_addr: uaddr, data: data, offset: intrin.base(), stride: OffsetStride::X1, @@ -3772,10 +3779,11 @@ impl<'a> ShaderFromNir<'a> { eviction_priority: MemEvictionPriority::Normal, }; let addr = self.get_src(&srcs[1]); + let uaddr = self.get_src(&srcs[2]); b.push_op(OpSt { addr: addr, - uniform_addr: Src::ZERO, + uniform_addr: uaddr, data: data, offset: intrin.base(), stride: OffsetStride::X1, @@ -3794,10 +3802,11 @@ impl<'a> ShaderFromNir<'a> { eviction_priority: MemEvictionPriority::Normal, }; let addr = self.get_src(&srcs[1]); + let uaddr = self.get_src(&srcs[2]); b.push_op(OpSt { addr: addr, - uniform_addr: Src::ZERO, + uniform_addr: uaddr, data: data, offset: intrin.base(), stride: intrin.offset_shift_nv().try_into().unwrap(), @@ -3912,12 +3921,13 @@ impl<'a> ShaderFromNir<'a> { }; let dst = b.alloc_ssa_vec(RegFile::GPR, comps); let addr = self.get_src(&srcs[0]); + let uaddr = self.get_src(&srcs[1]); b.push_op(OpLdsm { dst: dst.clone().into(), mat_size, mat_count, addr, - uniform_addr: Src::ZERO, + uniform_addr: uaddr, offset: intrin.base(), }); self.set_dst(&intrin.def, dst); diff --git a/src/nouveau/compiler/nak_nir.c b/src/nouveau/compiler/nak_nir.c index 9c129859c63..c6a85525473 100644 --- a/src/nouveau/compiler/nak_nir.c +++ b/src/nouveau/compiler/nak_nir.c @@ -1019,8 +1019,23 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak) continue; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + nir_src *addr; + + switch (intr->intrinsic) { + case nir_intrinsic_load_global_bounded: + case nir_intrinsic_load_global_constant_bounded: { + addr = &intr->src[0]; + break; + } + default: + addr = nir_get_io_offset_src(intr); + break; + } + if (!addr) + continue; + b.cursor = nir_before_instr(instr); - nir_src *addr = nir_get_io_offset_src(intr); + nir_def *uaddr = nir_imm_zero(&b, 1, addr->ssa->bit_size); nir_def *res = NULL; nir_intrinsic_instr *new = NULL; @@ -1028,7 +1043,7 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak) case nir_intrinsic_load_global: case nir_intrinsic_load_global_constant: { nir_def *nir_true = nir_imm_bool(&b, true); - res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, nir_true); + res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, uaddr, nir_true); break; } case nir_intrinsic_load_global_bounded: @@ -1044,32 +1059,32 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak) nir_def *addr = nir_iadd(&b, base->ssa, nir_u2u64(&b, offset->ssa)); nir_def *last_byte = nir_iadd_imm(&b, offset->ssa, load_size - 1); nir_def *cond = nir_ult(&b, last_byte, size->ssa); - res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr, cond); + res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr, uaddr, cond); break; } case nir_intrinsic_load_scratch: - res = nir_load_scratch_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa); + res = nir_load_scratch_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, uaddr); break; case nir_intrinsic_load_shared: - res = nir_load_shared_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa); + res = nir_load_shared_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, uaddr); break; case nir_intrinsic_store_global: - new = nir_store_global_nv(&b, intr->src[0].ssa, addr->ssa); + new = nir_store_global_nv(&b, intr->src[0].ssa, addr->ssa, uaddr); break; case nir_intrinsic_store_scratch: - new = nir_store_scratch_nv(&b, intr->src[0].ssa, addr->ssa); + new = nir_store_scratch_nv(&b, intr->src[0].ssa, addr->ssa, uaddr); break; case nir_intrinsic_store_shared: - new = nir_store_shared_nv(&b, intr->src[0].ssa, addr->ssa); + new = nir_store_shared_nv(&b, intr->src[0].ssa, addr->ssa, uaddr); break; case nir_intrinsic_global_atomic: - res = nir_global_atomic_nv(&b, intr->def.bit_size, addr->ssa, intr->src[1].ssa); + res = nir_global_atomic_nv(&b, intr->def.bit_size, addr->ssa, uaddr, intr->src[1].ssa); break; case nir_intrinsic_global_atomic_swap: res = nir_global_atomic_swap_nv(&b, intr->def.bit_size, addr->ssa, intr->src[1].ssa, intr->src[2].ssa); break; case nir_intrinsic_shared_atomic: - res = nir_shared_atomic_nv(&b, intr->def.bit_size, addr->ssa, intr->src[1].ssa); + res = nir_shared_atomic_nv(&b, intr->def.bit_size, addr->ssa, uaddr, intr->src[1].ssa); break; case nir_intrinsic_shared_atomic_swap: res = nir_shared_atomic_swap_nv(&b, intr->def.bit_size, addr->ssa, intr->src[1].ssa, intr->src[2].ssa); diff --git a/src/nouveau/compiler/nak_nir_lower_cmat.c b/src/nouveau/compiler/nak_nir_lower_cmat.c index 0490d21fd7b..e5c10171734 100644 --- a/src/nouveau/compiler/nak_nir_lower_cmat.c +++ b/src/nouveau/compiler/nak_nir_lower_cmat.c @@ -723,6 +723,7 @@ try_lower_cmat_load_to_ldsm(nir_builder *b, nir_intrinsic_instr *intr) nir_def *base = intr->src[1].ssa; offset = nir_u2uN(b, offset, base->bit_size); nir_def *addr = nir_iadd(b, base, offset); + nir_def *zero = nir_imm_zero(b, addr->num_components, addr->bit_size); /* flip the layout for B matrices */ if (desc.use == GLSL_CMAT_USE_B) { @@ -734,7 +735,7 @@ try_lower_cmat_load_to_ldsm(nir_builder *b, nir_intrinsic_instr *intr) /* Each thread loads 32 bits per matrix */ assert(length * bit_size == 32 * ldsm_count); - return nir_cmat_load_shared_nv(b, length, bit_size, addr, + return nir_cmat_load_shared_nv(b, length, bit_size, addr, zero, .num_matrices = ldsm_count, .matrix_layout = layout); } diff --git a/src/nouveau/compiler/nak_nir_lower_non_uniform_ldcx.c b/src/nouveau/compiler/nak_nir_lower_non_uniform_ldcx.c index 10507233910..7fd64e13b98 100644 --- a/src/nouveau/compiler/nak_nir_lower_non_uniform_ldcx.c +++ b/src/nouveau/compiler/nak_nir_lower_non_uniform_ldcx.c @@ -56,10 +56,12 @@ lower_ldcx_to_global(nir_builder *b, nir_intrinsic_instr *load, * simple less-than check here. */ nir_def *cond = nir_ilt(b, offset, size); + nir_def *zero_addr = nir_imm_zero(b, addr->num_components, + addr->bit_size); nir_def *val = nir_load_global_nv(b, load->def.num_components, load->def.bit_size, nir_iadd(b, addr, nir_u2u64(b, offset)), - cond, + zero_addr, cond, .align_mul = nir_intrinsic_align_mul(load), .align_offset = nir_intrinsic_align_offset(load), .access = ACCESS_CAN_REORDER,