diff --git a/src/amd/common/ac_nir_lower_esgs_io_to_mem.c b/src/amd/common/ac_nir_lower_esgs_io_to_mem.c index cad54d1a70a..9a483b3c694 100644 --- a/src/amd/common/ac_nir_lower_esgs_io_to_mem.c +++ b/src/amd/common/ac_nir_lower_esgs_io_to_mem.c @@ -71,13 +71,15 @@ emit_split_buffer_load(nir_builder *b, nir_ssa_def *desc, nir_ssa_def *v_off, ni full_dwords++; } + nir_ssa_def *zero = nir_imm_int(b, 0); + for (unsigned i = 0; i < full_dwords; ++i) - comps[i] = nir_load_buffer_amd(b, 1, 32, desc, v_off, s_off, + comps[i] = nir_load_buffer_amd(b, 1, 32, desc, v_off, s_off, zero, .base = component_stride * i, .memory_modes = nir_var_shader_in, .access = ACCESS_COHERENT); if (remaining_bytes) - comps[full_dwords] = nir_load_buffer_amd(b, 1, remaining_bytes * 8, desc, v_off, s_off, + comps[full_dwords] = nir_load_buffer_amd(b, 1, remaining_bytes * 8, desc, v_off, s_off, zero, .base = component_stride * full_dwords, .memory_modes = nir_var_shader_in, .access = ACCESS_COHERENT); @@ -90,6 +92,8 @@ emit_split_buffer_store(nir_builder *b, nir_ssa_def *d, nir_ssa_def *desc, nir_s unsigned component_stride, unsigned num_components, unsigned bit_size, unsigned writemask, bool swizzled, bool slc) { + nir_ssa_def *zero = nir_imm_int(b, 0); + while (writemask) { int start, count; u_bit_scan_consecutive_range(&writemask, &start, &count); @@ -106,7 +110,7 @@ emit_split_buffer_store(nir_builder *b, nir_ssa_def *d, nir_ssa_def *desc, nir_s store_bytes = MIN2(store_bytes, 2); nir_ssa_def *store_val = nir_extract_bits(b, &d, 1, start_byte * 8u, 1, store_bytes * 8u); - nir_store_buffer_amd(b, store_val, desc, v_off, s_off, .is_swizzled = swizzled, .slc_amd = slc, + nir_store_buffer_amd(b, store_val, desc, v_off, s_off, zero, .is_swizzled = swizzled, .slc_amd = slc, .base = start_byte, .memory_modes = nir_var_shader_out, .access = ACCESS_COHERENT); start_byte += store_bytes; diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c index 6c7957575c0..7c1ad54f00b 100644 --- a/src/amd/common/ac_nir_lower_ngg.c +++ b/src/amd/common/ac_nir_lower_ngg.c @@ -1700,9 +1700,10 @@ ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info, nir_ssa_def *out_data = nir_load_shared(b, count, 32, vtx_lds_addr, .base = offset); + nir_ssa_def *zero = nir_imm_int(b, 0); nir_store_buffer_amd(b, out_data, so_buffer[out->buffer], vtx_buffer_offsets[out->buffer], - nir_imm_int(b, 0), + zero, zero, .base = out->offset, .slc_amd = true); } @@ -3188,7 +3189,8 @@ ms_store_arrayed_output_intrin(nir_builder *b, } else if (out_mode == ms_out_mode_vram) { nir_ssa_def *ring = nir_load_ring_mesh_scratch_amd(b); nir_ssa_def *off = nir_load_ring_mesh_scratch_offset_amd(b); - nir_store_buffer_amd(b, store_val, ring, addr, off, + nir_ssa_def *zero = nir_imm_int(b, 0); + nir_store_buffer_amd(b, store_val, ring, addr, off, zero, .base = const_off, .write_mask = write_mask, .memory_modes = nir_var_shader_out, @@ -3242,7 +3244,8 @@ ms_load_arrayed_output(nir_builder *b, } else if (out_mode == ms_out_mode_vram) { nir_ssa_def *ring = nir_load_ring_mesh_scratch_amd(b); nir_ssa_def *off = nir_load_ring_mesh_scratch_offset_amd(b); - return nir_load_buffer_amd(b, num_components, load_bit_size, ring, addr, off, + nir_ssa_def *zero = nir_imm_int(b, 0); + return nir_load_buffer_amd(b, num_components, load_bit_size, ring, addr, off, zero, .base = const_off, .memory_modes = nir_var_shader_out, .access = ACCESS_COHERENT); diff --git a/src/amd/common/ac_nir_lower_taskmesh_io_to_mem.c b/src/amd/common/ac_nir_lower_taskmesh_io_to_mem.c index 17cba47c957..2808248cb87 100644 --- a/src/amd/common/ac_nir_lower_taskmesh_io_to_mem.c +++ b/src/amd/common/ac_nir_lower_taskmesh_io_to_mem.c @@ -240,8 +240,9 @@ task_write_draw_ring(nir_builder *b, nir_ssa_def *ring = nir_load_ring_task_draw_amd(b); nir_ssa_def *scalar_off = nir_imul_imm(b, ptr, s->draw_entry_bytes); nir_ssa_def *vector_off = nir_imm_int(b, 0); + nir_ssa_def *zero = nir_imm_int(b, 0); - nir_store_buffer_amd(b, store_val, ring, vector_off, scalar_off, + nir_store_buffer_amd(b, store_val, ring, vector_off, scalar_off, zero, .base = const_off, .memory_modes = nir_var_shader_out, .access = ACCESS_COHERENT); } @@ -305,8 +306,9 @@ lower_task_payload_store(nir_builder *b, nir_ssa_def *ring = nir_load_ring_task_payload_amd(b); nir_ssa_def *ptr = task_ring_entry_index(b, s); nir_ssa_def *ring_off = nir_imul_imm(b, ptr, s->payload_entry_bytes); + nir_ssa_def *zero = nir_imm_int(b, 0); - nir_store_buffer_amd(b, store_val, ring, addr, ring_off, .base = base, + nir_store_buffer_amd(b, store_val, ring, addr, ring_off, zero, .base = base, .write_mask = write_mask, .memory_modes = nir_var_mem_task_payload, .access = ACCESS_COHERENT); @@ -331,8 +333,9 @@ lower_taskmesh_payload_load(nir_builder *b, nir_ssa_def *addr = intrin->src[0].ssa; nir_ssa_def *ring = nir_load_ring_task_payload_amd(b); nir_ssa_def *ring_off = nir_imul_imm(b, ptr, s->payload_entry_bytes); + nir_ssa_def *zero = nir_imm_int(b, 0); - return nir_load_buffer_amd(b, num_components, bit_size, ring, addr, ring_off, .base = base, + return nir_load_buffer_amd(b, num_components, bit_size, ring, addr, ring_off, zero, .base = base, .memory_modes = nir_var_mem_task_payload, .access = ACCESS_COHERENT); } diff --git a/src/amd/common/ac_nir_lower_tess_io_to_mem.c b/src/amd/common/ac_nir_lower_tess_io_to_mem.c index 198ff521540..0dfe8ab39b9 100644 --- a/src/amd/common/ac_nir_lower_tess_io_to_mem.c +++ b/src/amd/common/ac_nir_lower_tess_io_to_mem.c @@ -442,7 +442,8 @@ lower_hs_output_store(nir_builder *b, nir_ssa_def *hs_ring_tess_offchip = nir_load_ring_tess_offchip_amd(b); nir_ssa_def *offchip_offset = nir_load_ring_tess_offchip_offset_amd(b); - nir_store_buffer_amd(b, store_val, hs_ring_tess_offchip, vmem_off, offchip_offset, + nir_ssa_def *zero = nir_imm_int(b, 0); + nir_store_buffer_amd(b, store_val, hs_ring_tess_offchip, vmem_off, offchip_offset, zero, .write_mask = write_mask, .memory_modes = nir_var_shader_out, .access = ACCESS_COHERENT); } @@ -587,6 +588,7 @@ hs_emit_write_tess_factors(nir_shader *shader, .align_mul = 16u, .align_offset = st->tcs_tess_lvl_in_loc % 16u) : NULL; + nir_ssa_def *zero = nir_imm_int(b, 0); nir_ssa_def *rel_patch_id = nir_load_tess_rel_patch_id_amd(b); nir_ssa_def *tess_factors_base = nir_load_ring_tess_factors_offset_amd(b); nir_ssa_def *tess_factors_offset = nir_imul_imm(b, rel_patch_id, (inner_comps + outer_comps) * 4u); @@ -596,7 +598,7 @@ hs_emit_write_tess_factors(nir_shader *shader, /* Store the dynamic HS control word. */ nir_if *rel_patch_id_zero = nir_push_if(b, nir_ieq_imm(b, rel_patch_id, 0)); nir_ssa_def *ctrlw = nir_imm_int(b, 0x80000000u); - nir_store_buffer_amd(b, ctrlw, tessfactor_ring, nir_imm_zero(b, 1, 32), tess_factors_base, + nir_store_buffer_amd(b, ctrlw, tessfactor_ring, zero, tess_factors_base, zero, .access = ACCESS_COHERENT); tess_factors_const_offset += 4; nir_pop_if(b, rel_patch_id_zero); @@ -606,17 +608,17 @@ hs_emit_write_tess_factors(nir_shader *shader, if (shader->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES) { /* LINES reversal */ nir_ssa_def *t = nir_vec2(b, nir_channel(b, tessfactors_outer, 1), nir_channel(b, tessfactors_outer, 0)); - nir_store_buffer_amd(b, t, tessfactor_ring, tess_factors_offset, tess_factors_base, + nir_store_buffer_amd(b, t, tessfactor_ring, tess_factors_offset, tess_factors_base, zero, .base = tess_factors_const_offset, .access = ACCESS_COHERENT); } else if (shader->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES) { nir_ssa_def *t = nir_vec4(b, nir_channel(b, tessfactors_outer, 0), nir_channel(b, tessfactors_outer, 1), nir_channel(b, tessfactors_outer, 2), nir_channel(b, tessfactors_inner, 0)); - nir_store_buffer_amd(b, t, tessfactor_ring, tess_factors_offset, tess_factors_base, + nir_store_buffer_amd(b, t, tessfactor_ring, tess_factors_offset, tess_factors_base, zero, .base = tess_factors_const_offset, .access = ACCESS_COHERENT); } else { - nir_store_buffer_amd(b, tessfactors_outer, tessfactor_ring, tess_factors_offset, tess_factors_base, + nir_store_buffer_amd(b, tessfactors_outer, tessfactor_ring, tess_factors_offset, tess_factors_base, zero, .base = tess_factors_const_offset, .access = ACCESS_COHERENT); - nir_store_buffer_amd(b, tessfactors_inner, tessfactor_ring, tess_factors_offset, tess_factors_base, + nir_store_buffer_amd(b, tessfactors_inner, tessfactor_ring, tess_factors_offset, tess_factors_base, zero, .base = tess_factors_const_offset + 4u * outer_comps, .access = ACCESS_COHERENT); } @@ -626,12 +628,12 @@ hs_emit_write_tess_factors(nir_shader *shader, nir_ssa_def *offchip_offset = nir_load_ring_tess_offchip_offset_amd(b); nir_ssa_def *vmem_off_outer = hs_per_patch_output_vmem_offset(b, st, NULL, st->tcs_tess_lvl_out_loc); - nir_store_buffer_amd(b, tessfactors_outer, hs_ring_tess_offchip, vmem_off_outer, offchip_offset, + nir_store_buffer_amd(b, tessfactors_outer, hs_ring_tess_offchip, vmem_off_outer, offchip_offset, zero, .memory_modes = nir_var_shader_out, .access = ACCESS_COHERENT); if (inner_comps) { nir_ssa_def *vmem_off_inner = hs_per_patch_output_vmem_offset(b, st, NULL, st->tcs_tess_lvl_in_loc); - nir_store_buffer_amd(b, tessfactors_inner, hs_ring_tess_offchip, vmem_off_inner, offchip_offset, + nir_store_buffer_amd(b, tessfactors_inner, hs_ring_tess_offchip, vmem_off_inner, offchip_offset, zero, .memory_modes = nir_var_shader_out, .access = ACCESS_COHERENT); } } @@ -655,8 +657,11 @@ lower_tes_input_load(nir_builder *b, ? hs_per_vertex_output_vmem_offset(b, st, intrin) : hs_per_patch_output_vmem_offset(b, st, intrin, 0); + nir_ssa_def *zero = nir_imm_int(b, 0); + return nir_load_buffer_amd(b, intrin->dest.ssa.num_components, - intrin->dest.ssa.bit_size, offchip_ring, off, offchip_offset, + intrin->dest.ssa.bit_size, offchip_ring, + off, offchip_offset, zero, .access = ACCESS_COHERENT); } diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 4993529e2b3..8df3b7db762 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -3991,6 +3991,7 @@ struct LoadEmitInfo { unsigned num_components; unsigned component_size; Temp resource = Temp(0, s1); /* buffer resource or base 64-bit address */ + Temp idx = Temp(0, v1); /* buffer index */ unsigned component_stride = 0; unsigned const_offset = 0; unsigned align_mul = 0; @@ -4417,6 +4418,14 @@ mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigne soffset = Operand(info.soffset); } + bool offen = !vaddr.isUndefined(); + bool idxen = info.idx.id(); + + if (offen && idxen) + vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr); + else if (idxen) + vaddr = Operand(info.idx); + unsigned bytes_size = 0; aco_opcode op; if (bytes_needed == 1 || align_ % 2) { @@ -4442,7 +4451,8 @@ mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigne mubuf->operands[0] = Operand(info.resource); mubuf->operands[1] = vaddr; mubuf->operands[2] = soffset; - mubuf->offen = (offset.type() == RegType::vgpr); + mubuf->offen = offen; + mubuf->idxen = idxen; mubuf->glc = info.glc; mubuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3); @@ -5078,7 +5088,7 @@ resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_off } void -emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata, +emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp idx, Temp vdata, unsigned const_offset, memory_sync_info sync, bool glc, bool slc, bool swizzled) { @@ -5090,20 +5100,30 @@ emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp s aco_opcode op = get_buffer_store_op(vdata.bytes()); const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset); - Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1); + bool offen = voffset.id(); + bool idxen = idx.id(); + Operand soffset_op = soffset.id() ? Operand(soffset) : Operand::zero(); glc &= ctx->program->gfx_level < GFX11; + + Operand vaddr_op(v1); + if (offen && idxen) + vaddr_op = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), idx, voffset); + else if (offen) + vaddr_op = Operand(voffset); + else if (idxen) + vaddr_op = Operand(idx); + Builder::Result r = - bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset, - /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled, - /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, - /* glc */ glc, /* dlc*/ false, /* slc */ slc); + bld.mubuf(op, Operand(descriptor), vaddr_op, soffset_op, Operand(vdata), const_offset, + offen, swizzled, idxen, /* addr64 */ false, /* disable_wqm */ false, glc, + /* dlc*/ false, slc); r.instr->mubuf().sync = sync; } void -store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset, +store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset, Temp idx, unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask, bool swizzled, memory_sync_info sync, bool glc, bool slc) { @@ -5120,13 +5140,13 @@ store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Tem for (unsigned i = 0; i < write_count; i++) { unsigned const_offset = offsets[i] + base_const_offset; - emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync, + emit_single_mubuf_store(ctx, descriptor, voffset, soffset, idx, write_datas[i], const_offset, sync, glc, slc, swizzled); } } void -load_vmem_mubuf(isel_context* ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset, +load_vmem_mubuf(isel_context* ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset, Temp idx, unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components, unsigned swizzle_element_size, bool glc, bool slc, memory_sync_info sync) { @@ -5136,6 +5156,7 @@ load_vmem_mubuf(isel_context* ctx, Temp dst, Temp descriptor, Temp voffset, Temp Builder bld(ctx->program, ctx->block); LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor}; + info.idx = idx; info.component_stride = swizzle_element_size; info.glc = glc; info.slc = slc; @@ -7175,10 +7196,13 @@ visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin) { Builder bld(ctx->program, ctx->block); + bool idxen = !nir_src_is_const(intrin->src[3]) || nir_src_as_uint(intrin->src[3]); + Temp dst = get_ssa_temp(ctx, &intrin->dest.ssa); Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa)); Temp v_offset = as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa)); Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa)); + Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[3].ssa)) : Temp(); bool swizzled = nir_intrinsic_is_swizzled(intrin); bool slc = nir_intrinsic_slc_amd(intrin); @@ -7192,7 +7216,7 @@ visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin) nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin); memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode)); - load_vmem_mubuf(ctx, dst, descriptor, v_offset, s_offset, const_offset, elem_size_bytes, + load_vmem_mubuf(ctx, dst, descriptor, v_offset, s_offset, idx, const_offset, elem_size_bytes, num_components, swizzle_element_size, glc, slc, sync); } @@ -7201,10 +7225,13 @@ visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin) { Builder bld(ctx->program, ctx->block); + bool idxen = !nir_src_is_const(intrin->src[4]) || nir_src_as_uint(intrin->src[4]); + Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa); Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[1].ssa)); Temp v_offset = as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[2].ssa)); Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[3].ssa)); + Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[4].ssa)) : Temp(); bool swizzled = nir_intrinsic_is_swizzled(intrin); bool glc = nir_intrinsic_access(intrin) & ACCESS_COHERENT; @@ -7217,7 +7244,7 @@ visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin) nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin); memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode)); - store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, const_offset, elem_size_bytes, + store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, idx, const_offset, elem_size_bytes, write_mask, swizzled, sync, glc, slc); } @@ -11942,7 +11969,7 @@ select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_ Temp val = bld.tmp(v1); unsigned const_offset = offset * program->info.gs.vertices_out * 16 * 4; - load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), const_offset, 4, 1, 0u, true, + load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), Temp(), const_offset, 4, 1, 0, true, true, memory_sync_info()); ctx.outputs.mask[i] |= 1 << j; diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index 90145131feb..7da1ff6ca32 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -4184,9 +4184,12 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins /* Currently ignored. */ break; case nir_intrinsic_load_buffer_amd: { + bool idxen = !nir_src_is_const(instr->src[3]) || nir_src_as_uint(instr->src[3]); + LLVMValueRef descriptor = get_src(ctx, instr->src[0]); LLVMValueRef addr_voffset = get_src(ctx, instr->src[1]); LLVMValueRef addr_soffset = get_src(ctx, instr->src[2]); + LLVMValueRef vidx = idxen ? get_src(ctx, instr->src[3]) : NULL; unsigned num_components = instr->dest.ssa.num_components; unsigned const_offset = nir_intrinsic_base(instr); bool swizzled = nir_intrinsic_is_swizzled(instr); @@ -4218,16 +4221,20 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins LLVMValueRef voffset = LLVMBuildAdd(ctx->ac.builder, addr_voffset, LLVMConstInt(ctx->ac.i32, const_offset, 0), ""); - result = ac_build_buffer_load(&ctx->ac, descriptor, num_components, NULL, voffset, + result = ac_build_buffer_load(&ctx->ac, descriptor, num_components, vidx, voffset, addr_soffset, channel_type, cache_policy, reorder, false); + result = ac_to_integer(&ctx->ac, ac_trim_vector(&ctx->ac, result, num_components)); break; } case nir_intrinsic_store_buffer_amd: { + bool idxen = !nir_src_is_const(instr->src[4]) || nir_src_as_uint(instr->src[4]); + LLVMValueRef store_data = get_src(ctx, instr->src[0]); LLVMValueRef descriptor = get_src(ctx, instr->src[1]); LLVMValueRef addr_voffset = get_src(ctx, instr->src[2]); LLVMValueRef addr_soffset = get_src(ctx, instr->src[3]); + LLVMValueRef vidx = idxen ? get_src(ctx, instr->src[4]) : NULL; unsigned const_offset = nir_intrinsic_base(instr); bool swizzled = nir_intrinsic_is_swizzled(instr); bool coherent = nir_intrinsic_access(instr) & ACCESS_COHERENT; @@ -4251,7 +4258,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins LLVMConstInt(ctx->ac.i32, const_offset + start * 4, 0), ""); LLVMValueRef data = extract_vector_range(&ctx->ac, store_data, start, count); - ac_build_buffer_store_dword(&ctx->ac, descriptor, data, NULL, voffset, addr_soffset, + ac_build_buffer_store_dword(&ctx->ac, descriptor, data, vidx, voffset, addr_soffset, cache_policy); } break; diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 3452f43fd22..8f29e8a6ebc 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1290,10 +1290,12 @@ store("tf_r600", []) # AMD GCN/RDNA specific intrinsics -# src[] = { descriptor, base address, scalar offset } -intrinsic("load_buffer_amd", src_comp=[4, 1, 1], dest_comp=0, indices=[BASE, IS_SWIZZLED, SLC_AMD, MEMORY_MODES, ACCESS], flags=[CAN_ELIMINATE]) -# src[] = { store value, descriptor, base address, scalar offset } -intrinsic("store_buffer_amd", src_comp=[0, 4, 1, 1], indices=[BASE, WRITE_MASK, IS_SWIZZLED, SLC_AMD, MEMORY_MODES, ACCESS]) +# src[] = { descriptor, vector byte offset, scalar byte offset, index offset } +# The index offset is multiplied by the stride in the descriptor. The vertex/scalar byte offsets +# are in bytes. +intrinsic("load_buffer_amd", src_comp=[4, 1, 1, 1], dest_comp=0, indices=[BASE, IS_SWIZZLED, SLC_AMD, MEMORY_MODES, ACCESS], flags=[CAN_ELIMINATE]) +# src[] = { store value, descriptor, vector byte offset, scalar byte offset, index offset } +intrinsic("store_buffer_amd", src_comp=[0, 4, 1, 1, 1], indices=[BASE, WRITE_MASK, IS_SWIZZLED, SLC_AMD, MEMORY_MODES, ACCESS]) # src[] = { address, unsigned 32-bit offset }. load("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])