From 96df4499ac159d55101fda867aa0fb2effe3dc16 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Tue, 2 Aug 2022 15:10:50 +0100 Subject: [PATCH] radv,aco: implement 64-bit vertex inputs Note that, from 22.4.1. Vertex Input Extraction of Vulkan spec: The input variable in the shader must be declared as a 64-bit data type if and only if format is a 64-bit data type. Signed-off-by: Rhys Perry Reviewed-by: Samuel Pitoiset Part-of: --- .../compiler/aco_instruction_selection.cpp | 46 +++++++++++--- src/amd/compiler/aco_shader_info.h | 1 + src/amd/llvm/ac_nir_to_llvm.c | 7 ++- src/amd/vulkan/radv_aco_shader_info.h | 1 + src/amd/vulkan/radv_nir_to_llvm.c | 21 ++++--- src/amd/vulkan/radv_shader.c | 10 ++- src/amd/vulkan/radv_shader.h | 1 + src/amd/vulkan/radv_shader_args.c | 2 +- src/amd/vulkan/radv_shader_info.c | 61 +++++++++++++------ 9 files changed, 108 insertions(+), 42 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 135e63a077b..0d8ee6e5641 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -5450,17 +5450,23 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr) "Unimplemented non-zero nir_intrinsic_load_input offset"); unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0; - unsigned component = nir_intrinsic_component(instr); unsigned bitsize = instr->dest.ssa.bit_size; + unsigned component = nir_intrinsic_component(instr) >> (bitsize == 64 ? 1 : 0); unsigned num_components = instr->dest.ssa.num_components; - Temp input = get_arg(ctx, ctx->args->vs_inputs[location]); - aco_ptr vec{create_instruction( aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; std::array elems; for (unsigned i = 0; i < num_components; i++) { - elems[i] = emit_extract_vector(ctx, input, component + i, bitsize == 64 ? v2 : v1); + if (bitsize == 64) { + Temp input = get_arg(ctx, ctx->args->vs_inputs[location + (component + i) / 2]); + elems[i] = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), + emit_extract_vector(ctx, input, (component + i) * 2 % 4, v1), + emit_extract_vector(ctx, input, (component + i) * 2 % 4 + 1, v1)); + } else { + Temp input = get_arg(ctx, ctx->args->vs_inputs[location]); + elems[i] = emit_extract_vector(ctx, input, component + i, v1); + } if (bitsize == 16) { if (nir_alu_type_get_base_type(nir_intrinsic_dest_type(instr)) == nir_type_float) elems[i] = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), elems[i]); @@ -5483,8 +5489,8 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr) convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.vertex_buffers)); unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0; - unsigned component = nir_intrinsic_component(instr); unsigned bitsize = instr->dest.ssa.bit_size; + unsigned component = nir_intrinsic_component(instr) >> (bitsize == 64 ? 1 : 0); unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location]; uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location]; uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location]; @@ -5639,8 +5645,8 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr) channels[channel_start] = fetch_dst; } else { for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++) - channels[channel_start + i] = - emit_extract_vector(ctx, fetch_dst, i, bitsize == 16 ? v2b : v1); + channels[channel_start + i] = emit_extract_vector( + ctx, fetch_dst, i, RegClass::get(RegType::vgpr, bitsize / 8u)); } channel_start += fetch_component; @@ -5664,6 +5670,12 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr) num_temp++; elems[i] = channel; + } else if (bitsize == 64) { + /* 22.1.1. Attribute Location and Component Assignment of Vulkan 1.3 specification: + * For 64-bit data types, no default attribute values are provided. Input variables + * must not use more components than provided by the attribute. + */ + vec->operands[i] = Operand(v2); } else if (is_float && idx == 3) { vec->operands[i] = bitsize == 16 ? Operand::c16(0x3c00u) : Operand::c32(0x3f800000u); } else if (!is_float && idx == 3) { @@ -11477,7 +11489,7 @@ add_startpgm(struct isel_context* ctx) } if (ctx->stage.has(SWStage::VS) && ctx->program->info.vs.dynamic_inputs) { - unsigned num_attributes = util_last_bit(ctx->program->info.vs.vb_desc_usage_mask); + unsigned num_attributes = util_last_bit(ctx->program->info.vs.input_slot_usage_mask); for (unsigned i = 0; i < num_attributes; i++) { Definition def(get_arg(ctx, ctx->args->vs_inputs[i])); @@ -12262,7 +12274,7 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key, ac_shade bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->gfx_level)); - for (unsigned i = 0; i < num_descs; i++, loc++) { + for (unsigned i = 0; i < num_descs;) { PhysReg dest(attributes_start.reg() + loc * 4u); /* calculate index */ @@ -12307,6 +12319,10 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key, ac_shade bld.mubuf(aco_opcode::buffer_load_dword, Definition(dest.advance(j * 4u), v1), Operand(cur_desc, s4), fetch_index, Operand::c32(0u), offset, false, false, true); + else if (vtx_info->chan_byte_size == 8) + bld.mtbuf(aco_opcode::tbuffer_load_format_xy, + Definition(dest.advance(j * 8u), v2), Operand(cur_desc, s4), + fetch_index, Operand::c32(0u), dfmt, nfmt, offset, false, true); else bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1), Operand(cur_desc, s4), fetch_index, Operand::c32(0u), dfmt, nfmt, @@ -12316,13 +12332,23 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key, ac_shade nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT ? 1u : 0x3f800000u; - for (unsigned j = vtx_info->num_channels; j < 4; j++) { + /* 22.1.1. Attribute Location and Component Assignment of Vulkan 1.3 specification: + * For 64-bit data types, no default attribute values are provided. Input variables must + * not use more components than provided by the attribute. + */ + for (unsigned j = vtx_info->num_channels; vtx_info->chan_byte_size != 8 && j < 4; j++) { bld.vop1(aco_opcode::v_mov_b32, Definition(dest.advance(j * 4u), v1), Operand::c32(j == 3 ? one : 0u)); } + + unsigned slots = vtx_info->chan_byte_size == 8 && vtx_info->num_channels > 2 ? 2 : 1; + loc += slots; + i += slots; } else { bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4), Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, false, true); + loc++; + i++; } } } diff --git a/src/amd/compiler/aco_shader_info.h b/src/amd/compiler/aco_shader_info.h index 57182cc67f1..973624a231c 100644 --- a/src/amd/compiler/aco_shader_info.h +++ b/src/amd/compiler/aco_shader_info.h @@ -115,6 +115,7 @@ struct aco_shader_info { uint64_t tcs_temp_only_input_mask; bool use_per_attribute_vb_descs; uint32_t vb_desc_usage_mask; + uint32_t input_slot_usage_mask; bool has_prolog; bool dynamic_inputs; } vs; diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index 3b7d9713cf1..08e3e897987 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -3405,8 +3405,11 @@ static LLVMValueRef visit_load(struct ac_nir_context *ctx, nir_intrinsic_instr * case 32: break; case 64: - unreachable("64-bit IO should have been lowered"); - return NULL; + if (ctx->stage != MESA_SHADER_VERTEX || is_output) { + unreachable("64-bit IO should have been lowered"); + return NULL; + } + break; default: unreachable("unhandled load type"); return NULL; diff --git a/src/amd/vulkan/radv_aco_shader_info.h b/src/amd/vulkan/radv_aco_shader_info.h index a9fcd4b934a..909be9abffe 100644 --- a/src/amd/vulkan/radv_aco_shader_info.h +++ b/src/amd/vulkan/radv_aco_shader_info.h @@ -82,6 +82,7 @@ radv_aco_convert_shader_info(struct aco_shader_info *aco_info, ASSIGN_FIELD(vs.tcs_temp_only_input_mask); ASSIGN_FIELD(vs.use_per_attribute_vb_descs); ASSIGN_FIELD(vs.vb_desc_usage_mask); + ASSIGN_FIELD(vs.input_slot_usage_mask); ASSIGN_FIELD(vs.has_prolog); ASSIGN_FIELD(vs.dynamic_inputs); ASSIGN_FIELD_CP(gs.output_usage_mask); diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index a79c25e5e13..a5fd3f3eef5 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -355,10 +355,10 @@ radv_get_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set, unsign static LLVMValueRef radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx, LLVMValueRef value, - unsigned num_channels, bool is_float) + unsigned num_channels, bool is_float, bool is_64bit) { - LLVMValueRef zero = is_float ? ctx->ac.f32_0 : ctx->ac.i32_0; - LLVMValueRef one = is_float ? ctx->ac.f32_1 : ctx->ac.i32_1; + LLVMValueRef zero = is_64bit ? ctx->ac.i64_0 : (is_float ? ctx->ac.f32_0 : ctx->ac.i32_0); + LLVMValueRef one = is_64bit ? ctx->ac.i64_0 : (is_float ? ctx->ac.f32_1 : ctx->ac.i32_1); LLVMValueRef chan[4]; if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) { @@ -446,8 +446,10 @@ load_vs_input(struct radv_shader_context *ctx, unsigned driver_location, LLVMTyp * dynamic) is unaligned and also if the VBO offset is aligned to a scalar (eg. stride is 8 and * VBO offset is 2 for R16G16B16A16_SNORM). */ + unsigned chan_dwords = vtx_info->chan_byte_size == 8 ? 2 : 1; if (((ctx->ac.gfx_level == GFX6 || ctx->ac.gfx_level >= GFX10) && vtx_info->chan_byte_size) || - !(vtx_info->has_hw_format & BITFIELD_BIT(vtx_info->num_channels - 1))) { + !(vtx_info->has_hw_format & BITFIELD_BIT(vtx_info->num_channels - 1)) || + vtx_info->element_size > 16) { unsigned chan_format = vtx_info->hw_format[0] & 0xf; LLVMValueRef values[4]; @@ -466,7 +468,7 @@ load_vs_input(struct radv_shader_context *ctx, unsigned driver_location, LLVMTyp values[chan] = ac_build_struct_tbuffer_load( &ctx->ac, t_list, chan_index, LLVMConstInt(ctx->ac.i32, chan_offset, false), - ctx->ac.i32_0, 1, chan_format, num_format, 0, true); + ctx->ac.i32_0, chan_dwords, chan_format, num_format, 0, true); } input = ac_build_gather_values(&ctx->ac, values, num_channels); @@ -482,10 +484,15 @@ load_vs_input(struct radv_shader_context *ctx, unsigned driver_location, LLVMTyp input = ac_build_struct_tbuffer_load( &ctx->ac, t_list, buffer_index, LLVMConstInt(ctx->ac.i32, attrib_offset, false), - ctx->ac.i32_0, num_channels, data_format, num_format, 0, true); + ctx->ac.i32_0, num_channels * chan_dwords, data_format, num_format, 0, true); } - input = radv_fixup_vertex_input_fetches(ctx, input, num_channels, is_float); + if (vtx_info->chan_byte_size == 8) + input = + LLVMBuildBitCast(ctx->ac.builder, input, LLVMVectorType(ctx->ac.i64, num_channels), ""); + + input = radv_fixup_vertex_input_fetches(ctx, input, num_channels, is_float, + vtx_info->chan_byte_size == 8); for (unsigned chan = 0; chan < 4; chan++) { LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false); diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index c2a17155d16..1653bae61fe 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -1143,8 +1143,14 @@ radv_lower_io(struct radv_device *device, nir_shader *nir) nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, MESA_SHADER_FRAGMENT); } - NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, type_size_vec4, - nir_lower_io_lower_64bit_to_32); + if (nir->info.stage == MESA_SHADER_VERTEX) { + NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in, type_size_vec4, 0); + NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out, type_size_vec4, + nir_lower_io_lower_64bit_to_32); + } else { + NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, type_size_vec4, + nir_lower_io_lower_64bit_to_32); + } /* This pass needs actual constants */ NIR_PASS(_, nir, nir_opt_constant_folding); diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 137048e0c76..d5c083ec096 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -260,6 +260,7 @@ struct radv_shader_info { bool needs_base_instance; bool use_per_attribute_vb_descs; uint32_t vb_desc_usage_mask; + uint32_t input_slot_usage_mask; bool has_prolog; bool dynamic_inputs; } vs; diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c index b04cca3e630..4dfdedcbca6 100644 --- a/src/amd/vulkan/radv_shader_args.c +++ b/src/amd/vulkan/radv_shader_args.c @@ -340,7 +340,7 @@ declare_vs_input_vgprs(enum amd_gfx_level gfx_level, const struct radv_shader_in if (info->vs.dynamic_inputs) { assert(info->vs.use_per_attribute_vb_descs); - unsigned num_attributes = util_last_bit(info->vs.vb_desc_usage_mask); + unsigned num_attributes = util_last_bit(info->vs.input_slot_usage_mask); for (unsigned i = 0; i < num_attributes; i++) ac_add_arg(&args->ac, AC_ARG_VGPR, 4, AC_ARG_INT, &args->vs_inputs[i]); /* Ensure the main shader doesn't use less vgprs than the prolog. The prolog requires one diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c index 537d2ce8439..4b3535a134a 100644 --- a/src/amd/vulkan/radv_shader_info.c +++ b/src/amd/vulkan/radv_shader_info.c @@ -42,8 +42,11 @@ gather_intrinsic_load_input_info(const nir_shader *nir, const nir_intrinsic_inst unsigned idx = nir_intrinsic_io_semantics(instr).location; unsigned component = nir_intrinsic_component(instr); unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa); + mask = (instr->dest.ssa.bit_size == 64 ? util_widen_mask(mask, 2) : mask) << component; - info->vs.input_usage_mask[idx] |= mask << component; + info->vs.input_usage_mask[idx] |= mask & 0xf; + if (mask >> 4) + info->vs.input_usage_mask[idx + 1] |= mask >> 4; break; } default: @@ -312,6 +315,40 @@ assign_outinfo_params(struct radv_vs_output_info *outinfo, uint64_t mask, } } +static void +gather_info_input_decl_vs(const nir_shader *nir, unsigned location, const struct glsl_type *type, + const struct radv_pipeline_key *key, struct radv_shader_info *info) +{ + if (glsl_type_is_scalar(type) || glsl_type_is_vector(type)) { + if (key->vs.instance_rate_inputs & BITFIELD_BIT(location)) { + info->vs.needs_instance_id = true; + info->vs.needs_base_instance = true; + } + + if (info->vs.use_per_attribute_vb_descs) + info->vs.vb_desc_usage_mask |= BITFIELD_BIT(location); + else + info->vs.vb_desc_usage_mask |= BITFIELD_BIT(key->vs.vertex_attribute_bindings[location]); + + info->vs.input_slot_usage_mask |= + BITFIELD_RANGE(location, glsl_count_attribute_slots(type, false)); + } else if (glsl_type_is_matrix(type) || glsl_type_is_array(type)) { + const struct glsl_type *elem = glsl_get_array_element(type); + unsigned stride = glsl_count_attribute_slots(elem, false); + + for (unsigned i = 0; i < glsl_get_length(type); ++i) + gather_info_input_decl_vs(nir, location + i * stride, elem, key, info); + } else { + assert(glsl_type_is_struct_or_ifc(type)); + + for (unsigned i = 0; i < glsl_get_length(type); i++) { + const struct glsl_type *field = glsl_get_struct_field(type, i); + gather_info_input_decl_vs(nir, location, field, key, info); + location += glsl_count_attribute_slots(field, false); + } + } +} + static void gather_shader_info_vs(struct radv_device *device, const nir_shader *nir, const struct radv_pipeline_key *pipeline_key, struct radv_shader_info *info) @@ -331,25 +368,9 @@ gather_shader_info_vs(struct radv_device *device, const nir_shader *nir, info->vs.needs_base_instance |= info->vs.has_prolog; info->vs.needs_draw_id |= info->vs.has_prolog; - nir_foreach_shader_in_variable(var, nir) { - unsigned attrib_count = glsl_count_attribute_slots(var->type, true); - - for (unsigned i = 0; i < attrib_count; ++i) { - unsigned attrib_index = var->data.location + i - VERT_ATTRIB_GENERIC0; - - if (pipeline_key->vs.instance_rate_inputs & (1u << attrib_index)) { - info->vs.needs_instance_id = true; - info->vs.needs_base_instance = true; - } - - if (info->vs.use_per_attribute_vb_descs) { - info->vs.vb_desc_usage_mask |= 1u << attrib_index; - } else { - info->vs.vb_desc_usage_mask |= - 1u << pipeline_key->vs.vertex_attribute_bindings[attrib_index]; - } - } - } + nir_foreach_shader_in_variable(var, nir) + gather_info_input_decl_vs(nir, var->data.location - VERT_ATTRIB_GENERIC0, var->type, + pipeline_key, info); } static void