diff --git a/docs/envvars.rst b/docs/envvars.rst index 0a0a8e5877a..92e568715e2 100644 --- a/docs/envvars.rst +++ b/docs/envvars.rst @@ -670,6 +670,8 @@ RADV driver environment variables disable VRS for flat shading (only on GFX10.3+) ``preoptir`` dump LLVM IR before any optimizations + ``prologs`` + dump vertex shader prologs ``shaders`` dump shaders ``shaderstats`` diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp index 60499f2aa5e..b70dc530d08 100644 --- a/src/amd/compiler/aco_interface.cpp +++ b/src/amd/compiler/aco_interface.cpp @@ -263,3 +263,10 @@ aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders, *binary = (radv_shader_binary*)legacy_binary; } + +void +aco_compile_vs_prolog(const struct radv_vs_prolog_key* key, struct radv_prolog_binary** binary, + const struct radv_shader_args* args) +{ + unreachable("TODO"); +} diff --git a/src/amd/compiler/aco_interface.h b/src/amd/compiler/aco_interface.h index 1575cf59478..33d2762ba69 100644 --- a/src/amd/compiler/aco_interface.h +++ b/src/amd/compiler/aco_interface.h @@ -44,6 +44,9 @@ extern const struct aco_compiler_statistic_info* aco_statistic_infos; void aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders, struct radv_shader_binary** binary, const struct radv_shader_args* args); +void aco_compile_vs_prolog(const struct radv_vs_prolog_key* key, struct radv_prolog_binary** binary, + const struct radv_shader_args* args); + #ifdef __cplusplus } #endif diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 3ab71a8ccca..5998a527e4f 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -38,6 +38,7 @@ struct radv_shader_args; struct radv_shader_info; +struct radv_vs_prolog_key; namespace aco { diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 2353ab68f51..da3285874c2 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -37,6 +37,8 @@ #include "ac_debug.h" +#include "util/fast_idiv_by_const.h" + enum { RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0), RADV_PREFETCH_VS = (1 << 1), @@ -2647,8 +2649,300 @@ radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->state.context_roll_without_scissor_emitted = true; } +union vs_prolog_key_header { + struct { + uint32_t key_size : 8; + uint32_t num_attributes : 6; + uint32_t as_ls : 1; + uint32_t is_ngg : 1; + uint32_t wave32 : 1; + uint32_t next_stage : 3; + uint32_t instance_rate_inputs : 1; + uint32_t alpha_adjust_lo : 1; + uint32_t alpha_adjust_hi : 1; + uint32_t misaligned_mask : 1; + uint32_t post_shuffle : 1; + uint32_t nontrivial_divisors : 1; + /* We need this to ensure the padding is zero. It's useful even if it's unused. */ + uint32_t padding0 : 6; + }; + uint32_t v; +}; + +uint32_t +radv_hash_vs_prolog(const void *key_) +{ + const uint32_t *key = key_; + union vs_prolog_key_header header; + header.v = key[0]; + return _mesa_hash_data(key, header.key_size); +} + +bool +radv_cmp_vs_prolog(const void *a_, const void *b_) +{ + const uint32_t *a = a_; + const uint32_t *b = b_; + if (a[0] != b[0]) + return false; + + union vs_prolog_key_header header; + header.v = a[0]; + return memcmp(a, b, header.key_size) == 0; +} + +static struct radv_shader_prolog * +lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader, + uint32_t *nontrivial_divisors) +{ + STATIC_ASSERT(sizeof(union vs_prolog_key_header) == 4); + assert(vs_shader->info.vs.dynamic_inputs); + + struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input; + struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; + struct radv_device *device = cmd_buffer->device; + enum chip_class chip = device->physical_device->rad_info.chip_class; + + unsigned num_attributes = util_last_bit(vs_shader->info.vs.vb_desc_usage_mask); + uint32_t attribute_mask = BITFIELD_MASK(num_attributes); + + uint32_t instance_rate_inputs = state->instance_rate_inputs & attribute_mask; + *nontrivial_divisors = state->nontrivial_divisors & attribute_mask; + uint32_t misaligned_mask = 0; + if (chip == GFX6 || chip >= GFX10) { + u_foreach_bit(index, state->attribute_mask & attribute_mask) + { + uint8_t req = state->format_align_req_minus_1[index]; + struct radv_vertex_binding *vb = &cmd_buffer->vertex_bindings[state->bindings[index]]; + VkDeviceSize offset = vb->offset + state->offsets[index]; + if (vb->buffer && ((offset & req) || (vb->stride & req))) + misaligned_mask |= 1u << index; + } + } + + struct radv_vs_prolog_key key; + key.state = state; + key.num_attributes = num_attributes; + key.misaligned_mask = misaligned_mask; + /* The instance ID input VGPR is placed differently when as_ls=true. */ + key.as_ls = vs_shader->info.vs.as_ls && instance_rate_inputs; + key.is_ngg = vs_shader->info.is_ngg; + key.wave32 = vs_shader->info.wave_size == 32; + key.next_stage = MESA_SHADER_VERTEX; + if (pipeline->shaders[MESA_SHADER_TESS_CTRL] == vs_shader) + key.next_stage = MESA_SHADER_TESS_CTRL; + else if (pipeline->shaders[MESA_SHADER_GEOMETRY] == vs_shader) + key.next_stage = MESA_SHADER_GEOMETRY; + + uint32_t key_words[16]; + unsigned key_size = 1; + + union vs_prolog_key_header header; + header.v = 0; + header.num_attributes = num_attributes; + header.as_ls = key.as_ls; + header.is_ngg = key.is_ngg; + header.wave32 = key.wave32; + header.next_stage = key.next_stage; + + if (instance_rate_inputs & ~*nontrivial_divisors) { + header.instance_rate_inputs = true; + key_words[key_size++] = instance_rate_inputs; + } + if (*nontrivial_divisors) { + header.nontrivial_divisors = true; + key_words[key_size++] = *nontrivial_divisors; + } + if (misaligned_mask) { + header.misaligned_mask = true; + key_words[key_size++] = misaligned_mask; + + uint8_t *formats = (uint8_t *)&key_words[key_size]; + unsigned num_formats = 0; + u_foreach_bit(index, misaligned_mask) formats[num_formats++] = state->formats[index]; + while (num_formats & 0x3) + formats[num_formats++] = 0; + key_size += num_formats / 4u; + + if (state->post_shuffle & attribute_mask) { + header.post_shuffle = true; + key_words[key_size++] = state->post_shuffle & attribute_mask; + } + } + if (state->alpha_adjust_lo & attribute_mask) { + header.alpha_adjust_lo = true; + key_words[key_size++] = state->alpha_adjust_lo & attribute_mask; + } + if (state->alpha_adjust_hi & attribute_mask) { + header.alpha_adjust_hi = true; + key_words[key_size++] = state->alpha_adjust_hi & attribute_mask; + } + + header.key_size = key_size * sizeof(key_words[0]); + key_words[0] = header.v; + + uint32_t hash = radv_hash_vs_prolog(key_words); + + if (cmd_buffer->state.emitted_vs_prolog && + cmd_buffer->state.emitted_vs_prolog_key_hash == hash && + radv_cmp_vs_prolog(key_words, cmd_buffer->state.emitted_vs_prolog_key)) + return cmd_buffer->state.emitted_vs_prolog; + + u_rwlock_rdlock(&device->vs_prologs_lock); + struct hash_entry *prolog_entry = + _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words); + u_rwlock_rdunlock(&device->vs_prologs_lock); + + if (!prolog_entry) { + u_rwlock_wrlock(&device->vs_prologs_lock); + prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words); + if (prolog_entry) { + u_rwlock_wrunlock(&device->vs_prologs_lock); + return prolog_entry->data; + } + + struct radv_shader_prolog *prolog = radv_create_vs_prolog(device, &key); + uint32_t *key2 = malloc(key_size * 4); + if (!prolog || !key2) { + free(key2); + u_rwlock_wrunlock(&device->vs_prologs_lock); + return NULL; + } + memcpy(key2, key_words, key_size * 4); + _mesa_hash_table_insert_pre_hashed(device->vs_prologs, hash, key2, prolog); + + u_rwlock_wrunlock(&device->vs_prologs_lock); + return prolog; + } + + return prolog_entry->data; +} + static void -radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer) +emit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader, + struct radv_shader_prolog *prolog, bool pipeline_is_dirty) +{ + /* no need to re-emit anything in this case */ + if (cmd_buffer->state.emitted_vs_prolog == prolog && !pipeline_is_dirty) + return; + + enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class; + struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; + uint64_t prolog_va = radv_buffer_get_va(prolog->bo) + prolog->alloc->offset; + + assert(cmd_buffer->state.emitted_pipeline == cmd_buffer->state.pipeline); + assert(vs_shader->info.num_input_sgprs <= prolog->num_preserved_sgprs); + + uint32_t rsrc1 = vs_shader->config.rsrc1; + if (chip < GFX10 && G_00B228_SGPRS(prolog->rsrc1) > G_00B228_SGPRS(vs_shader->config.rsrc1)) + rsrc1 = (rsrc1 & C_00B228_SGPRS) | (prolog->rsrc1 & ~C_00B228_SGPRS); + + /* The main shader must not use less VGPRs than the prolog, otherwise shared vgprs might not + * work. + */ + assert(G_00B848_VGPRS(vs_shader->config.rsrc1) >= G_00B848_VGPRS(prolog->rsrc1)); + + unsigned pgm_lo_reg = R_00B120_SPI_SHADER_PGM_LO_VS; + unsigned rsrc1_reg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; + if (vs_shader->info.is_ngg || pipeline->shaders[MESA_SHADER_GEOMETRY] == vs_shader) { + pgm_lo_reg = chip >= GFX10 ? R_00B320_SPI_SHADER_PGM_LO_ES : R_00B210_SPI_SHADER_PGM_LO_ES; + rsrc1_reg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; + } else if (pipeline->shaders[MESA_SHADER_TESS_CTRL] == vs_shader) { + pgm_lo_reg = chip >= GFX10 ? R_00B520_SPI_SHADER_PGM_LO_LS : R_00B410_SPI_SHADER_PGM_LO_LS; + rsrc1_reg = R_00B428_SPI_SHADER_PGM_RSRC1_HS; + } else if (vs_shader->info.vs.as_ls) { + pgm_lo_reg = R_00B520_SPI_SHADER_PGM_LO_LS; + rsrc1_reg = R_00B528_SPI_SHADER_PGM_RSRC1_LS; + } else if (vs_shader->info.vs.as_es) { + pgm_lo_reg = R_00B320_SPI_SHADER_PGM_LO_ES; + rsrc1_reg = R_00B328_SPI_SHADER_PGM_RSRC1_ES; + } + + radeon_set_sh_reg_seq(cmd_buffer->cs, pgm_lo_reg, 2); + radeon_emit(cmd_buffer->cs, prolog_va >> 8); + radeon_emit(cmd_buffer->cs, S_00B124_MEM_BASE(prolog_va >> 40)); + + if (chip < GFX10) + radeon_set_sh_reg(cmd_buffer->cs, rsrc1_reg, rsrc1); + else + assert(rsrc1 == vs_shader->config.rsrc1); + + radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, prolog->bo); +} + +static void +emit_prolog_inputs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader, + uint32_t nontrivial_divisors, bool pipeline_is_dirty) +{ + /* no need to re-emit anything in this case */ + if (!nontrivial_divisors && !pipeline_is_dirty) + return; + + struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input; + uint64_t input_va = radv_shader_variant_get_va(vs_shader); + + if (nontrivial_divisors) { + unsigned inputs_offset; + uint32_t *inputs; + unsigned size = 8 + util_bitcount(nontrivial_divisors) * 8; + if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &inputs_offset, (void **)&inputs)) + return; + + *(inputs++) = input_va; + *(inputs++) = input_va >> 32; + + u_foreach_bit(index, nontrivial_divisors) + { + uint32_t div = state->divisors[index]; + if (div == 0) { + *(inputs++) = 0; + *(inputs++) = 1; + } else if (util_is_power_of_two_or_zero(div)) { + *(inputs++) = util_logbase2(div) | (1 << 8); + *(inputs++) = 0xffffffffu; + } else { + struct util_fast_udiv_info info = util_compute_fast_udiv_info(div, 32, 32); + *(inputs++) = info.pre_shift | (info.increment << 8) | (info.post_shift << 16); + *(inputs++) = info.multiplier; + } + } + + input_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + inputs_offset; + } + + struct radv_userdata_info *loc = + &vs_shader->info.user_sgprs_locs.shader_data[AC_UD_VS_PROLOG_INPUTS]; + uint32_t base_reg = cmd_buffer->state.pipeline->user_data_0[MESA_SHADER_VERTEX]; + assert(loc->sgpr_idx != -1); + assert(loc->num_sgprs == 2); + radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, + input_va, true); +} + +static void +radv_emit_vertex_state(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty) +{ + struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; + struct radv_shader_variant *vs_shader = radv_get_shader(pipeline, MESA_SHADER_VERTEX); + + if (!vs_shader->info.vs.has_prolog) + return; + + uint32_t nontrivial_divisors; + struct radv_shader_prolog *prolog = + lookup_vs_prolog(cmd_buffer, vs_shader, &nontrivial_divisors); + if (!prolog) { + cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; + return; + } + emit_prolog_regs(cmd_buffer, vs_shader, prolog, pipeline_is_dirty); + emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors, pipeline_is_dirty); + + cmd_buffer->state.emitted_vs_prolog = prolog; +} + +static void +radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty) { uint64_t states = cmd_buffer->state.dirty & cmd_buffer->state.emitted_pipeline->graphics.needed_dynamic_state; @@ -2717,6 +3011,9 @@ radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer) if (states & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE) radv_emit_color_write_enable(cmd_buffer); + if (states & RADV_CMD_DIRTY_VERTEX_STATE) + radv_emit_vertex_state(cmd_buffer, pipeline_is_dirty); + cmd_buffer->state.dirty &= ~states; } @@ -2923,33 +3220,105 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stag cmd_buffer->push_constant_stages |= dirty_stages; } +enum radv_dst_sel { + DST_SEL_0001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1), + DST_SEL_X001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1), + DST_SEL_XY01 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1), + DST_SEL_XYZ1 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1), + DST_SEL_XYZW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W), + DST_SEL_ZYXW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W), +}; + +static const uint32_t data_format_dst_sel[] = { + [V_008F0C_BUF_DATA_FORMAT_INVALID] = DST_SEL_0001, + [V_008F0C_BUF_DATA_FORMAT_8] = DST_SEL_X001, + [V_008F0C_BUF_DATA_FORMAT_16] = DST_SEL_X001, + [V_008F0C_BUF_DATA_FORMAT_8_8] = DST_SEL_XY01, + [V_008F0C_BUF_DATA_FORMAT_32] = DST_SEL_X001, + [V_008F0C_BUF_DATA_FORMAT_16_16] = DST_SEL_XY01, + [V_008F0C_BUF_DATA_FORMAT_10_11_11] = DST_SEL_XYZ1, + [V_008F0C_BUF_DATA_FORMAT_11_11_10] = DST_SEL_XYZ1, + [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = DST_SEL_XYZW, + [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = DST_SEL_XYZW, + [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = DST_SEL_XYZW, + [V_008F0C_BUF_DATA_FORMAT_32_32] = DST_SEL_XY01, + [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = DST_SEL_XYZW, + [V_008F0C_BUF_DATA_FORMAT_32_32_32] = DST_SEL_XYZ1, + [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = DST_SEL_XYZW, +}; + static void radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty) { if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) && cmd_buffer->state.pipeline->vb_desc_usage_mask) { struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; + struct radv_shader_variant *vs_shader = radv_get_shader(pipeline, MESA_SHADER_VERTEX); + enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class; unsigned vb_offset; void *vb_ptr; unsigned desc_index = 0; uint32_t mask = pipeline->vb_desc_usage_mask; uint64_t va; + struct radv_vs_input_state *vs_state = + vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL; /* allocate some descriptor state for vertex buffers */ if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pipeline->vb_desc_alloc_size, &vb_offset, &vb_ptr)) return; + assert(!vs_state || pipeline->use_per_attribute_vb_descs); + while (mask) { unsigned i = u_bit_scan(&mask); uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4]; - uint32_t offset; - unsigned binding = pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i; + uint32_t offset, rsrc_word3; + unsigned binding = + vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i] + : (pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i); struct radv_buffer *buffer = cmd_buffer->vertex_bindings[binding].buffer; unsigned num_records; unsigned stride; + if (vs_state) { + unsigned format = vs_state->formats[i]; + unsigned dfmt = format & 0xf; + unsigned nfmt = (format >> 4) & 0x7; + + rsrc_word3 = + vs_state->post_shuffle & (1u << i) ? DST_SEL_ZYXW : data_format_dst_sel[dfmt]; + + if (chip >= GFX10) + rsrc_word3 |= S_008F0C_FORMAT(ac_get_tbuffer_format(chip, dfmt, nfmt)); + else + rsrc_word3 |= S_008F0C_NUM_FORMAT(nfmt) | S_008F0C_DATA_FORMAT(dfmt); + } else { + if (chip >= GFX10) + rsrc_word3 = DST_SEL_XYZW | S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT); + else + rsrc_word3 = DST_SEL_XYZW | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + } + if (!buffer) { - memset(desc, 0, 4 * 4); + if (vs_state) { + /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need + * to include the format/word3 so that the alpha channel is 1 for formats without an + * alpha channel. + */ + desc[0] = 0; + desc[1] = S_008F04_STRIDE(16); + desc[2] = 0; + desc[3] = rsrc_word3; + } else { + memset(desc, 0, 4 * 4); + } continue; } @@ -2957,6 +3326,8 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_ offset = cmd_buffer->vertex_bindings[binding].offset; va += offset + buffer->offset; + if (vs_state) + va += vs_state->offsets[i]; if (cmd_buffer->vertex_bindings[binding].size) { num_records = cmd_buffer->vertex_bindings[binding].size; @@ -2970,9 +3341,9 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_ stride = pipeline->binding_stride[binding]; } - enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class; if (pipeline->use_per_attribute_vb_descs) { - uint32_t attrib_end = pipeline->attrib_ends[i]; + uint32_t attrib_end = vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i] + : pipeline->attrib_ends[i]; if (num_records < attrib_end) { num_records = 0; /* not enough space for one vertex */ @@ -2997,7 +3368,14 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_ * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and * GFX10.3 but it doesn't hurt. */ - memset(desc, 0, 16); + if (vs_state) { + desc[0] = 0; + desc[1] = S_008F04_STRIDE(16); + desc[2] = 0; + desc[3] = rsrc_word3; + } else { + memset(desc, 0, 16); + } continue; } } else { @@ -3005,22 +3383,13 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_ num_records = DIV_ROUND_UP(num_records, stride); } - uint32_t rsrc_word3 = - S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); - if (chip >= GFX10) { /* OOB_SELECT chooses the out-of-bounds check: * - 1: index >= NUM_RECORDS (Structured) * - 3: offset >= NUM_RECORDS (Raw) */ int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW; - - rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT) | - S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(1); - } else { - rsrc_word3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(1); } desc[0] = va; @@ -4009,7 +4378,7 @@ radv_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer, uint32_t firstBindi return; } - cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER; + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_STATE; } static uint32_t @@ -4397,7 +4766,7 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline if (!pipeline) break; - cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE; + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT; cmd_buffer->push_constant_stages |= pipeline->active_stages; /* the new vertex shader might not have the same user regs */ @@ -5712,7 +6081,7 @@ radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer, /* Index, vertex and streamout buffers don't change context regs, and * pipeline is already handled. */ - used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER | + used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_STATE | RADV_CMD_DIRTY_STREAMOUT_BUFFER | RADV_CMD_DIRTY_PIPELINE); if (cmd_buffer->state.dirty & used_states) @@ -5918,7 +6287,8 @@ radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer, const struct rad } static void -radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info) +radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, + bool pipeline_is_dirty) { bool late_scissor_emission; @@ -5955,7 +6325,7 @@ radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct r } } - radv_cmd_buffer_flush_dynamic_state(cmd_buffer); + radv_cmd_buffer_flush_dynamic_state(cmd_buffer, pipeline_is_dirty); radv_emit_draw_registers(cmd_buffer, info); @@ -6004,7 +6374,7 @@ radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info * the CUs are idle is very short. (there are only SET_SH * packets between the wait and the draw) */ - radv_emit_all_graphics_states(cmd_buffer, info); + radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty); si_emit_cache_flush(cmd_buffer); /* <-- CUs are idle here --> */ @@ -6024,7 +6394,7 @@ radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty); - radv_emit_all_graphics_states(cmd_buffer, info); + radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty); } radv_describe_draw(cmd_buffer); diff --git a/src/amd/vulkan/radv_debug.h b/src/amd/vulkan/radv_debug.h index 0bfdd4889ce..5c0dd14220e 100644 --- a/src/amd/vulkan/radv_debug.h +++ b/src/amd/vulkan/radv_debug.h @@ -63,6 +63,7 @@ enum { RADV_DEBUG_NO_VRS_FLAT_SHADING = 1ull << 32, RADV_DEBUG_NO_ATOC_DITHERING = 1ull << 33, RADV_DEBUG_NO_NGGC = 1ull << 34, + RADV_DEBUG_DUMP_PROLOGS = 1ull << 35, }; enum { diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 6cc96e00d9b..a866812f9fd 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -853,6 +853,7 @@ static const struct debug_control radv_debug_options[] = { {"novrsflatshading", RADV_DEBUG_NO_VRS_FLAT_SHADING}, {"noatocdithering", RADV_DEBUG_NO_ATOC_DITHERING}, {"nonggc", RADV_DEBUG_NO_NGGC}, + {"prologs", RADV_DEBUG_DUMP_PROLOGS}, {NULL, 0}}; const char * @@ -2666,6 +2667,30 @@ radv_device_finish_border_color(struct radv_device *device) } } +static VkResult +radv_device_init_vs_prologs(struct radv_device *device) +{ + u_rwlock_init(&device->vs_prologs_lock); + device->vs_prologs = _mesa_hash_table_create(NULL, &radv_hash_vs_prolog, &radv_cmp_vs_prolog); + if (!device->vs_prologs) + return vk_error(device->physical_device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + + return VK_SUCCESS; +} + +static void +radv_device_finish_vs_prologs(struct radv_device *device) +{ + if (device->vs_prologs) { + hash_table_foreach(device->vs_prologs, entry) + { + free((void *)entry->key); + radv_prolog_destroy(device, entry->data); + } + _mesa_hash_table_destroy(device->vs_prologs, NULL); + } +} + VkResult radv_device_init_vrs_state(struct radv_device *device) { @@ -2799,6 +2824,7 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr bool custom_border_colors = false; bool attachment_vrs_enabled = false; bool image_float32_atomics = false; + bool vs_prologs = false; /* Check enabled features */ if (pCreateInfo->pEnabledFeatures) { @@ -3090,6 +3116,12 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr goto fail; } + if (vs_prologs) { + result = radv_device_init_vs_prologs(device); + if (result != VK_SUCCESS) + goto fail; + } + for (int family = 0; family < RADV_MAX_QUEUE_FAMILIES; ++family) { device->empty_cs[family] = device->ws->cs_create(device->ws, family); if (!device->empty_cs[family]) @@ -3156,6 +3188,7 @@ fail: if (device->gfx_init) device->ws->buffer_destroy(device->ws, device->gfx_init); + radv_device_finish_vs_prologs(device); radv_device_finish_border_color(device); for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) { @@ -3186,6 +3219,7 @@ radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) if (device->gfx_init) device->ws->buffer_destroy(device->ws, device->gfx_init); + radv_device_finish_vs_prologs(device); radv_device_finish_border_color(device); radv_device_finish_vrs_image(device); diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index a266fd01dfc..d18943b0ecb 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -2742,8 +2742,8 @@ radv_determine_ngg_settings(struct radv_pipeline *pipeline, : nir[es_stage]->info.tess.primitive_mode == GL_ISOLINES ? 2 : 3; - infos[es_stage].has_ngg_culling = - radv_consider_culling(device, nir[es_stage], ps_inputs_read, num_vertices_per_prim); + infos[es_stage].has_ngg_culling = radv_consider_culling( + device, nir[es_stage], ps_inputs_read, num_vertices_per_prim, &infos[es_stage]); nir_function_impl *impl = nir_shader_get_entrypoint(nir[es_stage]); infos[es_stage].has_ngg_early_prim_export = exec_list_is_singular(&impl->body); @@ -5386,7 +5386,10 @@ radv_pipeline_init_vertex_input_state(struct radv_pipeline *pipeline, } pipeline->use_per_attribute_vb_descs = info->vs.use_per_attribute_vb_descs; - pipeline->vb_desc_usage_mask = info->vs.vb_desc_usage_mask; + if (info->vs.dynamic_inputs) + pipeline->vb_desc_usage_mask = BITFIELD_MASK(util_last_bit(info->vs.vb_desc_usage_mask)); + else + pipeline->vb_desc_usage_mask = info->vs.vb_desc_usage_mask; pipeline->vb_desc_alloc_size = util_bitcount(pipeline->vb_desc_usage_mask) * 16; } diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index c65acb1cd64..60ea3b3c2aa 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -832,6 +832,9 @@ struct radv_device { struct radv_buffer *buffer; /* HTILE */ struct radv_device_memory *mem; } vrs; + + struct u_rwlock vs_prologs_lock; + struct hash_table *vs_prologs; }; VkResult _radv_device_set_lost(struct radv_device *device, const char *file, int line, @@ -997,7 +1000,8 @@ enum radv_dynamic_state_bits { RADV_DYNAMIC_LOGIC_OP = 1ull << 26, RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE = 1ull << 27, RADV_DYNAMIC_COLOR_WRITE_ENABLE = 1ull << 28, - RADV_DYNAMIC_ALL = (1ull << 29) - 1, + RADV_DYNAMIC_VERTEX_INPUT = 1ull << 29, + RADV_DYNAMIC_ALL = (1ull << 30) - 1, }; enum radv_cmd_dirty_bits { @@ -1032,12 +1036,14 @@ enum radv_cmd_dirty_bits { RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP = 1ull << 26, RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE = 1ull << 27, RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE = 1ull << 28, - RADV_CMD_DIRTY_DYNAMIC_ALL = (1ull << 29) - 1, - RADV_CMD_DIRTY_PIPELINE = 1ull << 29, - RADV_CMD_DIRTY_INDEX_BUFFER = 1ull << 30, - RADV_CMD_DIRTY_FRAMEBUFFER = 1ull << 31, - RADV_CMD_DIRTY_VERTEX_BUFFER = 1ull << 32, - RADV_CMD_DIRTY_STREAMOUT_BUFFER = 1ull << 33 + RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT = 1ull << 29, + RADV_CMD_DIRTY_DYNAMIC_ALL = (1ull << 30) - 1, + RADV_CMD_DIRTY_PIPELINE = 1ull << 30, + RADV_CMD_DIRTY_INDEX_BUFFER = 1ull << 31, + RADV_CMD_DIRTY_FRAMEBUFFER = 1ull << 32, + RADV_CMD_DIRTY_VERTEX_BUFFER = 1ull << 33, + RADV_CMD_DIRTY_STREAMOUT_BUFFER = 1ull << 34, + RADV_CMD_DIRTY_VERTEX_STATE = RADV_CMD_DIRTY_VERTEX_BUFFER | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT, }; enum radv_cmd_flush_bits { @@ -1349,6 +1355,7 @@ struct radv_cmd_state { struct radv_render_pass *pass; const struct radv_subpass *subpass; struct radv_dynamic_state dynamic; + struct radv_vs_input_state dynamic_vs_input; struct radv_attachment_state *attachments; struct radv_streamout_state streamout; VkRect2D render_area; @@ -1414,6 +1421,10 @@ struct radv_cmd_state { bool uses_draw_indirect_multi; uint32_t rt_stack_size; + + struct radv_shader_prolog *emitted_vs_prolog; + uint32_t *emitted_vs_prolog_key; + uint32_t emitted_vs_prolog_key_hash; }; struct radv_cmd_pool { @@ -1531,6 +1542,10 @@ void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uin void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer); void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer); + +uint32_t radv_hash_vs_prolog(const void *key_); +bool radv_cmp_vs_prolog(const void *a_, const void *b_); + bool radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, unsigned size, unsigned *out_offset, void **ptr); void radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer, diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index defa4298a0d..e610b92b0cf 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -888,8 +888,8 @@ radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir, } bool -radv_consider_culling(struct radv_device *device, struct nir_shader *nir, - uint64_t ps_inputs_read, unsigned num_vertices_per_primitive) +radv_consider_culling(struct radv_device *device, struct nir_shader *nir, uint64_t ps_inputs_read, + unsigned num_vertices_per_primitive, const struct radv_shader_info *info) { /* Culling doesn't make sense for meta shaders. */ if (!!nir->info.name) @@ -899,6 +899,10 @@ radv_consider_culling(struct radv_device *device, struct nir_shader *nir, if (nir->info.outputs_written & (VARYING_BIT_VIEWPORT | VARYING_BIT_VIEWPORT_MASK)) return false; + /* We don't support culling with vertex shader prologs. */ + if (info->vs.has_prolog) + return false; + if (!device->physical_device->use_ngg_culling) return false; @@ -1910,6 +1914,72 @@ radv_create_trap_handler_shader(struct radv_device *device) return shader; } +static struct radv_shader_prolog * +upload_vs_prolog(struct radv_device *device, struct radv_prolog_binary *bin, unsigned wave_size) +{ + struct radv_shader_prolog *prolog = malloc(sizeof(struct radv_shader_prolog)); + if (!prolog) + return NULL; + + prolog->alloc = alloc_shader_memory(device, bin->code_size, NULL); + if (!prolog->alloc) { + free(prolog); + return NULL; + } + + prolog->bo = prolog->alloc->arena->bo; + char *dest_ptr = prolog->alloc->arena->ptr + prolog->alloc->offset; + + memcpy(dest_ptr, bin->data, bin->code_size); + + prolog->rsrc1 = S_00B848_VGPRS((bin->num_vgprs - 1) / (wave_size == 32 ? 8 : 4)) | + S_00B228_SGPRS((bin->num_sgprs - 1) / 8); + prolog->num_preserved_sgprs = bin->num_preserved_sgprs; + + return prolog; +} + +struct radv_shader_prolog * +radv_create_vs_prolog(struct radv_device *device, const struct radv_vs_prolog_key *key) +{ + struct radv_nir_compiler_options options = {0}; + options.explicit_scratch_args = true; + options.family = device->physical_device->rad_info.family; + options.chip_class = device->physical_device->rad_info.chip_class; + options.info = &device->physical_device->rad_info; + options.address32_hi = device->physical_device->rad_info.address32_hi; + options.dump_shader = device->instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS; + + struct radv_shader_info info = {0}; + info.wave_size = key->wave32 ? 32 : 64; + info.vs.needs_instance_id = true; + info.vs.needs_base_instance = true; + info.vs.needs_draw_id = true; + info.vs.use_per_attribute_vb_descs = true; + info.vs.vb_desc_usage_mask = BITFIELD_MASK(key->num_attributes); + info.vs.has_prolog = true; + info.vs.as_ls = key->as_ls; + info.is_ngg = key->is_ngg; + + struct radv_shader_args args = {0}; + args.options = &options; + args.shader_info = &info; + radv_declare_shader_args(&args, key->next_stage, key->next_stage != MESA_SHADER_VERTEX, + MESA_SHADER_VERTEX); + +#ifdef LLVM_AVAILABLE + if (options.dump_shader) + ac_init_llvm_once(); +#endif + + struct radv_prolog_binary *binary = NULL; + aco_compile_vs_prolog(key, &binary, &args); + struct radv_shader_prolog *prolog = upload_vs_prolog(device, binary, info.wave_size); + free(binary); + + return prolog; +} + void radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_variant *variant) { @@ -1926,6 +1996,16 @@ radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_varia free(variant); } +void +radv_prolog_destroy(struct radv_device *device, struct radv_shader_prolog *prolog) +{ + if (!prolog) + return; + + free_shader_memory(device, prolog->alloc); + free(prolog); +} + uint64_t radv_shader_variant_get_va(const struct radv_shader_variant *variant) { diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index c7fc550f49b..105ccfd2d05 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -46,6 +46,7 @@ struct radv_device; struct radv_pipeline; struct radv_pipeline_cache; struct radv_pipeline_key; +struct radv_vs_input_state; enum radv_vs_input_alpha_adjust { ALPHA_ADJUST_NONE = 0, @@ -71,6 +72,7 @@ struct radv_pipeline_key { enum radv_vs_input_alpha_adjust vertex_alpha_adjust[MAX_VERTEX_ATTRIBS]; uint32_t vertex_post_shuffle; uint32_t provoking_vtx_last : 1; + uint32_t dynamic_input_state : 1; uint8_t topology; } vs; @@ -145,6 +147,7 @@ enum radv_ud_index { AC_UD_SHADER_START = 9, AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START, AC_UD_VS_BASE_VERTEX_START_INSTANCE, + AC_UD_VS_PROLOG_INPUTS, AC_UD_VS_MAX_UD, AC_UD_PS_MAX_UD, AC_UD_CS_GRID_SIZE = AC_UD_SHADER_START, @@ -259,6 +262,8 @@ struct radv_shader_info { bool needs_base_instance; bool use_per_attribute_vb_descs; uint32_t vb_desc_usage_mask; + bool has_prolog; + bool dynamic_inputs; } vs; struct { uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1]; @@ -353,6 +358,37 @@ struct radv_shader_info { struct gfx10_ngg_info ngg_info; }; +struct radv_vs_input_state { + uint32_t attribute_mask; + uint8_t bindings[MAX_VERTEX_ATTRIBS]; + + uint32_t instance_rate_inputs; + uint32_t nontrivial_divisors; + uint32_t divisors[MAX_VERTEX_ATTRIBS]; + + uint32_t offsets[MAX_VERTEX_ATTRIBS]; + + uint32_t post_shuffle; + /* Having two separate fields instead of a single uint64_t makes it easier to remove attributes + * using bitwise arithmetic. + */ + uint32_t alpha_adjust_lo; + uint32_t alpha_adjust_hi; + uint8_t formats[MAX_VERTEX_ATTRIBS]; + uint8_t format_align_req_minus_1[MAX_VERTEX_ATTRIBS]; + uint8_t format_sizes[MAX_VERTEX_ATTRIBS]; +}; + +struct radv_vs_prolog_key { + struct radv_vs_input_state *state; + unsigned num_attributes; + uint32_t misaligned_mask; + bool as_ls; + bool is_ngg; + bool wave32; + gl_shader_stage next_stage; +}; + enum radv_shader_binary_type { RADV_BINARY_TYPE_LEGACY, RADV_BINARY_TYPE_RTLD }; struct radv_shader_binary { @@ -387,6 +423,14 @@ struct radv_shader_binary_rtld { uint8_t data[0]; }; +struct radv_prolog_binary { + uint8_t num_sgprs; + uint8_t num_vgprs; + uint8_t num_preserved_sgprs; + unsigned code_size; + uint8_t data[0]; +}; + struct radv_shader_arena { struct list_head list; struct list_head entries; @@ -429,6 +473,13 @@ struct radv_shader_variant { uint32_t *statistics; }; +struct radv_shader_prolog { + struct radeon_winsys_bo *bo; + union radv_shader_arena_block *alloc; + uint32_t rsrc1; + uint8_t num_preserved_sgprs; +}; + void radv_optimize_nir(const struct radv_device *device, struct nir_shader *shader, bool optimize_conservatively, bool allow_copies); void radv_optimize_nir_algebraic(nir_shader *shader, bool opt_offsets); @@ -469,8 +520,13 @@ radv_create_gs_copy_shader(struct radv_device *device, struct nir_shader *nir, struct radv_shader_variant *radv_create_trap_handler_shader(struct radv_device *device); +struct radv_shader_prolog *radv_create_vs_prolog(struct radv_device *device, + const struct radv_vs_prolog_key *key); + void radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_variant *variant); +void radv_prolog_destroy(struct radv_device *device, struct radv_shader_prolog *prolog); + uint64_t radv_shader_variant_get_va(const struct radv_shader_variant *variant); struct radv_shader_variant *radv_find_shader_variant(struct radv_device *device, uint64_t pc); @@ -577,7 +633,8 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, const struct radv_pipeline_key *pl_key); bool radv_consider_culling(struct radv_device *device, struct nir_shader *nir, - uint64_t ps_inputs_read, unsigned num_vertices_per_primitive); + uint64_t ps_inputs_read, unsigned num_vertices_per_primitive, + const struct radv_shader_info *info); void radv_get_nir_options(struct radv_physical_device *device); diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c index a1ff13fe217..e2f1b4a0600 100644 --- a/src/amd/vulkan/radv_shader_args.c +++ b/src/amd/vulkan/radv_shader_args.c @@ -184,6 +184,10 @@ allocate_user_sgprs(struct radv_shader_args *args, gl_shader_stage stage, bool h /* 2 user sgprs will always be allocated for scratch/rings */ user_sgpr_count += 2; + /* prolog inputs */ + if (args->shader_info->vs.has_prolog) + user_sgpr_count += 2; + switch (stage) { case MESA_SHADER_COMPUTE: if (args->shader_info->cs.uses_sbt) @@ -281,6 +285,9 @@ static void declare_vs_specific_input_sgprs(struct radv_shader_args *args, gl_shader_stage stage, bool has_previous_stage, gl_shader_stage previous_stage) { + if (args->shader_info->vs.has_prolog) + ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_INT, &args->prolog_inputs); + if (!args->is_gs_copy_shader && (stage == MESA_SHADER_VERTEX || (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) { if (args->shader_info->vs.vb_desc_usage_mask) { @@ -328,6 +335,17 @@ declare_vs_input_vgprs(struct radv_shader_args *args) } } } + + if (args->shader_info->vs.dynamic_inputs) { + assert(args->shader_info->vs.use_per_attribute_vb_descs); + unsigned num_attributes = util_last_bit(args->shader_info->vs.vb_desc_usage_mask); + for (unsigned i = 0; i < num_attributes; i++) + ac_add_arg(&args->ac, AC_ARG_VGPR, 4, AC_ARG_INT, &args->vs_inputs[i]); + /* Ensure the main shader doesn't use less vgprs than the prolog. The prolog requires one + * VGPR more than the number of shader arguments in the case of non-trivial divisors on GFX8. + */ + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); + } } static void @@ -463,6 +481,9 @@ set_vs_specific_input_locs(struct radv_shader_args *args, gl_shader_stage stage, bool has_previous_stage, gl_shader_stage previous_stage, uint8_t *user_sgpr_idx) { + if (args->prolog_inputs.used) + set_loc_shader(args, AC_UD_VS_PROLOG_INPUTS, user_sgpr_idx, 2); + if (!args->is_gs_copy_shader && (stage == MESA_SHADER_VERTEX || (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) { if (args->ac.vertex_buffers.used) { diff --git a/src/amd/vulkan/radv_shader_args.h b/src/amd/vulkan/radv_shader_args.h index a7c13152fcb..a6d4b8b7be7 100644 --- a/src/amd/vulkan/radv_shader_args.h +++ b/src/amd/vulkan/radv_shader_args.h @@ -45,6 +45,9 @@ struct radv_shader_args { struct ac_arg ngg_viewport_scale[2]; struct ac_arg ngg_viewport_translate[2]; + struct ac_arg prolog_inputs; + struct ac_arg vs_inputs[MAX_VERTEX_ATTRIBS]; + bool is_gs_copy_shader; bool is_trap_handler_shader; }; diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c index 10069a4cc42..40042e3b4fe 100644 --- a/src/amd/vulkan/radv_shader_info.c +++ b/src/amd/vulkan/radv_shader_info.c @@ -608,12 +608,23 @@ radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shader *n } if (nir->info.stage == MESA_SHADER_VERTEX) { + if (pipeline_key->vs.dynamic_input_state && nir->info.inputs_read) { + info->vs.has_prolog = true; + info->vs.dynamic_inputs = true; + } + /* Use per-attribute vertex descriptors to prevent faults and * for correct bounds checking. */ - info->vs.use_per_attribute_vb_descs = device->robust_buffer_access; + info->vs.use_per_attribute_vb_descs = device->robust_buffer_access || info->vs.dynamic_inputs; } + /* We have to ensure consistent input register assignments between the main shader and the + * prolog. */ + info->vs.needs_instance_id |= info->vs.has_prolog; + info->vs.needs_base_instance |= info->vs.has_prolog; + info->vs.needs_draw_id |= info->vs.has_prolog; + nir_foreach_shader_in_variable (variable, nir) gather_info_input_decl(nir, variable, pipeline_key, info);