diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index a0156990ab5..7400f42d533 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -11053,7 +11053,7 @@ static void write_tcs_tess_factors(isel_context *ctx) store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, memory_sync_info()); /* Store to offchip for TES to read - only if TES reads them */ - if (ctx->args->options->key.tcs.tes_reads_tess_factors) { + if (ctx->args->shader_info->tcs.tes_reads_tess_factors) { Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u)); Temp oc_lds = get_arg(ctx, ctx->args->ac.tess_offchip_offset); diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 869acb9da10..eb0add80925 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -448,14 +448,14 @@ setup_tcs_info(isel_context *ctx, nir_shader *nir, nir_shader *vs) ctx->tcs_num_inputs = ctx->program->info->tcs.num_linked_inputs; ctx->tcs_num_outputs = ctx->program->info->tcs.num_linked_outputs; ctx->tcs_num_patch_outputs = ctx->program->info->tcs.num_linked_patch_outputs; - ctx->tcs_num_patches = ctx->args->shader_info->tcs.num_patches; + ctx->tcs_num_patches = ctx->args->shader_info->num_tess_patches; ctx->program->config->lds_size = ctx->args->shader_info->tcs.num_lds_blocks; } void setup_tes_variables(isel_context *ctx, nir_shader *nir) { - ctx->tcs_num_patches = ctx->args->options->key.tes.num_patches; + ctx->tcs_num_patches = ctx->args->shader_info->num_tess_patches; ctx->tcs_num_outputs = ctx->program->info->tes.num_linked_inputs; if (ctx->stage == tess_eval_vs || ctx->stage == tess_eval_ngg) { diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 2f889e9eadb..06c6b85525b 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -3465,7 +3465,7 @@ write_tess_factors(struct radv_shader_context *ctx) 16 + tf_offset, ac_glc); //store to offchip for TES to read - only if TES reads them - if (ctx->args->options->key.tcs.tes_reads_tess_factors) { + if (ctx->args->shader_info->tcs.tes_reads_tess_factors) { LLVMValueRef inner_vec, outer_vec, tf_outer_offset; LLVMValueRef tf_inner_offset; @@ -3986,12 +3986,12 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ctx.abi.load_patch_vertices_in = load_patch_vertices_in; ctx.abi.store_tcs_outputs = store_tcs_output; ctx.tcs_num_inputs = ctx.args->shader_info->tcs.num_linked_inputs; - ctx.tcs_num_patches = args->shader_info->tcs.num_patches; + ctx.tcs_num_patches = args->shader_info->num_tess_patches; } else if (shaders[shader_idx]->info.stage == MESA_SHADER_TESS_EVAL) { ctx.abi.load_tess_varyings = load_tes_input; ctx.abi.load_tess_coord = load_tess_coord; ctx.abi.load_patch_vertices_in = load_patch_vertices_in; - ctx.tcs_num_patches = args->options->key.tes.num_patches; + ctx.tcs_num_patches = args->shader_info->num_tess_patches; } else if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX) { ctx.abi.load_base_vertex = radv_load_base_vertex; } else if (shaders[shader_idx]->info.stage == MESA_SHADER_FRAGMENT) { diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 546d3805930..695908dd3d1 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -1451,7 +1451,7 @@ radv_compute_ia_multi_vgt_param_helpers(struct radv_pipeline *pipeline) const struct radv_device *device = pipeline->device; if (radv_pipeline_has_tess(pipeline)) - ia_multi_vgt_param.primgroup_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_patches; + ia_multi_vgt_param.primgroup_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches; else if (radv_pipeline_has_gs(pipeline)) ia_multi_vgt_param.primgroup_size = 64; else @@ -2733,8 +2733,6 @@ radv_fill_shader_keys(struct radv_device *device, keys[MESA_SHADER_VERTEX].vs_common_out.as_ls = true; keys[MESA_SHADER_TESS_CTRL].tcs.input_vertices = key->tess_input_vertices; keys[MESA_SHADER_TESS_CTRL].tcs.primitive_mode = nir[MESA_SHADER_TESS_EVAL]->info.tess.primitive_mode; - - keys[MESA_SHADER_TESS_CTRL].tcs.tes_reads_tess_factors = !!(nir[MESA_SHADER_TESS_EVAL]->info.inputs_read & (VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER)); } if (nir[MESA_SHADER_GEOMETRY]) { @@ -2916,13 +2914,6 @@ radv_fill_shader_info(struct radv_pipeline *pipeline, filled_stages |= (1 << MESA_SHADER_FRAGMENT); } - if (nir[MESA_SHADER_TESS_CTRL]) { - infos[MESA_SHADER_TESS_CTRL].tcs.tes_inputs_read = - nir[MESA_SHADER_TESS_EVAL]->info.inputs_read; - infos[MESA_SHADER_TESS_CTRL].tcs.tes_patch_inputs_read = - nir[MESA_SHADER_TESS_EVAL]->info.patch_inputs_read; - } - if (pipeline->device->physical_device->rad_info.chip_class >= GFX9 && nir[MESA_SHADER_TESS_CTRL]) { struct nir_shader *combined_nir[] = {nir[MESA_SHADER_VERTEX], nir[MESA_SHADER_TESS_CTRL]}; @@ -2937,9 +2928,6 @@ radv_fill_shader_info(struct radv_pipeline *pipeline, &infos[MESA_SHADER_TESS_CTRL]); } - keys[MESA_SHADER_TESS_EVAL].tes.num_patches = - infos[MESA_SHADER_TESS_CTRL].tcs.num_patches; - filled_stages |= (1 << MESA_SHADER_VERTEX); filled_stages |= (1 << MESA_SHADER_TESS_CTRL); } @@ -2965,12 +2953,6 @@ radv_fill_shader_info(struct radv_pipeline *pipeline, active_stages ^= filled_stages; while (active_stages) { int i = u_bit_scan(&active_stages); - - if (i == MESA_SHADER_TESS_EVAL) { - keys[MESA_SHADER_TESS_EVAL].tes.num_patches = - infos[MESA_SHADER_TESS_CTRL].tcs.num_patches; - } - radv_nir_shader_info_init(&infos[i]); radv_nir_shader_info_pass(nir[i], pipeline->layout, &keys[i], &infos[i]); @@ -2991,7 +2973,7 @@ radv_fill_shader_info(struct radv_pipeline *pipeline, static void merge_tess_info(struct shader_info *tes_info, - const struct shader_info *tcs_info) + struct shader_info *tcs_info) { /* The Vulkan 1.0.38 spec, section 21.1 Tessellator says: * @@ -3026,6 +3008,81 @@ merge_tess_info(struct shader_info *tes_info, tes_info->tess.primitive_mode |= tcs_info->tess.primitive_mode; tes_info->tess.ccw |= tcs_info->tess.ccw; tes_info->tess.point_mode |= tcs_info->tess.point_mode; + + /* Copy the merged info back to the TCS */ + tcs_info->tess.tcs_vertices_out = tes_info->tess.tcs_vertices_out; + tcs_info->tess.spacing = tes_info->tess.spacing; + tcs_info->tess.primitive_mode = tes_info->tess.primitive_mode; + tcs_info->tess.ccw = tes_info->tess.ccw; + tcs_info->tess.point_mode = tes_info->tess.point_mode; +} + +static void +gather_tess_info(struct radv_device *device, + nir_shader **nir, struct radv_shader_info *infos, + const struct radv_pipeline_key *pipeline_key) +{ + merge_tess_info(&nir[MESA_SHADER_TESS_EVAL]->info, &nir[MESA_SHADER_TESS_CTRL]->info); + + /* Number of tessellation patches per workgroup processed by the current pipeline. */ + unsigned num_patches = + get_tcs_num_patches( + pipeline_key->tess_input_vertices, + nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out, + infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_inputs, + infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_outputs, + infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_patch_outputs, + device->tess_offchip_block_dw_size, + device->physical_device->rad_info.chip_class, + device->physical_device->rad_info.family); + + /* LDS size used by VS+TCS for storing TCS inputs and outputs. */ + unsigned tcs_lds_size = + calculate_tess_lds_size( + device->physical_device->rad_info.chip_class, + pipeline_key->tess_input_vertices, + nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out, + infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_inputs, + num_patches, + infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_outputs, + infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_patch_outputs); + + infos[MESA_SHADER_TESS_CTRL].num_tess_patches = num_patches; + infos[MESA_SHADER_TESS_CTRL].tcs.num_lds_blocks = tcs_lds_size; + infos[MESA_SHADER_TESS_CTRL].tcs.tes_reads_tess_factors = !!(nir[MESA_SHADER_TESS_EVAL]->info.inputs_read & (VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER)); + infos[MESA_SHADER_TESS_CTRL].tcs.tes_inputs_read = nir[MESA_SHADER_TESS_EVAL]->info.inputs_read; + infos[MESA_SHADER_TESS_CTRL].tcs.tes_patch_inputs_read = nir[MESA_SHADER_TESS_EVAL]->info.patch_inputs_read; + + infos[MESA_SHADER_TESS_EVAL].num_tess_patches = num_patches; + infos[MESA_SHADER_GEOMETRY].num_tess_patches = num_patches; + + if (!radv_use_llvm_for_stage(device, MESA_SHADER_VERTEX)) { + /* When the number of TCS input and output vertices are the same (typically 3): + * - There is an equal amount of LS and HS invocations + * - In case of merged LSHS shaders, the LS and HS halves of the shader + * always process the exact same vertex. We can use this knowledge to optimize them. + * + * We don't set tcs_in_out_eq if the float controls differ because that might + * involve different float modes for the same block and our optimizer + * doesn't handle a instruction dominating another with a different mode. + */ + infos[MESA_SHADER_VERTEX].vs.tcs_in_out_eq = + device->physical_device->rad_info.chip_class >= GFX9 && + pipeline_key->tess_input_vertices == nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out && + nir[MESA_SHADER_VERTEX]->info.float_controls_execution_mode == nir[MESA_SHADER_TESS_CTRL]->info.float_controls_execution_mode; + + if (infos[MESA_SHADER_VERTEX].vs.tcs_in_out_eq) + infos[MESA_SHADER_VERTEX].vs.tcs_temp_only_input_mask = + nir[MESA_SHADER_TESS_CTRL]->info.inputs_read & + nir[MESA_SHADER_VERTEX]->info.outputs_written & + ~nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_cross_invocation_inputs_read & + ~nir[MESA_SHADER_TESS_CTRL]->info.inputs_read_indirectly & + ~nir[MESA_SHADER_VERTEX]->info.outputs_accessed_indirectly; + + /* Copy data to TCS so it can be accessed by the backend if they are merged. */ + infos[MESA_SHADER_TESS_CTRL].vs.tcs_in_out_eq = infos[MESA_SHADER_VERTEX].vs.tcs_in_out_eq; + infos[MESA_SHADER_TESS_CTRL].vs.tcs_temp_only_input_mask = infos[MESA_SHADER_VERTEX].vs.tcs_temp_only_input_mask; + } } static @@ -3314,11 +3371,6 @@ VkResult radv_create_shaders(struct radv_pipeline *pipeline, radv_stop_feedback(stage_feedbacks[i], false); } - if (nir[MESA_SHADER_TESS_CTRL]) { - nir_lower_patch_vertices(nir[MESA_SHADER_TESS_EVAL], nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out, NULL); - merge_tess_info(&nir[MESA_SHADER_TESS_EVAL]->info, &nir[MESA_SHADER_TESS_CTRL]->info); - } - bool optimize_conservatively = flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT; radv_link_shaders(pipeline, nir, optimize_conservatively); @@ -3337,6 +3389,14 @@ VkResult radv_create_shaders(struct radv_pipeline *pipeline, } } + infos[MESA_SHADER_VERTEX].vs.as_ls = !!nir[MESA_SHADER_TESS_CTRL]; + infos[MESA_SHADER_VERTEX].vs.as_es = !!nir[MESA_SHADER_GEOMETRY] && !nir[MESA_SHADER_TESS_CTRL]; + infos[MESA_SHADER_TESS_EVAL].tes.as_es = !!nir[MESA_SHADER_GEOMETRY] && !!nir[MESA_SHADER_TESS_CTRL]; + + if (nir[MESA_SHADER_TESS_CTRL]) { + nir_lower_patch_vertices(nir[MESA_SHADER_TESS_EVAL], nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out, NULL); + gather_tess_info(device, nir, infos, pipeline_key); + } for (int i = 0; i < MESA_SHADER_STAGES; ++i) { if (nir[i]) { @@ -3351,68 +3411,6 @@ VkResult radv_create_shaders(struct radv_pipeline *pipeline, } NIR_PASS_V(nir[i], nir_lower_memory_model); - if (i == MESA_SHADER_VERTEX) { - if (nir[MESA_SHADER_TESS_CTRL] && !radv_use_llvm_for_stage(device, i)) { - /* When the number of TCS input and output vertices are the same (typically 3): - * - There is an equal amount of LS and HS invocations - * - In case of merged LSHS shaders, the LS and HS halves of the shader - * always process the exact same vertex. We can use this knowledge to optimize them. - * - * We don't set tcs_in_out_eq if the float controls differ because that might - * involve different float modes for the same block and our optimizer - * doesn't handle a instruction dominating another with a different mode. - */ - infos[i].vs.tcs_in_out_eq = - device->physical_device->rad_info.chip_class >= GFX9 && - pipeline_key->tess_input_vertices == nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out && - nir[MESA_SHADER_VERTEX]->info.float_controls_execution_mode == nir[MESA_SHADER_TESS_CTRL]->info.float_controls_execution_mode; - - if (infos[i].vs.tcs_in_out_eq) - infos[i].vs.tcs_temp_only_input_mask = - nir[MESA_SHADER_TESS_CTRL]->info.inputs_read & - nir[MESA_SHADER_VERTEX]->info.outputs_written & - ~nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_cross_invocation_inputs_read & - ~nir[MESA_SHADER_TESS_CTRL]->info.inputs_read_indirectly & - ~nir[MESA_SHADER_VERTEX]->info.outputs_accessed_indirectly; - - /* Copy data to TCS so it can be accessed by the backend if they are merged. */ - infos[MESA_SHADER_TESS_CTRL].vs.tcs_in_out_eq = infos[i].vs.tcs_in_out_eq; - infos[MESA_SHADER_TESS_CTRL].vs.tcs_temp_only_input_mask = infos[i].vs.tcs_temp_only_input_mask; - } - } else if (i == MESA_SHADER_TESS_CTRL) { - /* Copy correct primitive mode from TES info. */ - nir[i]->info.tess.primitive_mode = nir[MESA_SHADER_TESS_EVAL]->info.tess.primitive_mode; - - /* Number of tessellation patches processed per workgroup in the current pipeline. */ - unsigned tcs_num_patches = - get_tcs_num_patches( - pipeline_key->tess_input_vertices, - nir[i]->info.tess.tcs_vertices_out, - infos[i].tcs.num_linked_inputs, - infos[i].tcs.num_linked_outputs, - infos[i].tcs.num_linked_patch_outputs, - device->tess_offchip_block_dw_size, - device->physical_device->rad_info.chip_class, - device->physical_device->rad_info.family); - - /* LDS size used by VS+TCS for storing TCS inputs and outputs. */ - unsigned tcs_lds_size = - calculate_tess_lds_size( - device->physical_device->rad_info.chip_class, - pipeline_key->tess_input_vertices, - nir[i]->info.tess.tcs_vertices_out, - infos[i].tcs.num_linked_inputs, - tcs_num_patches, - infos[i].tcs.num_linked_outputs, - infos[i].tcs.num_linked_patch_outputs); - - infos[i].tcs.num_patches = tcs_num_patches; - infos[i].tcs.num_lds_blocks = tcs_lds_size; - } else if (i == MESA_SHADER_TESS_EVAL) { - /* Copy num_patches from TCS info. */ - keys[i].tes.num_patches = infos[MESA_SHADER_TESS_CTRL].tcs.num_patches; - } - bool lower_to_scalar = false; nir_load_store_vectorize_options vectorize_opts = { @@ -3613,7 +3611,6 @@ VkResult radv_create_shaders(struct radv_pipeline *pipeline, radv_stop_feedback(stage_feedbacks[MESA_SHADER_TESS_CTRL], false); } modules[MESA_SHADER_VERTEX] = NULL; - keys[MESA_SHADER_TESS_EVAL].tes.num_patches = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_patches; } if (device->physical_device->rad_info.chip_class >= GFX9 && modules[MESA_SHADER_GEOMETRY]) { @@ -3637,10 +3634,6 @@ VkResult radv_create_shaders(struct radv_pipeline *pipeline, for (int i = 0; i < MESA_SHADER_STAGES; ++i) { if(modules[i] && !pipeline->shaders[i]) { - if (i == MESA_SHADER_TESS_EVAL) { - keys[MESA_SHADER_TESS_EVAL].tes.num_patches = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_patches; - } - radv_start_feedback(stage_feedbacks[i]); pipeline->shaders[i] = radv_shader_variant_compile(device, modules[i], &nir[i], 1, @@ -4757,7 +4750,7 @@ radv_pipeline_generate_tess_state(struct radeon_cmdbuf *ctx_cs, num_tcs_input_cp = pCreateInfo->pTessellationState->patchControlPoints; num_tcs_output_cp = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.tcs_vertices_out; //TCS VERTICES OUT - num_patches = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_patches; + num_patches = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches; ls_hs_config = S_028B58_NUM_PATCHES(num_patches) | S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) | @@ -5254,7 +5247,7 @@ gfx10_pipeline_generate_ge_cntl(struct radeon_cmdbuf *ctx_cs, unsigned vertgroup_size = 256; /* 256 = disable vertex grouping */ if (radv_pipeline_has_tess(pipeline)) { - primgroup_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_patches; + primgroup_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches; } else if (radv_pipeline_has_gs(pipeline)) { const struct gfx9_gs_info *gs_state = &pipeline->shaders[MESA_SHADER_GEOMETRY]->info.gs_ring_info; diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index c0fd533f30b..a7ebdb82e74 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -79,15 +79,12 @@ struct radv_vs_variant_key { struct radv_tes_variant_key { struct radv_vs_out_key out; - - uint8_t num_patches; }; struct radv_tcs_variant_key { struct radv_vs_variant_key vs_key; unsigned primitive_mode; unsigned input_vertices; - uint32_t tes_reads_tess_factors:1; }; struct radv_fs_variant_key { @@ -259,6 +256,7 @@ struct radv_shader_info { bool need_indirect_descriptor_sets; bool is_ngg; bool is_ngg_passthrough; + uint32_t num_tess_patches; struct { uint8_t input_usage_mask[RADV_VERT_ATTRIB_MAX]; uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1]; @@ -337,11 +335,11 @@ struct radv_shader_info { uint64_t tes_inputs_read; uint64_t tes_patch_inputs_read; unsigned tcs_vertices_out; - uint32_t num_patches; uint32_t num_lds_blocks; uint8_t num_linked_inputs; uint8_t num_linked_outputs; uint8_t num_linked_patch_outputs; + bool tes_reads_tess_factors:1; } tcs; struct radv_streamout_info so;