diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c index 3d42621f8fd..fbd0cd00ccd 100644 --- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c +++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c @@ -54,7 +54,9 @@ static nir_def *build_attr_ring_desc(nir_builder *b, struct si_shader *shader, b->shader->info.vs.blit_sgprs_amd - 1) : ac_nir_load_arg(b, &args->ac, args->gs_attr_address); - unsigned stride = 16 * si_shader_num_alloc_param_exports(shader); + unsigned per_vertex_params = MAX2(1, si_shader_num_alloc_param_exports(shader)); + unsigned total_params = per_vertex_params + shader->info.nr_prim_param_exports; + unsigned stride = 16 * total_params; uint32_t desc[4]; ac_build_attr_ring_descriptor(sel->screen->info.gfx_level, diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 13d5829e727..1516c79c713 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1224,6 +1224,7 @@ static void si_nir_assign_param_offsets(nir_shader *nir, struct si_shader *shade uint64_t outputs_written = 0; uint32_t outputs_written_16bit = 0; + uint64_t per_primitive_outputs = 0; nir_function_impl *impl = nir_shader_get_entrypoint(nir); assert(impl); @@ -1235,7 +1236,8 @@ static void si_nir_assign_param_offsets(nir_shader *nir, struct si_shader *shade nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic != nir_intrinsic_store_output && - intr->intrinsic != nir_intrinsic_store_per_vertex_output) + intr->intrinsic != nir_intrinsic_store_per_vertex_output && + intr->intrinsic != nir_intrinsic_store_per_primitive_output) continue; /* No indirect indexing allowed. */ @@ -1250,6 +1252,9 @@ static void si_nir_assign_param_offsets(nir_shader *nir, struct si_shader *shade else outputs_written |= BITFIELD64_BIT(sem.location); + if (intr->intrinsic == nir_intrinsic_store_per_primitive_output) + per_primitive_outputs |= BITFIELD64_BIT(sem.location); + /* Assign the param index if it's unassigned. */ if (nir_slot_is_varying(sem.location, MESA_SHADER_FRAGMENT) && !sem.no_varying && (sem.gs_streams & 0x3) == 0 && @@ -1259,7 +1264,10 @@ static void si_nir_assign_param_offsets(nir_shader *nir, struct si_shader *shade /* It must not be remapped (duplicated). */ assert(slot_remap[sem.location] == -1); - temp_info->vs_output_param_offset[sem.location] = info->nr_param_exports++; + temp_info->vs_output_param_offset[sem.location] = + intr->intrinsic == nir_intrinsic_store_per_primitive_output ? + info->nr_prim_param_exports++ : + info->nr_param_exports++; } } } @@ -1274,9 +1282,19 @@ static void si_nir_assign_param_offsets(nir_shader *nir, struct si_shader *shade temp_info->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = info->nr_param_exports++; } + /* per primitive outputs come after per vertex outputs */ + unsigned per_primitive_outputs_offset = info->nr_param_exports; + if (sel->screen->info.gfx_level >= GFX11) + per_primitive_outputs_offset = MAX2(per_primitive_outputs_offset, 1); + u_foreach_bit64 (i, per_primitive_outputs) { + if (temp_info->vs_output_param_offset[i] != AC_EXP_PARAM_DEFAULT_VAL_0000) + temp_info->vs_output_param_offset[i] += per_primitive_outputs_offset; + } + /* Update outputs written info, we may remove some outputs before. */ nir->info.outputs_written = outputs_written; nir->info.outputs_written_16bit = outputs_written_16bit; + nir->info.per_primitive_outputs = per_primitive_outputs; } static void si_assign_param_offsets(nir_shader *nir, struct si_shader *shader, @@ -1284,6 +1302,7 @@ static void si_assign_param_offsets(nir_shader *nir, struct si_shader *shader, { /* Initialize this first. */ shader->info.nr_param_exports = 0; + shader->info.nr_prim_param_exports = 0; STATIC_ASSERT(sizeof(temp_info->vs_output_param_offset[0]) == 1); memset(temp_info->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, @@ -1296,7 +1315,7 @@ static void si_assign_param_offsets(nir_shader *nir, struct si_shader *shader, memset(slot_remap, -1, NUM_TOTAL_VARYING_SLOTS); /* This sets DEFAULT_VAL for constant outputs in vs_output_param_offset. */ - /* TODO: This doesn't affect GS. */ + /* TODO: This doesn't affect GS and MS. */ NIR_PASS(_, nir, ac_nir_optimize_outputs, false, slot_remap, temp_info->vs_output_param_offset); @@ -2039,6 +2058,10 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi S_028644_DEFAULT_VAL(offset); } + if (sscreen->info.gfx_level >= GFX11 && + (nir->info.per_primitive_outputs & BITFIELD64_BIT(semantic))) + ps_input_cntl |= S_028644_PRIM_ATTR(1); + shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl; } } diff --git a/src/gallium/drivers/radeonsi/si_shader_info.c b/src/gallium/drivers/radeonsi/si_shader_info.c index 7ffa077ec37..9a1bb3cb9c3 100644 --- a/src/gallium/drivers/radeonsi/si_shader_info.c +++ b/src/gallium/drivers/radeonsi/si_shader_info.c @@ -341,6 +341,7 @@ static void scan_instruction(const struct nir_shader *nir, struct si_shader_info case nir_intrinsic_load_per_vertex_output: case nir_intrinsic_store_output: case nir_intrinsic_store_per_vertex_output: + case nir_intrinsic_store_per_primitive_output: scan_io_usage(nir, info, intr, false, colors_lowered); break; case nir_intrinsic_load_deref: diff --git a/src/gallium/drivers/radeonsi/si_shader_info.h b/src/gallium/drivers/radeonsi/si_shader_info.h index 170c764d14f..ee1c62f56f4 100644 --- a/src/gallium/drivers/radeonsi/si_shader_info.h +++ b/src/gallium/drivers/radeonsi/si_shader_info.h @@ -224,6 +224,7 @@ struct si_shader_variant_info { bool uses_mesh_scratch_ring : 1; uint8_t nr_pos_exports; uint8_t nr_param_exports; + uint8_t nr_prim_param_exports; uint8_t clipdist_mask; uint8_t culldist_mask; uint8_t num_streamout_vec4s; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index e9492f479c3..b81633d9774 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -1600,8 +1600,11 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader S_028A84_NGG_DISABLE_PROVOK_REUSE(shader->key.ge.mono.u.vs_export_prim_id || gs_sel->info.writes_primid); + unsigned num_params = si_shader_num_alloc_param_exports(shader); + unsigned num_prim_params = shader->info.nr_prim_param_exports; + bool no_pc_export = num_params == 0 && num_prim_params == 0; + if (sscreen->info.gfx_level >= GFX12) { - unsigned num_params = si_shader_num_alloc_param_exports(shader); unsigned wave_limit_per_se = 0x3ff; /* This tuning adds up to 50% streamout performance. */ @@ -1632,7 +1635,8 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader S_00B220_WAVE_LIMIT(wave_limit_per_se) | S_00B220_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)); shader->ngg.spi_vs_out_config = S_00B0C4_VS_EXPORT_COUNT(MAX2(num_params, 1) - 1) | - S_00B0C4_NO_PC_EXPORT(num_params == 0); + S_00B0C4_PRIM_EXPORT_COUNT(num_prim_params) | + S_00B0C4_NO_PC_EXPORT(no_pc_export); } else { unsigned late_alloc_wave64, cu_mask; @@ -1662,8 +1666,9 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader C_00B21C_CU_EN, 0, &sscreen->info); shader->ngg.spi_shader_pgm_rsrc4_gs = S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64); shader->ngg.spi_vs_out_config = - S_0286C4_VS_EXPORT_COUNT(MAX2(shader->info.nr_param_exports, 1) - 1) | - S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0); + S_0286C4_VS_EXPORT_COUNT(MAX2(num_params, 1) - 1) | + S_0286C4_PRIM_EXPORT_COUNT(num_prim_params) | + S_0286C4_NO_PC_EXPORT(no_pc_export); if (sscreen->info.gfx_level >= GFX11) { shader->ngg.spi_shader_pgm_rsrc4_gs |=