diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 26466ee8acc..1684a50edd6 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -398,8 +398,7 @@ setup_vs_variables(isel_context *ctx, nir_shader *nir) if (ctx->stage == vertex_ngg && ctx->args->options->key.vs_common_out.export_prim_id) { /* We need to store the primitive IDs in LDS */ unsigned lds_size = ctx->program->info->ngg_info.esgs_ring_size; - ctx->program->config->lds_size = (lds_size + ctx->program->lds_alloc_granule - 1) / - ctx->program->lds_alloc_granule; + ctx->program->config->lds_size = DIV_ROUND_UP(lds_size, ctx->program->lds_encoding_granule); } } @@ -424,7 +423,7 @@ void setup_gs_variables(isel_context *ctx, nir_shader *nir) unsigned total_lds_bytes = esgs_ring_bytes + ngg_emit_bytes + ngg_gs_scratch_bytes; assert(total_lds_bytes >= ctx->ngg_gs_emit_addr); assert(total_lds_bytes >= ctx->ngg_gs_scratch_addr); - ctx->program->config->lds_size = (total_lds_bytes + ctx->program->lds_alloc_granule - 1) / ctx->program->lds_alloc_granule; + ctx->program->config->lds_size = DIV_ROUND_UP(total_lds_bytes, ctx->program->lds_encoding_granule); /* Make sure we have enough room for emitted GS vertices */ if (nir->info.gs.vertices_out) @@ -488,8 +487,7 @@ setup_tcs_info(isel_context *ctx, nir_shader *nir, nir_shader *vs) ctx->args->shader_info->tcs.num_patches = ctx->tcs_num_patches; ctx->args->shader_info->tcs.num_lds_blocks = lds_size; - ctx->program->config->lds_size = (lds_size + ctx->program->lds_alloc_granule - 1) / - ctx->program->lds_alloc_granule; + ctx->program->config->lds_size = DIV_ROUND_UP(lds_size, ctx->program->lds_encoding_granule); } void @@ -520,8 +518,7 @@ setup_variables(isel_context *ctx, nir_shader *nir) break; } case MESA_SHADER_COMPUTE: { - ctx->program->config->lds_size = (nir->info.cs.shared_size + ctx->program->lds_alloc_granule - 1) / - ctx->program->lds_alloc_granule; + ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.cs.shared_size, ctx->program->lds_encoding_granule); break; } case MESA_SHADER_VERTEX: { @@ -544,7 +541,7 @@ setup_variables(isel_context *ctx, nir_shader *nir) } /* Make sure we fit the available LDS space. */ - assert((ctx->program->config->lds_size * ctx->program->lds_alloc_granule) <= ctx->program->lds_limit); + assert((ctx->program->config->lds_size * ctx->program->lds_encoding_granule) <= ctx->program->lds_limit); } void diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 7fea714daed..706c0344d12 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -93,7 +93,8 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info, program->wave_size = info->wave_size; program->lane_mask = program->wave_size == 32 ? s1 : s2; - program->lds_alloc_granule = chip_class >= GFX7 ? 512 : 256; + program->lds_encoding_granule = chip_class >= GFX7 ? 512 : 256; + program->lds_alloc_granule = chip_class >= GFX10_3 ? 1024 : program->lds_encoding_granule; program->lds_limit = chip_class >= GFX7 ? 65536 : 32768; /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */ program->has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY; diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 75af4297966..833f335ff01 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1817,6 +1817,7 @@ public: Temp scratch_offset; uint16_t min_waves = 0; + uint16_t lds_encoding_granule; uint16_t lds_alloc_granule; uint32_t lds_limit; /* in bytes */ bool has_16bank_lds; diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp index 8d5d75eff09..5aaec94e985 100644 --- a/src/amd/compiler/aco_live_var_analysis.cpp +++ b/src/amd/compiler/aco_live_var_analysis.cpp @@ -353,7 +353,8 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) unsigned waves_per_workgroup = calc_waves_per_workgroup(program); unsigned workgroups_per_cu_wgp = max_waves_per_simd * simd_per_cu_wgp / waves_per_workgroup; if (program->config->lds_size) { - unsigned lds = program->config->lds_size * program->lds_alloc_granule; + unsigned lds = program->config->lds_size * program->lds_encoding_granule; + lds = align(lds, program->lds_alloc_granule); workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds); } if (waves_per_workgroup > 1 && program->chip_class < GFX10)