From d08b09cb7e96cecb4f224f698ed4e7ef5bacd707 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 19 Nov 2021 04:01:34 -0500 Subject: [PATCH] radeonsi: use si_shader::wave_size Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/gfx10_shader_ngg.c | 5 ++- src/gallium/drivers/radeonsi/si_compute.c | 8 ++--- src/gallium/drivers/radeonsi/si_debug.c | 11 +++---- src/gallium/drivers/radeonsi/si_shader.c | 32 +++++++++---------- src/gallium/drivers/radeonsi/si_shader_llvm.c | 2 +- .../drivers/radeonsi/si_shader_llvm_gs.c | 4 +-- src/gallium/drivers/radeonsi/si_sqtt.c | 2 +- .../drivers/radeonsi/si_state_draw.cpp | 2 +- .../drivers/radeonsi/si_state_shaders.cpp | 11 +++---- 9 files changed, 35 insertions(+), 42 deletions(-) diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 8c63e0f5f52..9d2bb1ac054 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -2276,14 +2276,13 @@ retry_select_mode: /* Round up towards full wave sizes for better ALU utilization. */ if (!max_vert_out_per_gs_instance) { - const unsigned wavesize = si_get_shader_wave_size(shader); unsigned orig_max_esverts; unsigned orig_max_gsprims; do { orig_max_esverts = max_esverts; orig_max_gsprims = max_gsprims; - max_esverts = align(max_esverts, wavesize); + max_esverts = align(max_esverts, shader->wave_size); max_esverts = MIN2(max_esverts, max_esverts_base); if (esvert_lds_size) max_esverts = @@ -2293,7 +2292,7 @@ retry_select_mode: /* Hardware restriction: minimum value of max_esverts */ max_esverts = MAX2(max_esverts, min_esverts); - max_gsprims = align(max_gsprims, wavesize); + max_gsprims = align(max_gsprims, shader->wave_size); max_gsprims = MIN2(max_gsprims, max_gsprims_base); if (gsprim_lds_size) { /* Don't count unusable vertices to the LDS size. Those are vertices above diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 864732385e4..0772e49670b 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -68,7 +68,7 @@ static const amd_kernel_code_t *si_compute_get_code_object(const struct si_compu if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){.info = &sel->screen->info, .shader_type = MESA_SHADER_COMPUTE, - .wave_size = sel->screen->compute_wave_size, + .wave_size = program->shader.wave_size, .num_parts = 1, .elf_ptrs = &program->shader.binary.elf_buffer, .elf_sizes = &program->shader.binary.elf_size})) @@ -193,7 +193,7 @@ static void si_create_compute_state_async(void *job, void *gdata, int thread_ind bool scratch_enabled = shader->config.scratch_bytes_per_wave > 0; shader->config.rsrc1 = S_00B848_VGPRS((shader->config.num_vgprs - 1) / - ((sscreen->compute_wave_size == 32 || + ((shader->wave_size == 32 || sscreen->info.wave64_vgpr_alloc_granularity == 8) ? 8 : 4)) | S_00B848_DX10_CLAMP(1) | S_00B848_MEM_ORDERED(si_shader_mem_ordered(shader)) | @@ -770,7 +770,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_ bool render_cond_bit = sctx->render_cond_enabled; unsigned threads_per_threadgroup = info->block[0] * info->block[1] * info->block[2]; unsigned waves_per_threadgroup = - DIV_ROUND_UP(threads_per_threadgroup, sscreen->compute_wave_size); + DIV_ROUND_UP(threads_per_threadgroup, sctx->cs_shader_state.program->shader.wave_size); unsigned threadgroups_per_cu = 1; if (sctx->chip_class >= GFX10 && waves_per_threadgroup == 1) @@ -792,7 +792,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_ /* If the KMD allows it (there is a KMD hw register for it), * allow launching waves out-of-order. (same as Vulkan) */ S_00B800_ORDER_MODE(sctx->chip_class >= GFX7) | - S_00B800_CS_W32_EN(sscreen->compute_wave_size == 32); + S_00B800_CS_W32_EN(sctx->cs_shader_state.program->shader.wave_size == 32); const uint *last_block = info->last_block; bool partial_block_en = last_block[0] || last_block[1] || last_block[2]; diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c index a195bc18b3c..c88eb734241 100644 --- a/src/gallium/drivers/radeonsi/si_debug.c +++ b/src/gallium/drivers/radeonsi/si_debug.c @@ -919,28 +919,27 @@ static void si_print_annotated_shader(struct si_shader *shader, struct ac_wave_i */ unsigned num_inst = 0; uint64_t inst_addr = start_addr; - unsigned wave_size = si_get_shader_wave_size(shader); struct ac_rtld_binary rtld_binaries[5] = {}; struct si_shader_inst *instructions = calloc(shader->bo->b.b.width0 / 4, sizeof(struct si_shader_inst)); if (shader->prolog) { si_add_split_disasm(screen, &rtld_binaries[0], &shader->prolog->binary, &inst_addr, &num_inst, - instructions, stage, wave_size); + instructions, stage, shader->wave_size); } if (shader->previous_stage) { si_add_split_disasm(screen, &rtld_binaries[1], &shader->previous_stage->binary, &inst_addr, - &num_inst, instructions, stage, wave_size); + &num_inst, instructions, stage, shader->wave_size); } if (shader->prolog2) { si_add_split_disasm(screen, &rtld_binaries[2], &shader->prolog2->binary, &inst_addr, - &num_inst, instructions, stage, wave_size); + &num_inst, instructions, stage, shader->wave_size); } si_add_split_disasm(screen, &rtld_binaries[3], &shader->binary, &inst_addr, &num_inst, - instructions, stage, wave_size); + instructions, stage, shader->wave_size); if (shader->epilog) { si_add_split_disasm(screen, &rtld_binaries[4], &shader->epilog->binary, &inst_addr, &num_inst, - instructions, stage, wave_size); + instructions, stage, shader->wave_size); } fprintf(f, COLOR_YELLOW "%s - annotated disassembly:" COLOR_RESET "\n", diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 80b3d6144a7..7ebc3831b33 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -819,7 +819,7 @@ static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *sh .halt_at_entry = screen->options.halt_shaders, }, .shader_type = sel->info.stage, - .wave_size = si_get_shader_wave_size(shader), + .wave_size = shader->wave_size, .num_parts = num_parts, .elf_ptrs = part_elfs, .elf_sizes = part_sizes, @@ -992,7 +992,7 @@ static void si_calculate_max_simd_waves(struct si_shader *shader) case MESA_SHADER_COMPUTE: { unsigned max_workgroup_size = si_get_max_workgroup_size(shader); lds_per_wave = (conf->lds_size * lds_increment) / - DIV_ROUND_UP(max_workgroup_size, sscreen->compute_wave_size); + DIV_ROUND_UP(max_workgroup_size, shader->wave_size); } break; default:; @@ -1025,7 +1025,7 @@ void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shad if (screen->options.debug_disassembly) si_shader_dump_disassembly(screen, &shader->binary, shader->selector->info.stage, - si_get_shader_wave_size(shader), debug, "main", NULL); + shader->wave_size, debug, "main", NULL); pipe_debug_message(debug, SHADER_INFO, "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d " @@ -1123,25 +1123,24 @@ void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, if (!check_debug_option || (si_can_dump_shader(sscreen, stage) && !(sscreen->debug_flags & DBG(NO_ASM)))) { - unsigned wave_size = si_get_shader_wave_size(shader); fprintf(file, "\n%s:\n", si_get_shader_name(shader)); if (shader->prolog) - si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, wave_size, debug, + si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, shader->wave_size, debug, "prolog", file); if (shader->previous_stage) si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage, - wave_size, debug, "previous stage", file); + shader->wave_size, debug, "previous stage", file); if (shader->prolog2) - si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, stage, wave_size, + si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, stage, shader->wave_size, debug, "prolog2", file); - si_shader_dump_disassembly(sscreen, &shader->binary, stage, wave_size, debug, "main", + si_shader_dump_disassembly(sscreen, &shader->binary, stage, shader->wave_size, debug, "main", file); if (shader->epilog) - si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, wave_size, debug, + si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, shader->wave_size, debug, "epilog", file); fprintf(file, "\n"); } @@ -1330,7 +1329,7 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_ { memset(key, 0, sizeof(*key)); key->vs_prolog.states = *prolog_key; - key->vs_prolog.wave32 = si_get_shader_wave_size(shader_out) == 32; + key->vs_prolog.wave32 = shader_out->wave_size == 32; key->vs_prolog.num_input_sgprs = num_input_sgprs; key->vs_prolog.num_inputs = info->num_inputs; key->vs_prolog.as_ls = shader_out->key.ge.as_ls; @@ -1522,14 +1521,13 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */ if (sel->info.stage == MESA_SHADER_COMPUTE) { - unsigned wave_size = sscreen->compute_wave_size; unsigned max_vgprs = - sscreen->info.num_physical_wave64_vgprs_per_simd * (wave_size == 32 ? 2 : 1); + sscreen->info.num_physical_wave64_vgprs_per_simd * (shader->wave_size == 32 ? 2 : 1); unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd; unsigned max_sgprs_per_wave = 128; unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */ unsigned threads_per_tg = si_get_max_workgroup_size(shader); - unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size); + unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, shader->wave_size); unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg); max_vgprs = max_vgprs / waves_per_simd; @@ -1709,7 +1707,7 @@ static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm /* Get the epilog. */ union si_shader_part_key epilog_key; memset(&epilog_key, 0, sizeof(epilog_key)); - epilog_key.tcs_epilog.wave32 = si_get_shader_wave_size(shader) == 32; + epilog_key.tcs_epilog.wave32 = shader->wave_size == 32; epilog_key.tcs_epilog.states = shader->key.ge.part.tcs.epilog; shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, MESA_SHADER_TESS_CTRL, false, @@ -1754,7 +1752,7 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke memset(key, 0, sizeof(*key)); key->ps_prolog.states = shader->key.ps.part.prolog; - key->ps_prolog.wave32 = si_get_shader_wave_size(shader) == 32; + key->ps_prolog.wave32 = shader->wave_size == 32; key->ps_prolog.colors_read = info->colors_read; key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs; key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs; @@ -1888,7 +1886,7 @@ void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *ke { struct si_shader_info *info = &shader->selector->info; memset(key, 0, sizeof(*key)); - key->ps_epilog.wave32 = si_get_shader_wave_size(shader) == 32; + key->ps_epilog.wave32 = shader->wave_size == 32; key->ps_epilog.colors_written = info->colors_written; key->ps_epilog.color_types = info->output_color_types; key->ps_epilog.writes_z = info->writes_z; @@ -2013,7 +2011,7 @@ void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader) shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs); if (shader->selector->info.stage == MESA_SHADER_COMPUTE && - si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) { + si_get_max_workgroup_size(shader) > shader->wave_size) { si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size); } } diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index 4e7a8a49431..0577287f73c 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -1090,7 +1090,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler * struct si_shader_selector *sel = shader->selector; struct si_shader_context ctx; - si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader)); + si_llvm_context_init(&ctx, sscreen, compiler, shader->wave_size); LLVMValueRef ngg_cull_main_fn = NULL; if (sel->info.stage <= MESA_SHADER_TESS_EVAL && shader->key.ge.opt.ngg_culling) { diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c index 27041f9125d..d059f860629 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -427,9 +427,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen, shader->is_gs_copy_shader = true; shader->wave_size = si_get_shader_wave_size(shader); - si_llvm_context_init(&ctx, sscreen, compiler, - si_get_wave_size(sscreen, MESA_SHADER_VERTEX, - false, false)); + si_llvm_context_init(&ctx, sscreen, compiler, shader->wave_size); ctx.shader = shader; ctx.stage = MESA_SHADER_VERTEX; diff --git a/src/gallium/drivers/radeonsi/si_sqtt.c b/src/gallium/drivers/radeonsi/si_sqtt.c index 52cb5331188..17c037037ea 100644 --- a/src/gallium/drivers/radeonsi/si_sqtt.c +++ b/src/gallium/drivers/radeonsi/si_sqtt.c @@ -1014,7 +1014,7 @@ si_sqtt_add_code_object(struct si_context* sctx, record->shader_data[gl_shader_stage].hw_stage = hw_stage; record->shader_data[gl_shader_stage].is_combined = false; record->shader_data[gl_shader_stage].scratch_memory_size = shader->config.scratch_bytes_per_wave; - record->shader_data[gl_shader_stage].wavefront_size = si_get_shader_wave_size(shader); + record->shader_data[gl_shader_stage].wavefront_size = shader->wave_size; record->shader_stages_mask |= 1 << gl_shader_stage; record->num_shaders_combined++; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 2a8a9f1b1f0..90b01e033a1 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -631,7 +631,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx, unsigned *num_pa * if it's only partially filled. */ unsigned temp_verts_per_tg = *num_patches * max_verts_per_patch; - unsigned wave_size = sctx->screen->ge_wave_size; + unsigned wave_size = ls_current->wave_size; if (temp_verts_per_tg > wave_size && (wave_size - temp_verts_per_tg % wave_size >= MAX2(max_verts_per_patch, 8))) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 3a6ae910327..548262dc0c2 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -586,7 +586,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) si_pm4_set_reg( pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS, - S_00B428_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) | + S_00B428_VGPRS((shader->config.num_vgprs - 1) / (shader->wave_size == 32 ? 8 : 4)) | (sscreen->info.chip_class <= GFX9 ? S_00B428_SGPRS((shader->config.num_sgprs - 1) / 8) : 0) | S_00B428_DX10_CLAMP(1) | S_00B428_MEM_ORDERED(si_shader_mem_ordered(shader)) | @@ -1207,7 +1207,6 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader else gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */ - unsigned wave_size = si_get_shader_wave_size(shader); unsigned late_alloc_wave64, cu_mask; ac_compute_late_alloc(&sscreen->info, true, shader->key.ge.opt.ngg_culling, @@ -1217,7 +1216,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); si_pm4_set_reg( pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, - S_00B228_VGPRS((shader->config.num_vgprs - 1) / (wave_size == 32 ? 8 : 4)) | + S_00B228_VGPRS((shader->config.num_vgprs - 1) / (shader->wave_size == 32 ? 8 : 4)) | S_00B228_FLOAT_MODE(shader->config.float_mode) | S_00B228_DX10_CLAMP(1) | S_00B228_MEM_ORDERED(si_shader_mem_ordered(shader)) | /* Disable the WGP mode on gfx10.3 because it can hang. (it happened on VanGogh) @@ -1511,7 +1510,7 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, S_00B124_MEM_BASE(sscreen->info.address32_hi >> 8)); uint32_t rsrc1 = - S_00B128_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) | + S_00B128_VGPRS((shader->config.num_vgprs - 1) / (shader->wave_size == 32 ? 8 : 4)) | S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B128_DX10_CLAMP(1) | S_00B128_MEM_ORDERED(si_shader_mem_ordered(shader)) | S_00B128_FLOAT_MODE(shader->config.float_mode); @@ -1715,7 +1714,7 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) /* Set interpolation controls. */ spi_ps_in_control = S_0286D8_NUM_INTERP(num_interp) | - S_0286D8_PS_W32_EN(sscreen->ps_wave_size == 32); + S_0286D8_PS_W32_EN(shader->wave_size == 32); shader->ctx_reg.ps.num_interp = num_interp; shader->ctx_reg.ps.spi_baryc_cntl = spi_baryc_cntl; @@ -1731,7 +1730,7 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) S_00B024_MEM_BASE(sscreen->info.address32_hi >> 8)); uint32_t rsrc1 = - S_00B028_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ps_wave_size == 32 ? 8 : 4)) | + S_00B028_VGPRS((shader->config.num_vgprs - 1) / (shader->wave_size == 32 ? 8 : 4)) | S_00B028_DX10_CLAMP(1) | S_00B028_MEM_ORDERED(si_shader_mem_ordered(shader)) | S_00B028_FLOAT_MODE(shader->config.float_mode);