From 98866159588b489edbf651a32bbe49177bb63ec6 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Tue, 24 May 2022 02:44:53 -0700 Subject: [PATCH] intel/compiler: Move spill/fill tracking to the register allocator Originally, we had virtual opcodes for scratch access, and let the generator count spills/fills separately from other sends. Later, we started using the generic SHADER_OPCODE_SEND for spills/fills on some generations of hardware, and simply detected stateless messages there. But then we started using stateless messages for other things: - anv uses stateless messages for the buffer device address feature. - nir_opt_large_constants generates stateless messages. - XeHP curbe setup can generate stateless messages. So counting stateless messages is not accurate. Instead, we move the spill/fill accounting to the register allocator, as it generates such things, as well as the load/store_scratch intrinsic handling, as those are basically spill/fills, just at a higher level. Reviewed-by: Lionel Landwerlin Reviewed-by: Ian Romanick Part-of: --- src/intel/compiler/brw_fs.h | 2 + src/intel/compiler/brw_fs_generator.cpp | 44 +++++++++++----------- src/intel/compiler/brw_fs_nir.cpp | 3 ++ src/intel/compiler/brw_fs_reg_allocate.cpp | 30 +++++++++------ src/intel/compiler/brw_fs_visitor.cpp | 2 + 5 files changed, 47 insertions(+), 34 deletions(-) diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 4afb77146bc..5990cb422c8 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -81,6 +81,8 @@ offset(const fs_reg ®, const brw::fs_builder &bld, unsigned delta) struct shader_stats { const char *scheduler_mode; unsigned promoted_constants; + unsigned spill_count; + unsigned fill_count; }; /** diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index 4ca3e4cc9e9..54db3b6721b 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -1826,13 +1826,6 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, int start_offset = p->next_insn_offset; - /* `send_count` explicitly does not include spills or fills, as we'd - * like to use it as a metric for intentional memory access or other - * shared function use. Otherwise, subtle changes to scheduling or - * register allocation could cause it to fluctuate wildly - and that - * effect is already counted in spill/fill counts. - */ - int spill_count = 0, fill_count = 0; int loop_count = 0, send_count = 0, nop_count = 0; bool is_accum_used = false; @@ -2265,15 +2258,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, case SHADER_OPCODE_SEND: generate_send(inst, dst, src[0], src[1], src[2], inst->ex_mlen > 0 ? src[3] : brw_null_reg()); - if ((inst->desc & 0xff) == BRW_BTI_STATELESS || - (inst->desc & 0xff) == GFX8_BTI_STATELESS_NON_COHERENT) { - if (inst->size_written) - fill_count++; - else - spill_count++; - } else { - send_count++; - } + send_count++; break; case SHADER_OPCODE_GET_BUFFER_SIZE: @@ -2306,17 +2291,17 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, case SHADER_OPCODE_GFX4_SCRATCH_WRITE: generate_scratch_write(inst, src[0]); - spill_count++; + send_count++; break; case SHADER_OPCODE_GFX4_SCRATCH_READ: generate_scratch_read(inst, dst); - fill_count++; + send_count++; break; case SHADER_OPCODE_GFX7_SCRATCH_READ: generate_scratch_read_gfx7(inst, dst); - fill_count++; + send_count++; break; case SHADER_OPCODE_SCRATCH_HEADER: @@ -2630,6 +2615,15 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, /* end of program sentinel */ disasm_new_inst_group(disasm_info, p->next_insn_offset); + /* `send_count` explicitly does not include spills or fills, as we'd + * like to use it as a metric for intentional memory access or other + * shared function use. Otherwise, subtle changes to scheduling or + * register allocation could cause it to fluctuate wildly - and that + * effect is already counted in spill/fill counts. + */ + send_count -= shader_stats.spill_count; + send_count -= shader_stats.fill_count; + #ifndef NDEBUG bool validated = #else @@ -2661,7 +2655,9 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, shader_name, sha1buf, dispatch_width, before_size / 16, loop_count, perf.latency, - spill_count, fill_count, send_count, + shader_stats.spill_count, + shader_stats.fill_count, + send_count, shader_stats.scheduler_mode, shader_stats.promoted_constants, before_size, after_size, @@ -2693,7 +2689,9 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, _mesa_shader_stage_to_abbrev(stage), dispatch_width, before_size / 16 - nop_count, loop_count, perf.latency, - spill_count, fill_count, send_count, + shader_stats.spill_count, + shader_stats.fill_count, + send_count, shader_stats.scheduler_mode, shader_stats.promoted_constants, before_size, after_size); @@ -2703,8 +2701,8 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, stats->sends = send_count; stats->loops = loop_count; stats->cycles = perf.latency; - stats->spills = spill_count; - stats->fills = fill_count; + stats->spills = shader_stats.spill_count; + stats->fills = shader_stats.fill_count; } return start_offset; diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 954abde45a3..b51435f47c7 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -5177,6 +5177,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); bld.MOV(dest, read_result); } + + shader_stats.fill_count += DIV_ROUND_UP(dispatch_width, 16); break; } @@ -5250,6 +5252,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); } + shader_stats.spill_count += DIV_ROUND_UP(dispatch_width, 16); break; } diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp index 42077fb6621..9045e35fb7e 100644 --- a/src/intel/compiler/brw_fs_reg_allocate.cpp +++ b/src/intel/compiler/brw_fs_reg_allocate.cpp @@ -348,10 +348,10 @@ private: void build_interference_graph(bool allow_spilling); void discard_interference_graph(); - void emit_unspill(const fs_builder &bld, fs_reg dst, - uint32_t spill_offset, unsigned count); - void emit_spill(const fs_builder &bld, fs_reg src, - uint32_t spill_offset, unsigned count); + void emit_unspill(const fs_builder &bld, struct shader_stats *stats, + fs_reg dst, uint32_t spill_offset, unsigned count); + void emit_spill(const fs_builder &bld, struct shader_stats *stats, + fs_reg src, uint32_t spill_offset, unsigned count); void set_spill_costs(); int choose_spill_reg(); @@ -738,7 +738,9 @@ fs_reg_alloc::discard_interference_graph() } void -fs_reg_alloc::emit_unspill(const fs_builder &bld, fs_reg dst, +fs_reg_alloc::emit_unspill(const fs_builder &bld, + struct shader_stats *stats, + fs_reg dst, uint32_t spill_offset, unsigned count) { const intel_device_info *devinfo = bld.shader->devinfo; @@ -747,6 +749,8 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld, fs_reg dst, assert(count % reg_size == 0); for (unsigned i = 0; i < count / reg_size; i++) { + ++stats->fill_count; + fs_inst *unspill_inst; if (devinfo->ver >= 9) { fs_reg header = this->scratch_header; @@ -803,7 +807,9 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld, fs_reg dst, } void -fs_reg_alloc::emit_spill(const fs_builder &bld, fs_reg src, +fs_reg_alloc::emit_spill(const fs_builder &bld, + struct shader_stats *stats, + fs_reg src, uint32_t spill_offset, unsigned count) { const intel_device_info *devinfo = bld.shader->devinfo; @@ -812,6 +818,8 @@ fs_reg_alloc::emit_spill(const fs_builder &bld, fs_reg src, assert(count % reg_size == 0); for (unsigned i = 0; i < count / reg_size; i++) { + ++stats->spill_count; + fs_inst *spill_inst; if (devinfo->ver >= 9) { fs_reg header = this->scratch_header; @@ -1098,8 +1106,8 @@ fs_reg_alloc::spill_reg(unsigned spill_reg) * 32 bit channels. It shouldn't hurt in any case because the * unspill destination is a block-local temporary. */ - emit_unspill(ibld.exec_all().group(width, 0), unspill_dst, - subset_spill_offset, count); + emit_unspill(ibld.exec_all().group(width, 0), &fs->shader_stats, + unspill_dst, subset_spill_offset, count); } } @@ -1153,10 +1161,10 @@ fs_reg_alloc::spill_reg(unsigned spill_reg) */ if (inst->is_partial_write() || (!inst->force_writemask_all && !per_channel)) - emit_unspill(ubld, spill_src, subset_spill_offset, - regs_written(inst)); + emit_unspill(ubld, &fs->shader_stats, spill_src, + subset_spill_offset, regs_written(inst)); - emit_spill(ubld.at(block, inst->next), spill_src, + emit_spill(ubld.at(block, inst->next), &fs->shader_stats, spill_src, subset_spill_offset, regs_written(inst)); } diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp index 5da019bf938..760f2135139 100644 --- a/src/intel/compiler/brw_fs_visitor.cpp +++ b/src/intel/compiler/brw_fs_visitor.cpp @@ -1172,6 +1172,8 @@ fs_visitor::init() this->shader_stats.scheduler_mode = NULL; this->shader_stats.promoted_constants = 0, + this->shader_stats.spill_count = 0, + this->shader_stats.fill_count = 0, this->grf_used = 0; this->spilled_any_registers = false;