2010-10-10 15:42:37 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*
|
|
|
|
|
* Authors:
|
|
|
|
|
* Eric Anholt <eric@anholt.net>
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
|
2024-12-06 14:25:29 -08:00
|
|
|
#pragma once
|
2012-04-10 12:01:50 -07:00
|
|
|
|
2024-02-28 13:59:35 -08:00
|
|
|
#include "brw_cfg.h"
|
|
|
|
|
#include "brw_compiler.h"
|
2024-12-06 13:05:43 -08:00
|
|
|
#include "brw_inst.h"
|
2024-02-19 22:57:48 -08:00
|
|
|
#include "brw_ir_allocator.h"
|
2016-03-09 17:03:57 -08:00
|
|
|
#include "brw_fs_live_variables.h"
|
2020-03-26 14:59:02 -07:00
|
|
|
#include "brw_ir_performance.h"
|
2016-01-18 12:54:03 +02:00
|
|
|
#include "compiler/nir/nir.h"
|
2010-10-10 15:42:37 -07:00
|
|
|
|
2014-06-14 22:53:40 -07:00
|
|
|
struct bblock_t;
|
2012-06-06 10:57:54 -07:00
|
|
|
namespace {
|
2013-01-07 19:42:38 -08:00
|
|
|
struct acp_entry;
|
2012-06-06 10:57:54 -07:00
|
|
|
}
|
2012-05-10 16:10:15 -07:00
|
|
|
|
2024-02-19 22:57:48 -08:00
|
|
|
struct fs_visitor;
|
2016-03-13 16:35:49 -07:00
|
|
|
|
2012-06-05 11:37:22 -07:00
|
|
|
namespace brw {
|
2016-03-13 16:35:49 -07:00
|
|
|
/**
|
|
|
|
|
* Register pressure analysis of a shader. Estimates how many registers
|
|
|
|
|
* are live at any point of the program in GRF units.
|
|
|
|
|
*/
|
|
|
|
|
struct register_pressure {
|
|
|
|
|
register_pressure(const fs_visitor *v);
|
|
|
|
|
~register_pressure();
|
|
|
|
|
|
|
|
|
|
analysis_dependency_class
|
|
|
|
|
dependency_class() const
|
|
|
|
|
{
|
|
|
|
|
return (DEPENDENCY_INSTRUCTION_IDENTITY |
|
|
|
|
|
DEPENDENCY_INSTRUCTION_DATA_FLOW |
|
|
|
|
|
DEPENDENCY_VARIABLES);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
validate(const fs_visitor *) const
|
|
|
|
|
{
|
|
|
|
|
/* FINISHME */
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unsigned *regs_live_at_ip;
|
|
|
|
|
};
|
2023-11-16 01:16:45 -08:00
|
|
|
|
|
|
|
|
class def_analysis {
|
|
|
|
|
public:
|
|
|
|
|
def_analysis(const fs_visitor *v);
|
|
|
|
|
~def_analysis();
|
|
|
|
|
|
|
|
|
|
fs_inst *
|
2024-06-18 23:42:59 -07:00
|
|
|
get(const brw_reg ®) const
|
2023-11-16 01:16:45 -08:00
|
|
|
{
|
|
|
|
|
return reg.file == VGRF && reg.nr < def_count ?
|
|
|
|
|
def_insts[reg.nr] : NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bblock_t *
|
2024-06-18 23:42:59 -07:00
|
|
|
get_block(const brw_reg ®) const
|
2023-11-16 01:16:45 -08:00
|
|
|
{
|
|
|
|
|
return reg.file == VGRF && reg.nr < def_count ?
|
|
|
|
|
def_blocks[reg.nr] : NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-12 11:49:11 -08:00
|
|
|
uint32_t
|
2024-06-18 23:42:59 -07:00
|
|
|
get_use_count(const brw_reg ®) const
|
2024-01-12 11:49:11 -08:00
|
|
|
{
|
|
|
|
|
return reg.file == VGRF && reg.nr < def_count ?
|
|
|
|
|
def_use_counts[reg.nr] : 0;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-16 01:16:45 -08:00
|
|
|
unsigned count() const { return def_count; }
|
2024-08-16 21:29:48 -07:00
|
|
|
unsigned ssa_count() const;
|
2023-11-16 01:16:45 -08:00
|
|
|
|
|
|
|
|
void print_stats(const fs_visitor *) const;
|
|
|
|
|
|
|
|
|
|
analysis_dependency_class
|
|
|
|
|
dependency_class() const
|
|
|
|
|
{
|
|
|
|
|
return DEPENDENCY_INSTRUCTION_IDENTITY |
|
|
|
|
|
DEPENDENCY_INSTRUCTION_DATA_FLOW |
|
|
|
|
|
DEPENDENCY_VARIABLES |
|
|
|
|
|
DEPENDENCY_BLOCKS;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool validate(const fs_visitor *) const;
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
void mark_invalid(int);
|
|
|
|
|
bool fully_defines(const fs_visitor *v, fs_inst *);
|
|
|
|
|
void update_for_reads(const idom_tree &idom, bblock_t *block, fs_inst *);
|
|
|
|
|
void update_for_write(const fs_visitor *v, bblock_t *block, fs_inst *);
|
|
|
|
|
|
|
|
|
|
fs_inst **def_insts;
|
|
|
|
|
bblock_t **def_blocks;
|
2024-01-12 11:49:11 -08:00
|
|
|
uint32_t *def_use_counts;
|
2023-11-16 01:16:45 -08:00
|
|
|
unsigned def_count;
|
|
|
|
|
};
|
2012-06-05 11:37:22 -07:00
|
|
|
}
|
|
|
|
|
|
2024-02-28 13:59:35 -08:00
|
|
|
#define UBO_START ((1 << 16) - 4)
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Scratch data used when compiling a GLSL geometry shader.
|
|
|
|
|
*/
|
|
|
|
|
struct brw_gs_compile
|
|
|
|
|
{
|
|
|
|
|
struct brw_gs_prog_key key;
|
|
|
|
|
struct intel_vue_map input_vue_map;
|
|
|
|
|
|
|
|
|
|
unsigned control_data_bits_per_vertex;
|
|
|
|
|
unsigned control_data_header_size_bits;
|
|
|
|
|
};
|
2015-06-29 22:50:28 -07:00
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
class brw_builder;
|
2015-06-25 10:55:51 -07:00
|
|
|
|
2024-10-10 23:04:52 -07:00
|
|
|
struct brw_shader_stats {
|
2016-10-17 14:12:28 -07:00
|
|
|
const char *scheduler_mode;
|
2016-10-17 14:10:26 -07:00
|
|
|
unsigned promoted_constants;
|
2022-05-24 02:44:53 -07:00
|
|
|
unsigned spill_count;
|
|
|
|
|
unsigned fill_count;
|
2023-02-03 17:02:28 +01:00
|
|
|
unsigned max_register_pressure;
|
2024-08-16 21:29:48 -07:00
|
|
|
unsigned non_ssa_registers_after_nir;
|
2016-10-17 14:10:26 -07:00
|
|
|
};
|
|
|
|
|
|
2022-08-19 12:18:21 -07:00
|
|
|
/** Register numbers for thread payload fields. */
|
|
|
|
|
struct thread_payload {
|
|
|
|
|
/** The number of thread payload registers the hardware will supply. */
|
|
|
|
|
uint8_t num_regs;
|
2022-08-19 12:40:20 -07:00
|
|
|
|
|
|
|
|
virtual ~thread_payload() = default;
|
2022-08-29 17:13:00 -07:00
|
|
|
|
|
|
|
|
protected:
|
|
|
|
|
thread_payload() : num_regs() {}
|
2022-08-19 12:18:21 -07:00
|
|
|
};
|
|
|
|
|
|
2022-08-21 21:22:12 -07:00
|
|
|
struct vs_thread_payload : public thread_payload {
|
2022-08-01 16:42:57 -07:00
|
|
|
vs_thread_payload(const fs_visitor &v);
|
2022-08-21 21:22:12 -07:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg urb_handles;
|
2022-08-21 21:22:12 -07:00
|
|
|
};
|
|
|
|
|
|
2022-08-19 14:41:52 -07:00
|
|
|
struct tcs_thread_payload : public thread_payload {
|
|
|
|
|
tcs_thread_payload(const fs_visitor &v);
|
2022-08-19 14:57:31 -07:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg patch_urb_output;
|
|
|
|
|
brw_reg primitive_id;
|
|
|
|
|
brw_reg icp_handle_start;
|
2022-08-19 14:41:52 -07:00
|
|
|
};
|
|
|
|
|
|
2022-08-21 20:51:58 -07:00
|
|
|
struct tes_thread_payload : public thread_payload {
|
2022-09-07 14:11:05 -07:00
|
|
|
tes_thread_payload(const fs_visitor &v);
|
2022-08-21 20:51:58 -07:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg patch_urb_input;
|
|
|
|
|
brw_reg primitive_id;
|
|
|
|
|
brw_reg coords[3];
|
|
|
|
|
brw_reg urb_output;
|
2022-08-21 20:51:58 -07:00
|
|
|
};
|
|
|
|
|
|
2022-08-22 22:23:17 -07:00
|
|
|
struct gs_thread_payload : public thread_payload {
|
2023-11-20 23:17:12 -08:00
|
|
|
gs_thread_payload(fs_visitor &v);
|
2022-08-22 22:23:17 -07:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg urb_handles;
|
|
|
|
|
brw_reg primitive_id;
|
|
|
|
|
brw_reg instance_id;
|
|
|
|
|
brw_reg icp_handle_start;
|
2022-08-22 22:23:17 -07:00
|
|
|
};
|
|
|
|
|
|
2022-08-19 12:18:21 -07:00
|
|
|
struct fs_thread_payload : public thread_payload {
|
2022-08-19 12:40:20 -07:00
|
|
|
fs_thread_payload(const fs_visitor &v,
|
2024-02-27 12:23:52 -08:00
|
|
|
bool &source_depth_to_render_target);
|
2022-08-19 12:40:20 -07:00
|
|
|
|
2022-08-19 12:18:21 -07:00
|
|
|
uint8_t subspan_coord_reg[2];
|
|
|
|
|
uint8_t source_depth_reg[2];
|
|
|
|
|
uint8_t source_w_reg[2];
|
|
|
|
|
uint8_t aa_dest_stencil_reg[2];
|
|
|
|
|
uint8_t dest_depth_reg[2];
|
|
|
|
|
uint8_t sample_pos_reg[2];
|
|
|
|
|
uint8_t sample_mask_in_reg[2];
|
2024-11-18 11:33:35 +02:00
|
|
|
uint8_t barycentric_coord_reg[INTEL_BARYCENTRIC_MODE_COUNT][2];
|
2022-08-08 13:53:04 -07:00
|
|
|
|
|
|
|
|
uint8_t depth_w_coef_reg;
|
|
|
|
|
uint8_t pc_bary_coef_reg;
|
|
|
|
|
uint8_t npc_bary_coef_reg;
|
|
|
|
|
uint8_t sample_offsets_reg;
|
2022-08-19 12:18:21 -07:00
|
|
|
};
|
|
|
|
|
|
2022-08-22 21:47:02 -07:00
|
|
|
struct cs_thread_payload : public thread_payload {
|
|
|
|
|
cs_thread_payload(const fs_visitor &v);
|
|
|
|
|
|
2024-12-29 16:06:27 -08:00
|
|
|
void load_subgroup_id(const brw_builder &bld, brw_reg &dest) const;
|
2022-08-22 21:47:02 -07:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg local_invocation_id[3];
|
2023-11-27 16:31:25 -08:00
|
|
|
|
2024-09-30 08:45:21 +03:00
|
|
|
brw_reg inline_parameter;
|
|
|
|
|
|
2022-08-22 21:47:02 -07:00
|
|
|
protected:
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg subgroup_id_;
|
2022-08-22 21:47:02 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct task_mesh_thread_payload : public cs_thread_payload {
|
2023-11-20 23:17:12 -08:00
|
|
|
task_mesh_thread_payload(fs_visitor &v);
|
2022-08-21 23:05:08 -07:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg extended_parameter_0;
|
|
|
|
|
brw_reg local_index;
|
2022-08-21 23:05:08 -07:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg urb_output;
|
2022-08-21 23:05:08 -07:00
|
|
|
|
|
|
|
|
/* URB to read Task memory inputs. Only valid for MESH stage. */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg task_urb_input;
|
2022-08-21 23:05:08 -07:00
|
|
|
};
|
|
|
|
|
|
2022-08-25 17:00:15 -07:00
|
|
|
struct bs_thread_payload : public thread_payload {
|
2022-09-11 00:57:26 -07:00
|
|
|
bs_thread_payload(const fs_visitor &v);
|
2022-08-25 17:00:15 -07:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg global_arg_ptr;
|
|
|
|
|
brw_reg local_arg_ptr;
|
2022-08-25 17:00:15 -07:00
|
|
|
|
2024-12-29 16:06:27 -08:00
|
|
|
void load_shader_type(const brw_builder &bld, brw_reg &dest) const;
|
2022-08-25 17:00:15 -07:00
|
|
|
};
|
|
|
|
|
|
2024-08-27 10:16:11 -07:00
|
|
|
enum brw_shader_phase {
|
|
|
|
|
BRW_SHADER_PHASE_INITIAL = 0,
|
|
|
|
|
BRW_SHADER_PHASE_AFTER_NIR,
|
|
|
|
|
BRW_SHADER_PHASE_AFTER_OPT_LOOP,
|
|
|
|
|
BRW_SHADER_PHASE_AFTER_EARLY_LOWERING,
|
|
|
|
|
BRW_SHADER_PHASE_AFTER_MIDDLE_LOWERING,
|
|
|
|
|
BRW_SHADER_PHASE_AFTER_LATE_LOWERING,
|
|
|
|
|
BRW_SHADER_PHASE_AFTER_REGALLOC,
|
2024-08-27 13:00:46 -07:00
|
|
|
|
|
|
|
|
/* Larger value than any other phase. */
|
|
|
|
|
BRW_SHADER_PHASE_INVALID,
|
2024-08-27 10:16:11 -07:00
|
|
|
};
|
|
|
|
|
|
2012-11-09 01:05:47 -08:00
|
|
|
/**
|
|
|
|
|
* The fragment shader front-end.
|
|
|
|
|
*
|
|
|
|
|
* Translates either GLSL IR or Mesa IR (for ARB_fragment_program) into FS IR.
|
|
|
|
|
*/
|
2024-02-19 22:57:48 -08:00
|
|
|
struct fs_visitor
|
2010-10-10 15:42:37 -07:00
|
|
|
{
|
|
|
|
|
public:
|
2023-07-14 02:10:20 +03:00
|
|
|
fs_visitor(const struct brw_compiler *compiler,
|
|
|
|
|
const struct brw_compile_params *params,
|
2019-02-21 17:20:39 -06:00
|
|
|
const brw_base_prog_key *key,
|
2015-03-11 22:41:49 -07:00
|
|
|
struct brw_stage_prog_data *prog_data,
|
2015-10-05 19:26:02 -07:00
|
|
|
const nir_shader *shader,
|
2015-06-19 15:40:09 -07:00
|
|
|
unsigned dispatch_width,
|
2023-02-03 17:02:28 +01:00
|
|
|
bool needs_register_pressure,
|
2021-04-07 01:23:08 +03:00
|
|
|
bool debug_enabled);
|
2022-06-22 16:31:00 -07:00
|
|
|
fs_visitor(const struct brw_compiler *compiler,
|
|
|
|
|
const struct brw_compile_params *params,
|
|
|
|
|
const brw_wm_prog_key *key,
|
|
|
|
|
struct brw_wm_prog_data *prog_data,
|
|
|
|
|
const nir_shader *shader,
|
|
|
|
|
unsigned dispatch_width,
|
|
|
|
|
unsigned num_polygons,
|
|
|
|
|
bool needs_register_pressure,
|
|
|
|
|
bool debug_enabled);
|
2023-07-14 02:10:20 +03:00
|
|
|
fs_visitor(const struct brw_compiler *compiler,
|
|
|
|
|
const struct brw_compile_params *params,
|
2015-06-29 22:50:28 -07:00
|
|
|
struct brw_gs_compile *gs_compile,
|
|
|
|
|
struct brw_gs_prog_data *prog_data,
|
2015-11-03 12:51:32 -08:00
|
|
|
const nir_shader *shader,
|
2023-02-03 17:02:28 +01:00
|
|
|
bool needs_register_pressure,
|
2021-03-23 11:31:51 -07:00
|
|
|
bool debug_enabled);
|
2015-06-29 22:50:28 -07:00
|
|
|
void init();
|
2012-07-04 13:12:50 -07:00
|
|
|
~fs_visitor();
|
2010-10-10 15:42:37 -07:00
|
|
|
|
2011-07-25 18:13:04 -07:00
|
|
|
void import_uniforms(fs_visitor *v);
|
2010-10-10 15:42:37 -07:00
|
|
|
|
|
|
|
|
void assign_curb_setup();
|
2015-03-11 23:14:31 -07:00
|
|
|
void convert_attr_sources_to_hw_regs(fs_inst *inst);
|
intel/brw: Only force g0's liveness to be the whole program if spilling
We don't actually need to extend g0's live range to the EOT message
generally - most messages that end a shader are headerless. The main
implicit use of g0 is for constructing scratch headers. With the last
two patches, we now consider scratch access that may exist in the IR
and already extend the liveness appropriately.
There is one remaining problem: spilling. The register allocator will
create new scratch messages when spilling a register, which need to
create scratch headers, which need g0. So, every new spill or fill
might extend the live range of g0, which would create new interference,
altering the graph. This can be problematic.
However, when compiling SIMD16 or SIMD32 fragment shaders, we don't
allow spilling anyway. So, why not use allow g0? Also, when trying
various scheduling modes, we first try allocation without spilling.
If it works, great, if not, we try a (hopefully) less aggressive
schedule, and only allow spilling on the lowest-pressure schedule.
So, even for regular SIMD8 shaders, we can potentially gain the use
of g0 on the first few tries at scheduling+allocation.
Once we try to allocate with spilling, we go back to reserving g0
for the entire program, so that we can construct scratch headers at
any point. We could possibly do better here, but this is simple and
reliable with some benefit.
Thanks to Ian Romanick for suggesting I try this approach.
fossil-db on Alchemist shows some more spill/fill improvements:
Totals:
Instrs: 149062395 -> 149053010 (-0.01%); split: -0.01%, +0.00%
Cycles: 12609496913 -> 12611652181 (+0.02%); split: -0.45%, +0.47%
Spill count: 52891 -> 52471 (-0.79%)
Fill count: 101599 -> 100818 (-0.77%)
Scratch Memory Size: 3292160 -> 3197952 (-2.86%)
Totals from 416541 (66.59% of 625484) affected shaders:
Instrs: 124058587 -> 124049202 (-0.01%); split: -0.01%, +0.01%
Cycles: 3567164271 -> 3569319539 (+0.06%); split: -1.61%, +1.67%
Spill count: 420 -> 0 (-inf%)
Fill count: 781 -> 0 (-inf%)
Scratch Memory Size: 94208 -> 0 (-inf%)
Witcher 3 shows a 33% reduction in scratch memory size, for example.
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30319>
2024-07-22 17:22:47 -07:00
|
|
|
void calculate_payload_ranges(bool allow_spilling,
|
|
|
|
|
unsigned payload_node_count,
|
2016-09-23 15:15:33 +03:00
|
|
|
int *payload_last_use_ip) const;
|
2024-02-19 22:57:48 -08:00
|
|
|
void invalidate_analysis(brw::analysis_dependency_class c);
|
2023-10-22 14:36:03 -07:00
|
|
|
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
void vfail(const char *msg, va_list args);
|
2011-03-13 13:43:05 -07:00
|
|
|
void fail(const char *msg, ...);
|
2016-05-18 14:39:52 -07:00
|
|
|
void limit_dispatch_width(unsigned n, const char *msg);
|
2011-01-18 17:16:49 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
void emit_urb_writes(const brw_reg &gs_vertex_count = brw_reg());
|
|
|
|
|
void emit_gs_control_data_bits(const brw_reg &vertex_count);
|
|
|
|
|
brw_reg gs_urb_channel_mask(const brw_reg &dword_index);
|
|
|
|
|
brw_reg gs_urb_per_slot_dword_index(const brw_reg &vertex_count);
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
bool mark_last_urb_write_with_eot();
|
2015-04-12 02:06:57 -07:00
|
|
|
void emit_cs_terminate();
|
2012-11-27 14:10:52 -08:00
|
|
|
|
2024-02-19 22:57:48 -08:00
|
|
|
const struct brw_compiler *compiler;
|
|
|
|
|
void *log_data; /* Passed to compiler->*_log functions */
|
|
|
|
|
|
|
|
|
|
const struct intel_device_info * const devinfo;
|
|
|
|
|
const nir_shader *nir;
|
|
|
|
|
|
|
|
|
|
/** ralloc context for temporary data used during compile */
|
|
|
|
|
void *mem_ctx;
|
|
|
|
|
|
2024-02-20 21:12:17 -08:00
|
|
|
/** List of fs_inst. */
|
2024-02-19 22:57:48 -08:00
|
|
|
exec_list instructions;
|
|
|
|
|
|
|
|
|
|
cfg_t *cfg;
|
|
|
|
|
|
|
|
|
|
gl_shader_stage stage;
|
|
|
|
|
bool debug_enabled;
|
|
|
|
|
|
|
|
|
|
brw::simple_allocator alloc;
|
|
|
|
|
|
2019-02-21 17:20:39 -06:00
|
|
|
const brw_base_prog_key *const key;
|
2015-03-09 01:58:51 -07:00
|
|
|
|
2015-06-29 22:50:28 -07:00
|
|
|
struct brw_gs_compile *gs_compile;
|
|
|
|
|
|
2014-08-29 12:50:46 -07:00
|
|
|
struct brw_stage_prog_data *prog_data;
|
2010-10-10 15:42:37 -07:00
|
|
|
|
2024-02-19 22:25:16 -08:00
|
|
|
brw_analysis<brw::fs_live_variables, fs_visitor> live_analysis;
|
2020-10-29 20:13:50 +01:00
|
|
|
brw_analysis<brw::register_pressure, fs_visitor> regpressure_analysis;
|
|
|
|
|
brw_analysis<brw::performance, fs_visitor> performance_analysis;
|
2024-02-19 22:25:16 -08:00
|
|
|
brw_analysis<brw::idom_tree, fs_visitor> idom_analysis;
|
2023-11-16 01:16:45 -08:00
|
|
|
brw_analysis<brw::def_analysis, fs_visitor> def_analysis;
|
2013-08-04 23:27:14 -07:00
|
|
|
|
2014-02-19 15:27:01 +01:00
|
|
|
/** Number of uniform variable components visited. */
|
|
|
|
|
unsigned uniforms;
|
|
|
|
|
|
2014-05-13 21:00:35 -07:00
|
|
|
/** Byte-offset for the next available spot in the scratch space buffer. */
|
|
|
|
|
unsigned last_scratch;
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg frag_depth;
|
|
|
|
|
brw_reg frag_stencil;
|
|
|
|
|
brw_reg sample_mask;
|
|
|
|
|
brw_reg outputs[VARYING_SLOT_MAX];
|
|
|
|
|
brw_reg dual_src_output;
|
2010-10-10 15:42:37 -07:00
|
|
|
int first_non_payload_grf;
|
|
|
|
|
|
2024-08-27 10:16:11 -07:00
|
|
|
enum brw_shader_phase phase;
|
|
|
|
|
|
2011-03-13 13:43:05 -07:00
|
|
|
bool failed;
|
2011-05-16 15:10:26 -07:00
|
|
|
char *fail_msg;
|
2010-10-10 15:42:37 -07:00
|
|
|
|
2022-08-19 12:40:20 -07:00
|
|
|
thread_payload *payload_;
|
|
|
|
|
|
|
|
|
|
thread_payload &payload() {
|
|
|
|
|
return *this->payload_;
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-21 21:22:12 -07:00
|
|
|
vs_thread_payload &vs_payload() {
|
|
|
|
|
assert(stage == MESA_SHADER_VERTEX);
|
|
|
|
|
return *static_cast<vs_thread_payload *>(this->payload_);
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-19 14:41:52 -07:00
|
|
|
tcs_thread_payload &tcs_payload() {
|
|
|
|
|
assert(stage == MESA_SHADER_TESS_CTRL);
|
|
|
|
|
return *static_cast<tcs_thread_payload *>(this->payload_);
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-21 20:51:58 -07:00
|
|
|
tes_thread_payload &tes_payload() {
|
|
|
|
|
assert(stage == MESA_SHADER_TESS_EVAL);
|
|
|
|
|
return *static_cast<tes_thread_payload *>(this->payload_);
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-22 22:23:17 -07:00
|
|
|
gs_thread_payload &gs_payload() {
|
|
|
|
|
assert(stage == MESA_SHADER_GEOMETRY);
|
|
|
|
|
return *static_cast<gs_thread_payload *>(this->payload_);
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-19 12:40:20 -07:00
|
|
|
fs_thread_payload &fs_payload() {
|
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
return *static_cast<fs_thread_payload *>(this->payload_);
|
|
|
|
|
};
|
2014-05-13 21:52:51 -07:00
|
|
|
|
2022-08-10 17:33:56 -07:00
|
|
|
const fs_thread_payload &fs_payload() const {
|
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
return *static_cast<const fs_thread_payload *>(this->payload_);
|
|
|
|
|
};
|
|
|
|
|
|
2022-08-22 21:47:02 -07:00
|
|
|
cs_thread_payload &cs_payload() {
|
|
|
|
|
assert(gl_shader_stage_uses_workgroup(stage));
|
|
|
|
|
return *static_cast<cs_thread_payload *>(this->payload_);
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-21 23:05:08 -07:00
|
|
|
task_mesh_thread_payload &task_mesh_payload() {
|
|
|
|
|
assert(stage == MESA_SHADER_TASK || stage == MESA_SHADER_MESH);
|
|
|
|
|
return *static_cast<task_mesh_thread_payload *>(this->payload_);
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-25 17:00:15 -07:00
|
|
|
bs_thread_payload &bs_payload() {
|
|
|
|
|
assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE);
|
|
|
|
|
return *static_cast<bs_thread_payload *>(this->payload_);
|
|
|
|
|
}
|
|
|
|
|
|
2014-05-14 00:08:58 -07:00
|
|
|
bool source_depth_to_render_target;
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg pixel_x;
|
|
|
|
|
brw_reg pixel_y;
|
|
|
|
|
brw_reg pixel_z;
|
|
|
|
|
brw_reg wpos_w;
|
|
|
|
|
brw_reg pixel_w;
|
2024-11-18 11:33:35 +02:00
|
|
|
brw_reg delta_xy[INTEL_BARYCENTRIC_MODE_COUNT];
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg final_gs_vertex_count;
|
|
|
|
|
brw_reg control_data_bits;
|
|
|
|
|
brw_reg invocation_id;
|
2010-10-10 15:42:37 -07:00
|
|
|
|
2015-02-10 15:51:34 +02:00
|
|
|
unsigned grf_used;
|
2013-10-29 12:46:18 -07:00
|
|
|
bool spilled_any_registers;
|
2023-02-03 17:02:28 +01:00
|
|
|
bool needs_register_pressure;
|
2011-03-11 19:19:01 -08:00
|
|
|
|
2016-05-18 14:39:52 -07:00
|
|
|
const unsigned dispatch_width; /**< 8, 16 or 32 */
|
2022-06-22 16:31:00 -07:00
|
|
|
const unsigned max_polygons;
|
2016-05-18 14:39:52 -07:00
|
|
|
unsigned max_dispatch_width;
|
2015-03-16 12:18:31 -07:00
|
|
|
|
2023-03-10 22:57:36 +02:00
|
|
|
/* The API selected subgroup size */
|
|
|
|
|
unsigned api_subgroup_size; /**< 0, 8, 16, 32 */
|
|
|
|
|
|
2024-03-13 11:01:16 +02:00
|
|
|
unsigned next_address_register_nr;
|
|
|
|
|
|
2024-10-10 23:04:52 -07:00
|
|
|
struct brw_shader_stats shader_stats;
|
2016-10-17 14:10:26 -07:00
|
|
|
|
2023-08-14 16:59:17 -07:00
|
|
|
void debug_optimizer(const nir_shader *nir,
|
|
|
|
|
const char *pass_name,
|
2023-08-06 15:46:12 +03:00
|
|
|
int iteration, int pass_num) const;
|
2010-10-10 15:42:37 -07:00
|
|
|
};
|
|
|
|
|
|
2024-12-07 09:53:31 -08:00
|
|
|
void brw_print_instructions(const fs_visitor &s, FILE *file = stderr);
|
2024-07-12 16:32:36 -07:00
|
|
|
|
2024-12-07 09:53:31 -08:00
|
|
|
void brw_print_instruction(const fs_visitor &s, const fs_inst *inst,
|
|
|
|
|
FILE *file = stderr,
|
|
|
|
|
const brw::def_analysis *defs = nullptr);
|
2024-07-12 16:32:36 -07:00
|
|
|
|
2024-06-14 13:19:58 -07:00
|
|
|
void brw_print_swsb(FILE *f, const struct intel_device_info *devinfo, const tgl_swsb swsb);
|
|
|
|
|
|
2020-01-04 14:32:09 -08:00
|
|
|
/**
|
|
|
|
|
* Return the flag register used in fragment shaders to keep track of live
|
2021-03-29 15:46:12 -07:00
|
|
|
* samples. On Gfx7+ we use f1.0-f1.1 to allow discard jumps in SIMD32
|
2024-02-17 22:43:47 -08:00
|
|
|
* dispatch mode.
|
2020-01-04 14:32:09 -08:00
|
|
|
*/
|
|
|
|
|
static inline unsigned
|
2023-12-05 17:16:34 -08:00
|
|
|
sample_mask_flag_subreg(const fs_visitor &s)
|
2020-01-04 14:32:09 -08:00
|
|
|
{
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
2024-02-17 22:43:47 -08:00
|
|
|
return 2;
|
2020-01-04 14:32:09 -08:00
|
|
|
}
|
|
|
|
|
|
2024-12-07 10:28:03 -08:00
|
|
|
inline brw_reg
|
|
|
|
|
brw_dynamic_msaa_flags(const struct brw_wm_prog_data *wm_prog_data)
|
|
|
|
|
{
|
|
|
|
|
return brw_uniform_reg(wm_prog_data->msaa_flags_param, BRW_TYPE_UD);
|
2017-01-13 15:23:48 -08:00
|
|
|
}
|
|
|
|
|
|
2024-11-18 11:33:35 +02:00
|
|
|
enum intel_barycentric_mode brw_barycentric_mode(const struct brw_wm_prog_key *key,
|
|
|
|
|
nir_intrinsic_instr *intr);
|
2017-03-20 16:04:38 +00:00
|
|
|
|
2019-08-25 23:59:25 -07:00
|
|
|
uint32_t brw_fb_write_msg_control(const fs_inst *inst,
|
|
|
|
|
const struct brw_wm_prog_data *prog_data);
|
|
|
|
|
|
2018-12-11 18:45:43 +01:00
|
|
|
void brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data);
|
2019-08-25 23:59:25 -07:00
|
|
|
|
2022-08-30 00:47:32 -07:00
|
|
|
int brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
|
|
|
|
|
const brw_stage_prog_data *prog_data);
|
|
|
|
|
|
2023-12-05 15:11:09 -08:00
|
|
|
void nir_to_brw(fs_visitor *s);
|
2022-08-30 00:47:32 -07:00
|
|
|
|
2024-08-27 10:16:11 -07:00
|
|
|
void brw_shader_phase_update(fs_visitor &s, enum brw_shader_phase phase);
|
|
|
|
|
|
2024-04-01 12:00:16 -07:00
|
|
|
#ifndef NDEBUG
|
2024-12-29 17:39:39 -08:00
|
|
|
void brw_validate(const fs_visitor &s);
|
2024-04-01 12:00:16 -07:00
|
|
|
#else
|
2024-12-29 17:39:39 -08:00
|
|
|
static inline void brw_validate(const fs_visitor &s) {}
|
2024-04-01 12:00:16 -07:00
|
|
|
#endif
|
|
|
|
|
|
2024-07-12 17:08:46 -07:00
|
|
|
void brw_calculate_cfg(fs_visitor &s);
|
|
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
void brw_optimize(fs_visitor &s);
|
2024-01-04 16:42:50 -08:00
|
|
|
|
2024-12-06 22:39:15 -08:00
|
|
|
enum brw_instruction_scheduler_mode {
|
|
|
|
|
BRW_SCHEDULE_PRE,
|
|
|
|
|
BRW_SCHEDULE_PRE_NON_LIFO,
|
|
|
|
|
BRW_SCHEDULE_PRE_LIFO,
|
|
|
|
|
BRW_SCHEDULE_POST,
|
|
|
|
|
BRW_SCHEDULE_NONE,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
class brw_instruction_scheduler;
|
|
|
|
|
|
|
|
|
|
brw_instruction_scheduler *brw_prepare_scheduler(fs_visitor &s, void *mem_ctx);
|
|
|
|
|
void brw_schedule_instructions_pre_ra(fs_visitor &s, brw_instruction_scheduler *sched,
|
|
|
|
|
brw_instruction_scheduler_mode mode);
|
2024-07-12 16:55:33 -07:00
|
|
|
void brw_schedule_instructions_post_ra(fs_visitor &s);
|
|
|
|
|
|
|
|
|
|
void brw_allocate_registers(fs_visitor &s, bool allow_spilling);
|
|
|
|
|
bool brw_assign_regs(fs_visitor &s, bool allow_spilling, bool spill_all);
|
|
|
|
|
void brw_assign_regs_trivial(fs_visitor &s);
|
|
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
bool brw_lower_3src_null_dest(fs_visitor &s);
|
|
|
|
|
bool brw_lower_alu_restrictions(fs_visitor &s);
|
|
|
|
|
bool brw_lower_barycentrics(fs_visitor &s);
|
|
|
|
|
bool brw_lower_constant_loads(fs_visitor &s);
|
|
|
|
|
bool brw_lower_csel(fs_visitor &s);
|
|
|
|
|
bool brw_lower_derivatives(fs_visitor &s);
|
|
|
|
|
bool brw_lower_dpas(fs_visitor &s);
|
|
|
|
|
bool brw_lower_find_live_channel(fs_visitor &s);
|
|
|
|
|
bool brw_lower_indirect_mov(fs_visitor &s);
|
|
|
|
|
bool brw_lower_integer_multiplication(fs_visitor &s);
|
|
|
|
|
bool brw_lower_load_payload(fs_visitor &s);
|
|
|
|
|
bool brw_lower_load_subgroup_invocation(fs_visitor &s);
|
|
|
|
|
bool brw_lower_logical_sends(fs_visitor &s);
|
|
|
|
|
bool brw_lower_pack(fs_visitor &s);
|
|
|
|
|
bool brw_lower_regioning(fs_visitor &s);
|
2024-02-20 22:23:07 -08:00
|
|
|
bool brw_lower_scalar_fp64_MAD(fs_visitor &s);
|
2024-12-06 11:37:57 -08:00
|
|
|
bool brw_lower_scoreboard(fs_visitor &s);
|
brw: move final send lowering up into the IR
Because we do emit the final send message form in code generation, a
lot of emissions look like this :
add(8) vgrf0, u0, 0x100
mov(1) a0.1, vgrf0 # emitted by the generator
send(8) ..., a0.1
By moving address register manipulation in the IR, we can get this
down to :
add(1) a0.1, u0, 0x100
send(8) ..., a0.1
This reduce register pressure around some send messages by 1 vgrf.
All lost shaders in the below results are fragment SIMD32, due to the
throughput estimator. If turned off, we loose no SIMD32 shaders with
this change.
DG2 results:
Assassin's Creed Valhalla:
Totals from 2044 (96.87% of 2110) affected shaders:
Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00%
Subgroup size: 23832 -> 23824 (-0.03%)
Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82%
Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39%
Fill count: 2005 -> 1256 (-37.36%)
Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00%
Max live registers: 116765 -> 115058 (-1.46%)
Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67%
Cyberpunk 2077:
Totals from 1181 (93.43% of 1264) affected shaders:
Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01%
Subgroup size: 13016 -> 13032 (+0.12%)
Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39%
Spill count: 12 -> 8 (-33.33%)
Fill count: 9 -> 6 (-33.33%)
Dota2:
Totals from 173 (11.59% of 1493) affected shaders:
Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34%
Max live registers: 5787 -> 5779 (-0.14%)
Max dispatch width: 1344 -> 1152 (-14.29%)
Hitman3:
Totals from 5072 (95.39% of 5317) affected shaders:
Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00%
Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48%
Spill count: 3942 -> 3200 (-18.82%)
Fill count: 10158 -> 8846 (-12.92%)
Scratch Memory Size: 257024 -> 223232 (-13.15%)
Max live registers: 328467 -> 324631 (-1.17%)
Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73%
Fortnite:
Totals from 360 (4.82% of 7472) affected shaders:
Instrs: 778068 -> 777925 (-0.02%)
Subgroup size: 3128 -> 3136 (+0.26%)
Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19%
Max live registers: 50689 -> 50658 (-0.06%)
Hogwarts Legacy:
Totals from 1376 (84.00% of 1638) affected shaders:
Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03%
Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12%
Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36%
Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23%
Scratch Memory Size: 99328 -> 89088 (-10.31%)
Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23%
Max dispatch width: 11848 -> 11920 (+0.61%)
Metro Exodus:
Totals from 92 (0.21% of 43072) affected shaders:
Instrs: 262995 -> 262968 (-0.01%)
Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25%
Max live registers: 11152 -> 11140 (-0.11%)
Red Dead Redemption 2 :
Totals from 451 (7.71% of 5847) affected shaders:
Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00%
Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00%
Max live registers: 42294 -> 42185 (-0.26%)
Spiderman Remastered:
Totals from 6820 (98.02% of 6958) affected shaders:
Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65%
Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25%
Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61%
Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58%
Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74%
Max live registers: 493149 -> 487458 (-1.15%)
Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20%
Strange Brigade:
Totals from 3769 (91.21% of 4132) affected shaders:
Instrs: 1354476 -> 1321474 (-2.44%)
Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59%
Max live registers: 199057 -> 193656 (-2.71%)
Max dispatch width: 30272 -> 30240 (-0.11%)
Witcher 3:
Totals from 25 (2.40% of 1041) affected shaders:
Instrs: 24621 -> 24606 (-0.06%)
Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05%
Max live registers: 1963 -> 1955 (-0.41%)
LNL results:
Assassin's Creed Valhalla:
Totals from 1928 (98.02% of 1967) affected shaders:
Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11%
Subgroup size: 41264 -> 41280 (+0.04%)
Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11%
Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90%
Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60%
Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56%
Max live registers: 205483 -> 202192 (-1.60%)
Cyberpunk 2077:
Totals from 1177 (96.40% of 1221) affected shaders:
Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03%
Subgroup size: 24912 -> 24944 (+0.13%)
Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81%
Spill count: 8 -> 3 (-62.50%)
Fill count: 6 -> 3 (-50.00%)
Max live registers: 126922 -> 125472 (-1.14%)
Dota2:
Totals from 428 (32.47% of 1318) affected shaders:
Instrs: 89355 -> 89740 (+0.43%)
Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55%
Max live registers: 32863 -> 32847 (-0.05%)
Fortnite:
Totals from 5354 (81.72% of 6552) affected shaders:
Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53%
Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65%
Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72%
Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35%
Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71%
Hitman3:
Totals from 4912 (97.09% of 5059) affected shaders:
Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00%
Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55%
Spill count: 3739 -> 3136 (-16.13%)
Fill count: 10657 -> 9564 (-10.26%)
Scratch Memory Size: 373760 -> 318464 (-14.79%)
Max live registers: 597566 -> 589460 (-1.36%)
Hogwarts Legacy:
Totals from 1471 (96.33% of 1527) affected shaders:
Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05%
Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68%
Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95%
Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83%
Scratch Memory Size: 251904 -> 217088 (-13.82%)
Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12%
Metro Exodus:
Totals from 18356 (49.81% of 36854) affected shaders:
Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83%
Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84%
Spill count: 595 -> 546 (-8.24%)
Fill count: 1604 -> 1408 (-12.22%)
Max live registers: 2086937 -> 2086933 (-0.00%)
Red Dead Redemption 2:
Totals from 4171 (79.31% of 5259) affected shaders:
Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83%
Subgroup size: 86416 -> 86432 (+0.02%)
Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53%
Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59%
Scratch Memory Size: 401408 -> 385024 (-4.08%)
Spiderman Remastered:
Totals from 6639 (98.94% of 6710) affected shaders:
Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98%
Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59%
Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82%
Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76%
Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17%
Max live registers: 918240 -> 906604 (-1.27%)
Strange Brigade:
Totals from 3675 (92.24% of 3984) affected shaders:
Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00%
Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09%
Max live registers: 361849 -> 351265 (-2.92%)
Witcher 3:
Totals from 13 (46.43% of 28) affected shaders:
Instrs: 593 -> 660 (+11.30%)
Cycle count: 28302 -> 28714 (+1.46%)
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
2024-02-29 20:51:50 +02:00
|
|
|
bool brw_lower_send_descriptors(fs_visitor &s);
|
2024-11-20 08:12:52 -08:00
|
|
|
bool brw_lower_send_gather(fs_visitor &s);
|
2024-12-06 11:37:57 -08:00
|
|
|
bool brw_lower_sends_overlapping_payload(fs_visitor &s);
|
|
|
|
|
bool brw_lower_simd_width(fs_visitor &s);
|
2024-12-06 22:07:45 -08:00
|
|
|
bool brw_lower_src_modifiers(fs_visitor &s, bblock_t *block, fs_inst *inst, unsigned i);
|
2024-12-06 11:37:57 -08:00
|
|
|
bool brw_lower_sub_sat(fs_visitor &s);
|
|
|
|
|
bool brw_lower_subgroup_ops(fs_visitor &s);
|
|
|
|
|
bool brw_lower_uniform_pull_constant_loads(fs_visitor &s);
|
|
|
|
|
void brw_lower_vgrfs_to_fixed_grfs(fs_visitor &s);
|
|
|
|
|
|
brw: move final send lowering up into the IR
Because we do emit the final send message form in code generation, a
lot of emissions look like this :
add(8) vgrf0, u0, 0x100
mov(1) a0.1, vgrf0 # emitted by the generator
send(8) ..., a0.1
By moving address register manipulation in the IR, we can get this
down to :
add(1) a0.1, u0, 0x100
send(8) ..., a0.1
This reduce register pressure around some send messages by 1 vgrf.
All lost shaders in the below results are fragment SIMD32, due to the
throughput estimator. If turned off, we loose no SIMD32 shaders with
this change.
DG2 results:
Assassin's Creed Valhalla:
Totals from 2044 (96.87% of 2110) affected shaders:
Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00%
Subgroup size: 23832 -> 23824 (-0.03%)
Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82%
Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39%
Fill count: 2005 -> 1256 (-37.36%)
Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00%
Max live registers: 116765 -> 115058 (-1.46%)
Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67%
Cyberpunk 2077:
Totals from 1181 (93.43% of 1264) affected shaders:
Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01%
Subgroup size: 13016 -> 13032 (+0.12%)
Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39%
Spill count: 12 -> 8 (-33.33%)
Fill count: 9 -> 6 (-33.33%)
Dota2:
Totals from 173 (11.59% of 1493) affected shaders:
Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34%
Max live registers: 5787 -> 5779 (-0.14%)
Max dispatch width: 1344 -> 1152 (-14.29%)
Hitman3:
Totals from 5072 (95.39% of 5317) affected shaders:
Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00%
Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48%
Spill count: 3942 -> 3200 (-18.82%)
Fill count: 10158 -> 8846 (-12.92%)
Scratch Memory Size: 257024 -> 223232 (-13.15%)
Max live registers: 328467 -> 324631 (-1.17%)
Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73%
Fortnite:
Totals from 360 (4.82% of 7472) affected shaders:
Instrs: 778068 -> 777925 (-0.02%)
Subgroup size: 3128 -> 3136 (+0.26%)
Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19%
Max live registers: 50689 -> 50658 (-0.06%)
Hogwarts Legacy:
Totals from 1376 (84.00% of 1638) affected shaders:
Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03%
Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12%
Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36%
Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23%
Scratch Memory Size: 99328 -> 89088 (-10.31%)
Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23%
Max dispatch width: 11848 -> 11920 (+0.61%)
Metro Exodus:
Totals from 92 (0.21% of 43072) affected shaders:
Instrs: 262995 -> 262968 (-0.01%)
Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25%
Max live registers: 11152 -> 11140 (-0.11%)
Red Dead Redemption 2 :
Totals from 451 (7.71% of 5847) affected shaders:
Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00%
Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00%
Max live registers: 42294 -> 42185 (-0.26%)
Spiderman Remastered:
Totals from 6820 (98.02% of 6958) affected shaders:
Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65%
Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25%
Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61%
Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58%
Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74%
Max live registers: 493149 -> 487458 (-1.15%)
Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20%
Strange Brigade:
Totals from 3769 (91.21% of 4132) affected shaders:
Instrs: 1354476 -> 1321474 (-2.44%)
Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59%
Max live registers: 199057 -> 193656 (-2.71%)
Max dispatch width: 30272 -> 30240 (-0.11%)
Witcher 3:
Totals from 25 (2.40% of 1041) affected shaders:
Instrs: 24621 -> 24606 (-0.06%)
Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05%
Max live registers: 1963 -> 1955 (-0.41%)
LNL results:
Assassin's Creed Valhalla:
Totals from 1928 (98.02% of 1967) affected shaders:
Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11%
Subgroup size: 41264 -> 41280 (+0.04%)
Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11%
Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90%
Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60%
Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56%
Max live registers: 205483 -> 202192 (-1.60%)
Cyberpunk 2077:
Totals from 1177 (96.40% of 1221) affected shaders:
Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03%
Subgroup size: 24912 -> 24944 (+0.13%)
Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81%
Spill count: 8 -> 3 (-62.50%)
Fill count: 6 -> 3 (-50.00%)
Max live registers: 126922 -> 125472 (-1.14%)
Dota2:
Totals from 428 (32.47% of 1318) affected shaders:
Instrs: 89355 -> 89740 (+0.43%)
Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55%
Max live registers: 32863 -> 32847 (-0.05%)
Fortnite:
Totals from 5354 (81.72% of 6552) affected shaders:
Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53%
Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65%
Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72%
Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35%
Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71%
Hitman3:
Totals from 4912 (97.09% of 5059) affected shaders:
Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00%
Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55%
Spill count: 3739 -> 3136 (-16.13%)
Fill count: 10657 -> 9564 (-10.26%)
Scratch Memory Size: 373760 -> 318464 (-14.79%)
Max live registers: 597566 -> 589460 (-1.36%)
Hogwarts Legacy:
Totals from 1471 (96.33% of 1527) affected shaders:
Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05%
Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68%
Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95%
Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83%
Scratch Memory Size: 251904 -> 217088 (-13.82%)
Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12%
Metro Exodus:
Totals from 18356 (49.81% of 36854) affected shaders:
Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83%
Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84%
Spill count: 595 -> 546 (-8.24%)
Fill count: 1604 -> 1408 (-12.22%)
Max live registers: 2086937 -> 2086933 (-0.00%)
Red Dead Redemption 2:
Totals from 4171 (79.31% of 5259) affected shaders:
Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83%
Subgroup size: 86416 -> 86432 (+0.02%)
Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53%
Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59%
Scratch Memory Size: 401408 -> 385024 (-4.08%)
Spiderman Remastered:
Totals from 6639 (98.94% of 6710) affected shaders:
Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98%
Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59%
Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82%
Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76%
Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17%
Max live registers: 918240 -> 906604 (-1.27%)
Strange Brigade:
Totals from 3675 (92.24% of 3984) affected shaders:
Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00%
Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09%
Max live registers: 361849 -> 351265 (-2.92%)
Witcher 3:
Totals from 13 (46.43% of 28) affected shaders:
Instrs: 593 -> 660 (+11.30%)
Cycle count: 28302 -> 28714 (+1.46%)
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
2024-02-29 20:51:50 +02:00
|
|
|
bool brw_opt_address_reg_load(fs_visitor &s);
|
2024-12-06 11:37:57 -08:00
|
|
|
bool brw_opt_algebraic(fs_visitor &s);
|
|
|
|
|
bool brw_opt_bank_conflicts(fs_visitor &s);
|
|
|
|
|
bool brw_opt_cmod_propagation(fs_visitor &s);
|
|
|
|
|
bool brw_opt_combine_constants(fs_visitor &s);
|
brw: Combine convergent texture buffer fetches into fewer loads
Borderlands 3 (both DX11 and DX12 renderers) have a common pattern
across many shaders:
con 32x4 %510 = (uint32)txf %2 (handle), %1191 (0x10) (coord), %1 (0x0) (lod), 0 (texture)
con 32x4 %512 = (uint32)txf %2 (handle), %1511 (0x11) (coord), %1 (0x0) (lod), 0 (texture)
...
con 32x4 %550 = (uint32)txf %2 (handle), %1549 (0x25) (coord), %1 (0x0) (lod), 0 (texture)
con 32x4 %552 = (uint32)txf %2 (handle), %1551 (0x26) (coord), %1 (0x0) (lod), 0 (texture)
A single basic block contains piles of texelFetches from a 1D buffer
texture, with constant coordinates. In most cases, only the .x channel
of the result is read. So we have something on the order of 28 sampler
messages, each asking for...a single uint32_t scalar value. Because our
sampler doesn't have any support for convergent block loads (like the
untyped LSC transpose messages for SSBOs)...this means we were emitting
SIMD8/16 (or SIMD16/32 on Xe2) sampler messages for every single scalar,
replicating what's effectively a SIMD1 value to the entire register.
This is hugely wasteful, both in terms of register pressure, and also in
back-and-forth sending and receiving memory messages.
The good news is we can take advantage of our explicit SIMD model to
handle this more efficiently. This patch adds a new optimization pass
that detects a series of SHADER_OPCODE_TXF_LOGICAL, in the same basic
block, with constant offsets, from the same texture. It constructs a
new divergent coordinate where each channel is one of the constants
(i.e <10, 11, 12, ..., 26> in the above example). It issues a new
NoMask divergent texel fetch which loads N useful channels in one go,
and replaces the rest with expansion MOVs that splat the SIMD1 result
back to the full SIMD width. (These get copy propagated away.)
We can pick the SIMD size of the load independently of the native shader
width as well. On Xe2, those 28 convergent loads become a single SIMD32
ld message. On earlier hardware, we use 2 SIMD16 messages. Or we can
use a smaller size when there aren't many to combine.
In fossil-db, this cuts 27% of send messages in affected shaders, 3-6%
of cycles, 2-3% of instructions, and 8-12% of live registers. On A770,
this improves performance of Borderlands 3 by roughly 2.5-3.5%.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32573>
2024-12-09 13:25:18 -08:00
|
|
|
bool brw_opt_combine_convergent_txf(fs_visitor &s);
|
2024-12-06 11:37:57 -08:00
|
|
|
bool brw_opt_compact_virtual_grfs(fs_visitor &s);
|
|
|
|
|
bool brw_opt_constant_fold_instruction(const intel_device_info *devinfo, fs_inst *inst);
|
|
|
|
|
bool brw_opt_copy_propagation(fs_visitor &s);
|
|
|
|
|
bool brw_opt_copy_propagation_defs(fs_visitor &s);
|
|
|
|
|
bool brw_opt_cse_defs(fs_visitor &s);
|
|
|
|
|
bool brw_opt_dead_code_eliminate(fs_visitor &s);
|
|
|
|
|
bool brw_opt_eliminate_find_live_channel(fs_visitor &s);
|
|
|
|
|
bool brw_opt_register_coalesce(fs_visitor &s);
|
|
|
|
|
bool brw_opt_remove_extra_rounding_modes(fs_visitor &s);
|
|
|
|
|
bool brw_opt_remove_redundant_halts(fs_visitor &s);
|
|
|
|
|
bool brw_opt_saturate_propagation(fs_visitor &s);
|
2025-01-14 10:49:51 -08:00
|
|
|
bool brw_opt_send_gather_to_send(fs_visitor &s);
|
2024-11-20 08:12:52 -08:00
|
|
|
bool brw_opt_send_to_send_gather(fs_visitor &s);
|
2024-12-06 11:37:57 -08:00
|
|
|
bool brw_opt_split_sends(fs_visitor &s);
|
|
|
|
|
bool brw_opt_split_virtual_grfs(fs_visitor &s);
|
|
|
|
|
bool brw_opt_zero_samples(fs_visitor &s);
|
|
|
|
|
|
|
|
|
|
bool brw_workaround_emit_dummy_mov_instruction(fs_visitor &s);
|
|
|
|
|
bool brw_workaround_memory_fence_before_eot(fs_visitor &s);
|
|
|
|
|
bool brw_workaround_nomask_control_flow(fs_visitor &s);
|
|
|
|
|
bool brw_workaround_source_arf_before_eot(fs_visitor &s);
|
2024-01-03 15:47:28 -08:00
|
|
|
|
2024-01-04 22:07:17 -08:00
|
|
|
/* Helpers. */
|
2024-12-06 11:37:57 -08:00
|
|
|
unsigned brw_get_lowered_simd_width(const fs_visitor *shader,
|
|
|
|
|
const fs_inst *inst);
|