mesa/src/intel/compiler/brw_fs.h

630 lines
20 KiB
C
Raw Normal View History

/*
* Copyright © 2010 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Authors:
* Eric Anholt <eric@anholt.net>
*
*/
#ifndef BRW_FS_H
#define BRW_FS_H
#include "brw_shader.h"
#include "brw_ir_fs.h"
#include "brw_fs_live_variables.h"
intel/ir: Import shader performance analysis pass. This introduces an analysis pass intended to estimate several performance statistics of the shader, including cycle count latency and throughput values, based on static modeling. It has instruction performance information more comprehensive than the current scheduling pass for all platforms between Gen4-11, and works on both the FS and VEC4 back-end. The most immediate purpose of this pass is to implement a heuristic meant to determine whether using SIMD32 dispatch for a fragment shader can be expected to help more than it hurts. In addition this will allow the effect of passes run after scheduling (e.g. the TGL software scoreboard pass and the VEC4 dependency control pass) to be visible in shader-db statistics. But that isn't the end of the story, other potential applications of this pass (not part of this MR) I've been playing around with are: - Implement a similar SIMD16 heuristic allowing the identification of inefficient SIMD16 fragment shaders. - Implement similar SIMD16 and SIMD32 heuristics for the compute shader stage -- Currently compute shader builds always use the SIMD16 shader if available and never use the SIMD32 shader unless strictly necessary, which is suboptimal under certain conditions. - Hook up to the instruction scheduler in order to improve the accuracy of its timing information. - Use as heuristic in order to drive the selection of scheduling modes (Matt was experimenting with that). - Plug to the TGL software scoreboard pass in order to implement a more effective SBID token allocation algorithm, since in general the optimal token allocation depends on the timings of all instructions in the program. - Use its bottleneck detection functionality in order to implement a heuristic computing a more optimal bound for the number of fragment shader threads executed in parallel (by adjusting the MaximumNumberofThreadsPerPSD control of 3DSTATE_PS). As a follow-up I'm planning to submit updated timing information for Gen12 platforms -- Everything else required to support Gen12 like SWSB handling is already included in this patch, but there were some IP concerns regarding the TGL timing parameters since they cannot currently be obtained with the documentation and hardware which is publicly available. The timing parameters for any previous Gen7-11 platforms can be obtained by anyone by sampling the timestamp register using e.g. shader_time, though I have some more convenient instrumentation coming up. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2020-03-26 14:59:02 -07:00
#include "brw_ir_performance.h"
#include "compiler/nir/nir.h"
struct bblock_t;
namespace {
struct acp_entry;
}
class fs_visitor;
namespace brw {
/**
* Register pressure analysis of a shader. Estimates how many registers
* are live at any point of the program in GRF units.
*/
struct register_pressure {
register_pressure(const fs_visitor *v);
~register_pressure();
analysis_dependency_class
dependency_class() const
{
return (DEPENDENCY_INSTRUCTION_IDENTITY |
DEPENDENCY_INSTRUCTION_DATA_FLOW |
DEPENDENCY_VARIABLES);
}
bool
validate(const fs_visitor *) const
{
/* FINISHME */
return true;
}
unsigned *regs_live_at_ip;
};
}
struct brw_gs_compile;
namespace brw {
class fs_builder;
}
struct shader_stats {
const char *scheduler_mode;
unsigned promoted_constants;
unsigned spill_count;
unsigned fill_count;
unsigned max_register_pressure;
};
/** Register numbers for thread payload fields. */
struct thread_payload {
/** The number of thread payload registers the hardware will supply. */
uint8_t num_regs;
virtual ~thread_payload() = default;
protected:
thread_payload() : num_regs() {}
};
struct vs_thread_payload : public thread_payload {
vs_thread_payload(const fs_visitor &v);
fs_reg urb_handles;
};
struct tcs_thread_payload : public thread_payload {
tcs_thread_payload(const fs_visitor &v);
fs_reg patch_urb_output;
fs_reg primitive_id;
fs_reg icp_handle_start;
};
struct tes_thread_payload : public thread_payload {
tes_thread_payload(const fs_visitor &v);
fs_reg patch_urb_input;
fs_reg primitive_id;
fs_reg coords[3];
fs_reg urb_output;
};
struct gs_thread_payload : public thread_payload {
gs_thread_payload(fs_visitor &v);
fs_reg urb_handles;
fs_reg primitive_id;
fs_reg instance_id;
fs_reg icp_handle_start;
};
struct fs_thread_payload : public thread_payload {
fs_thread_payload(const fs_visitor &v,
bool &source_depth_to_render_target,
bool &runtime_check_aads_emit);
uint8_t subspan_coord_reg[2];
uint8_t source_depth_reg[2];
uint8_t source_w_reg[2];
uint8_t aa_dest_stencil_reg[2];
uint8_t dest_depth_reg[2];
uint8_t sample_pos_reg[2];
uint8_t sample_mask_in_reg[2];
uint8_t depth_w_coef_reg;
uint8_t barycentric_coord_reg[BRW_BARYCENTRIC_MODE_COUNT][2];
};
struct cs_thread_payload : public thread_payload {
cs_thread_payload(const fs_visitor &v);
void load_subgroup_id(const brw::fs_builder &bld, fs_reg &dest) const;
fs_reg local_invocation_id[3];
protected:
fs_reg subgroup_id_;
};
struct task_mesh_thread_payload : public cs_thread_payload {
task_mesh_thread_payload(fs_visitor &v);
fs_reg extended_parameter_0;
fs_reg local_index;
fs_reg inline_parameter;
fs_reg urb_output;
/* URB to read Task memory inputs. Only valid for MESH stage. */
fs_reg task_urb_input;
};
struct bs_thread_payload : public thread_payload {
bs_thread_payload(const fs_visitor &v);
fs_reg global_arg_ptr;
fs_reg local_arg_ptr;
void load_shader_type(const brw::fs_builder &bld, fs_reg &dest) const;
};
class fs_instruction_scheduler;
/**
* The fragment shader front-end.
*
* Translates either GLSL IR or Mesa IR (for ARB_fragment_program) into FS IR.
*/
class fs_visitor : public backend_shader
{
public:
fs_visitor(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
const brw_base_prog_key *key,
struct brw_stage_prog_data *prog_data,
const nir_shader *shader,
unsigned dispatch_width,
bool needs_register_pressure,
bool debug_enabled);
fs_visitor(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
const brw_wm_prog_key *key,
struct brw_wm_prog_data *prog_data,
const nir_shader *shader,
unsigned dispatch_width,
unsigned num_polygons,
bool needs_register_pressure,
bool debug_enabled);
fs_visitor(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
struct brw_gs_compile *gs_compile,
struct brw_gs_prog_data *prog_data,
const nir_shader *shader,
bool needs_register_pressure,
bool debug_enabled);
void init();
~fs_visitor();
fs_reg vgrf(const glsl_type *const type);
void import_uniforms(fs_visitor *v);
void VARYING_PULL_CONSTANT_LOAD(const brw::fs_builder &bld,
const fs_reg &dst,
const fs_reg &surface,
const fs_reg &surface_handle,
const fs_reg &varying_offset,
uint32_t const_offset,
intel/fs: Don't rely on CSE for VARYING_PULL_CONSTANT_LOAD In the past, we didn't have a good solution for combining scalar loads with a variable index plus a constant offset. To handle that, we took our load offset and rounded it down to the nearest vec4, loaded an entire vec4, and trusted in the backend CSE pass to detect loads from the same address and remove redundant ones. These days, nir_opt_load_store_vectorize() does a good job of taking those scalar loads and combining them into vector loads for us, so we no longer need to do this trick. In fact, it can be better not to: our offset need only be 4 byte (scalar) aligned, but we were making it 16 byte (vec4) aligned. So if you wanted to load an unaligned vec2, we might actually load two vec4's (___X | Y___) instead of doing a single load at the starting offset. This should also reduce the work the backend CSE pass has to do, since we just emit a single VARYING_PULL_CONSTANT_LOAD instead of 4. shader-db results on Alchemist: - No changes in SEND count or spills/fills - Instructions: helped 95, hurt 100, +/- 1-3 instructions - Cycles: helped 3411 hurt 1868, -0.01% (-0.28% in affected) - SIMD32: gained 5, lost 3 fossil-db results on Alchemist: - Instrs: 161381427 -> 161384130 (+0.00%); split: -0.00%, +0.00% - Cycles: 14258305873 -> 14145884365 (-0.79%); split: -0.95%, +0.16% - SIMD32: Gained 42, lost 26 - Totals from 56285 (8.63% of 652236) affected shaders: - Instrs: 13318308 -> 13321011 (+0.02%); split: -0.01%, +0.03% - Cycles: 7464985282 -> 7352563774 (-1.51%); split: -1.82%, +0.31% From this we can see that we aren't doing more loads than before and the change is pretty inconsequential, but it requires less optimizing to produce similar results. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27568>
2024-02-01 09:45:46 -08:00
uint8_t alignment,
unsigned components);
void DEP_RESOLVE_MOV(const brw::fs_builder &bld, int grf);
bool run_fs(bool allow_spilling, bool do_rep_send);
bool run_vs();
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8 Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
bool run_tcs();
bool run_tes();
bool run_gs();
bool run_cs(bool allow_spilling);
bool run_bs(bool allow_spilling);
bool run_task(bool allow_spilling);
bool run_mesh(bool allow_spilling);
void optimize();
void allocate_registers(bool allow_spilling);
uint32_t compute_max_register_pressure();
bool fixup_sends_duplicate_payload();
void fixup_3src_null_dest();
void emit_dummy_memory_fence_before_eot();
void emit_dummy_mov_instruction();
bool fixup_nomask_control_flow();
void assign_curb_setup();
void assign_urb_setup();
void convert_attr_sources_to_hw_regs(fs_inst *inst);
void assign_vs_urb_setup();
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8 Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
void assign_tcs_urb_setup();
void assign_tes_urb_setup();
void assign_gs_urb_setup();
bool assign_regs(bool allow_spilling, bool spill_all);
void assign_regs_trivial();
void calculate_payload_ranges(unsigned payload_node_count,
int *payload_last_use_ip) const;
bool split_virtual_grfs();
bool compact_virtual_grfs();
void assign_constant_locations();
bool get_pull_locs(const fs_reg &src, unsigned *out_surf_index,
unsigned *out_pull_index);
bool lower_constant_loads();
virtual void invalidate_analysis(brw::analysis_dependency_class c);
#ifndef NDEBUG
void validate();
#else
void validate() {}
#endif
bool opt_algebraic();
bool opt_redundant_halt();
bool opt_cse();
bool opt_cse_local(const brw::fs_live_variables &live, bblock_t *block, int &ip);
bool opt_copy_propagation();
intel/fs: Implement GRF bank conflict mitigation pass. Unnecessary GRF bank conflicts increase the issue time of ternary instructions (the overwhelmingly most common of which is MAD) by roughly 50%, leading to reduced ALU throughput. This pass attempts to minimize the number of bank conflicts by rearranging the layout of the GRF space post-register allocation. It's in general not possible to eliminate all of them without introducing extra copies, which are typically more expensive than the bank conflict itself. In a shader-db run on SKL this helps roughly 46k shaders: total conflicts in shared programs: 1008981 -> 600461 (-40.49%) conflicts in affected programs: 816222 -> 407702 (-50.05%) helped: 46234 HURT: 72 The running time of shader-db itself on SKL seems to be increased by roughly 2.52%±1.13% with n=20 due to the additional work done by the compiler back-end. On earlier generations the pass is somewhat less effective in relative terms because the hardware incurs a bank conflict anytime the last two sources of the instruction are duplicate (e.g. while trying to square a value using MAD), which is impossible to avoid without introducing copies. E.g. for a shader-db run on SNB: total conflicts in shared programs: 944636 -> 623185 (-34.03%) conflicts in affected programs: 853258 -> 531807 (-37.67%) helped: 31052 HURT: 19 And on BDW: total conflicts in shared programs: 1418393 -> 987539 (-30.38%) conflicts in affected programs: 1179787 -> 748933 (-36.52%) helped: 47592 HURT: 70 On SKL GT4e this improves performance of GpuTest Volplosion by 3.64% ±0.33% with n=16. NOTE: This patch intentionally disregards some i965 coding conventions for the sake of reviewability. This is addressed by the next squash patch which introduces an amount of (for the most part boring) boilerplate that might distract reviewers from the non-trivial algorithmic details of the pass. The following patch is squashed in: SQUASH: intel/fs/bank_conflicts: Roll back to the nineties. Acked-by: Matt Turner <mattst88@gmail.com>
2017-06-15 15:23:57 -07:00
bool opt_bank_conflicts();
intel/fs: Opportunistically split SEND message payloads While we've taken advantage of split-sends in select situations, there are many other cases (such as sampler messages, framebuffer writes, and URB writes) that have never received that treatment, and continued to use monolithic send payloads. This commit introduces a new optimization pass which detects SEND messages with a single payload, finds an adjacent LOAD_PAYLOAD that produces that payload, splits it two, and updates the SEND to use both of the new smaller payloads. In places where we manually used split SENDS, we rely on underlying knowledge of the message to determine a natural split point. For example, header and data, or address and value. In this pass, we instead infer a natural split point by looking at the source registers. Often times, consecutive LOAD_PAYLOAD sources may already be grouped together in a contiguous block, such as a texture coordinate. Then, there is another bit of data, such as a LOD, that may come from elsewhere. We look for the point where the source list switches VGRFs, and split it there. (If there is a message header, we choose to split there, as it will naturally come from elsewhere.) This not only reduces the payload sizes, alleviating register pressure, but it means that we may be able to eliminate some payload construction altogether, if we have a contiguous block already and some extra data being tacked on to one side or the other. shader-db results for Icelake are: total instructions in shared programs: 19602513 -> 19369255 (-1.19%) instructions in affected programs: 6085404 -> 5852146 (-3.83%) helped: 23650 / HURT: 15 helped stats (abs) min: 1 max: 1344 x̄: 9.87 x̃: 3 helped stats (rel) min: 0.03% max: 35.71% x̄: 3.78% x̃: 2.15% HURT stats (abs) min: 1 max: 44 x̄: 7.20 x̃: 2 HURT stats (rel) min: 1.04% max: 20.00% x̄: 4.13% x̃: 2.00% 95% mean confidence interval for instructions value: -10.16 -9.55 95% mean confidence interval for instructions %-change: -3.84% -3.72% Instructions are helped. total cycles in shared programs: 848180368 -> 842208063 (-0.70%) cycles in affected programs: 599931746 -> 593959441 (-1.00%) helped: 22114 / HURT: 13053 helped stats (abs) min: 1 max: 482486 x̄: 580.94 x̃: 22 helped stats (rel) min: <.01% max: 78.92% x̄: 4.76% x̃: 0.75% HURT stats (abs) min: 1 max: 94022 x̄: 526.67 x̃: 22 HURT stats (rel) min: <.01% max: 188.99% x̄: 4.52% x̃: 0.61% 95% mean confidence interval for cycles value: -222.87 -116.79 95% mean confidence interval for cycles %-change: -1.44% -1.20% Cycles are helped. total spills in shared programs: 8387 -> 6569 (-21.68%) spills in affected programs: 5110 -> 3292 (-35.58%) helped: 359 / HURT: 3 total fills in shared programs: 11833 -> 8218 (-30.55%) fills in affected programs: 8635 -> 5020 (-41.86%) helped: 358 / HURT: 3 LOST: 1 SIMD16 shader, 659 SIMD32 shaders GAINED: 65 SIMD16 shaders, 959 SIMD32 shaders Total CPU time (seconds): 1505.48 -> 1474.08 (-2.09%) Examining these results: the few shaders where spills/fills increased were already spilling significantly, and were only slightly hurt. The applications affected were also helped in countless other shaders, and other shaders stopped spilling altogether or had 50% reductions. Many SIMD16 shaders were gained, and overall we gain more SIMD32, though many close to the register pressure line go back and forth. Reviewed-by: Francisco Jerez <currojerez@riseup.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17018>
2022-06-13 02:21:49 -07:00
bool opt_split_sends();
bool register_coalesce();
bool eliminate_find_live_channel();
bool dead_code_eliminate();
bool remove_extra_rounding_modes();
i965/fs: Combine tex/fb_write operations (opt) Certain platforms support the ability to sample from a texture, and write it out to the file RT - thus saving a costly send instructions (note that this is a potnential win if one wanted to backport to a tag that didn't have the patch from Topi which removed excess MOVs from LOAD_PAYLOAD - 97caf5fa04dbd2), v2: Modify the algorithm. Instead of iterating in reverse through blocks and insts, since the last block/inst is the only thing which can benefit. Rebased on top of Ken's patching modifying is_last_send v3: Rebased over almost 2 months, and Incorporated feedback from Matt: Some comment typo fixes and rewordings. Whitespace Move the optimization pass outside of the optimize loop v4: Some cosmetic changes requested from Ken. These changes ensured that the optimization function always returned true when an optimization occurred, and false when one did not. This behavior did not exist with the original patch. As a result, having the separate helper function which Matt did not like no longer made sense, and so now I believe everyone should be happy. Benchmark (n=20) %diff *OglBatch5 -1.4 *OglBatch7 -1.79 OglFillTexMulti 5.57 OglFillTexSingle 1.16 OglShMapPcf 0.05 OglTexFilterAniso 3.01 OglTexFilterTri 1.94 No piglit regressions: (http://otc-gfxtest-01.jf.intel.com:8080/view/dev/job/bwidawsk/112/) [*] I believe my measurements are incorrect for Batch5-7. If I add this new optimization, but never emit the new instruction I see similar results. v5: Remove declaration of combine_tex_header since v4 dropped that function (Ben) Remove check for impossible case of an empty block (Matt) Set dest earlier to avoid extra special-casing in generate_tex (Matt) Signed-off-by: Ben Widawsky <ben@bwidawsk.net> Reviewed-by: Matt Turner <mattst88@gmail.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-02-08 13:59:57 -08:00
fs_instruction_scheduler *prepare_scheduler(void *mem_ctx);
void schedule_instructions_pre_ra(fs_instruction_scheduler *sched,
instruction_scheduler_mode mode);
void schedule_instructions_post_ra();
void vfail(const char *msg, va_list args);
void fail(const char *msg, ...);
void limit_dispatch_width(unsigned n, const char *msg);
bool lower_uniform_pull_constant_loads();
bool lower_load_payload();
bool lower_pack();
bool lower_regioning();
bool lower_logical_sends();
bool lower_integer_multiplication();
bool lower_simd_width();
bool lower_barycentrics();
bool lower_derivatives();
bool lower_find_live_channel();
bool lower_scoreboard();
bool lower_sub_sat();
bool opt_combine_constants();
void emit_repclear_shader();
void emit_interpolation_setup_gfx4();
void emit_interpolation_setup_gfx6();
bool opt_peephole_sel();
bool opt_saturate_propagation();
bool opt_cmod_propagation();
bool opt_zero_samples();
void set_tcs_invocation_id();
void emit_alpha_test();
fs_inst *emit_single_fb_write(const brw::fs_builder &bld,
fs_reg color1, fs_reg color2,
fs_reg src0_alpha, unsigned components);
void do_emit_fb_writes(int nr_color_regions, bool replicate_alpha);
void emit_fb_writes();
void emit_urb_writes(const fs_reg &gs_vertex_count = fs_reg());
void emit_gs_control_data_bits(const fs_reg &vertex_count);
void emit_gs_thread_end();
bool mark_last_urb_write_with_eot();
void emit_tcs_thread_end();
void emit_urb_fence();
void emit_cs_terminate();
fs_reg interp_reg(const brw::fs_builder &bld, unsigned location,
unsigned channel, unsigned comp);
fs_reg per_primitive_reg(const brw::fs_builder &bld,
int location, unsigned comp);
virtual void dump_instruction_to_file(const backend_instruction *inst, FILE *file) const;
virtual void dump_instructions_to_file(FILE *file) const;
const brw_base_prog_key *const key;
const struct brw_sampler_prog_key_data *key_tex;
struct brw_gs_compile *gs_compile;
struct brw_stage_prog_data *prog_data;
brw_analysis<brw::fs_live_variables, backend_shader> live_analysis;
brw_analysis<brw::register_pressure, fs_visitor> regpressure_analysis;
brw_analysis<brw::performance, fs_visitor> performance_analysis;
/** Number of uniform variable components visited. */
unsigned uniforms;
/** Byte-offset for the next available spot in the scratch space buffer. */
unsigned last_scratch;
/**
* Array mapping UNIFORM register numbers to the push parameter index,
* or -1 if this uniform register isn't being uploaded as a push constant.
*/
int *push_constant_loc;
fs_reg frag_depth;
fs_reg frag_stencil;
fs_reg sample_mask;
fs_reg outputs[VARYING_SLOT_MAX];
fs_reg dual_src_output;
int first_non_payload_grf;
/** Either BRW_MAX_GRF or GFX7_MRF_HACK_START */
unsigned max_grf;
bool failed;
char *fail_msg;
thread_payload *payload_;
thread_payload &payload() {
return *this->payload_;
}
vs_thread_payload &vs_payload() {
assert(stage == MESA_SHADER_VERTEX);
return *static_cast<vs_thread_payload *>(this->payload_);
}
tcs_thread_payload &tcs_payload() {
assert(stage == MESA_SHADER_TESS_CTRL);
return *static_cast<tcs_thread_payload *>(this->payload_);
}
tes_thread_payload &tes_payload() {
assert(stage == MESA_SHADER_TESS_EVAL);
return *static_cast<tes_thread_payload *>(this->payload_);
}
gs_thread_payload &gs_payload() {
assert(stage == MESA_SHADER_GEOMETRY);
return *static_cast<gs_thread_payload *>(this->payload_);
}
fs_thread_payload &fs_payload() {
assert(stage == MESA_SHADER_FRAGMENT);
return *static_cast<fs_thread_payload *>(this->payload_);
};
cs_thread_payload &cs_payload() {
assert(gl_shader_stage_uses_workgroup(stage));
return *static_cast<cs_thread_payload *>(this->payload_);
}
task_mesh_thread_payload &task_mesh_payload() {
assert(stage == MESA_SHADER_TASK || stage == MESA_SHADER_MESH);
return *static_cast<task_mesh_thread_payload *>(this->payload_);
}
bs_thread_payload &bs_payload() {
assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE);
return *static_cast<bs_thread_payload *>(this->payload_);
}
bool source_depth_to_render_target;
bool runtime_check_aads_emit;
fs_reg pixel_x;
fs_reg pixel_y;
fs_reg pixel_z;
fs_reg wpos_w;
fs_reg pixel_w;
fs_reg delta_xy[BRW_BARYCENTRIC_MODE_COUNT];
fs_reg final_gs_vertex_count;
fs_reg control_data_bits;
fs_reg invocation_id;
unsigned grf_used;
bool spilled_any_registers;
bool needs_register_pressure;
const unsigned dispatch_width; /**< 8, 16 or 32 */
const unsigned max_polygons;
unsigned max_dispatch_width;
/* The API selected subgroup size */
unsigned api_subgroup_size; /**< 0, 8, 16, 32 */
struct shader_stats shader_stats;
void lower_mul_dword_inst(fs_inst *inst, bblock_t *block);
void lower_mul_qword_inst(fs_inst *inst, bblock_t *block);
void lower_mulh_inst(fs_inst *inst, bblock_t *block);
unsigned workgroup_size() const;
void debug_optimizer(const nir_shader *nir,
const char *pass_name,
int iteration, int pass_num) const;
};
/**
* Return the flag register used in fragment shaders to keep track of live
* samples. On Gfx7+ we use f1.0-f1.1 to allow discard jumps in SIMD32
* dispatch mode, while earlier generations are constrained to f0.1, which
* limits the dispatch width to SIMD16 for fragment shaders that use discard.
*/
static inline unsigned
sample_mask_flag_subreg(const fs_visitor &s)
{
assert(s.stage == MESA_SHADER_FRAGMENT);
return s.devinfo->ver >= 7 ? 2 : 1;
}
/**
* The fragment shader code generator.
*
* Translates FS IR to actual i965 assembly code.
*/
class fs_generator
{
public:
fs_generator(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
struct brw_stage_prog_data *prog_data,
bool runtime_check_aads_emit,
gl_shader_stage stage);
~fs_generator();
void enable_debug(const char *shader_name);
int generate_code(const cfg_t *cfg, int dispatch_width,
struct shader_stats shader_stats,
const brw::performance &perf,
struct brw_compile_stats *stats,
unsigned max_polygons = 0);
void add_const_data(void *data, unsigned size);
void add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt);
const unsigned *get_assembly();
private:
void fire_fb_write(fs_inst *inst,
struct brw_reg payload,
struct brw_reg implied_header,
GLuint nr);
void generate_send(fs_inst *inst,
struct brw_reg dst,
struct brw_reg desc,
struct brw_reg ex_desc,
struct brw_reg payload,
struct brw_reg payload2);
void generate_fb_write(fs_inst *inst, struct brw_reg payload);
void generate_fb_read(fs_inst *inst, struct brw_reg dst,
struct brw_reg payload);
void generate_cs_terminate(fs_inst *inst, struct brw_reg payload);
void generate_barrier(fs_inst *inst, struct brw_reg src);
bool generate_linterp(fs_inst *inst, struct brw_reg dst,
struct brw_reg *src);
void generate_tex(fs_inst *inst, struct brw_reg dst,
struct brw_reg surface_index,
struct brw_reg sampler_index);
void generate_ddx(const fs_inst *inst,
struct brw_reg dst, struct brw_reg src);
void generate_ddy(const fs_inst *inst,
struct brw_reg dst, struct brw_reg src);
void generate_scratch_write(fs_inst *inst, struct brw_reg src);
void generate_scratch_read(fs_inst *inst, struct brw_reg dst);
void generate_scratch_read_gfx7(fs_inst *inst, struct brw_reg dst);
void generate_scratch_header(fs_inst *inst, struct brw_reg dst);
void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst,
struct brw_reg index,
struct brw_reg offset);
void generate_varying_pull_constant_load_gfx4(fs_inst *inst,
struct brw_reg dst,
struct brw_reg index);
void generate_set_sample_id(fs_inst *inst,
struct brw_reg dst,
struct brw_reg src0,
struct brw_reg src1);
void generate_halt(fs_inst *inst);
void generate_mov_indirect(fs_inst *inst,
struct brw_reg dst,
struct brw_reg reg,
struct brw_reg indirect_byte_offset);
void generate_shuffle(fs_inst *inst,
struct brw_reg dst,
struct brw_reg src,
struct brw_reg idx);
void generate_quad_swizzle(const fs_inst *inst,
struct brw_reg dst, struct brw_reg src,
unsigned swiz);
bool patch_halt_jumps();
const struct brw_compiler *compiler;
const struct brw_compile_params *params;
const struct intel_device_info *devinfo;
struct brw_codegen *p;
struct brw_stage_prog_data * const prog_data;
unsigned dispatch_width; /**< 8, 16 or 32 */
exec_list discard_halt_patches;
bool runtime_check_aads_emit;
bool debug_flag;
const char *shader_name;
gl_shader_stage stage;
void *mem_ctx;
};
namespace brw {
fs_reg
fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
brw_reg_type type = BRW_REGISTER_TYPE_F,
unsigned n = 1);
fs_reg
fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2]);
inline fs_reg
dynamic_msaa_flags(const struct brw_wm_prog_data *wm_prog_data)
{
return fs_reg(UNIFORM, wm_prog_data->msaa_flags_param,
BRW_REGISTER_TYPE_UD);
}
void
check_dynamic_msaa_flag(const fs_builder &bld,
const struct brw_wm_prog_data *wm_prog_data,
enum intel_msaa_flags flag);
bool
lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i);
}
void shuffle_from_32bit_read(const brw::fs_builder &bld,
const fs_reg &dst,
const fs_reg &src,
uint32_t first_component,
uint32_t components);
fs_reg setup_imm_df(const brw::fs_builder &bld,
double v);
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
fs_reg setup_imm_b(const brw::fs_builder &bld,
int8_t v);
fs_reg setup_imm_ub(const brw::fs_builder &bld,
uint8_t v);
enum brw_barycentric_mode brw_barycentric_mode(nir_intrinsic_instr *intr);
uint32_t brw_fb_write_msg_control(const fs_inst *inst,
const struct brw_wm_prog_data *prog_data);
void brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data);
bool brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width);
fs_reg brw_sample_mask_reg(const brw::fs_builder &bld);
void brw_emit_predicate_on_sample_mask(const brw::fs_builder &bld, fs_inst *inst);
int brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
const brw_stage_prog_data *prog_data);
bool brw_lower_dpas(fs_visitor &v);
void nir_to_brw(fs_visitor *s);
#endif /* BRW_FS_H */