2010-10-10 15:42:37 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*
|
|
|
|
|
* Authors:
|
|
|
|
|
* Eric Anholt <eric@anholt.net>
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
|
2017-03-20 16:04:38 +00:00
|
|
|
#ifndef BRW_FS_H
|
|
|
|
|
#define BRW_FS_H
|
2012-04-10 12:01:50 -07:00
|
|
|
|
2024-02-28 13:59:35 -08:00
|
|
|
#include "brw_cfg.h"
|
|
|
|
|
#include "brw_compiler.h"
|
2024-02-19 22:57:48 -08:00
|
|
|
#include "brw_ir_allocator.h"
|
2015-02-06 01:11:18 +02:00
|
|
|
#include "brw_ir_fs.h"
|
2016-03-09 17:03:57 -08:00
|
|
|
#include "brw_fs_live_variables.h"
|
2020-03-26 14:59:02 -07:00
|
|
|
#include "brw_ir_performance.h"
|
2016-01-18 12:54:03 +02:00
|
|
|
#include "compiler/nir/nir.h"
|
2010-10-10 15:42:37 -07:00
|
|
|
|
2014-06-14 22:53:40 -07:00
|
|
|
struct bblock_t;
|
2012-06-06 10:57:54 -07:00
|
|
|
namespace {
|
2013-01-07 19:42:38 -08:00
|
|
|
struct acp_entry;
|
2012-06-06 10:57:54 -07:00
|
|
|
}
|
2012-05-10 16:10:15 -07:00
|
|
|
|
2024-02-19 22:57:48 -08:00
|
|
|
struct fs_visitor;
|
2016-03-13 16:35:49 -07:00
|
|
|
|
2012-06-05 11:37:22 -07:00
|
|
|
namespace brw {
|
2016-03-13 16:35:49 -07:00
|
|
|
/**
|
|
|
|
|
* Register pressure analysis of a shader. Estimates how many registers
|
|
|
|
|
* are live at any point of the program in GRF units.
|
|
|
|
|
*/
|
|
|
|
|
struct register_pressure {
|
|
|
|
|
register_pressure(const fs_visitor *v);
|
|
|
|
|
~register_pressure();
|
|
|
|
|
|
|
|
|
|
analysis_dependency_class
|
|
|
|
|
dependency_class() const
|
|
|
|
|
{
|
|
|
|
|
return (DEPENDENCY_INSTRUCTION_IDENTITY |
|
|
|
|
|
DEPENDENCY_INSTRUCTION_DATA_FLOW |
|
|
|
|
|
DEPENDENCY_VARIABLES);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
validate(const fs_visitor *) const
|
|
|
|
|
{
|
|
|
|
|
/* FINISHME */
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unsigned *regs_live_at_ip;
|
|
|
|
|
};
|
2023-11-16 01:16:45 -08:00
|
|
|
|
|
|
|
|
class def_analysis {
|
|
|
|
|
public:
|
|
|
|
|
def_analysis(const fs_visitor *v);
|
|
|
|
|
~def_analysis();
|
|
|
|
|
|
|
|
|
|
fs_inst *
|
|
|
|
|
get(const fs_reg ®) const
|
|
|
|
|
{
|
|
|
|
|
return reg.file == VGRF && reg.nr < def_count ?
|
|
|
|
|
def_insts[reg.nr] : NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bblock_t *
|
|
|
|
|
get_block(const fs_reg ®) const
|
|
|
|
|
{
|
|
|
|
|
return reg.file == VGRF && reg.nr < def_count ?
|
|
|
|
|
def_blocks[reg.nr] : NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-12 11:49:11 -08:00
|
|
|
uint32_t
|
|
|
|
|
get_use_count(const fs_reg ®) const
|
|
|
|
|
{
|
|
|
|
|
return reg.file == VGRF && reg.nr < def_count ?
|
|
|
|
|
def_use_counts[reg.nr] : 0;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-16 01:16:45 -08:00
|
|
|
unsigned count() const { return def_count; }
|
|
|
|
|
|
|
|
|
|
void print_stats(const fs_visitor *) const;
|
|
|
|
|
|
|
|
|
|
analysis_dependency_class
|
|
|
|
|
dependency_class() const
|
|
|
|
|
{
|
|
|
|
|
return DEPENDENCY_INSTRUCTION_IDENTITY |
|
|
|
|
|
DEPENDENCY_INSTRUCTION_DATA_FLOW |
|
|
|
|
|
DEPENDENCY_VARIABLES |
|
|
|
|
|
DEPENDENCY_BLOCKS;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool validate(const fs_visitor *) const;
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
void mark_invalid(int);
|
|
|
|
|
bool fully_defines(const fs_visitor *v, fs_inst *);
|
|
|
|
|
void update_for_reads(const idom_tree &idom, bblock_t *block, fs_inst *);
|
|
|
|
|
void update_for_write(const fs_visitor *v, bblock_t *block, fs_inst *);
|
|
|
|
|
|
|
|
|
|
fs_inst **def_insts;
|
|
|
|
|
bblock_t **def_blocks;
|
2024-01-12 11:49:11 -08:00
|
|
|
uint32_t *def_use_counts;
|
2023-11-16 01:16:45 -08:00
|
|
|
unsigned def_count;
|
|
|
|
|
};
|
2012-06-05 11:37:22 -07:00
|
|
|
}
|
|
|
|
|
|
2024-02-28 13:59:35 -08:00
|
|
|
#define UBO_START ((1 << 16) - 4)
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Scratch data used when compiling a GLSL geometry shader.
|
|
|
|
|
*/
|
|
|
|
|
struct brw_gs_compile
|
|
|
|
|
{
|
|
|
|
|
struct brw_gs_prog_key key;
|
|
|
|
|
struct intel_vue_map input_vue_map;
|
|
|
|
|
|
|
|
|
|
unsigned control_data_bits_per_vertex;
|
|
|
|
|
unsigned control_data_header_size_bits;
|
|
|
|
|
};
|
2015-06-29 22:50:28 -07:00
|
|
|
|
2023-11-21 07:49:02 -08:00
|
|
|
namespace brw {
|
|
|
|
|
class fs_builder;
|
2015-06-25 10:55:51 -07:00
|
|
|
}
|
|
|
|
|
|
2016-10-17 14:10:26 -07:00
|
|
|
struct shader_stats {
|
2016-10-17 14:12:28 -07:00
|
|
|
const char *scheduler_mode;
|
2016-10-17 14:10:26 -07:00
|
|
|
unsigned promoted_constants;
|
2022-05-24 02:44:53 -07:00
|
|
|
unsigned spill_count;
|
|
|
|
|
unsigned fill_count;
|
2023-02-03 17:02:28 +01:00
|
|
|
unsigned max_register_pressure;
|
2016-10-17 14:10:26 -07:00
|
|
|
};
|
|
|
|
|
|
2022-08-19 12:18:21 -07:00
|
|
|
/** Register numbers for thread payload fields. */
|
|
|
|
|
struct thread_payload {
|
|
|
|
|
/** The number of thread payload registers the hardware will supply. */
|
|
|
|
|
uint8_t num_regs;
|
2022-08-19 12:40:20 -07:00
|
|
|
|
|
|
|
|
virtual ~thread_payload() = default;
|
2022-08-29 17:13:00 -07:00
|
|
|
|
|
|
|
|
protected:
|
|
|
|
|
thread_payload() : num_regs() {}
|
2022-08-19 12:18:21 -07:00
|
|
|
};
|
|
|
|
|
|
2022-08-21 21:22:12 -07:00
|
|
|
struct vs_thread_payload : public thread_payload {
|
2022-08-01 16:42:57 -07:00
|
|
|
vs_thread_payload(const fs_visitor &v);
|
2022-08-21 21:22:12 -07:00
|
|
|
|
|
|
|
|
fs_reg urb_handles;
|
|
|
|
|
};
|
|
|
|
|
|
2022-08-19 14:41:52 -07:00
|
|
|
struct tcs_thread_payload : public thread_payload {
|
|
|
|
|
tcs_thread_payload(const fs_visitor &v);
|
2022-08-19 14:57:31 -07:00
|
|
|
|
|
|
|
|
fs_reg patch_urb_output;
|
2022-08-19 15:04:15 -07:00
|
|
|
fs_reg primitive_id;
|
2022-08-19 17:31:37 -07:00
|
|
|
fs_reg icp_handle_start;
|
2022-08-19 14:41:52 -07:00
|
|
|
};
|
|
|
|
|
|
2022-08-21 20:51:58 -07:00
|
|
|
struct tes_thread_payload : public thread_payload {
|
2022-09-07 14:11:05 -07:00
|
|
|
tes_thread_payload(const fs_visitor &v);
|
2022-08-21 20:51:58 -07:00
|
|
|
|
|
|
|
|
fs_reg patch_urb_input;
|
|
|
|
|
fs_reg primitive_id;
|
|
|
|
|
fs_reg coords[3];
|
|
|
|
|
fs_reg urb_output;
|
|
|
|
|
};
|
|
|
|
|
|
2022-08-22 22:23:17 -07:00
|
|
|
struct gs_thread_payload : public thread_payload {
|
2023-11-20 23:17:12 -08:00
|
|
|
gs_thread_payload(fs_visitor &v);
|
2022-08-22 22:23:17 -07:00
|
|
|
|
|
|
|
|
fs_reg urb_handles;
|
|
|
|
|
fs_reg primitive_id;
|
2024-01-09 13:04:49 -08:00
|
|
|
fs_reg instance_id;
|
2022-08-22 23:02:34 -07:00
|
|
|
fs_reg icp_handle_start;
|
2022-08-22 22:23:17 -07:00
|
|
|
};
|
|
|
|
|
|
2022-08-19 12:18:21 -07:00
|
|
|
struct fs_thread_payload : public thread_payload {
|
2022-08-19 12:40:20 -07:00
|
|
|
fs_thread_payload(const fs_visitor &v,
|
2024-02-27 12:23:52 -08:00
|
|
|
bool &source_depth_to_render_target);
|
2022-08-19 12:40:20 -07:00
|
|
|
|
2022-08-19 12:18:21 -07:00
|
|
|
uint8_t subspan_coord_reg[2];
|
|
|
|
|
uint8_t source_depth_reg[2];
|
|
|
|
|
uint8_t source_w_reg[2];
|
|
|
|
|
uint8_t aa_dest_stencil_reg[2];
|
|
|
|
|
uint8_t dest_depth_reg[2];
|
|
|
|
|
uint8_t sample_pos_reg[2];
|
|
|
|
|
uint8_t sample_mask_in_reg[2];
|
|
|
|
|
uint8_t barycentric_coord_reg[BRW_BARYCENTRIC_MODE_COUNT][2];
|
2022-08-08 13:53:04 -07:00
|
|
|
|
|
|
|
|
uint8_t depth_w_coef_reg;
|
|
|
|
|
uint8_t pc_bary_coef_reg;
|
|
|
|
|
uint8_t npc_bary_coef_reg;
|
|
|
|
|
uint8_t sample_offsets_reg;
|
2022-08-19 12:18:21 -07:00
|
|
|
};
|
|
|
|
|
|
2022-08-22 21:47:02 -07:00
|
|
|
struct cs_thread_payload : public thread_payload {
|
|
|
|
|
cs_thread_payload(const fs_visitor &v);
|
|
|
|
|
|
|
|
|
|
void load_subgroup_id(const brw::fs_builder &bld, fs_reg &dest) const;
|
|
|
|
|
|
2023-11-27 16:31:25 -08:00
|
|
|
fs_reg local_invocation_id[3];
|
|
|
|
|
|
2022-08-22 21:47:02 -07:00
|
|
|
protected:
|
|
|
|
|
fs_reg subgroup_id_;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct task_mesh_thread_payload : public cs_thread_payload {
|
2023-11-20 23:17:12 -08:00
|
|
|
task_mesh_thread_payload(fs_visitor &v);
|
2022-08-21 23:05:08 -07:00
|
|
|
|
|
|
|
|
fs_reg extended_parameter_0;
|
|
|
|
|
fs_reg local_index;
|
|
|
|
|
fs_reg inline_parameter;
|
|
|
|
|
|
|
|
|
|
fs_reg urb_output;
|
|
|
|
|
|
|
|
|
|
/* URB to read Task memory inputs. Only valid for MESH stage. */
|
|
|
|
|
fs_reg task_urb_input;
|
|
|
|
|
};
|
|
|
|
|
|
2022-08-25 17:00:15 -07:00
|
|
|
struct bs_thread_payload : public thread_payload {
|
2022-09-11 00:57:26 -07:00
|
|
|
bs_thread_payload(const fs_visitor &v);
|
2022-08-25 17:00:15 -07:00
|
|
|
|
|
|
|
|
fs_reg global_arg_ptr;
|
|
|
|
|
fs_reg local_arg_ptr;
|
|
|
|
|
|
|
|
|
|
void load_shader_type(const brw::fs_builder &bld, fs_reg &dest) const;
|
|
|
|
|
};
|
|
|
|
|
|
2024-02-28 13:59:35 -08:00
|
|
|
enum instruction_scheduler_mode {
|
|
|
|
|
SCHEDULE_PRE,
|
|
|
|
|
SCHEDULE_PRE_NON_LIFO,
|
|
|
|
|
SCHEDULE_PRE_LIFO,
|
|
|
|
|
SCHEDULE_POST,
|
|
|
|
|
SCHEDULE_NONE,
|
|
|
|
|
};
|
|
|
|
|
|
2024-02-28 13:39:45 -08:00
|
|
|
class instruction_scheduler;
|
2023-10-20 10:32:54 -07:00
|
|
|
|
2012-11-09 01:05:47 -08:00
|
|
|
/**
|
|
|
|
|
* The fragment shader front-end.
|
|
|
|
|
*
|
|
|
|
|
* Translates either GLSL IR or Mesa IR (for ARB_fragment_program) into FS IR.
|
|
|
|
|
*/
|
2024-02-19 22:57:48 -08:00
|
|
|
struct fs_visitor
|
2010-10-10 15:42:37 -07:00
|
|
|
{
|
|
|
|
|
public:
|
2023-07-14 02:10:20 +03:00
|
|
|
fs_visitor(const struct brw_compiler *compiler,
|
|
|
|
|
const struct brw_compile_params *params,
|
2019-02-21 17:20:39 -06:00
|
|
|
const brw_base_prog_key *key,
|
2015-03-11 22:41:49 -07:00
|
|
|
struct brw_stage_prog_data *prog_data,
|
2015-10-05 19:26:02 -07:00
|
|
|
const nir_shader *shader,
|
2015-06-19 15:40:09 -07:00
|
|
|
unsigned dispatch_width,
|
2023-02-03 17:02:28 +01:00
|
|
|
bool needs_register_pressure,
|
2021-04-07 01:23:08 +03:00
|
|
|
bool debug_enabled);
|
2022-06-22 16:31:00 -07:00
|
|
|
fs_visitor(const struct brw_compiler *compiler,
|
|
|
|
|
const struct brw_compile_params *params,
|
|
|
|
|
const brw_wm_prog_key *key,
|
|
|
|
|
struct brw_wm_prog_data *prog_data,
|
|
|
|
|
const nir_shader *shader,
|
|
|
|
|
unsigned dispatch_width,
|
|
|
|
|
unsigned num_polygons,
|
|
|
|
|
bool needs_register_pressure,
|
|
|
|
|
bool debug_enabled);
|
2023-07-14 02:10:20 +03:00
|
|
|
fs_visitor(const struct brw_compiler *compiler,
|
|
|
|
|
const struct brw_compile_params *params,
|
2015-06-29 22:50:28 -07:00
|
|
|
struct brw_gs_compile *gs_compile,
|
|
|
|
|
struct brw_gs_prog_data *prog_data,
|
2015-11-03 12:51:32 -08:00
|
|
|
const nir_shader *shader,
|
2023-02-03 17:02:28 +01:00
|
|
|
bool needs_register_pressure,
|
2021-03-23 11:31:51 -07:00
|
|
|
bool debug_enabled);
|
2015-06-29 22:50:28 -07:00
|
|
|
void init();
|
2012-07-04 13:12:50 -07:00
|
|
|
~fs_visitor();
|
2010-10-10 15:42:37 -07:00
|
|
|
|
2011-07-25 18:13:04 -07:00
|
|
|
void import_uniforms(fs_visitor *v);
|
2010-10-10 15:42:37 -07:00
|
|
|
|
2015-06-03 22:22:39 +03:00
|
|
|
void VARYING_PULL_CONSTANT_LOAD(const brw::fs_builder &bld,
|
|
|
|
|
const fs_reg &dst,
|
2023-01-13 12:29:30 +02:00
|
|
|
const fs_reg &surface,
|
|
|
|
|
const fs_reg &surface_handle,
|
2015-06-03 22:22:39 +03:00
|
|
|
const fs_reg &varying_offset,
|
2020-02-21 10:59:38 -06:00
|
|
|
uint32_t const_offset,
|
intel/fs: Don't rely on CSE for VARYING_PULL_CONSTANT_LOAD
In the past, we didn't have a good solution for combining scalar loads
with a variable index plus a constant offset. To handle that, we took
our load offset and rounded it down to the nearest vec4, loaded an
entire vec4, and trusted in the backend CSE pass to detect loads from
the same address and remove redundant ones.
These days, nir_opt_load_store_vectorize() does a good job of taking
those scalar loads and combining them into vector loads for us, so we
no longer need to do this trick. In fact, it can be better not to:
our offset need only be 4 byte (scalar) aligned, but we were making it
16 byte (vec4) aligned. So if you wanted to load an unaligned vec2,
we might actually load two vec4's (___X | Y___) instead of doing a
single load at the starting offset.
This should also reduce the work the backend CSE pass has to do,
since we just emit a single VARYING_PULL_CONSTANT_LOAD instead of 4.
shader-db results on Alchemist:
- No changes in SEND count or spills/fills
- Instructions: helped 95, hurt 100, +/- 1-3 instructions
- Cycles: helped 3411 hurt 1868, -0.01% (-0.28% in affected)
- SIMD32: gained 5, lost 3
fossil-db results on Alchemist:
- Instrs: 161381427 -> 161384130 (+0.00%); split: -0.00%, +0.00%
- Cycles: 14258305873 -> 14145884365 (-0.79%); split: -0.95%, +0.16%
- SIMD32: Gained 42, lost 26
- Totals from 56285 (8.63% of 652236) affected shaders:
- Instrs: 13318308 -> 13321011 (+0.02%); split: -0.01%, +0.03%
- Cycles: 7464985282 -> 7352563774 (-1.51%); split: -1.82%, +0.31%
From this we can see that we aren't doing more loads than before
and the change is pretty inconsequential, but it requires less
optimizing to produce similar results.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27568>
2024-02-01 09:45:46 -08:00
|
|
|
uint8_t alignment,
|
|
|
|
|
unsigned components);
|
2012-11-08 16:06:24 -08:00
|
|
|
|
2016-05-16 14:30:25 -07:00
|
|
|
bool run_fs(bool allow_spilling, bool do_rep_send);
|
2017-09-28 16:25:31 -07:00
|
|
|
bool run_vs();
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
bool run_tcs();
|
2015-11-10 14:35:27 -08:00
|
|
|
bool run_tes();
|
2015-03-11 23:14:31 -07:00
|
|
|
bool run_gs();
|
2020-05-19 14:37:44 -07:00
|
|
|
bool run_cs(bool allow_spilling);
|
2020-10-21 14:46:50 -05:00
|
|
|
bool run_bs(bool allow_spilling);
|
2021-10-29 12:27:45 -07:00
|
|
|
bool run_task(bool allow_spilling);
|
|
|
|
|
bool run_mesh(bool allow_spilling);
|
2020-05-19 14:37:44 -07:00
|
|
|
void allocate_registers(bool allow_spilling);
|
2023-02-03 17:02:28 +01:00
|
|
|
uint32_t compute_max_register_pressure();
|
2010-10-10 15:42:37 -07:00
|
|
|
void assign_curb_setup();
|
|
|
|
|
void assign_urb_setup();
|
2015-03-11 23:14:31 -07:00
|
|
|
void convert_attr_sources_to_hw_regs(fs_inst *inst);
|
2014-10-27 22:42:50 -07:00
|
|
|
void assign_vs_urb_setup();
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
void assign_tcs_urb_setup();
|
2015-11-10 14:35:27 -08:00
|
|
|
void assign_tes_urb_setup();
|
2015-03-11 23:14:31 -07:00
|
|
|
void assign_gs_urb_setup();
|
2016-05-16 14:30:25 -07:00
|
|
|
bool assign_regs(bool allow_spilling, bool spill_all);
|
2010-10-10 15:42:37 -07:00
|
|
|
void assign_regs_trivial();
|
2022-07-07 01:12:24 -07:00
|
|
|
void calculate_payload_ranges(unsigned payload_node_count,
|
2016-09-23 15:15:33 +03:00
|
|
|
int *payload_last_use_ip) const;
|
2014-03-11 14:35:27 -07:00
|
|
|
void assign_constant_locations();
|
2017-06-02 09:54:31 -07:00
|
|
|
bool get_pull_locs(const fs_reg &src, unsigned *out_surf_index,
|
|
|
|
|
unsigned *out_pull_index);
|
2024-02-19 22:57:48 -08:00
|
|
|
void invalidate_analysis(brw::analysis_dependency_class c);
|
2023-10-22 14:36:03 -07:00
|
|
|
|
2024-02-28 13:39:45 -08:00
|
|
|
instruction_scheduler *prepare_scheduler(void *mem_ctx);
|
|
|
|
|
void schedule_instructions_pre_ra(instruction_scheduler *sched,
|
2023-10-20 10:32:54 -07:00
|
|
|
instruction_scheduler_mode mode);
|
|
|
|
|
void schedule_instructions_post_ra();
|
|
|
|
|
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
void vfail(const char *msg, va_list args);
|
2011-03-13 13:43:05 -07:00
|
|
|
void fail(const char *msg, ...);
|
2016-05-18 14:39:52 -07:00
|
|
|
void limit_dispatch_width(unsigned n, const char *msg);
|
2011-01-18 17:16:49 -08:00
|
|
|
|
2014-09-26 14:47:03 -07:00
|
|
|
void emit_repclear_shader();
|
2024-02-17 22:43:47 -08:00
|
|
|
void emit_interpolation_setup();
|
2010-10-08 14:35:34 -07:00
|
|
|
|
2019-05-03 14:20:00 -07:00
|
|
|
void set_tcs_invocation_id();
|
|
|
|
|
|
2015-06-03 21:07:52 +03:00
|
|
|
fs_inst *emit_single_fb_write(const brw::fs_builder &bld,
|
|
|
|
|
fs_reg color1, fs_reg color2,
|
2015-07-16 16:12:48 +03:00
|
|
|
fs_reg src0_alpha, unsigned components);
|
2021-12-03 10:45:48 -06:00
|
|
|
void do_emit_fb_writes(int nr_color_regions, bool replicate_alpha);
|
2010-10-10 15:42:37 -07:00
|
|
|
void emit_fb_writes();
|
2015-03-11 23:14:31 -07:00
|
|
|
void emit_urb_writes(const fs_reg &gs_vertex_count = fs_reg());
|
|
|
|
|
void emit_gs_control_data_bits(const fs_reg &vertex_count);
|
2024-02-05 13:34:53 -08:00
|
|
|
fs_reg gs_urb_channel_mask(const fs_reg &dword_index);
|
|
|
|
|
fs_reg gs_urb_per_slot_dword_index(const fs_reg &vertex_count);
|
2015-03-11 23:14:31 -07:00
|
|
|
void emit_gs_thread_end();
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
bool mark_last_urb_write_with_eot();
|
|
|
|
|
void emit_tcs_thread_end();
|
2022-05-18 17:05:53 +02:00
|
|
|
void emit_urb_fence();
|
2015-04-12 02:06:57 -07:00
|
|
|
void emit_cs_terminate();
|
2012-11-27 14:10:52 -08:00
|
|
|
|
2022-06-22 16:19:05 -07:00
|
|
|
fs_reg interp_reg(const brw::fs_builder &bld, unsigned location,
|
|
|
|
|
unsigned channel, unsigned comp);
|
2023-12-07 20:07:25 -08:00
|
|
|
fs_reg per_primitive_reg(const brw::fs_builder &bld,
|
|
|
|
|
int location, unsigned comp);
|
2015-06-28 21:04:17 +03:00
|
|
|
|
2024-03-11 02:57:46 -07:00
|
|
|
void dump_instruction_to_file(const fs_inst *inst, FILE *file, const brw::def_analysis *defs) const;
|
2024-02-19 22:37:47 -08:00
|
|
|
void dump_instructions_to_file(FILE *file) const;
|
|
|
|
|
|
|
|
|
|
/* Convenience functions based on the above. */
|
2024-03-11 02:57:46 -07:00
|
|
|
void dump_instruction(const fs_inst *inst, FILE *file = stderr, const brw::def_analysis *defs = nullptr) const {
|
|
|
|
|
dump_instruction_to_file(inst, file, defs);
|
2024-02-19 22:37:47 -08:00
|
|
|
}
|
|
|
|
|
void dump_instructions(const char *name = nullptr) const;
|
|
|
|
|
|
2024-02-19 22:45:39 -08:00
|
|
|
void calculate_cfg();
|
2012-10-30 15:35:44 -07:00
|
|
|
|
2024-02-19 22:57:48 -08:00
|
|
|
const struct brw_compiler *compiler;
|
|
|
|
|
void *log_data; /* Passed to compiler->*_log functions */
|
|
|
|
|
|
|
|
|
|
const struct intel_device_info * const devinfo;
|
|
|
|
|
const nir_shader *nir;
|
|
|
|
|
|
|
|
|
|
/** ralloc context for temporary data used during compile */
|
|
|
|
|
void *mem_ctx;
|
|
|
|
|
|
2024-02-20 21:12:17 -08:00
|
|
|
/** List of fs_inst. */
|
2024-02-19 22:57:48 -08:00
|
|
|
exec_list instructions;
|
|
|
|
|
|
|
|
|
|
cfg_t *cfg;
|
|
|
|
|
|
|
|
|
|
gl_shader_stage stage;
|
|
|
|
|
bool debug_enabled;
|
|
|
|
|
|
|
|
|
|
brw::simple_allocator alloc;
|
|
|
|
|
|
2019-02-21 17:20:39 -06:00
|
|
|
const brw_base_prog_key *const key;
|
2015-03-09 01:58:51 -07:00
|
|
|
|
2015-06-29 22:50:28 -07:00
|
|
|
struct brw_gs_compile *gs_compile;
|
|
|
|
|
|
2014-08-29 12:50:46 -07:00
|
|
|
struct brw_stage_prog_data *prog_data;
|
2010-10-10 15:42:37 -07:00
|
|
|
|
2024-02-19 22:25:16 -08:00
|
|
|
brw_analysis<brw::fs_live_variables, fs_visitor> live_analysis;
|
2020-10-29 20:13:50 +01:00
|
|
|
brw_analysis<brw::register_pressure, fs_visitor> regpressure_analysis;
|
|
|
|
|
brw_analysis<brw::performance, fs_visitor> performance_analysis;
|
2024-02-19 22:25:16 -08:00
|
|
|
brw_analysis<brw::idom_tree, fs_visitor> idom_analysis;
|
2023-11-16 01:16:45 -08:00
|
|
|
brw_analysis<brw::def_analysis, fs_visitor> def_analysis;
|
2013-08-04 23:27:14 -07:00
|
|
|
|
2014-02-19 15:27:01 +01:00
|
|
|
/** Number of uniform variable components visited. */
|
|
|
|
|
unsigned uniforms;
|
|
|
|
|
|
2014-05-13 21:00:35 -07:00
|
|
|
/** Byte-offset for the next available spot in the scratch space buffer. */
|
|
|
|
|
unsigned last_scratch;
|
|
|
|
|
|
2014-03-11 14:35:27 -07:00
|
|
|
/**
|
|
|
|
|
* Array mapping UNIFORM register numbers to the push parameter index,
|
|
|
|
|
* or -1 if this uniform register isn't being uploaded as a push constant.
|
2011-07-25 18:13:04 -07:00
|
|
|
*/
|
2014-03-11 14:35:27 -07:00
|
|
|
int *push_constant_loc;
|
2011-07-25 18:13:04 -07:00
|
|
|
|
2012-09-18 18:12:48 +02:00
|
|
|
fs_reg frag_depth;
|
2015-10-20 14:29:39 -07:00
|
|
|
fs_reg frag_stencil;
|
2013-10-24 16:21:13 -07:00
|
|
|
fs_reg sample_mask;
|
2014-10-27 22:42:50 -07:00
|
|
|
fs_reg outputs[VARYING_SLOT_MAX];
|
2012-04-25 13:58:07 -07:00
|
|
|
fs_reg dual_src_output;
|
2010-10-10 15:42:37 -07:00
|
|
|
int first_non_payload_grf;
|
|
|
|
|
|
2011-03-13 13:43:05 -07:00
|
|
|
bool failed;
|
2011-05-16 15:10:26 -07:00
|
|
|
char *fail_msg;
|
2010-10-10 15:42:37 -07:00
|
|
|
|
2022-08-19 12:40:20 -07:00
|
|
|
thread_payload *payload_;
|
|
|
|
|
|
|
|
|
|
thread_payload &payload() {
|
|
|
|
|
return *this->payload_;
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-21 21:22:12 -07:00
|
|
|
vs_thread_payload &vs_payload() {
|
|
|
|
|
assert(stage == MESA_SHADER_VERTEX);
|
|
|
|
|
return *static_cast<vs_thread_payload *>(this->payload_);
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-19 14:41:52 -07:00
|
|
|
tcs_thread_payload &tcs_payload() {
|
|
|
|
|
assert(stage == MESA_SHADER_TESS_CTRL);
|
|
|
|
|
return *static_cast<tcs_thread_payload *>(this->payload_);
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-21 20:51:58 -07:00
|
|
|
tes_thread_payload &tes_payload() {
|
|
|
|
|
assert(stage == MESA_SHADER_TESS_EVAL);
|
|
|
|
|
return *static_cast<tes_thread_payload *>(this->payload_);
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-22 22:23:17 -07:00
|
|
|
gs_thread_payload &gs_payload() {
|
|
|
|
|
assert(stage == MESA_SHADER_GEOMETRY);
|
|
|
|
|
return *static_cast<gs_thread_payload *>(this->payload_);
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-19 12:40:20 -07:00
|
|
|
fs_thread_payload &fs_payload() {
|
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
return *static_cast<fs_thread_payload *>(this->payload_);
|
|
|
|
|
};
|
2014-05-13 21:52:51 -07:00
|
|
|
|
2022-08-10 17:33:56 -07:00
|
|
|
const fs_thread_payload &fs_payload() const {
|
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
return *static_cast<const fs_thread_payload *>(this->payload_);
|
|
|
|
|
};
|
|
|
|
|
|
2022-08-22 21:47:02 -07:00
|
|
|
cs_thread_payload &cs_payload() {
|
|
|
|
|
assert(gl_shader_stage_uses_workgroup(stage));
|
|
|
|
|
return *static_cast<cs_thread_payload *>(this->payload_);
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-21 23:05:08 -07:00
|
|
|
task_mesh_thread_payload &task_mesh_payload() {
|
|
|
|
|
assert(stage == MESA_SHADER_TASK || stage == MESA_SHADER_MESH);
|
|
|
|
|
return *static_cast<task_mesh_thread_payload *>(this->payload_);
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-25 17:00:15 -07:00
|
|
|
bs_thread_payload &bs_payload() {
|
|
|
|
|
assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE);
|
|
|
|
|
return *static_cast<bs_thread_payload *>(this->payload_);
|
|
|
|
|
}
|
|
|
|
|
|
2014-05-14 00:08:58 -07:00
|
|
|
bool source_depth_to_render_target;
|
|
|
|
|
|
2010-10-10 15:42:37 -07:00
|
|
|
fs_reg pixel_x;
|
|
|
|
|
fs_reg pixel_y;
|
2020-10-29 15:10:59 +02:00
|
|
|
fs_reg pixel_z;
|
2010-10-10 15:42:37 -07:00
|
|
|
fs_reg wpos_w;
|
|
|
|
|
fs_reg pixel_w;
|
2016-07-11 16:24:12 -07:00
|
|
|
fs_reg delta_xy[BRW_BARYCENTRIC_MODE_COUNT];
|
2015-03-11 23:14:31 -07:00
|
|
|
fs_reg final_gs_vertex_count;
|
|
|
|
|
fs_reg control_data_bits;
|
2015-11-14 17:40:43 -08:00
|
|
|
fs_reg invocation_id;
|
2010-10-10 15:42:37 -07:00
|
|
|
|
2015-02-10 15:51:34 +02:00
|
|
|
unsigned grf_used;
|
2013-10-29 12:46:18 -07:00
|
|
|
bool spilled_any_registers;
|
2023-02-03 17:02:28 +01:00
|
|
|
bool needs_register_pressure;
|
2011-03-11 19:19:01 -08:00
|
|
|
|
2016-05-18 14:39:52 -07:00
|
|
|
const unsigned dispatch_width; /**< 8, 16 or 32 */
|
2022-06-22 16:31:00 -07:00
|
|
|
const unsigned max_polygons;
|
2016-05-18 14:39:52 -07:00
|
|
|
unsigned max_dispatch_width;
|
2015-03-16 12:18:31 -07:00
|
|
|
|
2023-03-10 22:57:36 +02:00
|
|
|
/* The API selected subgroup size */
|
|
|
|
|
unsigned api_subgroup_size; /**< 0, 8, 16, 32 */
|
|
|
|
|
|
2016-10-17 14:10:26 -07:00
|
|
|
struct shader_stats shader_stats;
|
|
|
|
|
|
2020-01-14 12:22:47 -08:00
|
|
|
unsigned workgroup_size() const;
|
2023-08-06 15:46:12 +03:00
|
|
|
|
2023-08-14 16:59:17 -07:00
|
|
|
void debug_optimizer(const nir_shader *nir,
|
|
|
|
|
const char *pass_name,
|
2023-08-06 15:46:12 +03:00
|
|
|
int iteration, int pass_num) const;
|
2010-10-10 15:42:37 -07:00
|
|
|
};
|
|
|
|
|
|
2024-06-14 13:19:58 -07:00
|
|
|
void brw_print_swsb(FILE *f, const struct intel_device_info *devinfo, const tgl_swsb swsb);
|
|
|
|
|
|
2020-01-04 14:32:09 -08:00
|
|
|
/**
|
|
|
|
|
* Return the flag register used in fragment shaders to keep track of live
|
2021-03-29 15:46:12 -07:00
|
|
|
* samples. On Gfx7+ we use f1.0-f1.1 to allow discard jumps in SIMD32
|
2024-02-17 22:43:47 -08:00
|
|
|
* dispatch mode.
|
2020-01-04 14:32:09 -08:00
|
|
|
*/
|
|
|
|
|
static inline unsigned
|
2023-12-05 17:16:34 -08:00
|
|
|
sample_mask_flag_subreg(const fs_visitor &s)
|
2020-01-04 14:32:09 -08:00
|
|
|
{
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
2024-02-17 22:43:47 -08:00
|
|
|
return 2;
|
2020-01-04 14:32:09 -08:00
|
|
|
}
|
|
|
|
|
|
2012-11-09 01:05:47 -08:00
|
|
|
/**
|
|
|
|
|
* The fragment shader code generator.
|
|
|
|
|
*
|
|
|
|
|
* Translates FS IR to actual i965 assembly code.
|
|
|
|
|
*/
|
|
|
|
|
class fs_generator
|
|
|
|
|
{
|
|
|
|
|
public:
|
2023-07-14 02:10:20 +03:00
|
|
|
fs_generator(const struct brw_compiler *compiler,
|
|
|
|
|
const struct brw_compile_params *params,
|
2014-10-20 22:53:31 -07:00
|
|
|
struct brw_stage_prog_data *prog_data,
|
2016-01-14 20:27:51 -08:00
|
|
|
gl_shader_stage stage);
|
2012-11-09 01:05:47 -08:00
|
|
|
~fs_generator();
|
|
|
|
|
|
2014-10-27 19:40:47 -07:00
|
|
|
void enable_debug(const char *shader_name);
|
2019-04-23 23:19:56 -05:00
|
|
|
int generate_code(const cfg_t *cfg, int dispatch_width,
|
2020-03-04 16:24:25 -08:00
|
|
|
struct shader_stats shader_stats,
|
2020-03-26 16:27:32 -07:00
|
|
|
const brw::performance &perf,
|
2023-12-07 19:47:55 -08:00
|
|
|
struct brw_compile_stats *stats,
|
|
|
|
|
unsigned max_polygons = 0);
|
2020-08-07 22:26:07 -05:00
|
|
|
void add_const_data(void *data, unsigned size);
|
2020-09-04 12:40:06 -05:00
|
|
|
void add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt);
|
2018-02-26 16:34:55 -08:00
|
|
|
const unsigned *get_assembly();
|
2012-11-09 01:05:47 -08:00
|
|
|
|
|
|
|
|
private:
|
2018-10-29 15:06:14 -05:00
|
|
|
void generate_send(fs_inst *inst,
|
|
|
|
|
struct brw_reg dst,
|
|
|
|
|
struct brw_reg desc,
|
|
|
|
|
struct brw_reg ex_desc,
|
|
|
|
|
struct brw_reg payload,
|
|
|
|
|
struct brw_reg payload2);
|
2014-08-27 11:32:08 -07:00
|
|
|
void generate_barrier(fs_inst *inst, struct brw_reg src);
|
2017-06-15 15:41:40 -07:00
|
|
|
void generate_ddx(const fs_inst *inst,
|
|
|
|
|
struct brw_reg dst, struct brw_reg src);
|
|
|
|
|
void generate_ddy(const fs_inst *inst,
|
|
|
|
|
struct brw_reg dst, struct brw_reg src);
|
2020-10-09 04:13:20 -05:00
|
|
|
void generate_scratch_header(fs_inst *inst, struct brw_reg dst);
|
2013-10-24 16:17:08 -07:00
|
|
|
|
2020-11-30 17:24:51 -06:00
|
|
|
void generate_halt(fs_inst *inst);
|
2012-12-06 10:15:08 -08:00
|
|
|
|
2015-11-07 18:58:34 -08:00
|
|
|
void generate_mov_indirect(fs_inst *inst,
|
|
|
|
|
struct brw_reg dst,
|
|
|
|
|
struct brw_reg reg,
|
|
|
|
|
struct brw_reg indirect_byte_offset);
|
|
|
|
|
|
2017-08-29 09:21:32 -07:00
|
|
|
void generate_shuffle(fs_inst *inst,
|
|
|
|
|
struct brw_reg dst,
|
|
|
|
|
struct brw_reg src,
|
|
|
|
|
struct brw_reg idx);
|
|
|
|
|
|
2018-12-06 14:11:34 -08:00
|
|
|
void generate_quad_swizzle(const fs_inst *inst,
|
|
|
|
|
struct brw_reg dst, struct brw_reg src,
|
|
|
|
|
unsigned swiz);
|
|
|
|
|
|
2020-11-19 09:32:27 -06:00
|
|
|
bool patch_halt_jumps();
|
2012-11-09 01:05:47 -08:00
|
|
|
|
2015-04-16 14:13:52 -07:00
|
|
|
const struct brw_compiler *compiler;
|
2023-07-14 02:10:20 +03:00
|
|
|
const struct brw_compile_params *params;
|
2015-04-16 14:34:04 -07:00
|
|
|
|
2021-04-05 13:19:39 -07:00
|
|
|
const struct intel_device_info *devinfo;
|
2012-11-09 01:05:47 -08:00
|
|
|
|
2015-04-16 11:06:57 -07:00
|
|
|
struct brw_codegen *p;
|
2014-08-29 12:50:46 -07:00
|
|
|
struct brw_stage_prog_data * const prog_data;
|
2012-11-09 01:05:47 -08:00
|
|
|
|
2016-05-18 14:39:52 -07:00
|
|
|
unsigned dispatch_width; /**< 8, 16 or 32 */
|
2012-11-09 01:05:47 -08:00
|
|
|
|
2012-12-06 10:15:08 -08:00
|
|
|
exec_list discard_halt_patches;
|
2014-10-27 19:40:47 -07:00
|
|
|
bool debug_flag;
|
|
|
|
|
const char *shader_name;
|
2016-01-14 20:27:51 -08:00
|
|
|
gl_shader_stage stage;
|
2012-11-09 01:05:47 -08:00
|
|
|
void *mem_ctx;
|
|
|
|
|
};
|
|
|
|
|
|
2017-01-13 15:23:48 -08:00
|
|
|
namespace brw {
|
2023-11-21 07:49:02 -08:00
|
|
|
fs_reg
|
2017-01-13 15:36:51 -08:00
|
|
|
fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
|
2024-04-20 17:08:02 -07:00
|
|
|
brw_reg_type type = BRW_TYPE_F,
|
2022-08-03 16:47:52 -07:00
|
|
|
unsigned n = 1);
|
2017-01-13 15:36:51 -08:00
|
|
|
|
2023-11-21 07:49:02 -08:00
|
|
|
fs_reg
|
|
|
|
|
fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2]);
|
2020-01-03 14:41:15 -08:00
|
|
|
|
2021-11-19 16:32:24 -06:00
|
|
|
inline fs_reg
|
|
|
|
|
dynamic_msaa_flags(const struct brw_wm_prog_data *wm_prog_data)
|
|
|
|
|
{
|
2024-04-20 17:30:23 -07:00
|
|
|
return fs_reg(UNIFORM, wm_prog_data->msaa_flags_param, BRW_TYPE_UD);
|
2021-11-19 16:32:24 -06:00
|
|
|
}
|
|
|
|
|
|
2023-11-21 07:49:02 -08:00
|
|
|
void
|
2021-11-19 16:32:24 -06:00
|
|
|
check_dynamic_msaa_flag(const fs_builder &bld,
|
|
|
|
|
const struct brw_wm_prog_data *wm_prog_data,
|
2024-02-01 13:17:42 -08:00
|
|
|
enum intel_msaa_flags flag);
|
2021-11-19 16:32:24 -06:00
|
|
|
|
2018-12-29 04:00:13 -08:00
|
|
|
bool
|
|
|
|
|
lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i);
|
2017-01-13 15:23:48 -08:00
|
|
|
}
|
|
|
|
|
|
2018-06-09 11:45:22 +02:00
|
|
|
void shuffle_from_32bit_read(const brw::fs_builder &bld,
|
|
|
|
|
const fs_reg &dst,
|
|
|
|
|
const fs_reg &src,
|
|
|
|
|
uint32_t first_component,
|
|
|
|
|
uint32_t components);
|
|
|
|
|
|
2024-04-18 09:54:11 +03:00
|
|
|
enum brw_barycentric_mode brw_barycentric_mode(const struct brw_wm_prog_key *key,
|
|
|
|
|
nir_intrinsic_instr *intr);
|
2017-03-20 16:04:38 +00:00
|
|
|
|
2019-08-25 23:59:25 -07:00
|
|
|
uint32_t brw_fb_write_msg_control(const fs_inst *inst,
|
|
|
|
|
const struct brw_wm_prog_data *prog_data);
|
|
|
|
|
|
2018-12-11 18:45:43 +01:00
|
|
|
void brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data);
|
2019-08-25 23:59:25 -07:00
|
|
|
|
2022-07-18 18:35:34 +02:00
|
|
|
bool brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width);
|
2021-05-18 10:37:36 -07:00
|
|
|
|
2022-06-27 12:24:58 -07:00
|
|
|
fs_reg brw_sample_mask_reg(const brw::fs_builder &bld);
|
|
|
|
|
void brw_emit_predicate_on_sample_mask(const brw::fs_builder &bld, fs_inst *inst);
|
|
|
|
|
|
2022-08-30 00:47:32 -07:00
|
|
|
int brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
|
|
|
|
|
const brw_stage_prog_data *prog_data);
|
|
|
|
|
|
2023-12-05 15:11:09 -08:00
|
|
|
void nir_to_brw(fs_visitor *s);
|
2022-08-30 00:47:32 -07:00
|
|
|
|
2024-04-01 12:00:16 -07:00
|
|
|
#ifndef NDEBUG
|
|
|
|
|
void brw_fs_validate(const fs_visitor &s);
|
|
|
|
|
#else
|
|
|
|
|
static inline void brw_fs_validate(const fs_visitor &s) {}
|
|
|
|
|
#endif
|
|
|
|
|
|
2024-01-04 16:42:50 -08:00
|
|
|
void brw_fs_optimize(fs_visitor &s);
|
|
|
|
|
|
2024-01-04 16:11:22 -08:00
|
|
|
bool brw_fs_lower_3src_null_dest(fs_visitor &s);
|
2024-03-18 22:52:35 -07:00
|
|
|
bool brw_fs_lower_alu_restrictions(fs_visitor &s);
|
2024-01-03 15:41:21 -08:00
|
|
|
bool brw_fs_lower_barycentrics(fs_visitor &s);
|
2024-01-03 15:35:15 -08:00
|
|
|
bool brw_fs_lower_constant_loads(fs_visitor &s);
|
2024-01-03 16:05:16 -08:00
|
|
|
bool brw_fs_lower_derivatives(fs_visitor &s);
|
2024-02-19 22:25:16 -08:00
|
|
|
bool brw_fs_lower_dpas(fs_visitor &s);
|
2024-01-03 16:21:24 -08:00
|
|
|
bool brw_fs_lower_find_live_channel(fs_visitor &s);
|
2024-01-03 16:00:38 -08:00
|
|
|
bool brw_fs_lower_integer_multiplication(fs_visitor &s);
|
2024-02-24 01:24:03 -08:00
|
|
|
bool brw_fs_lower_load_subgroup_invocation(fs_visitor &s);
|
2024-05-21 12:56:50 -07:00
|
|
|
bool brw_fs_lower_indirect_mov(fs_visitor &s);
|
2024-01-03 15:43:24 -08:00
|
|
|
bool brw_fs_lower_logical_sends(fs_visitor &s);
|
2024-01-03 15:37:32 -08:00
|
|
|
bool brw_fs_lower_pack(fs_visitor &s);
|
2024-01-03 16:30:53 -08:00
|
|
|
bool brw_fs_lower_load_payload(fs_visitor &s);
|
2024-01-03 16:07:12 -08:00
|
|
|
bool brw_fs_lower_regioning(fs_visitor &s);
|
2024-01-04 16:28:40 -08:00
|
|
|
bool brw_fs_lower_scoreboard(fs_visitor &s);
|
2024-01-03 16:14:23 -08:00
|
|
|
bool brw_fs_lower_sends_overlapping_payload(fs_visitor &s);
|
2024-01-03 15:39:23 -08:00
|
|
|
bool brw_fs_lower_simd_width(fs_visitor &s);
|
2024-05-21 13:40:34 -07:00
|
|
|
bool brw_fs_lower_csel(fs_visitor &s);
|
2024-01-03 16:02:09 -08:00
|
|
|
bool brw_fs_lower_sub_sat(fs_visitor &s);
|
2024-01-03 16:17:56 -08:00
|
|
|
bool brw_fs_lower_uniform_pull_constant_loads(fs_visitor &s);
|
2024-04-04 16:03:34 -07:00
|
|
|
void brw_fs_lower_vgrfs_to_fixed_grfs(fs_visitor &s);
|
2024-01-03 15:35:15 -08:00
|
|
|
|
2024-01-03 14:11:59 -08:00
|
|
|
bool brw_fs_opt_algebraic(fs_visitor &s);
|
2024-01-03 13:49:58 -08:00
|
|
|
bool brw_fs_opt_bank_conflicts(fs_visitor &s);
|
2024-01-03 11:00:59 -08:00
|
|
|
bool brw_fs_opt_cmod_propagation(fs_visitor &s);
|
2024-01-03 11:13:14 -08:00
|
|
|
bool brw_fs_opt_combine_constants(fs_visitor &s);
|
2024-01-03 14:40:37 -08:00
|
|
|
bool brw_fs_opt_compact_virtual_grfs(fs_visitor &s);
|
2024-01-03 10:57:23 -08:00
|
|
|
bool brw_fs_opt_copy_propagation(fs_visitor &s);
|
2024-03-08 23:18:33 -08:00
|
|
|
bool brw_fs_opt_copy_propagation_defs(fs_visitor &s);
|
intel/brw: Write a new global CSE pass that works on defs
This has a number of advantages compared to the pass I wrote years ago:
- It can easily perform either Global CSE or block-local CSE, without
needing to roll any dataflow analysis, thanks to SSA def analysis.
This global CSE is able to detect and coalesce memory loads across
blocks. Although it may increase spilling a little, the reduction
in memory loads seems to more than compensate.
- Because SSA guarantees that values are never written more than once,
the new CSE pass can directly reuse an existing value. The old pass
emitted copies at the point where it discovered a value because it
had no idea whether it'd be mutated later. This led it to generate
a ton of trash for copy propagation to clean up later, and also a
nasty fragility where CSE, register coalescing, and copy propagation
could all fight one another by generating and cleaning up copies,
leading to infinite optimization loops unless we were really careful.
Generating less trash improves our CPU efficiency.
- It uses hash tables like nir_instr_set and nir_opt_cse, instead of
linearly walking lists and comparing each element. This is much more
CPU efficient.
- It doesn't use liveness analysis, which is one of the most expensive
analysis passes that we have. Def analysis is cheaper.
In addition to CSE'ing SSA values, we continue to handle flag writes,
as this is a huge source of CSE'able values. These remain block local.
However, we can simply track the last flag write, rather than creating
entire sets of instruction entries like the old pass. Much simpler.
The only real downside to this pass is that, because the backend is
currently only partially SSA, it has limited visibility and isn't able
to see all values. However, the results appear to be good enough that
the new pass can effectively replace the old pass in almost all cases.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28666>
2024-02-29 01:55:35 -08:00
|
|
|
bool brw_fs_opt_cse_defs(fs_visitor &s);
|
2024-01-03 11:08:36 -08:00
|
|
|
bool brw_fs_opt_dead_code_eliminate(fs_visitor &s);
|
2024-02-19 22:25:16 -08:00
|
|
|
bool brw_fs_opt_dead_control_flow_eliminate(fs_visitor &s);
|
2024-01-03 14:51:58 -08:00
|
|
|
bool brw_fs_opt_eliminate_find_live_channel(fs_visitor &s);
|
2024-01-03 13:52:40 -08:00
|
|
|
bool brw_fs_opt_peephole_sel(fs_visitor &s);
|
2024-02-19 22:25:16 -08:00
|
|
|
bool brw_fs_opt_predicated_break(fs_visitor &s);
|
2024-01-03 14:57:38 -08:00
|
|
|
bool brw_fs_opt_register_coalesce(fs_visitor &s);
|
2024-01-03 14:52:08 -08:00
|
|
|
bool brw_fs_opt_remove_extra_rounding_modes(fs_visitor &s);
|
2024-01-03 14:03:23 -08:00
|
|
|
bool brw_fs_opt_remove_redundant_halts(fs_visitor &s);
|
2024-01-03 11:03:51 -08:00
|
|
|
bool brw_fs_opt_saturate_propagation(fs_visitor &s);
|
2024-01-03 14:42:29 -08:00
|
|
|
bool brw_fs_opt_split_sends(fs_visitor &s);
|
2024-01-03 14:40:37 -08:00
|
|
|
bool brw_fs_opt_split_virtual_grfs(fs_visitor &s);
|
2024-01-03 14:44:40 -08:00
|
|
|
bool brw_fs_opt_zero_samples(fs_visitor &s);
|
2024-01-03 10:57:23 -08:00
|
|
|
|
2024-01-04 16:24:21 -08:00
|
|
|
bool brw_fs_workaround_emit_dummy_mov_instruction(fs_visitor &s);
|
2024-01-04 16:18:04 -08:00
|
|
|
bool brw_fs_workaround_memory_fence_before_eot(fs_visitor &s);
|
2024-01-03 15:47:28 -08:00
|
|
|
bool brw_fs_workaround_nomask_control_flow(fs_visitor &s);
|
|
|
|
|
|
2024-01-04 22:07:17 -08:00
|
|
|
/* Helpers. */
|
|
|
|
|
unsigned brw_fs_get_lowered_simd_width(const fs_visitor *shader,
|
|
|
|
|
const fs_inst *inst);
|
|
|
|
|
|
2017-03-20 16:04:38 +00:00
|
|
|
#endif /* BRW_FS_H */
|