2015-10-08 17:09:54 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2010 - 2015 Intel Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
2024-12-06 14:25:29 -08:00
|
|
|
#pragma once
|
2015-10-08 17:09:54 -07:00
|
|
|
|
2015-11-17 01:37:27 -08:00
|
|
|
#include <stdio.h>
|
2021-02-18 16:09:31 -06:00
|
|
|
#include "c11/threads.h"
|
2021-04-05 11:47:31 -07:00
|
|
|
#include "dev/intel_device_info.h"
|
2024-01-22 12:14:01 -08:00
|
|
|
#include "isl/isl.h"
|
2022-07-25 15:40:15 -07:00
|
|
|
#include "util/macros.h"
|
2024-02-09 15:30:57 -08:00
|
|
|
#include "util/mesa-sha1.h"
|
2021-11-19 16:32:24 -06:00
|
|
|
#include "util/enum_operators.h"
|
2017-09-29 11:05:55 -07:00
|
|
|
#include "util/ralloc.h"
|
2021-12-07 16:34:48 +10:00
|
|
|
#include "util/u_math.h"
|
2023-10-19 15:49:51 +03:00
|
|
|
#include "util/u_printf.h"
|
2022-06-29 14:13:31 -07:00
|
|
|
#include "brw_isa_info.h"
|
2024-02-01 13:17:42 -08:00
|
|
|
#include "intel_shader_enums.h"
|
2015-10-08 17:09:54 -07:00
|
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
|
extern "C" {
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
struct ra_regs;
|
|
|
|
|
struct nir_shader;
|
2021-12-07 16:41:19 +10:00
|
|
|
struct shader_info;
|
2015-10-08 17:09:54 -07:00
|
|
|
|
2021-12-07 15:53:49 +10:00
|
|
|
struct nir_shader_compiler_options;
|
2020-06-26 19:54:29 +02:00
|
|
|
typedef struct nir_shader nir_shader;
|
|
|
|
|
|
2024-07-22 19:35:13 -04:00
|
|
|
#define REG_CLASS_COUNT 20
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
struct brw_compiler {
|
2021-04-05 13:19:39 -07:00
|
|
|
const struct intel_device_info *devinfo;
|
2015-10-08 17:09:54 -07:00
|
|
|
|
2021-02-18 16:09:31 -06:00
|
|
|
/* This lock must be taken if the compiler is to be modified in any way,
|
|
|
|
|
* including adding something to the ralloc child list.
|
|
|
|
|
*/
|
|
|
|
|
mtx_t mutex;
|
|
|
|
|
|
2022-06-29 14:13:31 -07:00
|
|
|
struct brw_isa_info isa;
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
struct {
|
|
|
|
|
struct ra_regs *regs;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Array of the ra classes for the unaligned contiguous register
|
|
|
|
|
* block sizes used, indexed by register size.
|
|
|
|
|
*/
|
2024-07-22 19:35:13 -04:00
|
|
|
struct ra_class *classes[REG_CLASS_COUNT];
|
2024-12-06 23:49:29 -08:00
|
|
|
} reg_set;
|
2015-10-08 17:09:54 -07:00
|
|
|
|
2021-07-29 14:13:27 -07:00
|
|
|
void (*shader_debug_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4);
|
2021-07-29 14:27:57 -07:00
|
|
|
void (*shader_perf_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4);
|
2015-10-08 17:09:54 -07:00
|
|
|
|
2022-08-16 11:02:20 -07:00
|
|
|
bool use_tcs_multi_patch;
|
2021-12-07 15:53:49 +10:00
|
|
|
struct nir_shader_compiler_options *nir_options[MESA_ALL_SHADER_STAGES];
|
i965: Add an INTEL_PRECISE_TRIG=1 option to fix SIN/COS output range.
The SIN and COS instructions on Intel hardware can produce values
slightly outside of the [-1.0, 1.0] range for a small set of values.
Obviously, this can break everyone's expectations about trig functions.
According to an internal presentation, the COS instruction can produce
a value up to 1.000027 for inputs in the range (0.08296, 0.09888). One
suggested workaround is to multiply by 0.99997, scaling down the
amplitude slightly. Apparently this also minimizes the error function,
reducing the maximum error from 0.00006 to about 0.00003.
When enabled, fixes 16 dEQP precision tests
dEQP-GLES31.functional.shaders.builtin_functions.precision.
{cos,sin}.{highp,mediump}_compute.{scalar,vec2,vec4,vec4}.
at the cost of making every sin and cos call more expensive (about
twice the number of cycles on recent hardware). Enabling this
option has been shown to reduce GPUTest Volplosion performance by
about 10%.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-01-27 12:21:04 -08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Apply workarounds for SIN and COS output range problems.
|
|
|
|
|
* This can negatively impact performance.
|
|
|
|
|
*/
|
|
|
|
|
bool precise_trig;
|
2014-08-14 22:36:45 -07:00
|
|
|
|
2020-02-21 11:29:06 -06:00
|
|
|
/**
|
|
|
|
|
* Whether indirect UBO loads should use the sampler or go through the
|
|
|
|
|
* data/constant cache. For the sampler, UBO surface states have to be set
|
|
|
|
|
* up with VK_FORMAT_R32G32B32A32_FLOAT whereas if it's going through the
|
|
|
|
|
* constant or data cache, UBOs must use VK_FORMAT_RAW.
|
|
|
|
|
*/
|
|
|
|
|
bool indirect_ubos_use_sampler;
|
2021-02-18 16:09:31 -06:00
|
|
|
|
2022-10-14 17:49:00 +03:00
|
|
|
/**
|
|
|
|
|
* Gfx12.5+ has a bit in the SEND instruction extending the bindless
|
|
|
|
|
* surface offset range from 20 to 26 bits, effectively giving us 4Gb of
|
|
|
|
|
* bindless surface descriptors instead of 64Mb previously.
|
|
|
|
|
*/
|
|
|
|
|
bool extended_bindless_surface_offset;
|
|
|
|
|
|
2023-02-22 15:44:41 +02:00
|
|
|
/**
|
|
|
|
|
* Gfx11+ has a bit in the dword 3 of the sampler message header that
|
|
|
|
|
* indicates whether the sampler handle is relative to the dynamic state
|
|
|
|
|
* base address (0) or the bindless sampler base address (1). The driver
|
|
|
|
|
* can select this.
|
|
|
|
|
*/
|
|
|
|
|
bool use_bindless_sampler_offset;
|
|
|
|
|
|
2023-10-10 15:35:46 -07:00
|
|
|
/**
|
|
|
|
|
* Should DPAS instructions be lowered?
|
|
|
|
|
*
|
|
|
|
|
* This will be set for all platforms before Gfx12.5. It may also be set
|
|
|
|
|
* platforms that support DPAS for testing purposes.
|
|
|
|
|
*/
|
|
|
|
|
bool lower_dpas;
|
|
|
|
|
|
2020-10-23 15:58:06 -05:00
|
|
|
/**
|
|
|
|
|
* Calling the ra_allocate function after each register spill can take
|
|
|
|
|
* several minutes. This option speeds up shader compilation by spilling
|
|
|
|
|
* more registers after the ra_allocate failure. Required for
|
|
|
|
|
* Cyberpunk 2077, which uses a watchdog thread to terminate the process
|
|
|
|
|
* in case the render thread hasn't responded within 2 minutes.
|
|
|
|
|
*/
|
|
|
|
|
int spilling_rate;
|
|
|
|
|
|
2021-02-18 16:09:31 -06:00
|
|
|
struct nir_shader *clc_shader;
|
2015-10-08 17:09:54 -07:00
|
|
|
};
|
|
|
|
|
|
2021-07-29 14:13:27 -07:00
|
|
|
#define brw_shader_debug_log(compiler, data, fmt, ... ) do { \
|
|
|
|
|
static unsigned id = 0; \
|
|
|
|
|
compiler->shader_debug_log(data, &id, fmt, ##__VA_ARGS__); \
|
|
|
|
|
} while (0)
|
|
|
|
|
|
2021-07-29 14:27:57 -07:00
|
|
|
#define brw_shader_perf_log(compiler, data, fmt, ... ) do { \
|
|
|
|
|
static unsigned id = 0; \
|
|
|
|
|
compiler->shader_perf_log(data, &id, fmt, ##__VA_ARGS__); \
|
|
|
|
|
} while (0)
|
|
|
|
|
|
2017-04-28 01:22:39 -07:00
|
|
|
/**
|
|
|
|
|
* We use a constant subgroup size of 32. It really only needs to be a
|
|
|
|
|
* maximum and, since we do SIMD32 for compute shaders in some cases, it
|
|
|
|
|
* needs to be at least 32. SIMD8 and SIMD16 shaders will still claim a
|
|
|
|
|
* subgroup size of 32 but will act as if 16 or 24 of those channels are
|
|
|
|
|
* disabled.
|
|
|
|
|
*/
|
|
|
|
|
#define BRW_SUBGROUP_SIZE 32
|
2015-10-08 17:09:54 -07:00
|
|
|
|
2020-10-21 14:46:50 -05:00
|
|
|
static inline bool
|
|
|
|
|
brw_shader_stage_is_bindless(gl_shader_stage stage)
|
|
|
|
|
{
|
|
|
|
|
return stage >= MESA_SHADER_RAYGEN &&
|
|
|
|
|
stage <= MESA_SHADER_CALLABLE;
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-12 13:46:31 +02:00
|
|
|
static inline bool
|
|
|
|
|
brw_shader_stage_requires_bindless_resources(gl_shader_stage stage)
|
|
|
|
|
{
|
|
|
|
|
return brw_shader_stage_is_bindless(stage) || gl_shader_stage_is_mesh(stage);
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-30 08:45:21 +03:00
|
|
|
static inline bool
|
|
|
|
|
brw_shader_stage_has_inline_data(const struct intel_device_info *devinfo,
|
|
|
|
|
gl_shader_stage stage)
|
|
|
|
|
{
|
|
|
|
|
return stage == MESA_SHADER_MESH || stage == MESA_SHADER_TASK ||
|
|
|
|
|
(stage == MESA_SHADER_COMPUTE && devinfo->verx10 >= 125);
|
|
|
|
|
}
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
/**
|
|
|
|
|
* Program key structures.
|
|
|
|
|
*
|
|
|
|
|
* When drawing, we look for the currently bound shaders in the program
|
|
|
|
|
* cache. This is essentially a hash table lookup, and these are the keys.
|
|
|
|
|
*
|
|
|
|
|
* Sometimes OpenGL features specified as state need to be simulated via
|
|
|
|
|
* shader code, due to a mismatch between the API and the hardware. This
|
|
|
|
|
* is often referred to as "non-orthagonal state" or "NOS". We store NOS
|
|
|
|
|
* in the program key so it's considered when searching for a program. If
|
|
|
|
|
* we haven't seen a particular combination before, we have to recompile a
|
|
|
|
|
* new specialized version.
|
|
|
|
|
*
|
|
|
|
|
* Shader compilation should not look up state in gl_context directly, but
|
|
|
|
|
* instead use the copy in the program key. This guarantees recompiles will
|
|
|
|
|
* happen correctly.
|
|
|
|
|
*
|
|
|
|
|
* @{
|
|
|
|
|
*/
|
|
|
|
|
|
2022-06-14 17:13:20 -07:00
|
|
|
#define BRW_MAX_SAMPLERS 32
|
|
|
|
|
|
2022-07-25 15:40:15 -07:00
|
|
|
/* Provide explicit padding for each member, to ensure that the compiler
|
|
|
|
|
* initializes every bit in the shader cache keys. The keys will be compared
|
|
|
|
|
* with memcmp.
|
|
|
|
|
*/
|
|
|
|
|
PRAGMA_DIAGNOSTIC_PUSH
|
|
|
|
|
PRAGMA_DIAGNOSTIC_ERROR(-Wpadded)
|
|
|
|
|
|
2022-06-21 18:06:04 -07:00
|
|
|
enum brw_robustness_flags {
|
|
|
|
|
BRW_ROBUSTNESS_UBO = BITFIELD_BIT(0),
|
|
|
|
|
BRW_ROBUSTNESS_SSBO = BITFIELD_BIT(1),
|
|
|
|
|
};
|
|
|
|
|
|
2019-02-21 17:20:39 -06:00
|
|
|
struct brw_base_prog_key {
|
|
|
|
|
unsigned program_string_id;
|
|
|
|
|
|
2022-06-21 18:06:04 -07:00
|
|
|
enum brw_robustness_flags robust_flags:2;
|
|
|
|
|
|
2024-04-24 16:14:16 +03:00
|
|
|
bool uses_inline_push_addr:1;
|
|
|
|
|
|
2025-04-29 17:40:22 +03:00
|
|
|
enum intel_vue_layout vue_layout:2;
|
2022-05-06 18:52:47 +03:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Apply workarounds for SIN and COS input range problems.
|
|
|
|
|
* This limits input range for SIN and COS to [-2p : 2p] to
|
|
|
|
|
* avoid precision issues.
|
|
|
|
|
*/
|
2025-04-29 17:40:22 +03:00
|
|
|
bool limit_trig_input_range:1;
|
|
|
|
|
|
|
|
|
|
unsigned padding:26;
|
2019-02-21 17:20:39 -06:00
|
|
|
};
|
|
|
|
|
|
2017-07-21 10:26:31 +02:00
|
|
|
/**
|
|
|
|
|
* OpenGL attribute slots fall in [0, VERT_ATTRIB_MAX - 1] with the range
|
|
|
|
|
* [VERT_ATTRIB_GENERIC0, VERT_ATTRIB_MAX - 1] reserved for up to 16 user
|
2025-02-12 09:55:59 +02:00
|
|
|
* input vertex attributes. In Vulkan, we expose up to 29 user vertex input
|
2017-07-21 10:26:31 +02:00
|
|
|
* attributes that are mapped to slots also starting at VERT_ATTRIB_GENERIC0.
|
|
|
|
|
*/
|
|
|
|
|
#define MAX_GL_VERT_ATTRIB VERT_ATTRIB_MAX
|
2025-02-12 09:55:59 +02:00
|
|
|
#define MAX_VK_VERT_ATTRIB (VERT_ATTRIB_GENERIC0 + 29)
|
2025-02-12 09:56:29 +02:00
|
|
|
#define MAX_HW_VERT_ATTRIB (VERT_ATTRIB_GENERIC0 + 34)
|
2017-07-21 10:26:31 +02:00
|
|
|
|
2024-12-02 15:00:07 +02:00
|
|
|
/**
|
|
|
|
|
* Use the last 2 slots :
|
|
|
|
|
* - slot 32: start vertex, vertex count, instance count, start instance
|
|
|
|
|
* - slot 33: base vertex, base instance, draw id
|
|
|
|
|
*/
|
|
|
|
|
#define BRW_SVGS_VE_INDEX (32)
|
|
|
|
|
#define BRW_DRAWID_VE_INDEX (33)
|
|
|
|
|
|
2025-02-12 10:09:00 +02:00
|
|
|
/** The program key for Vertex Shaders.
|
|
|
|
|
*
|
|
|
|
|
* Notes about slot compaction & component packing:
|
|
|
|
|
*
|
|
|
|
|
* VF slot compaction is our default compiler behavior. The compiler looks at
|
|
|
|
|
* used inputs locations, for example [0, 2, 3], and will arrange things such
|
|
|
|
|
* that the payload only includes 3 vec4 in that case. Location 1 is
|
|
|
|
|
* completely dropped. The driver is expected to program
|
|
|
|
|
* 3DSTATE_VERTEX_ELEMENTS to match this. So even if the location 1 is
|
|
|
|
|
* described in the API input, the driver will not program it in
|
|
|
|
|
* 3DSTATE_VERTEX_ELEMENTS because it sees the compiler is not using it.
|
|
|
|
|
*
|
|
|
|
|
* Component compaction is a HW feature that removes unused components (for
|
|
|
|
|
* whatever slot [0, 31]) from the payload. Those values are stored in the URB
|
|
|
|
|
* by the VF but they get scrapped when the payload is generated. For example
|
|
|
|
|
* with input locations [ 0 vec2, 1 vec1, 2 vec4 ], the register payload for
|
|
|
|
|
* VF inputs will be made up of 7 GRFs (2 + 1 + 4). Without component
|
|
|
|
|
* compaction, the payload would be 12 GRFs (3 * 4).
|
|
|
|
|
*
|
|
|
|
|
* The HW component compaction feature only works on first 32 slots, so
|
|
|
|
|
* anything after that will deliver the full vec4.
|
|
|
|
|
*/
|
2015-10-08 17:09:54 -07:00
|
|
|
struct brw_vs_prog_key {
|
2019-02-21 17:20:39 -06:00
|
|
|
struct brw_base_prog_key base;
|
2024-11-29 12:37:29 +02:00
|
|
|
|
|
|
|
|
/** Enable component packing
|
|
|
|
|
*
|
|
|
|
|
* Using this option requires that the driver programs
|
|
|
|
|
* 3DSTATE_VF_COMPONENT_PACKING with the values provided in
|
|
|
|
|
* brw_vs_prog_data::vf_component_packing
|
|
|
|
|
*/
|
|
|
|
|
bool vf_component_packing : 1;
|
|
|
|
|
|
2024-12-02 15:00:34 +02:00
|
|
|
/** Prevent compaction of slots of VF inputs
|
|
|
|
|
*
|
|
|
|
|
* So that 3DSTATE_VERTEX_ELEMENTS programming remains independent of
|
|
|
|
|
* shader inputs (essentially an unused location should have an associated
|
|
|
|
|
* VERTEX_ELEMENT_STATE).
|
|
|
|
|
*/
|
|
|
|
|
bool no_vf_slot_compaction : 1;
|
|
|
|
|
|
|
|
|
|
uint32_t padding : 30;
|
2015-10-08 17:09:54 -07:00
|
|
|
};
|
|
|
|
|
|
2015-11-17 01:07:39 -08:00
|
|
|
/** The program key for Tessellation Control Shaders. */
|
|
|
|
|
struct brw_tcs_prog_key
|
|
|
|
|
{
|
2019-02-21 17:20:39 -06:00
|
|
|
struct brw_base_prog_key base;
|
2015-11-17 01:07:39 -08:00
|
|
|
|
2022-07-25 16:48:28 -07:00
|
|
|
/** A bitfield of per-vertex outputs written. */
|
|
|
|
|
uint64_t outputs_written;
|
|
|
|
|
|
2022-01-19 11:43:15 +10:00
|
|
|
enum tess_primitive_mode _tes_primitive_mode;
|
2015-11-17 01:07:39 -08:00
|
|
|
|
2023-04-08 21:34:35 +03:00
|
|
|
/** Number of input vertices, 0 means dynamic */
|
2015-11-25 23:35:29 -08:00
|
|
|
unsigned input_vertices;
|
|
|
|
|
|
i965: Handle mix-and-match TCS/TES with separate shader objects.
GL_ARB_separate_shader_objects allows the application to mix-and-match
TCS and TES programs separately. This means that the interface between
the two stages isn't known until the final SSO pipeline is in place.
This isn't a great match for our hardware: the TCS and TES have to agree
on the Patch URB entry layout. Since we store data as per-patch slots
followed by per-vertex slots, changing the number of per-patch slots can
significantly alter the layout. This can easily happen with SSO.
To handle this, we store the [Patch]OutputsWritten and [Patch]InputsRead
bitfields in the TCS/TES program keys, introducing program recompiles.
brw_upload_programs() decides the layout for both TCS and TES, and
passes it to brw_upload_tcs/tes(), which store it in the key.
When creating the NIR for a shader specialization, we override
nir->info.inputs_read (and friends) to the program key's values.
Since everything uses those, no further compiler changes are needed.
This also replaces the hack in brw_create_nir().
To avoid recompiles, brw_precompile_tes() looks to see if there's a
TCS in the linked shader. If so, it accounts for the TCS outputs,
just as brw_upload_programs() would. This eliminates all recompiles
in the non-SSO case. In the SSO case, there should only be recompiles
when using a TCS and TES that have different input/output interfaces.
Fixes Piglit's mix-and-match-tcs-tes test.
v2: Pull the brw_upload_programs code into a brw_upload_tess_programs()
helper function (requested by Jordan Justen).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
2015-12-07 20:18:42 -08:00
|
|
|
/** A bitfield of per-patch outputs written. */
|
|
|
|
|
uint32_t patch_outputs_written;
|
2015-12-17 21:39:28 -08:00
|
|
|
|
2024-02-23 16:24:53 -08:00
|
|
|
uint32_t padding;
|
2015-11-17 01:07:39 -08:00
|
|
|
};
|
|
|
|
|
|
2023-04-08 21:34:35 +03:00
|
|
|
#define BRW_MAX_TCS_INPUT_VERTICES (32)
|
|
|
|
|
|
|
|
|
|
static inline uint32_t
|
|
|
|
|
brw_tcs_prog_key_input_vertices(const struct brw_tcs_prog_key *key)
|
|
|
|
|
{
|
|
|
|
|
return key->input_vertices != 0 ?
|
|
|
|
|
key->input_vertices : BRW_MAX_TCS_INPUT_VERTICES;
|
|
|
|
|
}
|
|
|
|
|
|
2015-11-10 14:35:27 -08:00
|
|
|
/** The program key for Tessellation Evaluation Shaders. */
|
|
|
|
|
struct brw_tes_prog_key
|
|
|
|
|
{
|
2019-02-21 17:20:39 -06:00
|
|
|
struct brw_base_prog_key base;
|
2015-11-10 14:35:27 -08:00
|
|
|
|
i965: Handle mix-and-match TCS/TES with separate shader objects.
GL_ARB_separate_shader_objects allows the application to mix-and-match
TCS and TES programs separately. This means that the interface between
the two stages isn't known until the final SSO pipeline is in place.
This isn't a great match for our hardware: the TCS and TES have to agree
on the Patch URB entry layout. Since we store data as per-patch slots
followed by per-vertex slots, changing the number of per-patch slots can
significantly alter the layout. This can easily happen with SSO.
To handle this, we store the [Patch]OutputsWritten and [Patch]InputsRead
bitfields in the TCS/TES program keys, introducing program recompiles.
brw_upload_programs() decides the layout for both TCS and TES, and
passes it to brw_upload_tcs/tes(), which store it in the key.
When creating the NIR for a shader specialization, we override
nir->info.inputs_read (and friends) to the program key's values.
Since everything uses those, no further compiler changes are needed.
This also replaces the hack in brw_create_nir().
To avoid recompiles, brw_precompile_tes() looks to see if there's a
TCS in the linked shader. If so, it accounts for the TCS outputs,
just as brw_upload_programs() would. This eliminates all recompiles
in the non-SSO case. In the SSO case, there should only be recompiles
when using a TCS and TES that have different input/output interfaces.
Fixes Piglit's mix-and-match-tcs-tes test.
v2: Pull the brw_upload_programs code into a brw_upload_tess_programs()
helper function (requested by Jordan Justen).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
2015-12-07 20:18:42 -08:00
|
|
|
/** A bitfield of per-vertex inputs read. */
|
|
|
|
|
uint64_t inputs_read;
|
2019-06-28 22:25:57 +10:00
|
|
|
|
2022-07-25 16:48:28 -07:00
|
|
|
/** A bitfield of per-patch inputs read. */
|
|
|
|
|
uint32_t patch_inputs_read;
|
|
|
|
|
|
2024-02-23 16:24:53 -08:00
|
|
|
uint32_t padding;
|
2015-11-10 14:35:27 -08:00
|
|
|
};
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
/** The program key for Geometry Shaders. */
|
|
|
|
|
struct brw_gs_prog_key
|
|
|
|
|
{
|
2019-02-21 17:20:39 -06:00
|
|
|
struct brw_base_prog_key base;
|
2015-10-08 17:09:54 -07:00
|
|
|
};
|
|
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
struct brw_task_prog_key
|
|
|
|
|
{
|
|
|
|
|
struct brw_base_prog_key base;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct brw_mesh_prog_key
|
|
|
|
|
{
|
|
|
|
|
struct brw_base_prog_key base;
|
|
|
|
|
};
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
/** The program key for Fragment/Pixel Shaders. */
|
|
|
|
|
struct brw_wm_prog_key {
|
2019-02-21 17:20:39 -06:00
|
|
|
struct brw_base_prog_key base;
|
|
|
|
|
|
2022-07-25 16:48:28 -07:00
|
|
|
uint64_t input_slots_valid;
|
|
|
|
|
uint8_t color_outputs_valid;
|
|
|
|
|
|
2017-02-28 16:33:49 -08:00
|
|
|
/* Some collection of BRW_WM_IZ_* */
|
2015-10-08 17:09:54 -07:00
|
|
|
bool flat_shade:1;
|
|
|
|
|
unsigned nr_color_regions:5;
|
i965,iris,anv: Make alpha to coverage work with sample mask
From "Alpha Coverage" section of SKL PRM Volume 7:
"If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
hardware, regardless of the state setting for this feature."
From OpenGL spec 4.6, "15.2 Shader Execution":
"The built-in integer array gl_SampleMask can be used to change
the sample coverage for a fragment from within the shader."
From OpenGL spec 4.6, "17.3.1 Alpha To Coverage":
"If SAMPLE_ALPHA_TO_COVERAGE is enabled, a temporary coverage value
is generated where each bit is determined by the alpha value at the
corresponding sample location. The temporary coverage value is then
ANDed with the fragment coverage value to generate a new fragment
coverage value."
Similar wording could be found in Vulkan spec 1.1.100
"25.6. Multisample Coverage"
Thus we need to compute alpha to coverage dithering manually in shader
and replace sample mask store with the bitwise-AND of sample mask and
alpha to coverage dithering.
The following formula is used to compute final sample mask:
m = int(16.0 * clamp(src0_alpha, 0.0, 1.0))
dither_mask = 0x1111 * ((0xfea80 >> (m & ~3)) & 0xf) |
0x0808 * (m & 2) | 0x0100 * (m & 1)
sample_mask = sample_mask & dither_mask
Credits to Francisco Jerez <currojerez@riseup.net> for creating it.
It gives a number of ones proportional to the alpha for 2, 4, 8 or 16
least significant bits of the result.
GEN6 hardware does not have issue with simultaneous usage of sample mask
and alpha to coverage however due to the wrong sending order of oMask
and src0_alpha it is still affected by it.
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109743
Signed-off-by: Danylo Piliaiev <danylo.piliaiev@globallogic.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
2019-02-20 19:39:18 +02:00
|
|
|
bool alpha_test_replicate_alpha:1;
|
2024-11-18 10:58:46 +02:00
|
|
|
enum intel_sometimes alpha_to_coverage:2;
|
2015-10-08 17:09:54 -07:00
|
|
|
bool clamp_fragment_color:1;
|
2021-11-19 15:30:08 -06:00
|
|
|
|
2022-03-09 15:31:34 +02:00
|
|
|
bool force_dual_color_blend:1;
|
|
|
|
|
|
2021-11-19 15:30:08 -06:00
|
|
|
/** Whether or inputs are interpolated at sample rate by default
|
|
|
|
|
*
|
|
|
|
|
* This corresponds to the sample shading API bit in Vulkan or OpenGL which
|
|
|
|
|
* controls how inputs with no interpolation qualifier are interpolated.
|
|
|
|
|
* This is distinct from the way that using gl_SampleID or similar requires
|
|
|
|
|
* us to run per-sample. Even when running per-sample due to gl_SampleID,
|
|
|
|
|
* we may still interpolate unqualified inputs at the pixel center.
|
|
|
|
|
*/
|
2024-11-18 10:58:46 +02:00
|
|
|
enum intel_sometimes persample_interp:2;
|
2021-11-19 15:30:08 -06:00
|
|
|
|
2021-11-19 13:44:35 -06:00
|
|
|
/* Whether or not we are running on a multisampled framebuffer */
|
2024-11-18 10:58:46 +02:00
|
|
|
enum intel_sometimes multisample_fbo:2;
|
2021-11-19 13:44:35 -06:00
|
|
|
|
2025-03-10 23:18:30 +02:00
|
|
|
/* Whether the shader is dispatch with a preceeding mesh shader */
|
2024-11-18 10:58:46 +02:00
|
|
|
enum intel_sometimes mesh_input:2;
|
2023-09-04 21:40:46 -07:00
|
|
|
|
2016-07-21 20:32:12 -07:00
|
|
|
bool coherent_fb_fetch:1;
|
2020-05-07 19:34:56 -05:00
|
|
|
bool ignore_sample_mask_out:1;
|
2020-10-22 13:23:06 +03:00
|
|
|
bool coarse_pixel:1;
|
2024-07-01 14:45:38 -07:00
|
|
|
bool null_push_constant_tbimr_workaround:1;
|
2015-10-08 17:09:54 -07:00
|
|
|
|
2024-07-01 14:45:38 -07:00
|
|
|
uint64_t padding:35;
|
2015-10-08 17:09:54 -07:00
|
|
|
};
|
|
|
|
|
|
2025-02-24 13:14:19 +02:00
|
|
|
static inline bool
|
|
|
|
|
brw_wm_prog_key_is_dynamic(const struct brw_wm_prog_key *key)
|
|
|
|
|
{
|
|
|
|
|
return key->alpha_to_coverage == INTEL_SOMETIMES ||
|
|
|
|
|
key->persample_interp == INTEL_SOMETIMES ||
|
2025-02-24 13:21:06 +02:00
|
|
|
key->multisample_fbo == INTEL_SOMETIMES ||
|
2025-03-10 23:18:30 +02:00
|
|
|
key->base.vue_layout == INTEL_VUE_LAYOUT_SEPARATE_MESH;
|
2025-02-24 13:14:19 +02:00
|
|
|
}
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
struct brw_cs_prog_key {
|
2019-02-21 17:20:39 -06:00
|
|
|
struct brw_base_prog_key base;
|
2015-10-08 17:09:54 -07:00
|
|
|
};
|
|
|
|
|
|
2020-10-21 14:46:50 -05:00
|
|
|
struct brw_bs_prog_key {
|
|
|
|
|
struct brw_base_prog_key base;
|
2022-09-26 13:26:36 -07:00
|
|
|
|
|
|
|
|
/* Represents enum enum brw_rt_ray_flags values given at pipeline creation
|
|
|
|
|
* to be combined with ray_flags handed to the traceRayEXT() calls by the
|
|
|
|
|
* shader.
|
|
|
|
|
*/
|
|
|
|
|
uint32_t pipeline_ray_flags;
|
2020-10-21 14:46:50 -05:00
|
|
|
};
|
|
|
|
|
|
2017-10-21 01:29:16 -07:00
|
|
|
/* brw_any_prog_key is any of the keys that map to an API stage */
|
|
|
|
|
union brw_any_prog_key {
|
2019-02-21 17:20:39 -06:00
|
|
|
struct brw_base_prog_key base;
|
2017-10-21 01:29:16 -07:00
|
|
|
struct brw_vs_prog_key vs;
|
|
|
|
|
struct brw_tcs_prog_key tcs;
|
|
|
|
|
struct brw_tes_prog_key tes;
|
|
|
|
|
struct brw_gs_prog_key gs;
|
|
|
|
|
struct brw_wm_prog_key wm;
|
|
|
|
|
struct brw_cs_prog_key cs;
|
2020-10-21 14:46:50 -05:00
|
|
|
struct brw_bs_prog_key bs;
|
2021-10-29 12:27:45 -07:00
|
|
|
struct brw_task_prog_key task;
|
|
|
|
|
struct brw_mesh_prog_key mesh;
|
2017-10-21 01:29:16 -07:00
|
|
|
};
|
|
|
|
|
|
2022-07-25 15:40:15 -07:00
|
|
|
PRAGMA_DIAGNOSTIC_POP
|
|
|
|
|
|
2017-02-28 18:14:49 -08:00
|
|
|
/** Max number of render targets in a shader */
|
|
|
|
|
#define BRW_MAX_DRAW_BUFFERS 8
|
|
|
|
|
|
2016-01-02 03:21:28 -08:00
|
|
|
struct brw_ubo_range
|
|
|
|
|
{
|
|
|
|
|
uint16_t block;
|
2021-10-26 11:42:37 +03:00
|
|
|
|
|
|
|
|
/* In units of 32-byte registers */
|
2016-01-02 03:21:28 -08:00
|
|
|
uint8_t start;
|
|
|
|
|
uint8_t length;
|
|
|
|
|
};
|
|
|
|
|
|
2017-09-28 16:25:31 -07:00
|
|
|
/* We reserve the first 2^16 values for builtins */
|
|
|
|
|
#define BRW_PARAM_IS_BUILTIN(param) (((param) & 0xffff0000) == 0)
|
|
|
|
|
|
|
|
|
|
enum brw_param_builtin {
|
|
|
|
|
BRW_PARAM_BUILTIN_ZERO,
|
|
|
|
|
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_0_X,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_0_Y,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_0_Z,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_0_W,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_1_X,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_1_Y,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_1_Z,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_1_W,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_2_X,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_2_Y,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_2_Z,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_2_W,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_3_X,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_3_Y,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_3_Z,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_3_W,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_4_X,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_4_Y,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_4_Z,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_4_W,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_5_X,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_5_Y,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_5_Z,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_5_W,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_6_X,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_6_Y,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_6_Z,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_6_W,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_7_X,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_7_Y,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_7_Z,
|
|
|
|
|
BRW_PARAM_BUILTIN_CLIP_PLANE_7_W,
|
|
|
|
|
|
|
|
|
|
BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X,
|
|
|
|
|
BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Y,
|
|
|
|
|
BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Z,
|
|
|
|
|
BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_W,
|
|
|
|
|
BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X,
|
|
|
|
|
BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y,
|
2017-09-29 12:22:48 -07:00
|
|
|
|
2018-12-04 14:11:51 -08:00
|
|
|
BRW_PARAM_BUILTIN_PATCH_VERTICES_IN,
|
|
|
|
|
|
2017-10-03 15:23:07 -07:00
|
|
|
BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_X,
|
|
|
|
|
BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Y,
|
|
|
|
|
BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Z,
|
2017-08-24 11:40:31 -07:00
|
|
|
BRW_PARAM_BUILTIN_SUBGROUP_ID,
|
2018-11-12 06:29:51 -08:00
|
|
|
BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X,
|
|
|
|
|
BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Y,
|
|
|
|
|
BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z,
|
2020-10-06 18:06:05 -05:00
|
|
|
BRW_PARAM_BUILTIN_WORK_DIM,
|
2017-09-28 16:25:31 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
#define BRW_PARAM_BUILTIN_CLIP_PLANE(idx, comp) \
|
|
|
|
|
(BRW_PARAM_BUILTIN_CLIP_PLANE_0_X + ((idx) << 2) + (comp))
|
|
|
|
|
|
|
|
|
|
#define BRW_PARAM_BUILTIN_IS_CLIP_PLANE(param) \
|
|
|
|
|
((param) >= BRW_PARAM_BUILTIN_CLIP_PLANE_0_X && \
|
|
|
|
|
(param) <= BRW_PARAM_BUILTIN_CLIP_PLANE_7_W)
|
|
|
|
|
|
|
|
|
|
#define BRW_PARAM_BUILTIN_CLIP_PLANE_IDX(param) \
|
|
|
|
|
(((param) - BRW_PARAM_BUILTIN_CLIP_PLANE_0_X) >> 2)
|
|
|
|
|
|
|
|
|
|
#define BRW_PARAM_BUILTIN_CLIP_PLANE_COMP(param) \
|
|
|
|
|
(((param) - BRW_PARAM_BUILTIN_CLIP_PLANE_0_X) & 0x3)
|
|
|
|
|
|
2023-03-21 14:16:16 +02:00
|
|
|
#define BRW_MAX_EMBEDDED_SAMPLERS (4096)
|
|
|
|
|
|
2020-09-04 12:00:42 -05:00
|
|
|
enum brw_shader_reloc_id {
|
|
|
|
|
BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW,
|
|
|
|
|
BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH,
|
2020-09-04 12:40:06 -05:00
|
|
|
BRW_SHADER_RELOC_SHADER_START_OFFSET,
|
2021-01-21 15:16:30 -06:00
|
|
|
BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW,
|
|
|
|
|
BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH,
|
2023-03-15 16:10:25 +02:00
|
|
|
BRW_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH,
|
2023-03-27 19:42:31 +03:00
|
|
|
BRW_SHADER_RELOC_DESCRIPTORS_BUFFER_ADDR_HIGH,
|
2023-03-21 14:16:16 +02:00
|
|
|
BRW_SHADER_RELOC_EMBEDDED_SAMPLER_HANDLE,
|
|
|
|
|
BRW_SHADER_RELOC_LAST_EMBEDDED_SAMPLER_HANDLE =
|
|
|
|
|
BRW_SHADER_RELOC_EMBEDDED_SAMPLER_HANDLE + BRW_MAX_EMBEDDED_SAMPLERS - 1,
|
2023-09-05 16:11:56 +03:00
|
|
|
BRW_SHADER_RELOC_PRINTF_BUFFER_ADDR_LOW,
|
|
|
|
|
BRW_SHADER_RELOC_PRINTF_BUFFER_ADDR_HIGH,
|
2025-01-17 15:51:33 +02:00
|
|
|
BRW_SHADER_RELOC_PRINTF_BUFFER_SIZE,
|
2020-09-04 12:00:42 -05:00
|
|
|
};
|
|
|
|
|
|
2020-09-04 12:09:11 -05:00
|
|
|
enum brw_shader_reloc_type {
|
2020-09-04 12:23:35 -05:00
|
|
|
/** An arbitrary 32-bit value */
|
|
|
|
|
BRW_SHADER_RELOC_TYPE_U32,
|
2020-09-04 12:09:11 -05:00
|
|
|
/** A MOV instruction with an immediate source */
|
|
|
|
|
BRW_SHADER_RELOC_TYPE_MOV_IMM,
|
|
|
|
|
};
|
|
|
|
|
|
2020-08-08 12:55:29 -05:00
|
|
|
/** Represents a code relocation
|
|
|
|
|
*
|
|
|
|
|
* Relocatable constants are immediates in the code which we want to be able
|
|
|
|
|
* to replace post-compile with the actual value.
|
|
|
|
|
*/
|
|
|
|
|
struct brw_shader_reloc {
|
|
|
|
|
/** The 32-bit ID of the relocatable constant */
|
|
|
|
|
uint32_t id;
|
|
|
|
|
|
2020-09-04 12:09:11 -05:00
|
|
|
/** Type of this relocation */
|
|
|
|
|
enum brw_shader_reloc_type type;
|
|
|
|
|
|
|
|
|
|
/** The offset in the shader to the relocated value
|
2020-08-08 12:55:29 -05:00
|
|
|
*
|
2020-09-04 12:09:11 -05:00
|
|
|
* For MOV_IMM relocs, this is an offset to the MOV instruction. This
|
|
|
|
|
* allows us to do some sanity checking while we update the value.
|
2020-08-08 12:55:29 -05:00
|
|
|
*/
|
|
|
|
|
uint32_t offset;
|
2020-09-04 12:09:11 -05:00
|
|
|
|
|
|
|
|
/** Value to be added to the relocated value before it is written */
|
|
|
|
|
uint32_t delta;
|
2020-08-08 12:55:29 -05:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/** A value to write to a relocation */
|
|
|
|
|
struct brw_shader_reloc_value {
|
|
|
|
|
/** The 32-bit ID of the relocatable constant */
|
|
|
|
|
uint32_t id;
|
|
|
|
|
|
|
|
|
|
/** The value with which to replace the relocated immediate */
|
|
|
|
|
uint32_t value;
|
|
|
|
|
};
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
struct brw_stage_prog_data {
|
2016-01-02 03:21:28 -08:00
|
|
|
struct brw_ubo_range ubo_ranges[4];
|
|
|
|
|
|
2021-12-09 16:47:24 -06:00
|
|
|
unsigned nr_params; /**< number of float params/constants */
|
2015-10-08 17:09:54 -07:00
|
|
|
|
2020-11-10 13:11:31 -09:00
|
|
|
gl_shader_stage stage;
|
|
|
|
|
|
2020-04-03 20:20:53 -05:00
|
|
|
/* zero_push_reg is a bitfield which indicates what push registers (if any)
|
|
|
|
|
* should be zeroed by SW at the start of the shader. The corresponding
|
|
|
|
|
* push_reg_mask_param specifies the param index (in 32-bit units) where
|
|
|
|
|
* the actual runtime 64-bit mask will be pushed. The shader will zero
|
|
|
|
|
* push reg i if
|
|
|
|
|
*
|
|
|
|
|
* reg_used & zero_push_reg & ~*push_reg_mask_param & (1ull << i)
|
|
|
|
|
*
|
|
|
|
|
* If this field is set, brw_compiler::compact_params must be false.
|
|
|
|
|
*/
|
|
|
|
|
uint64_t zero_push_reg;
|
|
|
|
|
unsigned push_reg_mask_param;
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
unsigned curb_read_length;
|
|
|
|
|
unsigned total_scratch;
|
2015-09-03 18:15:19 +03:00
|
|
|
unsigned total_shared;
|
2015-10-08 17:09:54 -07:00
|
|
|
|
2016-04-14 10:59:16 +10:00
|
|
|
unsigned program_size;
|
|
|
|
|
|
2020-08-07 22:26:07 -05:00
|
|
|
unsigned const_data_size;
|
|
|
|
|
unsigned const_data_offset;
|
|
|
|
|
|
2020-08-08 12:55:29 -05:00
|
|
|
unsigned num_relocs;
|
|
|
|
|
const struct brw_shader_reloc *relocs;
|
|
|
|
|
|
2019-09-09 22:21:17 -07:00
|
|
|
/** Does this program pull from any UBO or other constant buffers? */
|
|
|
|
|
bool has_ubo_pull;
|
|
|
|
|
|
2021-10-26 16:39:08 +03:00
|
|
|
/** How many ray queries objects in this shader. */
|
|
|
|
|
unsigned ray_queries;
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
/**
|
|
|
|
|
* Register where the thread expects to find input data from the URB
|
|
|
|
|
* (typically uniforms, followed by vertex or fragment attributes).
|
|
|
|
|
*/
|
|
|
|
|
unsigned dispatch_grf_start_reg;
|
|
|
|
|
|
2024-09-18 14:32:58 -07:00
|
|
|
/** Number of GRF registers used. */
|
|
|
|
|
unsigned grf_used;
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
bool use_alt_mode; /**< Use ALT floating point mode? Otherwise, IEEE. */
|
|
|
|
|
|
2025-02-12 12:43:25 +02:00
|
|
|
uint32_t source_hash;
|
|
|
|
|
|
2017-09-28 16:25:31 -07:00
|
|
|
/* 32-bit identifiers for all push/pull parameters. These can be anything
|
|
|
|
|
* the driver wishes them to be; the core of the back-end compiler simply
|
|
|
|
|
* re-arranges them. The one restriction is that the bottom 2^16 values
|
|
|
|
|
* are reserved for builtins defined in the brw_param_builtin enum defined
|
|
|
|
|
* above.
|
2015-10-08 17:09:54 -07:00
|
|
|
*/
|
2017-09-28 16:25:31 -07:00
|
|
|
uint32_t *param;
|
2020-03-06 08:59:16 +02:00
|
|
|
|
|
|
|
|
/* Whether shader uses atomic operations. */
|
|
|
|
|
bool uses_atomic_load_store;
|
2023-10-19 15:49:51 +03:00
|
|
|
|
|
|
|
|
/* Printf descriptions contained by the shader */
|
|
|
|
|
uint32_t printf_info_count;
|
|
|
|
|
u_printf_info *printf_info;
|
2015-10-08 17:09:54 -07:00
|
|
|
};
|
|
|
|
|
|
2024-09-18 14:39:10 -07:00
|
|
|
/**
|
|
|
|
|
* Convert a number of GRF registers used (grf_used in prog_data) into
|
|
|
|
|
* a number of GRF register blocks supported by the hardware on PTL+.
|
|
|
|
|
*/
|
|
|
|
|
static inline unsigned
|
|
|
|
|
ptl_register_blocks(unsigned grf_used)
|
|
|
|
|
{
|
|
|
|
|
const unsigned n = DIV_ROUND_UP(grf_used, 32) - 1;
|
|
|
|
|
return (n < 6 ? n : 7);
|
|
|
|
|
}
|
|
|
|
|
|
2017-09-29 11:05:55 -07:00
|
|
|
static inline uint32_t *
|
|
|
|
|
brw_stage_prog_data_add_params(struct brw_stage_prog_data *prog_data,
|
|
|
|
|
unsigned nr_new_params)
|
|
|
|
|
{
|
|
|
|
|
unsigned old_nr_params = prog_data->nr_params;
|
|
|
|
|
prog_data->nr_params += nr_new_params;
|
|
|
|
|
prog_data->param = reralloc(ralloc_parent(prog_data->param),
|
|
|
|
|
prog_data->param, uint32_t,
|
|
|
|
|
prog_data->nr_params);
|
|
|
|
|
return prog_data->param + old_nr_params;
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-19 15:49:51 +03:00
|
|
|
void
|
|
|
|
|
brw_stage_prog_data_add_printf(struct brw_stage_prog_data *prog_data,
|
|
|
|
|
void *mem_ctx,
|
|
|
|
|
const u_printf_info *print);
|
|
|
|
|
|
2017-05-02 09:20:02 -07:00
|
|
|
enum brw_pixel_shader_computed_depth_mode {
|
|
|
|
|
BRW_PSCDEPTH_OFF = 0, /* PS does not compute depth */
|
|
|
|
|
BRW_PSCDEPTH_ON = 1, /* PS computes depth; no guarantee about value */
|
|
|
|
|
BRW_PSCDEPTH_ON_GE = 2, /* PS guarantees output depth >= source depth */
|
|
|
|
|
BRW_PSCDEPTH_ON_LE = 3, /* PS guarantees output depth <= source depth */
|
|
|
|
|
};
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
/* Data about a particular attempt to compile a program. Note that
|
|
|
|
|
* there can be many of these, each in a different GL state
|
|
|
|
|
* corresponding to a different brw_wm_prog_key struct, with different
|
|
|
|
|
* compiled programs.
|
|
|
|
|
*/
|
|
|
|
|
struct brw_wm_prog_data {
|
|
|
|
|
struct brw_stage_prog_data base;
|
|
|
|
|
|
2025-03-21 12:32:24 +02:00
|
|
|
/**
|
|
|
|
|
* Number of slots (16B) chunks dedicated to per primitive payload.
|
|
|
|
|
*/
|
2021-12-09 16:47:24 -06:00
|
|
|
unsigned num_per_primitive_inputs;
|
2025-03-21 12:32:24 +02:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Number of slots (16B) chunks dedicated to per vertex payload.
|
|
|
|
|
*/
|
2021-12-09 16:47:24 -06:00
|
|
|
unsigned num_varying_inputs;
|
2015-10-08 17:09:54 -07:00
|
|
|
|
2018-05-17 23:49:29 -07:00
|
|
|
uint8_t dispatch_grf_start_reg_16;
|
2018-05-17 23:26:02 -07:00
|
|
|
uint8_t dispatch_grf_start_reg_32;
|
2018-05-17 23:49:29 -07:00
|
|
|
uint32_t prog_offset_16;
|
2018-05-17 23:26:02 -07:00
|
|
|
uint32_t prog_offset_32;
|
2015-10-08 17:09:54 -07:00
|
|
|
|
|
|
|
|
uint8_t computed_depth_mode;
|
|
|
|
|
|
2022-06-28 17:44:13 -07:00
|
|
|
/**
|
|
|
|
|
* Number of polygons handled in parallel by the multi-polygon PS
|
|
|
|
|
* kernel.
|
|
|
|
|
*/
|
|
|
|
|
uint8_t max_polygons;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Dispatch width of the multi-polygon PS kernel, or 0 if no
|
|
|
|
|
* multi-polygon kernel was built.
|
|
|
|
|
*/
|
|
|
|
|
uint8_t dispatch_multi;
|
|
|
|
|
|
|
|
|
|
bool computed_stencil;
|
2015-10-08 17:09:54 -07:00
|
|
|
bool early_fragment_tests;
|
2016-12-06 21:37:01 +02:00
|
|
|
bool post_depth_coverage;
|
2016-11-25 14:23:01 +00:00
|
|
|
bool inner_coverage;
|
2016-04-28 15:37:39 -07:00
|
|
|
bool dispatch_8;
|
|
|
|
|
bool dispatch_16;
|
2018-05-17 23:26:02 -07:00
|
|
|
bool dispatch_32;
|
2015-10-08 17:09:54 -07:00
|
|
|
bool dual_src_blend;
|
2023-05-10 08:16:59 +03:00
|
|
|
bool uses_pos_offset;
|
2015-10-08 17:09:54 -07:00
|
|
|
bool uses_omask;
|
|
|
|
|
bool uses_kill;
|
2016-02-10 21:27:57 -08:00
|
|
|
bool uses_src_depth;
|
|
|
|
|
bool uses_src_w;
|
2020-10-29 15:10:59 +02:00
|
|
|
bool uses_depth_w_coefficients;
|
2022-08-08 13:53:04 -07:00
|
|
|
bool uses_pc_bary_coefficients;
|
|
|
|
|
bool uses_npc_bary_coefficients;
|
|
|
|
|
bool uses_sample_offsets;
|
2016-02-10 21:27:57 -08:00
|
|
|
bool uses_sample_mask;
|
2019-06-07 18:17:36 -05:00
|
|
|
bool uses_vmask;
|
2016-06-01 18:46:30 -07:00
|
|
|
bool has_side_effects;
|
2015-10-08 17:09:54 -07:00
|
|
|
bool pulls_bary;
|
|
|
|
|
|
i965: Move Gen4-5 interpolation stuff to brw_wm_prog_data.
This fixes glxgears rendering, which had surprisingly been broken since
late October! Specifically, commit 91d61fbf7cb61a44adcaae51ee08ad0dd6b.
glxgears uses glShadeModel(GL_FLAT) when drawing the main portion of the
gears, then uses glShadeModel(GL_SMOOTH) for drawing the Gouraud-shaded
inner portion of the gears. This results in the same fragment program
having two different state-dependent interpolation maps: one where
gl_Color is flat, and another where it's smooth.
The problem is that there's only one gen4_fragment_program, so it can't
store both. Each FS compile would trash the last one. But, the FS
compiles are cached, so the first one would store FLAT, and the second
would see a matching program in the cache and never bother to compile
one with SMOOTH. (Clearing the program cache on every draw made it
render correctly.)
Instead, move it to brw_wm_prog_data, where we can keep a copy for
every specialization of the program. The only downside is bloating
the structure a bit, but we can tighten that up a bit if we need to.
This also lets us kill gen4_fragment_program entirely!
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
2017-01-13 14:29:52 -08:00
|
|
|
bool contains_flat_varying;
|
|
|
|
|
bool contains_noperspective_varying;
|
|
|
|
|
|
2021-11-19 16:32:24 -06:00
|
|
|
/** True if the shader wants sample shading
|
|
|
|
|
*
|
|
|
|
|
* This corresponds to whether or not a gl_SampleId, gl_SamplePosition, or
|
|
|
|
|
* a sample-qualified input are used in the shader. It is independent of
|
|
|
|
|
* GL_MIN_SAMPLE_SHADING_VALUE in GL or minSampleShading in Vulkan.
|
|
|
|
|
*/
|
|
|
|
|
bool sample_shading;
|
|
|
|
|
|
2024-04-02 12:59:53 +03:00
|
|
|
/** Min sample shading value
|
|
|
|
|
*
|
|
|
|
|
* Not used by the compiler, but useful for restore from the cache. The
|
|
|
|
|
* driver is expected to write the value it wants.
|
|
|
|
|
*/
|
|
|
|
|
float min_sample_shading;
|
|
|
|
|
|
2021-11-19 16:32:24 -06:00
|
|
|
/** Should this shader be dispatched per-sample */
|
2024-11-18 10:58:46 +02:00
|
|
|
enum intel_sometimes persample_dispatch;
|
2021-11-19 16:32:24 -06:00
|
|
|
|
2020-10-22 13:23:06 +03:00
|
|
|
/**
|
|
|
|
|
* Shader is ran at the coarse pixel shading dispatch rate (3DSTATE_CPS).
|
|
|
|
|
*/
|
2024-11-18 10:58:46 +02:00
|
|
|
enum intel_sometimes coarse_pixel_dispatch;
|
2021-11-19 16:32:24 -06:00
|
|
|
|
2022-03-09 15:31:34 +02:00
|
|
|
/**
|
|
|
|
|
* Shader writes the SampleMask and this is AND-ed with the API's
|
|
|
|
|
* SampleMask to generate a new coverage mask.
|
|
|
|
|
*/
|
2024-11-18 10:58:46 +02:00
|
|
|
enum intel_sometimes alpha_to_coverage;
|
2022-03-09 15:31:34 +02:00
|
|
|
|
2025-03-21 12:32:24 +02:00
|
|
|
/**
|
|
|
|
|
* Push constant location of intel_msaa_flags (dynamic configuration of the
|
|
|
|
|
* pixel shader).
|
|
|
|
|
*/
|
2021-11-19 16:32:24 -06:00
|
|
|
unsigned msaa_flags_param;
|
2020-10-22 13:23:06 +03:00
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
/**
|
|
|
|
|
* Mask of which interpolation modes are required by the fragment shader.
|
2022-07-07 12:24:38 +03:00
|
|
|
* Those interpolations are delivered as part of the thread payload. Used
|
|
|
|
|
* in hardware setup on gfx6+.
|
2015-10-08 17:09:54 -07:00
|
|
|
*/
|
|
|
|
|
uint32_t barycentric_interp_modes;
|
|
|
|
|
|
2022-07-07 12:24:38 +03:00
|
|
|
/**
|
|
|
|
|
* Whether nonperspective interpolation modes are used by the
|
|
|
|
|
* barycentric_interp_modes or fragment shader through interpolator messages.
|
|
|
|
|
*/
|
|
|
|
|
bool uses_nonperspective_interp_modes;
|
|
|
|
|
|
2016-04-05 18:19:34 -07:00
|
|
|
/**
|
|
|
|
|
* Mask of which FS inputs are marked flat by the shader source. This is
|
|
|
|
|
* needed for setting up 3DSTATE_SF/SBE.
|
|
|
|
|
*/
|
|
|
|
|
uint32_t flat_inputs;
|
|
|
|
|
|
2020-03-31 10:45:26 +00:00
|
|
|
/**
|
|
|
|
|
* The FS inputs
|
|
|
|
|
*/
|
|
|
|
|
uint64_t inputs;
|
|
|
|
|
|
2025-03-10 23:18:30 +02:00
|
|
|
/**
|
|
|
|
|
* The FS per-primitive inputs (some bits can be in both inputs &
|
|
|
|
|
* per_primitive_inputs if the shader is compiled without being linked to
|
|
|
|
|
* the previous stage)
|
|
|
|
|
*/
|
|
|
|
|
uint64_t per_primitive_inputs;
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
/**
|
|
|
|
|
* Map from gl_varying_slot to the position within the FS setup data
|
|
|
|
|
* payload where the varying's attribute vertex deltas should be delivered.
|
|
|
|
|
* For varying slots that are not used by the FS, the value is -1.
|
|
|
|
|
*/
|
|
|
|
|
int urb_setup[VARYING_SLOT_MAX];
|
2022-12-21 15:40:07 +01:00
|
|
|
int urb_setup_channel[VARYING_SLOT_MAX];
|
2018-12-11 18:45:43 +01:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Cache structure into the urb_setup array above that contains the
|
|
|
|
|
* attribute numbers of active varyings out of urb_setup.
|
|
|
|
|
* The actual count is stored in urb_setup_attribs_count.
|
|
|
|
|
*/
|
|
|
|
|
uint8_t urb_setup_attribs[VARYING_SLOT_MAX];
|
|
|
|
|
uint8_t urb_setup_attribs_count;
|
2015-10-08 17:09:54 -07:00
|
|
|
};
|
|
|
|
|
|
2025-02-24 13:14:19 +02:00
|
|
|
static inline bool
|
|
|
|
|
brw_wm_prog_data_is_dynamic(const struct brw_wm_prog_data *prog_data)
|
|
|
|
|
{
|
|
|
|
|
return prog_data->alpha_to_coverage == INTEL_SOMETIMES ||
|
|
|
|
|
prog_data->coarse_pixel_dispatch == INTEL_SOMETIMES ||
|
|
|
|
|
prog_data->persample_dispatch == INTEL_SOMETIMES;
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-02 17:30:16 -07:00
|
|
|
#ifdef GFX_VERx10
|
|
|
|
|
|
|
|
|
|
#if GFX_VERx10 >= 200
|
|
|
|
|
|
|
|
|
|
/** Returns the SIMD width corresponding to a given KSP index
|
|
|
|
|
*
|
|
|
|
|
* The "Variable Pixel Dispatch" table in the PRM (which can be found, for
|
|
|
|
|
* example in Vol. 7 of the SKL PRM) has a mapping from dispatch widths to
|
|
|
|
|
* kernel start pointer (KSP) indices that is based on what dispatch widths
|
|
|
|
|
* are enabled. This function provides, effectively, the reverse mapping.
|
|
|
|
|
*
|
2023-12-01 23:06:27 -08:00
|
|
|
* If the given KSP is enabled, a SIMD width of 8, 16, or 32 is
|
|
|
|
|
* returned. Note that for a multipolygon dispatch kernel 8 is always
|
|
|
|
|
* returned, since multipolygon kernels use the "_8" fields from
|
|
|
|
|
* brw_wm_prog_data regardless of their SIMD width. If the KSP is
|
|
|
|
|
* invalid, 0 is returned.
|
2022-08-02 17:30:16 -07:00
|
|
|
*/
|
|
|
|
|
static inline unsigned
|
|
|
|
|
brw_fs_simd_width_for_ksp(unsigned ksp_idx, bool enabled, unsigned width_sel)
|
|
|
|
|
{
|
2023-12-01 23:06:27 -08:00
|
|
|
assert(ksp_idx < 2);
|
|
|
|
|
return !enabled ? 0 :
|
|
|
|
|
width_sel ? 32 :
|
|
|
|
|
16;
|
2022-08-02 17:30:16 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx) \
|
2023-12-01 23:06:27 -08:00
|
|
|
(ksp_idx == 0 && (wm_state).Kernel0MaximumPolysperThread ? 8 : \
|
|
|
|
|
ksp_idx == 0 ? brw_fs_simd_width_for_ksp(ksp_idx, (wm_state).Kernel0Enable, \
|
|
|
|
|
(wm_state).Kernel0SIMDWidth): \
|
|
|
|
|
brw_fs_simd_width_for_ksp(ksp_idx, (wm_state).Kernel1Enable, \
|
|
|
|
|
(wm_state).Kernel1SIMDWidth))
|
2022-08-02 17:30:16 -07:00
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
2018-05-17 23:17:17 -07:00
|
|
|
/** Returns the SIMD width corresponding to a given KSP index
|
|
|
|
|
*
|
|
|
|
|
* The "Variable Pixel Dispatch" table in the PRM (which can be found, for
|
|
|
|
|
* example in Vol. 7 of the SKL PRM) has a mapping from dispatch widths to
|
|
|
|
|
* kernel start pointer (KSP) indices that is based on what dispatch widths
|
|
|
|
|
* are enabled. This function provides, effectively, the reverse mapping.
|
|
|
|
|
*
|
|
|
|
|
* If the given KSP is valid with respect to the SIMD8/16/32 enables, a SIMD
|
|
|
|
|
* width of 8, 16, or 32 is returned. If the KSP is invalid, 0 is returned.
|
|
|
|
|
*/
|
|
|
|
|
static inline unsigned
|
|
|
|
|
brw_fs_simd_width_for_ksp(unsigned ksp_idx, bool simd8_enabled,
|
|
|
|
|
bool simd16_enabled, bool simd32_enabled)
|
|
|
|
|
{
|
|
|
|
|
/* This function strictly ignores contiguous dispatch */
|
|
|
|
|
switch (ksp_idx) {
|
|
|
|
|
case 0:
|
|
|
|
|
return simd8_enabled ? 8 :
|
|
|
|
|
(simd16_enabled && !simd32_enabled) ? 16 :
|
|
|
|
|
(simd32_enabled && !simd16_enabled) ? 32 : 0;
|
|
|
|
|
case 1:
|
|
|
|
|
return (simd32_enabled && (simd16_enabled || simd8_enabled)) ? 32 : 0;
|
|
|
|
|
case 2:
|
|
|
|
|
return (simd16_enabled && (simd32_enabled || simd8_enabled)) ? 16 : 0;
|
|
|
|
|
default:
|
|
|
|
|
unreachable("Invalid KSP index");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-02 17:30:16 -07:00
|
|
|
#define brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx) \
|
2018-05-17 23:17:17 -07:00
|
|
|
brw_fs_simd_width_for_ksp((ksp_idx), (wm_state)._8PixelDispatchEnable, \
|
|
|
|
|
(wm_state)._16PixelDispatchEnable, \
|
|
|
|
|
(wm_state)._32PixelDispatchEnable)
|
|
|
|
|
|
2022-08-02 17:30:16 -07:00
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
2018-05-17 23:17:17 -07:00
|
|
|
#define brw_wm_state_has_ksp(wm_state, ksp_idx) \
|
|
|
|
|
(brw_wm_state_simd_width_for_ksp((wm_state), (ksp_idx)) != 0)
|
|
|
|
|
|
|
|
|
|
static inline uint32_t
|
|
|
|
|
_brw_wm_prog_data_prog_offset(const struct brw_wm_prog_data *prog_data,
|
2018-05-17 23:49:29 -07:00
|
|
|
unsigned simd_width)
|
2018-05-17 23:17:17 -07:00
|
|
|
{
|
2018-05-17 23:49:29 -07:00
|
|
|
switch (simd_width) {
|
|
|
|
|
case 8: return 0;
|
|
|
|
|
case 16: return prog_data->prog_offset_16;
|
2018-05-17 23:26:02 -07:00
|
|
|
case 32: return prog_data->prog_offset_32;
|
2018-05-17 23:49:29 -07:00
|
|
|
default: return 0;
|
2018-05-17 23:17:17 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define brw_wm_prog_data_prog_offset(prog_data, wm_state, ksp_idx) \
|
2018-05-17 23:49:29 -07:00
|
|
|
_brw_wm_prog_data_prog_offset(prog_data, \
|
|
|
|
|
brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx))
|
2018-05-17 23:17:17 -07:00
|
|
|
|
|
|
|
|
static inline uint8_t
|
|
|
|
|
_brw_wm_prog_data_dispatch_grf_start_reg(const struct brw_wm_prog_data *prog_data,
|
2018-05-17 23:49:29 -07:00
|
|
|
unsigned simd_width)
|
2018-05-17 23:17:17 -07:00
|
|
|
{
|
2018-05-17 23:49:29 -07:00
|
|
|
switch (simd_width) {
|
|
|
|
|
case 8: return prog_data->base.dispatch_grf_start_reg;
|
|
|
|
|
case 16: return prog_data->dispatch_grf_start_reg_16;
|
2018-05-17 23:26:02 -07:00
|
|
|
case 32: return prog_data->dispatch_grf_start_reg_32;
|
2018-05-17 23:49:29 -07:00
|
|
|
default: return 0;
|
2018-05-17 23:17:17 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm_state, ksp_idx) \
|
2018-05-17 23:49:29 -07:00
|
|
|
_brw_wm_prog_data_dispatch_grf_start_reg(prog_data, \
|
|
|
|
|
brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx))
|
2018-05-17 23:17:17 -07:00
|
|
|
|
2021-11-19 16:32:24 -06:00
|
|
|
static inline bool
|
|
|
|
|
brw_wm_prog_data_is_persample(const struct brw_wm_prog_data *prog_data,
|
2024-02-01 13:17:42 -08:00
|
|
|
enum intel_msaa_flags pushed_msaa_flags)
|
2021-11-19 16:32:24 -06:00
|
|
|
{
|
2024-11-18 11:49:07 +02:00
|
|
|
return intel_fs_is_persample(prog_data->persample_dispatch,
|
|
|
|
|
prog_data->sample_shading,
|
|
|
|
|
pushed_msaa_flags);
|
2021-11-19 16:32:24 -06:00
|
|
|
}
|
|
|
|
|
|
2021-11-19 16:34:19 -06:00
|
|
|
static inline uint32_t
|
|
|
|
|
wm_prog_data_barycentric_modes(const struct brw_wm_prog_data *prog_data,
|
2024-02-01 13:17:42 -08:00
|
|
|
enum intel_msaa_flags pushed_msaa_flags)
|
2021-11-19 16:34:19 -06:00
|
|
|
{
|
2024-11-18 11:49:07 +02:00
|
|
|
return intel_fs_barycentric_modes(prog_data->persample_dispatch,
|
|
|
|
|
prog_data->barycentric_interp_modes,
|
|
|
|
|
pushed_msaa_flags);
|
2021-11-19 16:34:19 -06:00
|
|
|
}
|
|
|
|
|
|
2021-11-19 16:32:24 -06:00
|
|
|
static inline bool
|
|
|
|
|
brw_wm_prog_data_is_coarse(const struct brw_wm_prog_data *prog_data,
|
2024-02-01 13:17:42 -08:00
|
|
|
enum intel_msaa_flags pushed_msaa_flags)
|
2021-11-19 16:32:24 -06:00
|
|
|
{
|
2024-11-18 11:49:07 +02:00
|
|
|
return intel_fs_is_coarse(prog_data->coarse_pixel_dispatch,
|
|
|
|
|
pushed_msaa_flags);
|
2021-11-19 16:32:24 -06:00
|
|
|
}
|
|
|
|
|
|
2016-05-22 21:46:28 -07:00
|
|
|
struct brw_push_const_block {
|
|
|
|
|
unsigned dwords; /* Dword count, not reg aligned */
|
|
|
|
|
unsigned regs;
|
|
|
|
|
unsigned size; /* Bytes, register aligned */
|
|
|
|
|
};
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
struct brw_cs_prog_data {
|
|
|
|
|
struct brw_stage_prog_data base;
|
|
|
|
|
|
|
|
|
|
unsigned local_size[3];
|
2020-05-21 01:56:54 -07:00
|
|
|
|
|
|
|
|
/* Program offsets for the 8/16/32 SIMD variants. Multiple variants are
|
|
|
|
|
* kept when using variable group size, and the right one can only be
|
|
|
|
|
* decided at dispatch time.
|
|
|
|
|
*/
|
|
|
|
|
unsigned prog_offset[3];
|
|
|
|
|
|
|
|
|
|
/* Bitmask indicating which program offsets are valid. */
|
|
|
|
|
unsigned prog_mask;
|
|
|
|
|
|
|
|
|
|
/* Bitmask indicating which programs have spilled. */
|
|
|
|
|
unsigned prog_spilled;
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
bool uses_barrier;
|
|
|
|
|
bool uses_num_work_groups;
|
2020-06-16 23:06:56 -05:00
|
|
|
bool uses_inline_data;
|
2024-04-24 16:14:16 +03:00
|
|
|
/** Whether inline push data is used to provide a 64bit pointer to push
|
|
|
|
|
* constants
|
|
|
|
|
*/
|
|
|
|
|
bool uses_inline_push_addr;
|
2020-10-21 14:46:50 -05:00
|
|
|
bool uses_btd_stack_ids;
|
2023-09-22 16:17:18 -07:00
|
|
|
bool uses_systolic;
|
2023-11-28 02:33:41 -08:00
|
|
|
uint8_t generate_local_id;
|
2024-02-20 10:39:41 -08:00
|
|
|
enum intel_compute_walk_order walk_order;
|
2015-10-08 17:09:54 -07:00
|
|
|
|
2023-08-09 17:15:11 -07:00
|
|
|
/* True if shader has any sample operation */
|
|
|
|
|
bool uses_sampler;
|
|
|
|
|
|
2016-05-22 21:46:28 -07:00
|
|
|
struct {
|
|
|
|
|
struct brw_push_const_block cross_thread;
|
|
|
|
|
struct brw_push_const_block per_thread;
|
|
|
|
|
} push;
|
2015-10-08 17:09:54 -07:00
|
|
|
};
|
|
|
|
|
|
2020-05-21 02:26:21 -07:00
|
|
|
static inline uint32_t
|
|
|
|
|
brw_cs_prog_data_prog_offset(const struct brw_cs_prog_data *prog_data,
|
|
|
|
|
unsigned dispatch_width)
|
|
|
|
|
{
|
2020-05-21 01:56:54 -07:00
|
|
|
assert(dispatch_width == 8 ||
|
|
|
|
|
dispatch_width == 16 ||
|
|
|
|
|
dispatch_width == 32);
|
|
|
|
|
const unsigned index = dispatch_width / 16;
|
|
|
|
|
assert(prog_data->prog_mask & (1 << index));
|
|
|
|
|
return prog_data->prog_offset[index];
|
2020-05-21 02:26:21 -07:00
|
|
|
}
|
|
|
|
|
|
2020-10-21 14:46:50 -05:00
|
|
|
struct brw_bs_prog_data {
|
|
|
|
|
struct brw_stage_prog_data base;
|
2020-09-04 12:40:06 -05:00
|
|
|
|
2024-04-24 16:14:16 +03:00
|
|
|
/** Whether inline push data is used to provide a 64bit pointer to push
|
|
|
|
|
* constants
|
|
|
|
|
*/
|
|
|
|
|
bool uses_inline_push_addr;
|
|
|
|
|
|
2020-09-04 12:40:06 -05:00
|
|
|
/** SIMD size of the root shader */
|
2020-10-21 14:46:50 -05:00
|
|
|
uint8_t simd_size;
|
2020-09-04 12:40:06 -05:00
|
|
|
|
|
|
|
|
/** Maximum stack size of all shaders */
|
|
|
|
|
uint32_t max_stack_size;
|
|
|
|
|
|
|
|
|
|
/** Offset into the shader where the resume SBT is located */
|
|
|
|
|
uint32_t resume_sbt_offset;
|
2021-10-13 13:05:59 +00:00
|
|
|
|
|
|
|
|
/** Number of resume shaders */
|
|
|
|
|
uint32_t num_resume_shaders;
|
2020-10-21 14:46:50 -05:00
|
|
|
};
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
/**
|
|
|
|
|
* Enum representing the i965-specific vertex results that don't correspond
|
|
|
|
|
* exactly to any element of gl_varying_slot. The values of this enum are
|
|
|
|
|
* assigned such that they don't conflict with gl_varying_slot.
|
|
|
|
|
*/
|
|
|
|
|
typedef enum
|
|
|
|
|
{
|
2024-02-17 22:17:47 -08:00
|
|
|
BRW_VARYING_SLOT_PAD = VARYING_SLOT_MAX,
|
2015-10-08 17:09:54 -07:00
|
|
|
BRW_VARYING_SLOT_COUNT
|
|
|
|
|
} brw_varying_slot;
|
|
|
|
|
|
2025-03-19 12:11:53 +02:00
|
|
|
|
|
|
|
|
#define BRW_VUE_HEADER_VARYING_MASK \
|
|
|
|
|
(VARYING_BIT_VIEWPORT | \
|
|
|
|
|
VARYING_BIT_LAYER | \
|
|
|
|
|
VARYING_BIT_PRIMITIVE_SHADING_RATE | \
|
|
|
|
|
VARYING_BIT_PSIZ)
|
|
|
|
|
|
2017-02-28 16:09:58 -08:00
|
|
|
/**
|
|
|
|
|
* Bitmask indicating which fragment shader inputs represent varyings (and
|
|
|
|
|
* hence have to be delivered to the fragment shader by the SF/SBE stage).
|
|
|
|
|
*/
|
|
|
|
|
#define BRW_FS_VARYING_INPUT_MASK \
|
|
|
|
|
(BITFIELD64_RANGE(0, VARYING_SLOT_MAX) & \
|
|
|
|
|
~VARYING_BIT_POS & ~VARYING_BIT_FACE)
|
|
|
|
|
|
2024-02-01 15:39:52 -08:00
|
|
|
void brw_print_vue_map(FILE *fp, const struct intel_vue_map *vue_map,
|
2021-02-11 12:19:13 -08:00
|
|
|
gl_shader_stage stage);
|
2015-11-10 00:48:33 -08:00
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
/**
|
|
|
|
|
* Convert a VUE slot number into a byte offset within the VUE.
|
|
|
|
|
*/
|
2021-12-09 16:47:24 -06:00
|
|
|
static inline unsigned brw_vue_slot_to_offset(unsigned slot)
|
2015-10-08 17:09:54 -07:00
|
|
|
{
|
|
|
|
|
return 16*slot;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Convert a vertex output (brw_varying_slot) into a byte offset within the
|
|
|
|
|
* VUE.
|
|
|
|
|
*/
|
2021-12-09 16:47:24 -06:00
|
|
|
static inline unsigned
|
2024-02-01 15:39:52 -08:00
|
|
|
brw_varying_to_offset(const struct intel_vue_map *vue_map, unsigned varying)
|
2015-10-08 17:09:54 -07:00
|
|
|
{
|
|
|
|
|
return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]);
|
|
|
|
|
}
|
|
|
|
|
|
2025-03-10 23:18:30 +02:00
|
|
|
void
|
|
|
|
|
brw_compute_per_primitive_map(int *out_per_primitive_map,
|
|
|
|
|
uint32_t *out_per_primitive_stride,
|
|
|
|
|
uint32_t *out_first_offset,
|
|
|
|
|
uint32_t base_offset,
|
|
|
|
|
nir_shader *nir,
|
|
|
|
|
uint32_t variables_mode,
|
|
|
|
|
uint64_t slots_valid,
|
|
|
|
|
bool separate_shader);
|
|
|
|
|
|
2021-04-05 13:19:39 -07:00
|
|
|
void brw_compute_vue_map(const struct intel_device_info *devinfo,
|
2024-02-01 15:39:52 -08:00
|
|
|
struct intel_vue_map *vue_map,
|
2017-02-28 16:49:19 -08:00
|
|
|
uint64_t slots_valid,
|
2025-04-29 17:40:22 +03:00
|
|
|
enum intel_vue_layout layout,
|
2018-09-21 16:07:38 -07:00
|
|
|
uint32_t pos_slots);
|
2015-10-08 17:09:54 -07:00
|
|
|
|
2024-02-01 15:39:52 -08:00
|
|
|
void brw_compute_tess_vue_map(struct intel_vue_map *const vue_map,
|
2017-02-28 16:49:19 -08:00
|
|
|
uint64_t slots_valid,
|
|
|
|
|
uint32_t is_patch);
|
2015-11-10 01:17:04 -08:00
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
struct brw_vue_prog_data {
|
|
|
|
|
struct brw_stage_prog_data base;
|
2024-02-01 15:39:52 -08:00
|
|
|
struct intel_vue_map vue_map;
|
2015-10-08 17:09:54 -07:00
|
|
|
|
2015-09-29 14:43:29 -07:00
|
|
|
/** Should the hardware deliver input VUE handles for URB pull loads? */
|
|
|
|
|
bool include_vue_handles;
|
|
|
|
|
|
2021-12-09 16:47:24 -06:00
|
|
|
unsigned urb_read_length;
|
|
|
|
|
unsigned total_grf;
|
2015-10-08 17:09:54 -07:00
|
|
|
|
2016-10-03 23:41:09 -07:00
|
|
|
uint32_t clip_distance_mask;
|
2016-05-12 17:38:56 -07:00
|
|
|
uint32_t cull_distance_mask;
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
/* Used for calculating urb partitions. In the VS, this is the size of the
|
|
|
|
|
* URB entry used for both input and output to the thread. In the GS, this
|
|
|
|
|
* is the size of the URB entry used for output.
|
|
|
|
|
*/
|
2021-12-09 16:47:24 -06:00
|
|
|
unsigned urb_entry_size;
|
2015-10-08 17:09:54 -07:00
|
|
|
|
2024-02-01 13:58:36 -08:00
|
|
|
enum intel_shader_dispatch_mode dispatch_mode;
|
2015-10-08 17:09:54 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct brw_vs_prog_data {
|
|
|
|
|
struct brw_vue_prog_data base;
|
|
|
|
|
|
2022-01-19 13:30:29 +10:00
|
|
|
uint64_t inputs_read;
|
|
|
|
|
uint64_t double_inputs_read;
|
2015-10-08 17:09:54 -07:00
|
|
|
|
|
|
|
|
bool uses_vertexid;
|
|
|
|
|
bool uses_instanceid;
|
2018-04-28 14:09:19 +02:00
|
|
|
bool uses_is_indexed_draw;
|
2018-01-25 19:15:39 +01:00
|
|
|
bool uses_firstvertex;
|
2015-12-10 12:24:50 -08:00
|
|
|
bool uses_baseinstance;
|
2015-12-10 12:27:38 -08:00
|
|
|
bool uses_drawid;
|
2024-12-02 15:00:34 +02:00
|
|
|
bool no_vf_slot_compaction;
|
2024-11-29 12:37:29 +02:00
|
|
|
|
|
|
|
|
uint32_t vf_component_packing[4];
|
2015-10-08 17:09:54 -07:00
|
|
|
};
|
|
|
|
|
|
2014-09-09 21:25:00 +12:00
|
|
|
struct brw_tcs_prog_data
|
|
|
|
|
{
|
|
|
|
|
struct brw_vue_prog_data base;
|
|
|
|
|
|
2025-02-27 10:28:10 +02:00
|
|
|
/** Number of input vertices, 0 means dynamic */
|
|
|
|
|
unsigned input_vertices;
|
|
|
|
|
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
/** Should the non-SINGLE_PATCH payload provide primitive ID? */
|
|
|
|
|
bool include_primitive_id;
|
|
|
|
|
|
2014-09-09 21:25:00 +12:00
|
|
|
/** Number vertices in output patch */
|
|
|
|
|
int instances;
|
2020-01-23 22:24:37 -08:00
|
|
|
|
|
|
|
|
/** Track patch count threshold */
|
|
|
|
|
int patch_count_threshold;
|
2014-09-09 21:25:00 +12:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct brw_tes_prog_data
|
|
|
|
|
{
|
|
|
|
|
struct brw_vue_prog_data base;
|
|
|
|
|
|
2024-02-01 13:45:01 -08:00
|
|
|
enum intel_tess_partitioning partitioning;
|
|
|
|
|
enum intel_tess_output_topology output_topology;
|
|
|
|
|
enum intel_tess_domain domain;
|
2021-10-22 09:51:42 -07:00
|
|
|
bool include_primitive_id;
|
2014-09-09 21:25:00 +12:00
|
|
|
};
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
struct brw_gs_prog_data
|
|
|
|
|
{
|
|
|
|
|
struct brw_vue_prog_data base;
|
|
|
|
|
|
2016-02-10 21:57:52 -08:00
|
|
|
unsigned vertices_in;
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
/**
|
|
|
|
|
* Size of an output vertex, measured in HWORDS (32 bytes).
|
|
|
|
|
*/
|
|
|
|
|
unsigned output_vertex_size_hwords;
|
|
|
|
|
|
|
|
|
|
unsigned output_topology;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Size of the control data (cut bits or StreamID bits), in hwords (32
|
|
|
|
|
* bytes). 0 if there is no control data.
|
|
|
|
|
*/
|
|
|
|
|
unsigned control_data_header_size_hwords;
|
|
|
|
|
|
|
|
|
|
/**
|
2021-03-29 15:16:59 -07:00
|
|
|
* Format of the control data (either GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID
|
2015-10-08 17:09:54 -07:00
|
|
|
* if the control data is StreamID bits, or
|
2021-03-29 15:16:59 -07:00
|
|
|
* GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits).
|
2015-10-08 17:09:54 -07:00
|
|
|
* Ignored if control_data_header_size is 0.
|
|
|
|
|
*/
|
|
|
|
|
unsigned control_data_format;
|
|
|
|
|
|
|
|
|
|
bool include_primitive_id;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* The number of vertices emitted, if constant - otherwise -1.
|
|
|
|
|
*/
|
|
|
|
|
int static_vertex_count;
|
|
|
|
|
|
|
|
|
|
int invocations;
|
|
|
|
|
};
|
|
|
|
|
|
2021-05-18 11:06:59 -07:00
|
|
|
struct brw_tue_map {
|
|
|
|
|
uint32_t size_dw;
|
|
|
|
|
|
|
|
|
|
uint32_t per_task_data_start_dw;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct brw_mue_map {
|
2025-03-10 23:18:30 +02:00
|
|
|
/* Total size in bytes of the MUE (32B aligned) */
|
|
|
|
|
uint32_t size;
|
2021-05-18 11:06:59 -07:00
|
|
|
|
|
|
|
|
uint32_t max_primitives;
|
|
|
|
|
uint32_t max_vertices;
|
2025-03-10 23:18:30 +02:00
|
|
|
|
|
|
|
|
/* Stride in bytes between sets of primitive indices */
|
|
|
|
|
uint32_t per_primitive_indices_stride;
|
|
|
|
|
|
|
|
|
|
/* Per primitive offset from the start of the MUE (32B aligned) */
|
|
|
|
|
uint32_t per_primitive_offset;
|
|
|
|
|
|
|
|
|
|
/* Per primitive stride in bytes (32B aligned) */
|
|
|
|
|
uint32_t per_primitive_stride;
|
|
|
|
|
|
|
|
|
|
/* Whether the per primitive block includes a header */
|
|
|
|
|
bool has_per_primitive_header;
|
|
|
|
|
|
|
|
|
|
/* Per vertex offset in bytes from the start of the MUE (32B aligned) */
|
|
|
|
|
uint32_t per_vertex_offset;
|
|
|
|
|
|
|
|
|
|
/* Size of the per vertex header (32B aligned) */
|
|
|
|
|
uint32_t per_vertex_header_size;
|
|
|
|
|
|
|
|
|
|
/* Per vertex stride in bytes (32B aligned) */
|
|
|
|
|
uint32_t per_vertex_stride;
|
|
|
|
|
|
|
|
|
|
/* VUE map for the per vertex attributes */
|
|
|
|
|
struct intel_vue_map vue_map;
|
|
|
|
|
|
|
|
|
|
/* Offset in bytes of each per primitive relative to
|
|
|
|
|
* per_primitive_offset (-1 if unused)
|
|
|
|
|
*/
|
|
|
|
|
int per_primitive_offsets[VARYING_SLOT_MAX];
|
2021-05-18 11:06:59 -07:00
|
|
|
};
|
|
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
struct brw_task_prog_data {
|
|
|
|
|
struct brw_cs_prog_data base;
|
|
|
|
|
struct brw_tue_map map;
|
2021-07-16 15:03:20 +02:00
|
|
|
bool uses_drawid;
|
2021-10-29 12:27:45 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
enum brw_mesh_index_format {
|
|
|
|
|
BRW_INDEX_FORMAT_U32,
|
2023-01-25 15:06:23 +01:00
|
|
|
BRW_INDEX_FORMAT_U888X,
|
2021-10-29 12:27:45 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct brw_mesh_prog_data {
|
|
|
|
|
struct brw_cs_prog_data base;
|
|
|
|
|
struct brw_mue_map map;
|
|
|
|
|
|
2021-12-09 16:47:43 +01:00
|
|
|
uint32_t clip_distance_mask;
|
|
|
|
|
uint32_t cull_distance_mask;
|
2021-10-29 12:27:45 -07:00
|
|
|
uint16_t primitive_type;
|
|
|
|
|
|
|
|
|
|
enum brw_mesh_index_format index_format;
|
2021-07-16 15:03:20 +02:00
|
|
|
|
|
|
|
|
bool uses_drawid;
|
2023-03-03 13:08:06 +01:00
|
|
|
bool autostrip_enable;
|
2021-10-29 12:27:45 -07:00
|
|
|
};
|
|
|
|
|
|
2017-10-21 01:29:16 -07:00
|
|
|
/* brw_any_prog_data is prog_data for any stage that maps to an API stage */
|
|
|
|
|
union brw_any_prog_data {
|
|
|
|
|
struct brw_stage_prog_data base;
|
|
|
|
|
struct brw_vue_prog_data vue;
|
|
|
|
|
struct brw_vs_prog_data vs;
|
|
|
|
|
struct brw_tcs_prog_data tcs;
|
|
|
|
|
struct brw_tes_prog_data tes;
|
|
|
|
|
struct brw_gs_prog_data gs;
|
|
|
|
|
struct brw_wm_prog_data wm;
|
|
|
|
|
struct brw_cs_prog_data cs;
|
2020-10-21 14:46:50 -05:00
|
|
|
struct brw_bs_prog_data bs;
|
2021-10-29 12:27:45 -07:00
|
|
|
struct brw_task_prog_data task;
|
|
|
|
|
struct brw_mesh_prog_data mesh;
|
2017-10-21 01:29:16 -07:00
|
|
|
};
|
|
|
|
|
|
2020-11-10 13:11:31 -09:00
|
|
|
#define DEFINE_PROG_DATA_DOWNCAST(STAGE, CHECK) \
|
|
|
|
|
static inline struct brw_##STAGE##_prog_data * \
|
|
|
|
|
brw_##STAGE##_prog_data(struct brw_stage_prog_data *prog_data) \
|
2020-04-15 16:05:43 -05:00
|
|
|
{ \
|
2020-11-10 13:11:31 -09:00
|
|
|
if (prog_data) \
|
|
|
|
|
assert(CHECK); \
|
|
|
|
|
return (struct brw_##STAGE##_prog_data *) prog_data; \
|
2020-04-15 16:05:43 -05:00
|
|
|
} \
|
2020-11-10 13:11:31 -09:00
|
|
|
static inline const struct brw_##STAGE##_prog_data * \
|
|
|
|
|
brw_##STAGE##_prog_data_const(const struct brw_stage_prog_data *prog_data) \
|
2020-04-15 16:05:43 -05:00
|
|
|
{ \
|
2020-11-10 13:11:31 -09:00
|
|
|
if (prog_data) \
|
|
|
|
|
assert(CHECK); \
|
|
|
|
|
return (const struct brw_##STAGE##_prog_data *) prog_data; \
|
2016-09-08 23:48:51 -07:00
|
|
|
}
|
2020-11-10 13:11:31 -09:00
|
|
|
|
|
|
|
|
DEFINE_PROG_DATA_DOWNCAST(vs, prog_data->stage == MESA_SHADER_VERTEX)
|
|
|
|
|
DEFINE_PROG_DATA_DOWNCAST(tcs, prog_data->stage == MESA_SHADER_TESS_CTRL)
|
|
|
|
|
DEFINE_PROG_DATA_DOWNCAST(tes, prog_data->stage == MESA_SHADER_TESS_EVAL)
|
|
|
|
|
DEFINE_PROG_DATA_DOWNCAST(gs, prog_data->stage == MESA_SHADER_GEOMETRY)
|
|
|
|
|
DEFINE_PROG_DATA_DOWNCAST(wm, prog_data->stage == MESA_SHADER_FRAGMENT)
|
2021-05-18 10:01:49 -07:00
|
|
|
DEFINE_PROG_DATA_DOWNCAST(cs, gl_shader_stage_uses_workgroup(prog_data->stage))
|
2020-10-21 14:46:50 -05:00
|
|
|
DEFINE_PROG_DATA_DOWNCAST(bs, brw_shader_stage_is_bindless(prog_data->stage))
|
2020-11-10 13:11:31 -09:00
|
|
|
|
|
|
|
|
DEFINE_PROG_DATA_DOWNCAST(vue, prog_data->stage == MESA_SHADER_VERTEX ||
|
|
|
|
|
prog_data->stage == MESA_SHADER_TESS_CTRL ||
|
|
|
|
|
prog_data->stage == MESA_SHADER_TESS_EVAL ||
|
|
|
|
|
prog_data->stage == MESA_SHADER_GEOMETRY)
|
|
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
DEFINE_PROG_DATA_DOWNCAST(task, prog_data->stage == MESA_SHADER_TASK)
|
|
|
|
|
DEFINE_PROG_DATA_DOWNCAST(mesh, prog_data->stage == MESA_SHADER_MESH)
|
|
|
|
|
|
2016-09-08 23:48:51 -07:00
|
|
|
#undef DEFINE_PROG_DATA_DOWNCAST
|
2015-10-08 17:09:54 -07:00
|
|
|
|
2019-04-23 23:19:56 -05:00
|
|
|
struct brw_compile_stats {
|
|
|
|
|
uint32_t dispatch_width; /**< 0 for vec4 */
|
2023-12-07 19:47:55 -08:00
|
|
|
uint32_t max_polygons;
|
2023-03-19 15:03:33 +02:00
|
|
|
uint32_t max_dispatch_width;
|
2019-04-23 23:19:56 -05:00
|
|
|
uint32_t instructions;
|
2020-04-03 13:09:41 -05:00
|
|
|
uint32_t sends;
|
2019-04-23 23:19:56 -05:00
|
|
|
uint32_t loops;
|
|
|
|
|
uint32_t cycles;
|
|
|
|
|
uint32_t spills;
|
|
|
|
|
uint32_t fills;
|
2023-02-03 17:02:28 +01:00
|
|
|
uint32_t max_live_registers;
|
2024-08-16 21:29:48 -07:00
|
|
|
uint32_t non_ssa_registers_after_nir;
|
2019-04-23 23:19:56 -05:00
|
|
|
};
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
/** @} */
|
|
|
|
|
|
2016-01-21 09:19:53 -08:00
|
|
|
struct brw_compiler *
|
2021-04-05 13:19:39 -07:00
|
|
|
brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo);
|
2016-01-21 09:19:53 -08:00
|
|
|
|
2018-07-25 14:31:05 -07:00
|
|
|
/**
|
|
|
|
|
* Returns a compiler configuration for use with disk shader cache
|
|
|
|
|
*
|
|
|
|
|
* This value only needs to change for settings that can cause different
|
|
|
|
|
* program generation between two runs on the same hardware.
|
|
|
|
|
*
|
|
|
|
|
* For example, it doesn't need to be different for gen 8 and gen 9 hardware,
|
|
|
|
|
* but it does need to be different if INTEL_DEBUG=nocompact is or isn't used.
|
|
|
|
|
*/
|
|
|
|
|
uint64_t
|
|
|
|
|
brw_get_compiler_config_value(const struct brw_compiler *compiler);
|
|
|
|
|
|
2024-02-09 15:30:57 -08:00
|
|
|
/* Provides a string sha1 hash of all device information fields that could
|
|
|
|
|
* affect shader compilation.
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
brw_device_sha1(char *hex, const struct intel_device_info *devinfo);
|
|
|
|
|
|
|
|
|
|
/* For callers computing their own UUID or hash. Hashes all device
|
|
|
|
|
* information fields that could affect shader compilation into the provided
|
|
|
|
|
* sha1_ctx.
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
brw_device_sha1_update(struct mesa_sha1 *sha1_ctx,
|
|
|
|
|
const struct intel_device_info *devinfo);
|
|
|
|
|
|
2017-10-21 01:30:13 -07:00
|
|
|
unsigned
|
|
|
|
|
brw_prog_data_size(gl_shader_stage stage);
|
|
|
|
|
|
|
|
|
|
unsigned
|
|
|
|
|
brw_prog_key_size(gl_shader_stage stage);
|
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
struct brw_compile_params {
|
|
|
|
|
void *mem_ctx;
|
|
|
|
|
|
|
|
|
|
nir_shader *nir;
|
|
|
|
|
|
|
|
|
|
struct brw_compile_stats *stats;
|
|
|
|
|
|
|
|
|
|
void *log_data;
|
|
|
|
|
|
|
|
|
|
char *error_str;
|
|
|
|
|
|
|
|
|
|
uint64_t debug_flag;
|
2023-07-18 18:48:48 +00:00
|
|
|
|
|
|
|
|
uint32_t source_hash;
|
2023-07-14 02:10:20 +03:00
|
|
|
};
|
|
|
|
|
|
2021-03-22 23:07:18 -07:00
|
|
|
/**
|
|
|
|
|
* Parameters for compiling a vertex shader.
|
|
|
|
|
*
|
|
|
|
|
* Some of these will be modified during the shader compilation.
|
|
|
|
|
*/
|
|
|
|
|
struct brw_compile_vs_params {
|
2023-07-14 02:10:20 +03:00
|
|
|
struct brw_compile_params base;
|
2021-03-22 23:07:18 -07:00
|
|
|
|
|
|
|
|
const struct brw_vs_prog_key *key;
|
|
|
|
|
struct brw_vs_prog_data *prog_data;
|
|
|
|
|
};
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
/**
|
|
|
|
|
* Compile a vertex shader.
|
|
|
|
|
*
|
2021-03-22 23:07:18 -07:00
|
|
|
* Returns the final assembly and updates the parameters structure.
|
2015-10-08 17:09:54 -07:00
|
|
|
*/
|
|
|
|
|
const unsigned *
|
2021-03-22 23:07:18 -07:00
|
|
|
brw_compile_vs(const struct brw_compiler *compiler,
|
|
|
|
|
struct brw_compile_vs_params *params);
|
2015-10-08 17:09:54 -07:00
|
|
|
|
2021-03-23 14:34:23 -07:00
|
|
|
/**
|
|
|
|
|
* Parameters for compiling a tessellation control shader.
|
|
|
|
|
*
|
|
|
|
|
* Some of these will be modified during the shader compilation.
|
|
|
|
|
*/
|
|
|
|
|
struct brw_compile_tcs_params {
|
2023-07-14 02:10:20 +03:00
|
|
|
struct brw_compile_params base;
|
2021-03-23 14:34:23 -07:00
|
|
|
|
|
|
|
|
const struct brw_tcs_prog_key *key;
|
|
|
|
|
struct brw_tcs_prog_data *prog_data;
|
|
|
|
|
};
|
|
|
|
|
|
2015-11-17 01:07:39 -08:00
|
|
|
/**
|
|
|
|
|
* Compile a tessellation control shader.
|
|
|
|
|
*
|
2021-03-23 14:34:23 -07:00
|
|
|
* Returns the final assembly and updates the parameters structure.
|
2015-11-17 01:07:39 -08:00
|
|
|
*/
|
|
|
|
|
const unsigned *
|
|
|
|
|
brw_compile_tcs(const struct brw_compiler *compiler,
|
2021-03-23 14:34:23 -07:00
|
|
|
struct brw_compile_tcs_params *params);
|
2015-11-17 01:07:39 -08:00
|
|
|
|
2021-03-23 15:03:50 -07:00
|
|
|
/**
|
|
|
|
|
* Parameters for compiling a tessellation evaluation shader.
|
|
|
|
|
*
|
|
|
|
|
* Some of these will be modified during the shader compilation.
|
|
|
|
|
*/
|
|
|
|
|
struct brw_compile_tes_params {
|
2023-07-14 02:10:20 +03:00
|
|
|
struct brw_compile_params base;
|
2021-03-23 15:03:50 -07:00
|
|
|
|
|
|
|
|
const struct brw_tes_prog_key *key;
|
|
|
|
|
struct brw_tes_prog_data *prog_data;
|
2024-02-01 15:39:52 -08:00
|
|
|
const struct intel_vue_map *input_vue_map;
|
2021-03-23 15:03:50 -07:00
|
|
|
};
|
|
|
|
|
|
2015-11-10 14:35:27 -08:00
|
|
|
/**
|
|
|
|
|
* Compile a tessellation evaluation shader.
|
|
|
|
|
*
|
2021-03-23 15:03:50 -07:00
|
|
|
* Returns the final assembly and updates the parameters structure.
|
2015-11-10 14:35:27 -08:00
|
|
|
*/
|
|
|
|
|
const unsigned *
|
2021-03-23 15:03:50 -07:00
|
|
|
brw_compile_tes(const struct brw_compiler *compiler,
|
|
|
|
|
struct brw_compile_tes_params *params);
|
2015-11-10 14:35:27 -08:00
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
/**
|
2021-03-23 15:19:05 -07:00
|
|
|
* Parameters for compiling a geometry shader.
|
2015-10-08 17:09:54 -07:00
|
|
|
*
|
2021-03-23 15:19:05 -07:00
|
|
|
* Some of these will be modified during the shader compilation.
|
|
|
|
|
*/
|
|
|
|
|
struct brw_compile_gs_params {
|
2023-07-14 02:10:20 +03:00
|
|
|
struct brw_compile_params base;
|
2021-03-23 15:19:05 -07:00
|
|
|
|
|
|
|
|
const struct brw_gs_prog_key *key;
|
|
|
|
|
struct brw_gs_prog_data *prog_data;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Compile a geometry shader.
|
|
|
|
|
*
|
|
|
|
|
* Returns the final assembly and updates the parameters structure.
|
2015-10-08 17:09:54 -07:00
|
|
|
*/
|
|
|
|
|
const unsigned *
|
2021-03-23 15:19:05 -07:00
|
|
|
brw_compile_gs(const struct brw_compiler *compiler,
|
|
|
|
|
struct brw_compile_gs_params *params);
|
2015-10-08 17:09:54 -07:00
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
struct brw_compile_task_params {
|
2023-07-14 02:10:20 +03:00
|
|
|
struct brw_compile_params base;
|
2021-10-29 12:27:45 -07:00
|
|
|
|
|
|
|
|
const struct brw_task_prog_key *key;
|
|
|
|
|
struct brw_task_prog_data *prog_data;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
const unsigned *
|
|
|
|
|
brw_compile_task(const struct brw_compiler *compiler,
|
|
|
|
|
struct brw_compile_task_params *params);
|
|
|
|
|
|
|
|
|
|
struct brw_compile_mesh_params {
|
2023-07-14 02:10:20 +03:00
|
|
|
struct brw_compile_params base;
|
2021-10-29 12:27:45 -07:00
|
|
|
|
|
|
|
|
const struct brw_mesh_prog_key *key;
|
|
|
|
|
struct brw_mesh_prog_data *prog_data;
|
|
|
|
|
const struct brw_tue_map *tue_map;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
const unsigned *
|
|
|
|
|
brw_compile_mesh(const struct brw_compiler *compiler,
|
|
|
|
|
struct brw_compile_mesh_params *params);
|
|
|
|
|
|
2021-03-22 22:13:09 -07:00
|
|
|
/**
|
|
|
|
|
* Parameters for compiling a fragment shader.
|
|
|
|
|
*
|
|
|
|
|
* Some of these will be modified during the shader compilation.
|
|
|
|
|
*/
|
|
|
|
|
struct brw_compile_fs_params {
|
2023-07-14 02:10:20 +03:00
|
|
|
struct brw_compile_params base;
|
2021-03-22 22:13:09 -07:00
|
|
|
|
|
|
|
|
const struct brw_wm_prog_key *key;
|
|
|
|
|
struct brw_wm_prog_data *prog_data;
|
2021-05-18 11:05:33 -07:00
|
|
|
|
2024-02-01 15:39:52 -08:00
|
|
|
const struct intel_vue_map *vue_map;
|
2021-05-18 11:05:33 -07:00
|
|
|
const struct brw_mue_map *mue_map;
|
2021-03-22 22:13:09 -07:00
|
|
|
|
|
|
|
|
bool allow_spilling;
|
|
|
|
|
bool use_rep_send;
|
2022-06-22 16:16:08 -07:00
|
|
|
uint8_t max_polygons;
|
2021-03-22 22:13:09 -07:00
|
|
|
};
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
/**
|
|
|
|
|
* Compile a fragment shader.
|
|
|
|
|
*
|
2021-03-22 22:13:09 -07:00
|
|
|
* Returns the final assembly and updates the parameters structure.
|
2015-10-08 17:09:54 -07:00
|
|
|
*/
|
|
|
|
|
const unsigned *
|
2021-03-22 22:13:09 -07:00
|
|
|
brw_compile_fs(const struct brw_compiler *compiler,
|
|
|
|
|
struct brw_compile_fs_params *params);
|
2015-10-08 17:09:54 -07:00
|
|
|
|
2021-03-23 21:01:21 -07:00
|
|
|
/**
|
|
|
|
|
* Parameters for compiling a compute shader.
|
|
|
|
|
*
|
|
|
|
|
* Some of these will be modified during the shader compilation.
|
|
|
|
|
*/
|
|
|
|
|
struct brw_compile_cs_params {
|
2023-07-14 02:10:20 +03:00
|
|
|
struct brw_compile_params base;
|
2021-03-23 21:01:21 -07:00
|
|
|
|
|
|
|
|
const struct brw_cs_prog_key *key;
|
|
|
|
|
struct brw_cs_prog_data *prog_data;
|
|
|
|
|
};
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
/**
|
|
|
|
|
* Compile a compute shader.
|
|
|
|
|
*
|
2021-03-23 21:01:21 -07:00
|
|
|
* Returns the final assembly and updates the parameters structure.
|
2015-10-08 17:09:54 -07:00
|
|
|
*/
|
|
|
|
|
const unsigned *
|
2021-03-23 21:01:21 -07:00
|
|
|
brw_compile_cs(const struct brw_compiler *compiler,
|
|
|
|
|
struct brw_compile_cs_params *params);
|
2015-10-08 17:09:54 -07:00
|
|
|
|
2020-10-21 14:46:50 -05:00
|
|
|
/**
|
2021-03-23 21:21:40 -07:00
|
|
|
* Parameters for compiling a Bindless shader.
|
2020-10-21 14:46:50 -05:00
|
|
|
*
|
2021-03-23 21:21:40 -07:00
|
|
|
* Some of these will be modified during the shader compilation.
|
|
|
|
|
*/
|
|
|
|
|
struct brw_compile_bs_params {
|
2023-07-14 02:10:20 +03:00
|
|
|
struct brw_compile_params base;
|
2021-03-23 21:21:40 -07:00
|
|
|
|
|
|
|
|
const struct brw_bs_prog_key *key;
|
|
|
|
|
struct brw_bs_prog_data *prog_data;
|
|
|
|
|
|
|
|
|
|
unsigned num_resume_shaders;
|
|
|
|
|
struct nir_shader **resume_shaders;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Compile a Bindless shader.
|
|
|
|
|
*
|
|
|
|
|
* Returns the final assembly and updates the parameters structure.
|
2020-10-21 14:46:50 -05:00
|
|
|
*/
|
|
|
|
|
const unsigned *
|
2021-03-23 21:21:40 -07:00
|
|
|
brw_compile_bs(const struct brw_compiler *compiler,
|
|
|
|
|
struct brw_compile_bs_params *params);
|
2020-10-21 14:46:50 -05:00
|
|
|
|
2019-04-15 21:59:50 -07:00
|
|
|
void brw_debug_key_recompile(const struct brw_compiler *c, void *log,
|
|
|
|
|
gl_shader_stage stage,
|
2019-02-21 17:20:39 -06:00
|
|
|
const struct brw_base_prog_key *old_key,
|
|
|
|
|
const struct brw_base_prog_key *key);
|
2019-04-15 21:59:50 -07:00
|
|
|
|
2020-03-20 21:02:06 -07:00
|
|
|
unsigned
|
|
|
|
|
brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
|
|
|
|
|
unsigned threads);
|
|
|
|
|
|
2020-08-08 12:55:29 -05:00
|
|
|
void
|
2022-06-29 14:13:31 -07:00
|
|
|
brw_write_shader_relocs(const struct brw_isa_info *isa,
|
2020-08-08 12:55:29 -05:00
|
|
|
void *program,
|
|
|
|
|
const struct brw_stage_prog_data *prog_data,
|
|
|
|
|
struct brw_shader_reloc_value *values,
|
|
|
|
|
unsigned num_values);
|
|
|
|
|
|
2021-04-28 10:54:53 -07:00
|
|
|
/**
|
|
|
|
|
* Get the dispatch information for a shader to be used with GPGPU_WALKER and
|
|
|
|
|
* similar instructions.
|
|
|
|
|
*
|
|
|
|
|
* If override_local_size is not NULL, it must to point to a 3-element that
|
|
|
|
|
* will override the value from prog_data->local_size. This is used by
|
|
|
|
|
* ARB_compute_variable_group_size, where the size is set only at dispatch
|
|
|
|
|
* time (so prog_data is outdated).
|
|
|
|
|
*/
|
2024-02-01 16:02:50 -08:00
|
|
|
struct intel_cs_dispatch_info
|
2021-04-28 10:54:53 -07:00
|
|
|
brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
|
|
|
|
|
const struct brw_cs_prog_data *prog_data,
|
|
|
|
|
const unsigned *override_local_size);
|
|
|
|
|
|
2016-09-15 17:20:23 -07:00
|
|
|
/**
|
|
|
|
|
* Return true if the given shader stage is dispatched contiguously by the
|
|
|
|
|
* relevant fixed function starting from channel 0 of the SIMD thread, which
|
|
|
|
|
* implies that the dispatch mask of a thread can be assumed to have the form
|
|
|
|
|
* '2^n - 1' for some n.
|
|
|
|
|
*/
|
|
|
|
|
static inline bool
|
2021-04-05 13:19:39 -07:00
|
|
|
brw_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo,
|
2023-12-07 19:38:02 -08:00
|
|
|
gl_shader_stage stage, unsigned max_polygons,
|
2016-09-15 17:20:23 -07:00
|
|
|
const struct brw_stage_prog_data *prog_data)
|
|
|
|
|
{
|
|
|
|
|
/* The code below makes assumptions about the hardware's thread dispatch
|
|
|
|
|
* behavior that could be proven wrong in future generations -- Make sure
|
2025-02-10 08:55:26 -08:00
|
|
|
* to do a full test run with brw_test_dispatch_packing() hooked up to
|
2024-05-17 01:20:33 -07:00
|
|
|
* the NIR front-end before changing this assertion. It can be temporarily
|
|
|
|
|
* enabled by setting the macro below to true.
|
2016-09-15 17:20:23 -07:00
|
|
|
*/
|
2025-02-10 08:55:26 -08:00
|
|
|
#define ENABLE_TEST_DISPATCH_PACKING false
|
2023-08-02 01:54:09 -07:00
|
|
|
assert(devinfo->ver <= 30);
|
2016-09-15 17:20:23 -07:00
|
|
|
|
|
|
|
|
switch (stage) {
|
|
|
|
|
case MESA_SHADER_FRAGMENT: {
|
|
|
|
|
/* The PSD discards subspans coming in with no lit samples, which in the
|
|
|
|
|
* per-pixel shading case implies that each subspan will either be fully
|
|
|
|
|
* lit (due to the VMask being used to allow derivative computations),
|
|
|
|
|
* or not dispatched at all. In per-sample dispatch mode individual
|
|
|
|
|
* samples from the same subspan have a fixed relative location within
|
|
|
|
|
* the SIMD thread, so dispatch of unlit samples cannot be avoided in
|
|
|
|
|
* general and we should return false.
|
|
|
|
|
*/
|
|
|
|
|
const struct brw_wm_prog_data *wm_prog_data =
|
|
|
|
|
(const struct brw_wm_prog_data *)prog_data;
|
2019-06-07 18:17:36 -05:00
|
|
|
return devinfo->verx10 < 125 &&
|
|
|
|
|
!wm_prog_data->persample_dispatch &&
|
2023-12-07 19:38:02 -08:00
|
|
|
wm_prog_data->uses_vmask &&
|
|
|
|
|
max_polygons < 2;
|
2016-09-15 17:20:23 -07:00
|
|
|
}
|
|
|
|
|
case MESA_SHADER_COMPUTE:
|
|
|
|
|
/* Compute shaders will be spawned with either a fully enabled dispatch
|
|
|
|
|
* mask or with whatever bottom/right execution mask was given to the
|
|
|
|
|
* GPGPU walker command to be used along the workgroup edges -- In both
|
|
|
|
|
* cases the dispatch mask is required to be tightly packed for our
|
|
|
|
|
* invocation index calculations to work.
|
|
|
|
|
*/
|
|
|
|
|
return true;
|
|
|
|
|
default:
|
|
|
|
|
/* Most remaining fixed functions are limited to use a packed dispatch
|
|
|
|
|
* mask due to the hardware representation of the dispatch mask as a
|
|
|
|
|
* single counter representing the number of enabled channels.
|
|
|
|
|
*/
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
i965: skip reading unused slots at the begining of the URB for the FS
We can start reading the URB at the first offset that contains varyings
that are actually read in the URB. We still need to make sure that we
read at least one varying to honor hardware requirements.
This helps alleviate a problem introduced with 99df02ca26f61 for
separate shader objects: without separate shader objects we assign
locations sequentially, however, since that commit we have changed the
method for SSO so that the VUE slot assigned depends on the number of
builtin slots plus the location assigned to the varying. This fixed
layout is intended to help SSO programs by avoiding on-the-fly recompiles
when swapping out shaders, however, it also means that if a varying uses
a large location number close to the maximum allowed by the SF/FS units
(31), then the offset introduced by the number of builtin slots can push
the location outside the range and trigger an assertion.
This problem is affecting at least the following CTS tests for
enhanced layouts:
KHR-GL45.enhanced_layouts.varying_array_components
KHR-GL45.enhanced_layouts.varying_array_locations
KHR-GL45.enhanced_layouts.varying_components
KHR-GL45.enhanced_layouts.varying_locations
which use SSO and the the location layout qualifier to select such
location numbers explicitly.
This change helps these tests because for SSO we always have to include
things such as VARYING_SLOT_CLIP_DIST{0,1} even if the fragment shader is
very unlikely to read them, so by doing this we free builtin slots from
the fixed VUE layout and we avoid the tests to crash in this scenario.
Of course, this is not a proper fix, we'd still run into problems if someone
tries to use an explicit max location and read gl_ViewportIndex, gl_LayerID or
gl_CullDistancein in the FS, but that would be a much less common bug and we
can probably wait to see if anyone actually runs into that situation in a real
world scenario before making the decision that more aggresive changes are
required to support this without reverting 99df02ca26f61.
v2:
- Add a debug message when we skip clip distances (Ilia)
- we also need to account for this when we compute the urb setup
for the fragment shader stage, so add a compiler util to compute
the first slot that we need to read from the URB instead of
replicating the logic in both places.
v3:
- Make the util more generic so it can account for all unused slots
at the beginning of the URB, that will make it more useful (Ken).
- Drop the debug message, it was not what Ilia was asking for.
Suggested-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-09-20 09:22:51 +02:00
|
|
|
/**
|
|
|
|
|
* Computes the first varying slot in the URB produced by the previous stage
|
|
|
|
|
* that is used in the next stage. We do this by testing the varying slots in
|
|
|
|
|
* the previous stage's vue map against the inputs read in the next stage.
|
|
|
|
|
*
|
2025-03-19 12:11:53 +02:00
|
|
|
* Note that each URB offset contains two varying slots and we can only skip a
|
|
|
|
|
* full offset if both slots are unused, so the value we return here is always
|
|
|
|
|
* rounded down to the closest multiple of two.
|
i965: skip reading unused slots at the begining of the URB for the FS
We can start reading the URB at the first offset that contains varyings
that are actually read in the URB. We still need to make sure that we
read at least one varying to honor hardware requirements.
This helps alleviate a problem introduced with 99df02ca26f61 for
separate shader objects: without separate shader objects we assign
locations sequentially, however, since that commit we have changed the
method for SSO so that the VUE slot assigned depends on the number of
builtin slots plus the location assigned to the varying. This fixed
layout is intended to help SSO programs by avoiding on-the-fly recompiles
when swapping out shaders, however, it also means that if a varying uses
a large location number close to the maximum allowed by the SF/FS units
(31), then the offset introduced by the number of builtin slots can push
the location outside the range and trigger an assertion.
This problem is affecting at least the following CTS tests for
enhanced layouts:
KHR-GL45.enhanced_layouts.varying_array_components
KHR-GL45.enhanced_layouts.varying_array_locations
KHR-GL45.enhanced_layouts.varying_components
KHR-GL45.enhanced_layouts.varying_locations
which use SSO and the the location layout qualifier to select such
location numbers explicitly.
This change helps these tests because for SSO we always have to include
things such as VARYING_SLOT_CLIP_DIST{0,1} even if the fragment shader is
very unlikely to read them, so by doing this we free builtin slots from
the fixed VUE layout and we avoid the tests to crash in this scenario.
Of course, this is not a proper fix, we'd still run into problems if someone
tries to use an explicit max location and read gl_ViewportIndex, gl_LayerID or
gl_CullDistancein in the FS, but that would be a much less common bug and we
can probably wait to see if anyone actually runs into that situation in a real
world scenario before making the decision that more aggresive changes are
required to support this without reverting 99df02ca26f61.
v2:
- Add a debug message when we skip clip distances (Ilia)
- we also need to account for this when we compute the urb setup
for the fragment shader stage, so add a compiler util to compute
the first slot that we need to read from the URB instead of
replicating the logic in both places.
v3:
- Make the util more generic so it can account for all unused slots
at the beginning of the URB, that will make it more useful (Ken).
- Drop the debug message, it was not what Ilia was asking for.
Suggested-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-09-20 09:22:51 +02:00
|
|
|
*/
|
2025-03-19 12:11:53 +02:00
|
|
|
int
|
|
|
|
|
brw_compute_first_fs_urb_slot_required(uint64_t inputs_read,
|
2025-03-10 23:18:30 +02:00
|
|
|
const struct intel_vue_map *prev_stage_vue_map,
|
|
|
|
|
bool mesh);
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
brw_compute_sbe_per_vertex_urb_read(const struct intel_vue_map *prev_stage_vue_map,
|
|
|
|
|
bool mesh,
|
|
|
|
|
const struct brw_wm_prog_data *wm_prog_data,
|
|
|
|
|
uint32_t *out_first_slot,
|
|
|
|
|
uint32_t *num_slots,
|
|
|
|
|
uint32_t *out_num_varyings,
|
|
|
|
|
uint32_t *out_primitive_id_offset);
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Computes the URB offset at which SBE should read the per primitive date
|
|
|
|
|
* written by the mesh shader.
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
brw_compute_sbe_per_primitive_urb_read(uint64_t inputs_read,
|
|
|
|
|
uint32_t num_varyings,
|
|
|
|
|
const struct brw_mue_map *mue_map,
|
|
|
|
|
uint32_t *out_read_offset,
|
|
|
|
|
uint32_t *out_read_length);
|
i965: skip reading unused slots at the begining of the URB for the FS
We can start reading the URB at the first offset that contains varyings
that are actually read in the URB. We still need to make sure that we
read at least one varying to honor hardware requirements.
This helps alleviate a problem introduced with 99df02ca26f61 for
separate shader objects: without separate shader objects we assign
locations sequentially, however, since that commit we have changed the
method for SSO so that the VUE slot assigned depends on the number of
builtin slots plus the location assigned to the varying. This fixed
layout is intended to help SSO programs by avoiding on-the-fly recompiles
when swapping out shaders, however, it also means that if a varying uses
a large location number close to the maximum allowed by the SF/FS units
(31), then the offset introduced by the number of builtin slots can push
the location outside the range and trigger an assertion.
This problem is affecting at least the following CTS tests for
enhanced layouts:
KHR-GL45.enhanced_layouts.varying_array_components
KHR-GL45.enhanced_layouts.varying_array_locations
KHR-GL45.enhanced_layouts.varying_components
KHR-GL45.enhanced_layouts.varying_locations
which use SSO and the the location layout qualifier to select such
location numbers explicitly.
This change helps these tests because for SSO we always have to include
things such as VARYING_SLOT_CLIP_DIST{0,1} even if the fragment shader is
very unlikely to read them, so by doing this we free builtin slots from
the fixed VUE layout and we avoid the tests to crash in this scenario.
Of course, this is not a proper fix, we'd still run into problems if someone
tries to use an explicit max location and read gl_ViewportIndex, gl_LayerID or
gl_CullDistancein in the FS, but that would be a much less common bug and we
can probably wait to see if anyone actually runs into that situation in a real
world scenario before making the decision that more aggresive changes are
required to support this without reverting 99df02ca26f61.
v2:
- Add a debug message when we skip clip distances (Ilia)
- we also need to account for this when we compute the urb setup
for the fragment shader stage, so add a compiler util to compute
the first slot that we need to read from the URB instead of
replicating the logic in both places.
v3:
- Make the util more generic so it can account for all unused slots
at the beginning of the URB, that will make it more useful (Ken).
- Drop the debug message, it was not what Ilia was asking for.
Suggested-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-09-20 09:22:51 +02:00
|
|
|
|
2021-12-13 14:11:27 +01:00
|
|
|
/* From InlineData in 3DSTATE_TASK_SHADER_DATA and 3DSTATE_MESH_SHADER_DATA. */
|
|
|
|
|
#define BRW_TASK_MESH_INLINE_DATA_SIZE_DW 8
|
|
|
|
|
|
|
|
|
|
/* InlineData[0-1] is used for Vulkan descriptor. */
|
|
|
|
|
#define BRW_TASK_MESH_PUSH_CONSTANTS_START_DW 2
|
|
|
|
|
|
|
|
|
|
#define BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW \
|
|
|
|
|
(BRW_TASK_MESH_INLINE_DATA_SIZE_DW - BRW_TASK_MESH_PUSH_CONSTANTS_START_DW)
|
|
|
|
|
|
2021-06-18 11:52:31 +03:00
|
|
|
/**
|
|
|
|
|
* This enum is used as the base indice of the nir_load_topology_id_intel
|
|
|
|
|
* intrinsic. This is used to return different values based on some aspect of
|
|
|
|
|
* the topology of the device.
|
|
|
|
|
*/
|
|
|
|
|
enum brw_topology_id
|
|
|
|
|
{
|
|
|
|
|
/* A value based of the DSS identifier the shader is currently running on.
|
|
|
|
|
* Be mindful that the DSS ID can be higher than the total number of DSS on
|
|
|
|
|
* the device. This is because of the fusing that can occur on different
|
|
|
|
|
* parts.
|
|
|
|
|
*/
|
|
|
|
|
BRW_TOPOLOGY_ID_DSS,
|
2021-06-18 14:12:03 +03:00
|
|
|
|
|
|
|
|
/* A value composed of EU ID, thread ID & SIMD lane ID. */
|
|
|
|
|
BRW_TOPOLOGY_ID_EU_THREAD_SIMD,
|
2021-06-18 11:52:31 +03:00
|
|
|
};
|
|
|
|
|
|
2015-10-08 17:09:54 -07:00
|
|
|
#ifdef __cplusplus
|
|
|
|
|
} /* extern "C" */
|
|
|
|
|
#endif
|