2014-08-15 10:32:07 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
2025-02-05 14:25:15 -08:00
|
|
|
#include "brw_shader.h"
|
2025-01-15 08:20:46 -08:00
|
|
|
#include "brw_builder.h"
|
2015-03-17 11:49:04 -07:00
|
|
|
#include "brw_nir.h"
|
intel/fs: Improve discard_if code generation
Previously we would blindly emit an sequence like:
mov(1) f0.1<1>UW g1.14<0,1,0>UW
...
cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */
(+f0.1) cmp.z.f0.1(16) null<1>D g7<8,8,1>D 0D
The first move sets the flags based on the initial execution mask.
Later discard sequences contain a predicated compare that can only
remove more SIMD channels. Often times the only user of the result from
the first compare is the second compare. Instead, generate a sequence
like
mov(1) f0.1<1>UW g1.14<0,1,0>UW
...
cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */
(+f0.1) cmp.ge.f0.1(8) null<1>F g5<8,8,1>F 0x41700000F /* 15F */
If the results stored in g7 and f0.0 are not used, the comparison will
be eliminated. This removes an instruction and potentially reduces
register pressure.
v2: Major re-write of the commit message (including fixing the assembly
code). Suggested by Matt.
All Gen8+ platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 17224434 -> 17198659 (-0.15%)
instructions in affected programs: 2908125 -> 2882350 (-0.89%)
helped: 18891
HURT: 5
helped stats (abs) min: 1 max: 12 x̄: 1.38 x̃: 1
helped stats (rel) min: 0.03% max: 25.00% x̄: 1.76% x̃: 1.02%
HURT stats (abs) min: 9 max: 105 x̄: 51.40 x̃: 35
HURT stats (rel) min: 0.43% max: 4.92% x̄: 2.34% x̃: 1.56%
95% mean confidence interval for instructions value: -1.39 -1.34
95% mean confidence interval for instructions %-change: -1.79% -1.73%
Instructions are helped.
total cycles in shared programs: 361468458 -> 361170679 (-0.08%)
cycles in affected programs: 38470116 -> 38172337 (-0.77%)
helped: 16202
HURT: 1456
helped stats (abs) min: 1 max: 4473 x̄: 26.24 x̃: 18
helped stats (rel) min: <.01% max: 28.44% x̄: 2.90% x̃: 2.18%
HURT stats (abs) min: 1 max: 5982 x̄: 87.51 x̃: 28
HURT stats (rel) min: <.01% max: 51.29% x̄: 5.48% x̃: 1.64%
95% mean confidence interval for cycles value: -18.24 -15.49
95% mean confidence interval for cycles %-change: -2.26% -2.14%
Cycles are helped.
total spills in shared programs: 12147 -> 12176 (0.24%)
spills in affected programs: 175 -> 204 (16.57%)
helped: 8
HURT: 5
total fills in shared programs: 25262 -> 25292 (0.12%)
fills in affected programs: 269 -> 299 (11.15%)
helped: 8
HURT: 5
Haswell
total instructions in shared programs: 13530316 -> 13502647 (-0.20%)
instructions in affected programs: 2507824 -> 2480155 (-1.10%)
helped: 18859
HURT: 10
helped stats (abs) min: 1 max: 12 x̄: 1.48 x̃: 1
helped stats (rel) min: 0.03% max: 27.78% x̄: 2.38% x̃: 1.41%
HURT stats (abs) min: 5 max: 39 x̄: 25.70 x̃: 31
HURT stats (rel) min: 0.22% max: 1.66% x̄: 1.09% x̃: 1.31%
95% mean confidence interval for instructions value: -1.49 -1.44
95% mean confidence interval for instructions %-change: -2.42% -2.34%
Instructions are helped.
total cycles in shared programs: 377865412 -> 377639034 (-0.06%)
cycles in affected programs: 40169572 -> 39943194 (-0.56%)
helped: 15550
HURT: 1938
helped stats (abs) min: 1 max: 2482 x̄: 25.67 x̃: 18
helped stats (rel) min: <.01% max: 37.77% x̄: 3.00% x̃: 2.25%
HURT stats (abs) min: 1 max: 4862 x̄: 89.17 x̃: 35
HURT stats (rel) min: <.01% max: 67.67% x̄: 6.16% x̃: 2.75%
95% mean confidence interval for cycles value: -14.42 -11.47
95% mean confidence interval for cycles %-change: -2.05% -1.91%
Cycles are helped.
total spills in shared programs: 26769 -> 26814 (0.17%)
spills in affected programs: 826 -> 871 (5.45%)
helped: 9
HURT: 10
total fills in shared programs: 38383 -> 38425 (0.11%)
fills in affected programs: 834 -> 876 (5.04%)
helped: 9
HURT: 10
LOST: 5
GAINED: 10
Ivy Bridge
total instructions in shared programs: 12079250 -> 12044139 (-0.29%)
instructions in affected programs: 2409680 -> 2374569 (-1.46%)
helped: 16135
HURT: 0
helped stats (abs) min: 1 max: 23 x̄: 2.18 x̃: 2
helped stats (rel) min: 0.07% max: 37.50% x̄: 2.72% x̃: 1.68%
95% mean confidence interval for instructions value: -2.21 -2.14
95% mean confidence interval for instructions %-change: -2.76% -2.67%
Instructions are helped.
total cycles in shared programs: 180116747 -> 179900405 (-0.12%)
cycles in affected programs: 25439823 -> 25223481 (-0.85%)
helped: 13817
HURT: 1499
helped stats (abs) min: 1 max: 1886 x̄: 26.40 x̃: 18
helped stats (rel) min: <.01% max: 38.84% x̄: 2.57% x̃: 1.97%
HURT stats (abs) min: 1 max: 3684 x̄: 98.99 x̃: 52
HURT stats (rel) min: <.01% max: 97.01% x̄: 6.37% x̃: 3.42%
95% mean confidence interval for cycles value: -15.68 -12.57
95% mean confidence interval for cycles %-change: -1.77% -1.63%
Cycles are helped.
LOST: 8
GAINED: 10
Sandy Bridge
total instructions in shared programs: 10878990 -> 10863659 (-0.14%)
instructions in affected programs: 1806702 -> 1791371 (-0.85%)
helped: 13023
HURT: 0
helped stats (abs) min: 1 max: 5 x̄: 1.18 x̃: 1
helped stats (rel) min: 0.07% max: 13.79% x̄: 1.65% x̃: 1.10%
95% mean confidence interval for instructions value: -1.18 -1.17
95% mean confidence interval for instructions %-change: -1.68% -1.62%
Instructions are helped.
total cycles in shared programs: 154082878 -> 153862810 (-0.14%)
cycles in affected programs: 20199374 -> 19979306 (-1.09%)
helped: 12048
HURT: 510
helped stats (abs) min: 1 max: 323 x̄: 20.57 x̃: 18
helped stats (rel) min: 0.03% max: 17.78% x̄: 2.05% x̃: 1.52%
HURT stats (abs) min: 1 max: 448 x̄: 54.39 x̃: 16
HURT stats (rel) min: 0.02% max: 37.98% x̄: 4.13% x̃: 1.17%
95% mean confidence interval for cycles value: -17.97 -17.08
95% mean confidence interval for cycles %-change: -1.84% -1.75%
Cycles are helped.
LOST: 1
GAINED: 0
Iron Lake
total instructions in shared programs: 8155075 -> 8142729 (-0.15%)
instructions in affected programs: 949495 -> 937149 (-1.30%)
helped: 5810
HURT: 0
helped stats (abs) min: 1 max: 8 x̄: 2.12 x̃: 2
helped stats (rel) min: 0.10% max: 16.67% x̄: 2.53% x̃: 1.85%
95% mean confidence interval for instructions value: -2.14 -2.11
95% mean confidence interval for instructions %-change: -2.59% -2.48%
Instructions are helped.
total cycles in shared programs: 188584610 -> 188549632 (-0.02%)
cycles in affected programs: 17274446 -> 17239468 (-0.20%)
helped: 3881
HURT: 90
helped stats (abs) min: 2 max: 168 x̄: 9.08 x̃: 6
helped stats (rel) min: <.01% max: 23.53% x̄: 0.83% x̃: 0.30%
HURT stats (abs) min: 2 max: 10 x̄: 2.80 x̃: 2
HURT stats (rel) min: <.01% max: 0.60% x̄: 0.10% x̃: 0.07%
95% mean confidence interval for cycles value: -9.35 -8.27
95% mean confidence interval for cycles %-change: -0.85% -0.77%
Cycles are helped.
GM45
total instructions in shared programs: 5019308 -> 5013119 (-0.12%)
instructions in affected programs: 489028 -> 482839 (-1.27%)
helped: 2912
HURT: 0
helped stats (abs) min: 1 max: 8 x̄: 2.13 x̃: 2
helped stats (rel) min: 0.10% max: 16.67% x̄: 2.46% x̃: 1.81%
95% mean confidence interval for instructions value: -2.14 -2.11
95% mean confidence interval for instructions %-change: -2.54% -2.39%
Instructions are helped.
total cycles in shared programs: 129002592 -> 128977804 (-0.02%)
cycles in affected programs: 12669152 -> 12644364 (-0.20%)
helped: 2759
HURT: 37
helped stats (abs) min: 2 max: 168 x̄: 9.03 x̃: 4
helped stats (rel) min: <.01% max: 21.43% x̄: 0.75% x̃: 0.31%
HURT stats (abs) min: 2 max: 10 x̄: 3.62 x̃: 4
HURT stats (rel) min: <.01% max: 0.41% x̄: 0.10% x̃: 0.04%
95% mean confidence interval for cycles value: -9.53 -8.20
95% mean confidence interval for cycles %-change: -0.79% -0.70%
Cycles are helped.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2019-05-20 17:25:01 -07:00
|
|
|
#include "brw_eu.h"
|
2023-05-18 14:14:04 -05:00
|
|
|
#include "nir.h"
|
|
|
|
|
#include "nir_intrinsics.h"
|
2018-06-25 19:55:31 -07:00
|
|
|
#include "nir_search_helpers.h"
|
2024-08-23 10:46:13 -07:00
|
|
|
#include "dev/intel_debug.h"
|
2018-08-21 09:46:46 -07:00
|
|
|
#include "util/u_math.h"
|
2018-11-12 18:48:10 -06:00
|
|
|
#include "util/bitscan.h"
|
2025-06-09 16:05:19 -04:00
|
|
|
#include "compiler/glsl_types.h"
|
2014-08-15 10:32:07 -07:00
|
|
|
|
2025-08-21 16:20:49 -07:00
|
|
|
#include <optional>
|
|
|
|
|
|
2024-12-07 09:36:03 -08:00
|
|
|
struct brw_bind_info {
|
2023-11-20 21:36:14 -08:00
|
|
|
bool valid;
|
|
|
|
|
bool bindless;
|
|
|
|
|
unsigned block;
|
|
|
|
|
unsigned set;
|
|
|
|
|
unsigned binding;
|
|
|
|
|
};
|
|
|
|
|
|
2023-11-20 21:21:54 -08:00
|
|
|
struct nir_to_brw_state {
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s;
|
2023-11-20 21:21:54 -08:00
|
|
|
const nir_shader *nir;
|
|
|
|
|
const intel_device_info *devinfo;
|
2023-12-05 17:16:34 -08:00
|
|
|
void *mem_ctx;
|
2023-11-20 21:21:54 -08:00
|
|
|
|
2023-11-20 22:11:23 -08:00
|
|
|
/* Points to the end of the program. Annotated with the current NIR
|
|
|
|
|
* instruction when applicable.
|
|
|
|
|
*/
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder bld;
|
2023-11-20 22:00:28 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg *ssa_values;
|
2024-12-07 09:36:03 -08:00
|
|
|
struct brw_bind_info *ssa_bind_infos;
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg *system_values;
|
2024-08-23 10:46:13 -07:00
|
|
|
|
|
|
|
|
bool annotate;
|
2023-11-20 21:21:54 -08:00
|
|
|
};
|
|
|
|
|
|
2025-01-15 13:27:05 -08:00
|
|
|
static brw_reg get_nir_src(nir_to_brw_state &ntb, const nir_src &src, int channel);
|
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2.
v3: Fix handling of 64-bit CMP results.
v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.
shader-db:
Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21
total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135
Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18
total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134
total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19
total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100
total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0
total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16
total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39
total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2
total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4
LOST: 1
GAINED: 1
fossil-db:
Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%
Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)
Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-01-30 18:53:05 -08:00
|
|
|
static brw_reg get_nir_def(nir_to_brw_state &ntb, const nir_def &def, bool all_sources_uniform = false);
|
2023-11-20 15:21:11 -08:00
|
|
|
static nir_component_mask_t get_nir_write_mask(const nir_def &def);
|
|
|
|
|
|
2024-12-07 09:36:03 -08:00
|
|
|
static void brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb, const brw_builder &bld, nir_intrinsic_instr *instr);
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg emit_samplepos_setup(nir_to_brw_state &ntb);
|
|
|
|
|
static brw_reg emit_sampleid_setup(nir_to_brw_state &ntb);
|
|
|
|
|
static brw_reg emit_samplemaskin_setup(nir_to_brw_state &ntb);
|
|
|
|
|
static brw_reg emit_shading_rate_setup(nir_to_brw_state &ntb);
|
2023-11-20 12:13:47 -08:00
|
|
|
|
2024-12-07 09:36:03 -08:00
|
|
|
static void brw_from_nir_emit_impl(nir_to_brw_state &ntb, nir_function_impl *impl);
|
|
|
|
|
static void brw_from_nir_emit_cf_list(nir_to_brw_state &ntb, exec_list *list);
|
|
|
|
|
static void brw_from_nir_emit_if(nir_to_brw_state &ntb, nir_if *if_stmt);
|
|
|
|
|
static void brw_from_nir_emit_loop(nir_to_brw_state &ntb, nir_loop *loop);
|
|
|
|
|
static void brw_from_nir_emit_block(nir_to_brw_state &ntb, nir_block *block);
|
|
|
|
|
static void brw_from_nir_emit_instr(nir_to_brw_state &ntb, nir_instr *instr);
|
2023-11-20 14:42:06 -08:00
|
|
|
|
2024-12-07 09:36:03 -08:00
|
|
|
static void brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld,
|
|
|
|
|
const brw_builder &xbld,
|
2023-11-20 15:05:03 -08:00
|
|
|
nir_intrinsic_instr *instr);
|
|
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
static void brw_combine_with_vec(const brw_builder &bld, const brw_reg &dst,
|
2024-02-12 08:43:34 -08:00
|
|
|
const brw_reg &src, unsigned n);
|
|
|
|
|
|
2025-08-28 14:28:53 +03:00
|
|
|
static bool
|
|
|
|
|
brw_texture_offset(const nir_tex_instr *tex, unsigned src,
|
|
|
|
|
uint32_t *offset_bits_out)
|
|
|
|
|
{
|
|
|
|
|
if (!nir_src_is_const(tex->src[src].src))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
const unsigned num_components = nir_tex_instr_src_size(tex, src);
|
|
|
|
|
|
|
|
|
|
/* Combine all three offsets into a single unsigned dword:
|
|
|
|
|
*
|
|
|
|
|
* bits 11:8 - U Offset (X component)
|
|
|
|
|
* bits 7:4 - V Offset (Y component)
|
|
|
|
|
* bits 3:0 - R Offset (Z component)
|
|
|
|
|
*/
|
|
|
|
|
uint32_t offset_bits = 0;
|
|
|
|
|
for (unsigned i = 0; i < num_components; i++) {
|
|
|
|
|
int offset = nir_src_comp_as_int(tex->src[src].src, i);
|
|
|
|
|
|
|
|
|
|
/* offset out of bounds; caller will handle it. */
|
|
|
|
|
if (offset > 7 || offset < -8)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
const unsigned shift = 4 * (2 - i);
|
|
|
|
|
offset_bits |= (offset & 0xF) << shift;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
*offset_bits_out = offset_bits;
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2024-12-29 15:41:04 -08:00
|
|
|
setup_imm_b(const brw_builder &bld, int8_t v)
|
2024-02-15 02:51:39 -08:00
|
|
|
{
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg tmp = bld.vgrf(BRW_TYPE_B);
|
2024-02-15 02:51:39 -08:00
|
|
|
bld.MOV(tmp, brw_imm_w(v));
|
|
|
|
|
return tmp;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 14:50:48 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_setup_outputs(nir_to_brw_state &ntb)
|
2014-08-15 10:32:07 -07:00
|
|
|
{
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 22:00:28 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
if (s.stage == MESA_SHADER_TESS_CTRL ||
|
|
|
|
|
s.stage == MESA_SHADER_TASK ||
|
|
|
|
|
s.stage == MESA_SHADER_MESH ||
|
2024-06-08 02:11:31 -07:00
|
|
|
s.stage == MESA_SHADER_FRAGMENT ||
|
|
|
|
|
s.stage == MESA_SHADER_COMPUTE)
|
2015-11-14 17:40:43 -08:00
|
|
|
return;
|
|
|
|
|
|
2017-10-10 01:02:44 -07:00
|
|
|
unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
|
|
|
|
|
|
|
|
|
|
/* Calculate the size of output registers in a separate pass, before
|
|
|
|
|
* allocating them. With ARB_enhanced_layouts, multiple output variables
|
|
|
|
|
* may occupy the same slot, but have different type sizes.
|
|
|
|
|
*/
|
2023-12-05 17:16:34 -08:00
|
|
|
nir_foreach_shader_out_variable(var, s.nir) {
|
2017-10-10 01:02:44 -07:00
|
|
|
const int loc = var->data.driver_location;
|
2023-07-14 12:24:51 -04:00
|
|
|
const unsigned var_vec4s = nir_variable_count_slots(var, var->type);
|
2017-10-10 01:02:44 -07:00
|
|
|
vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
|
|
|
|
|
}
|
|
|
|
|
|
2018-05-18 13:39:13 +02:00
|
|
|
for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
|
|
|
|
|
if (vec4s[loc] == 0) {
|
|
|
|
|
loc++;
|
|
|
|
|
continue;
|
2016-10-12 22:41:09 -07:00
|
|
|
}
|
2018-05-18 13:39:13 +02:00
|
|
|
|
|
|
|
|
unsigned reg_size = vec4s[loc];
|
|
|
|
|
|
|
|
|
|
/* Check if there are any ranges that start within this range and extend
|
|
|
|
|
* past it. If so, include them in this allocation.
|
|
|
|
|
*/
|
2020-09-09 19:01:49 +02:00
|
|
|
for (unsigned i = 1; i < reg_size; i++) {
|
|
|
|
|
assert(i + loc < ARRAY_SIZE(vec4s));
|
2018-05-18 13:39:13 +02:00
|
|
|
reg_size = MAX2(vec4s[i + loc] + i, reg_size);
|
2020-09-09 19:01:49 +02:00
|
|
|
}
|
2018-05-18 13:39:13 +02:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg reg = ntb.bld.vgrf(BRW_TYPE_F, 4 * reg_size);
|
2020-09-09 19:01:49 +02:00
|
|
|
for (unsigned i = 0; i < reg_size; i++) {
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(loc + i < ARRAY_SIZE(s.outputs));
|
|
|
|
|
s.outputs[loc + i] = offset(reg, ntb.bld, 4 * i);
|
2020-09-09 19:01:49 +02:00
|
|
|
}
|
2018-05-18 13:39:13 +02:00
|
|
|
|
|
|
|
|
loc += reg_size;
|
2014-08-15 10:32:07 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2023-12-05 15:27:29 -08:00
|
|
|
emit_work_group_id_setup(nir_to_brw_state &ntb)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld.scalar_group();
|
2023-11-20 13:25:36 -08:00
|
|
|
|
2025-08-05 16:43:06 +08:00
|
|
|
assert(mesa_shader_stage_is_compute(s.stage));
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg id = bld.vgrf(BRW_TYPE_UD, 3);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-01-30 18:14:02 -08:00
|
|
|
id.is_scalar = true;
|
|
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_TYPE_UD));
|
2023-11-17 17:17:25 -08:00
|
|
|
bld.MOV(id, r0_1);
|
|
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_TYPE_UD));
|
|
|
|
|
struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_TYPE_UD));
|
2023-11-17 17:17:25 -08:00
|
|
|
bld.MOV(offset(id, bld, 1), r0_6);
|
|
|
|
|
bld.MOV(offset(id, bld, 2), r0_7);
|
|
|
|
|
|
|
|
|
|
return id;
|
|
|
|
|
}
|
|
|
|
|
|
2014-12-17 12:34:27 -08:00
|
|
|
static bool
|
2023-12-05 15:27:29 -08:00
|
|
|
emit_system_values_block(nir_to_brw_state &ntb, nir_block *block)
|
2014-12-17 12:34:27 -08:00
|
|
|
{
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg *reg;
|
2014-12-17 12:34:27 -08:00
|
|
|
|
2016-04-26 18:34:19 -07:00
|
|
|
nir_foreach_instr(instr, block) {
|
2014-12-17 12:34:27 -08:00
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
|
|
|
switch (intrin->intrinsic) {
|
2015-03-09 01:58:55 -07:00
|
|
|
case nir_intrinsic_load_vertex_id:
|
2018-04-28 14:09:22 +02:00
|
|
|
case nir_intrinsic_load_base_vertex:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("should be lowered by nir_lower_system_values().");
|
2015-03-09 01:58:55 -07:00
|
|
|
|
|
|
|
|
case nir_intrinsic_load_vertex_id_zero_base:
|
2018-04-28 14:09:20 +02:00
|
|
|
case nir_intrinsic_load_is_indexed_draw:
|
2018-01-25 19:15:40 +01:00
|
|
|
case nir_intrinsic_load_first_vertex:
|
2015-03-09 01:58:55 -07:00
|
|
|
case nir_intrinsic_load_instance_id:
|
2015-12-10 12:24:50 -08:00
|
|
|
case nir_intrinsic_load_base_instance:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("should be lowered by brw_nir_lower_vs_inputs().");
|
2021-07-16 15:03:20 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_draw_id:
|
|
|
|
|
/* For Task/Mesh, draw_id will be handled later in
|
|
|
|
|
* nir_emit_mesh_task_intrinsic().
|
|
|
|
|
*/
|
2025-08-05 16:44:47 +08:00
|
|
|
if (!mesa_shader_stage_is_mesh(s.stage))
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("should be lowered by brw_nir_lower_vs_inputs().");
|
2021-07-16 15:03:20 +02:00
|
|
|
break;
|
2015-12-10 12:27:38 -08:00
|
|
|
|
2015-07-10 00:16:19 -07:00
|
|
|
case nir_intrinsic_load_invocation_id:
|
2023-12-05 17:16:34 -08:00
|
|
|
if (s.stage == MESA_SHADER_TESS_CTRL)
|
2015-11-14 17:40:43 -08:00
|
|
|
break;
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_GEOMETRY);
|
2023-12-05 15:27:29 -08:00
|
|
|
reg = &ntb.system_values[SYSTEM_VALUE_INVOCATION_ID];
|
2015-07-10 00:16:19 -07:00
|
|
|
if (reg->file == BAD_FILE) {
|
2024-01-09 13:04:49 -08:00
|
|
|
*reg = s.gs_payload().instance_id;
|
2015-07-10 00:16:19 -07:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
2014-12-17 12:34:27 -08:00
|
|
|
case nir_intrinsic_load_sample_pos:
|
2021-12-02 14:16:02 -06:00
|
|
|
case nir_intrinsic_load_sample_pos_or_center:
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
2023-12-05 15:27:29 -08:00
|
|
|
reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_POS];
|
2014-12-17 12:34:27 -08:00
|
|
|
if (reg->file == BAD_FILE)
|
2023-11-20 22:00:28 -08:00
|
|
|
*reg = emit_samplepos_setup(ntb);
|
2014-12-17 12:34:27 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_sample_id:
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
2023-12-05 15:27:29 -08:00
|
|
|
reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
|
2014-12-17 12:34:27 -08:00
|
|
|
if (reg->file == BAD_FILE)
|
2023-11-20 22:00:28 -08:00
|
|
|
*reg = emit_sampleid_setup(ntb);
|
2014-12-17 12:34:27 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_sample_mask_in:
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
2023-12-05 15:27:29 -08:00
|
|
|
reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
|
2014-12-17 12:34:27 -08:00
|
|
|
if (reg->file == BAD_FILE)
|
2023-11-20 21:36:14 -08:00
|
|
|
*reg = emit_samplemaskin_setup(ntb);
|
2014-12-17 12:34:27 -08:00
|
|
|
break;
|
|
|
|
|
|
2021-06-04 12:04:15 -07:00
|
|
|
case nir_intrinsic_load_workgroup_id:
|
2025-08-05 16:44:47 +08:00
|
|
|
if (mesa_shader_stage_is_mesh(s.stage))
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("should be lowered by nir_lower_compute_system_values().");
|
2025-08-05 16:43:06 +08:00
|
|
|
assert(mesa_shader_stage_is_compute(s.stage));
|
2023-12-05 15:27:29 -08:00
|
|
|
reg = &ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID];
|
2015-03-13 11:39:53 -07:00
|
|
|
if (reg->file == BAD_FILE)
|
2023-11-20 22:00:28 -08:00
|
|
|
*reg = emit_work_group_id_setup(ntb);
|
2015-03-13 11:39:53 -07:00
|
|
|
break;
|
|
|
|
|
|
2015-11-13 17:51:12 -08:00
|
|
|
case nir_intrinsic_load_helper_invocation:
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
2023-12-05 15:27:29 -08:00
|
|
|
reg = &ntb.system_values[SYSTEM_VALUE_HELPER_INVOCATION];
|
2015-11-13 17:51:12 -08:00
|
|
|
if (reg->file == BAD_FILE) {
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder abld =
|
2024-08-23 10:46:13 -07:00
|
|
|
ntb.bld.annotate("gl_HelperInvocation");
|
2015-11-13 17:51:12 -08:00
|
|
|
|
2021-03-29 15:46:12 -07:00
|
|
|
/* On Gfx6+ (gl_HelperInvocation is only exposed on Gfx7+) the
|
2015-11-13 17:51:12 -08:00
|
|
|
* pixel mask is in g1.7 of the thread payload.
|
|
|
|
|
*
|
|
|
|
|
* We move the per-channel pixel enable bit to the low bit of each
|
|
|
|
|
* channel by shifting the byte containing the pixel mask by the
|
|
|
|
|
* vector immediate 0x76543210UV.
|
|
|
|
|
*
|
|
|
|
|
* The region of <1,8,0> reads only 1 byte (the pixel masks for
|
|
|
|
|
* subspans 0 and 1) in SIMD8 and an additional byte (the pixel
|
|
|
|
|
* masks for 2 and 3) in SIMD16.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg shifted = abld.vgrf(BRW_TYPE_UW);
|
2017-01-11 19:55:33 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
|
2022-06-11 17:36:09 -07:00
|
|
|
/* According to the "PS Thread Payload for Normal
|
|
|
|
|
* Dispatch" pages on the BSpec, the dispatch mask is
|
|
|
|
|
* stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
|
|
|
|
|
* gfx6+.
|
|
|
|
|
*/
|
|
|
|
|
const struct brw_reg reg = s.devinfo->ver >= 20 ?
|
|
|
|
|
xe2_vec1_grf(i, 15) : brw_vec1_grf(i + 1, 7);
|
2017-01-11 19:55:33 -08:00
|
|
|
hbld.SHR(offset(shifted, hbld, i),
|
2024-04-20 17:08:02 -07:00
|
|
|
stride(retype(reg, BRW_TYPE_UB), 1, 8, 0),
|
2017-01-11 19:55:33 -08:00
|
|
|
brw_imm_v(0x76543210));
|
|
|
|
|
}
|
2015-11-13 17:51:12 -08:00
|
|
|
|
|
|
|
|
/* A set bit in the pixel mask means the channel is enabled, but
|
|
|
|
|
* that is the opposite of gl_HelperInvocation so we need to invert
|
|
|
|
|
* the mask.
|
|
|
|
|
*
|
2021-03-29 15:46:12 -07:00
|
|
|
* The negate source-modifier bit of logical instructions on Gfx8+
|
2015-11-13 17:51:12 -08:00
|
|
|
* performs 1's complement negation, so we can use that instead of
|
|
|
|
|
* a NOT instruction.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg inverted = negate(shifted);
|
2015-11-13 17:51:12 -08:00
|
|
|
|
|
|
|
|
/* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
|
|
|
|
|
* with 1 and negating.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg anded = abld.vgrf(BRW_TYPE_UD);
|
2015-11-13 17:51:12 -08:00
|
|
|
abld.AND(anded, inverted, brw_imm_uw(1));
|
|
|
|
|
|
2024-04-12 17:43:22 -07:00
|
|
|
*reg = abld.MOV(negate(retype(anded, BRW_TYPE_D)));
|
2015-11-13 17:51:12 -08:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
2020-10-29 15:19:30 +02:00
|
|
|
case nir_intrinsic_load_frag_shading_rate:
|
2023-12-05 15:27:29 -08:00
|
|
|
reg = &ntb.system_values[SYSTEM_VALUE_FRAG_SHADING_RATE];
|
2020-10-29 15:19:30 +02:00
|
|
|
if (reg->file == BAD_FILE)
|
2023-11-20 22:00:28 -08:00
|
|
|
*reg = emit_shading_rate_setup(ntb);
|
2020-10-29 15:19:30 +02:00
|
|
|
break;
|
|
|
|
|
|
2014-12-17 12:34:27 -08:00
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 14:50:48 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_system_values(nir_to_brw_state &ntb)
|
2014-12-17 12:34:27 -08:00
|
|
|
{
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 14:50:48 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
ntb.system_values = ralloc_array(ntb.mem_ctx, brw_reg, SYSTEM_VALUE_MAX);
|
2015-10-30 13:53:38 -07:00
|
|
|
for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
|
2024-06-18 23:42:59 -07:00
|
|
|
ntb.system_values[i] = brw_reg();
|
2015-10-30 13:53:38 -07:00
|
|
|
}
|
|
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)s.nir);
|
2018-10-29 12:08:29 -05:00
|
|
|
nir_foreach_block(block, impl)
|
2023-11-20 21:36:14 -08:00
|
|
|
emit_system_values_block(ntb, block);
|
2014-12-17 12:34:27 -08:00
|
|
|
}
|
|
|
|
|
|
2023-11-20 14:42:06 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_impl(nir_to_brw_state &ntb, nir_function_impl *impl)
|
2014-08-15 10:32:07 -07:00
|
|
|
{
|
2024-06-18 23:42:59 -07:00
|
|
|
ntb.ssa_values = rzalloc_array(ntb.mem_ctx, brw_reg, impl->ssa_alloc);
|
2024-12-07 09:36:03 -08:00
|
|
|
ntb.ssa_bind_infos = rzalloc_array(ntb.mem_ctx, struct brw_bind_info, impl->ssa_alloc);
|
2023-02-09 15:07:36 +02:00
|
|
|
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_cf_list(ntb, &impl->body);
|
2014-08-15 10:32:07 -07:00
|
|
|
}
|
|
|
|
|
|
2023-11-20 14:42:06 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_cf_list(nir_to_brw_state &ntb, exec_list *list)
|
2014-08-15 10:32:07 -07:00
|
|
|
{
|
2015-01-21 16:00:55 -08:00
|
|
|
exec_list_validate(list);
|
2014-08-15 10:32:07 -07:00
|
|
|
foreach_list_typed(nir_cf_node, node, node, list) {
|
|
|
|
|
switch (node->type) {
|
|
|
|
|
case nir_cf_node_if:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_if(ntb, nir_cf_node_as_if(node));
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_cf_node_loop:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_loop(ntb, nir_cf_node_as_loop(node));
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_cf_node_block:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_block(ntb, nir_cf_node_as_block(node));
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("Invalid CFG node block");
|
2014-08-15 10:32:07 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-04-01 17:52:58 -07:00
|
|
|
static brw_inst *
|
|
|
|
|
brw_from_nir_emit_jump(nir_to_brw_state &ntb, nir_jump_instr *instr)
|
|
|
|
|
{
|
|
|
|
|
switch (instr->type) {
|
|
|
|
|
case nir_jump_break:
|
|
|
|
|
return ntb.bld.emit(BRW_OPCODE_BREAK);
|
|
|
|
|
case nir_jump_continue:
|
|
|
|
|
return ntb.bld.emit(BRW_OPCODE_CONTINUE);
|
|
|
|
|
case nir_jump_halt:
|
|
|
|
|
return ntb.bld.emit(BRW_OPCODE_HALT);
|
|
|
|
|
case nir_jump_return:
|
|
|
|
|
default:
|
|
|
|
|
UNREACHABLE("unknown jump");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 14:42:06 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_if(nir_to_brw_state &ntb, nir_if *if_stmt)
|
2014-08-15 10:32:07 -07:00
|
|
|
{
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2023-11-20 14:42:06 -08:00
|
|
|
|
2018-12-03 12:06:50 -08:00
|
|
|
bool invert;
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg cond_reg;
|
2018-12-03 12:06:50 -08:00
|
|
|
|
|
|
|
|
/* If the condition has the form !other_condition, use other_condition as
|
|
|
|
|
* the source, but invert the predicate on the if instruction.
|
|
|
|
|
*/
|
2019-04-17 17:10:18 -05:00
|
|
|
nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition);
|
2018-12-03 12:06:50 -08:00
|
|
|
if (cond != NULL && cond->op == nir_op_inot) {
|
|
|
|
|
invert = true;
|
2024-02-12 08:43:34 -08:00
|
|
|
cond_reg = get_nir_src(ntb, cond->src[0].src, cond->src[0].swizzle[0]);
|
2018-12-03 12:06:50 -08:00
|
|
|
} else {
|
|
|
|
|
invert = false;
|
2025-01-15 13:27:05 -08:00
|
|
|
cond_reg = get_nir_src(ntb, if_stmt->condition, 0);
|
2018-12-03 12:06:50 -08:00
|
|
|
}
|
|
|
|
|
|
2014-08-15 10:32:07 -07:00
|
|
|
/* first, put the condition into f0 */
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *inst = bld.MOV(bld.null_reg_d(),
|
2024-04-20 17:08:02 -07:00
|
|
|
retype(cond_reg, BRW_TYPE_D));
|
2014-08-15 10:32:07 -07:00
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NZ;
|
|
|
|
|
|
2025-04-01 17:52:58 -07:00
|
|
|
/* Peephole: replace IF-JUMP-ENDIF with predicated jump */
|
|
|
|
|
if (nir_cf_list_is_empty_block(&if_stmt->else_list) &&
|
|
|
|
|
exec_list_is_singular(&if_stmt->then_list)) {
|
|
|
|
|
struct exec_node *head = exec_list_get_head(&if_stmt->then_list);
|
|
|
|
|
nir_block *block =
|
|
|
|
|
nir_cf_node_as_block(exec_node_data(nir_cf_node, head, node));
|
|
|
|
|
|
|
|
|
|
if (exec_list_is_singular(&block->instr_list) &&
|
|
|
|
|
nir_block_ends_in_jump(block)) {
|
|
|
|
|
nir_jump_instr *jump = nir_instr_as_jump(nir_block_first_instr(block));
|
|
|
|
|
if (jump->type == nir_jump_break ||
|
|
|
|
|
jump->type == nir_jump_continue) {
|
|
|
|
|
|
|
|
|
|
brw_inst *inst = brw_from_nir_emit_jump(ntb, jump);
|
|
|
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
inst->predicate_inverse = invert;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *iff = bld.IF(BRW_PREDICATE_NORMAL);
|
intel/brw: Replace predicated break optimization with a simple peephole
We can achieve most of what brw_fs_opt_predicated_break() does with
simple peepholes at NIR -> BRW conversion time.
For predicated break and continue, we can simply look at an IF ... ENDIF
sequence after emitting it. If there's a single instruction between the
two, and it's a BREAK or CONTINUE, then we can move the predicate from
the IF onto the jump, and delete the IF/ENDIF. Because we haven't built
the CFG at this stage, we only need to remove them from the linked list
of instructions, which is trivial to do.
For the predicated while optimization, we can rely on the fact that we
already did the predicated break optimization, and simply look for a
predicated BREAK just before the WHILE. If so, we move the predicate
onto the WHILE, invert it, and remove the BREAK.
There are a few cases where this approach does a worse job than the old
one: nir_convert_from_ssa may introduce load_reg and store_reg in blocks
containing break, and nir_trivialize_registers may decide it needs to
insert movs into those blocks. So, at NIR -> BRW time, we'll actually
emit some MOVs there, which might have been possible to copy propagate
out after later optimizations.
However, the fossil-db results show that it's still pretty competitive.
For instructions, 1017 shaders were helped (average -1.87 instructions),
while only 62 were hurt (average +2.19 instructions). In affected
shaders, it was -0.08% for instructions.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30498>
2024-08-01 15:33:36 -07:00
|
|
|
iff->predicate_inverse = invert;
|
2014-08-15 10:32:07 -07:00
|
|
|
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_cf_list(ntb, &if_stmt->then_list);
|
2014-08-15 10:32:07 -07:00
|
|
|
|
2019-04-03 14:24:31 -07:00
|
|
|
if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
|
|
|
|
|
bld.emit(BRW_OPCODE_ELSE);
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_cf_list(ntb, &if_stmt->else_list);
|
2019-04-03 14:24:31 -07:00
|
|
|
}
|
2014-08-15 10:32:07 -07:00
|
|
|
|
2025-04-01 17:52:58 -07:00
|
|
|
bld.emit(BRW_OPCODE_ENDIF);
|
2014-08-15 10:32:07 -07:00
|
|
|
}
|
|
|
|
|
|
2023-11-20 14:42:06 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_loop(nir_to_brw_state &ntb, nir_loop *loop)
|
2014-08-15 10:32:07 -07:00
|
|
|
{
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2023-11-20 14:42:06 -08:00
|
|
|
|
2021-12-02 10:31:56 +01:00
|
|
|
assert(!nir_loop_has_continue_construct(loop));
|
2025-02-13 19:20:38 -08:00
|
|
|
bld.DO();
|
2014-08-15 10:32:07 -07:00
|
|
|
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_cf_list(ntb, &loop->body);
|
2014-08-15 10:32:07 -07:00
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *peep_while = bld.emit(BRW_OPCODE_WHILE);
|
intel/brw: Replace predicated break optimization with a simple peephole
We can achieve most of what brw_fs_opt_predicated_break() does with
simple peepholes at NIR -> BRW conversion time.
For predicated break and continue, we can simply look at an IF ... ENDIF
sequence after emitting it. If there's a single instruction between the
two, and it's a BREAK or CONTINUE, then we can move the predicate from
the IF onto the jump, and delete the IF/ENDIF. Because we haven't built
the CFG at this stage, we only need to remove them from the linked list
of instructions, which is trivial to do.
For the predicated while optimization, we can rely on the fact that we
already did the predicated break optimization, and simply look for a
predicated BREAK just before the WHILE. If so, we move the predicate
onto the WHILE, invert it, and remove the BREAK.
There are a few cases where this approach does a worse job than the old
one: nir_convert_from_ssa may introduce load_reg and store_reg in blocks
containing break, and nir_trivialize_registers may decide it needs to
insert movs into those blocks. So, at NIR -> BRW time, we'll actually
emit some MOVs there, which might have been possible to copy propagate
out after later optimizations.
However, the fossil-db results show that it's still pretty competitive.
For instructions, 1017 shaders were helped (average -1.87 instructions),
while only 62 were hurt (average +2.19 instructions). In affected
shaders, it was -0.08% for instructions.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30498>
2024-08-01 15:33:36 -07:00
|
|
|
|
|
|
|
|
/* Peephole: replace (+f0) break; while with (-f0) while */
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *peep_break = (brw_inst *) peep_while->prev;
|
intel/brw: Replace predicated break optimization with a simple peephole
We can achieve most of what brw_fs_opt_predicated_break() does with
simple peepholes at NIR -> BRW conversion time.
For predicated break and continue, we can simply look at an IF ... ENDIF
sequence after emitting it. If there's a single instruction between the
two, and it's a BREAK or CONTINUE, then we can move the predicate from
the IF onto the jump, and delete the IF/ENDIF. Because we haven't built
the CFG at this stage, we only need to remove them from the linked list
of instructions, which is trivial to do.
For the predicated while optimization, we can rely on the fact that we
already did the predicated break optimization, and simply look for a
predicated BREAK just before the WHILE. If so, we move the predicate
onto the WHILE, invert it, and remove the BREAK.
There are a few cases where this approach does a worse job than the old
one: nir_convert_from_ssa may introduce load_reg and store_reg in blocks
containing break, and nir_trivialize_registers may decide it needs to
insert movs into those blocks. So, at NIR -> BRW time, we'll actually
emit some MOVs there, which might have been possible to copy propagate
out after later optimizations.
However, the fossil-db results show that it's still pretty competitive.
For instructions, 1017 shaders were helped (average -1.87 instructions),
while only 62 were hurt (average +2.19 instructions). In affected
shaders, it was -0.08% for instructions.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30498>
2024-08-01 15:33:36 -07:00
|
|
|
|
|
|
|
|
if (peep_break->opcode == BRW_OPCODE_BREAK &&
|
|
|
|
|
peep_break->predicate != BRW_PREDICATE_NONE) {
|
|
|
|
|
peep_while->predicate = peep_break->predicate;
|
|
|
|
|
peep_while->predicate_inverse = !peep_break->predicate_inverse;
|
2025-07-28 16:07:44 -04:00
|
|
|
peep_break->brw_exec_node::remove();
|
intel/brw: Replace predicated break optimization with a simple peephole
We can achieve most of what brw_fs_opt_predicated_break() does with
simple peepholes at NIR -> BRW conversion time.
For predicated break and continue, we can simply look at an IF ... ENDIF
sequence after emitting it. If there's a single instruction between the
two, and it's a BREAK or CONTINUE, then we can move the predicate from
the IF onto the jump, and delete the IF/ENDIF. Because we haven't built
the CFG at this stage, we only need to remove them from the linked list
of instructions, which is trivial to do.
For the predicated while optimization, we can rely on the fact that we
already did the predicated break optimization, and simply look for a
predicated BREAK just before the WHILE. If so, we move the predicate
onto the WHILE, invert it, and remove the BREAK.
There are a few cases where this approach does a worse job than the old
one: nir_convert_from_ssa may introduce load_reg and store_reg in blocks
containing break, and nir_trivialize_registers may decide it needs to
insert movs into those blocks. So, at NIR -> BRW time, we'll actually
emit some MOVs there, which might have been possible to copy propagate
out after later optimizations.
However, the fossil-db results show that it's still pretty competitive.
For instructions, 1017 shaders were helped (average -1.87 instructions),
while only 62 were hurt (average +2.19 instructions). In affected
shaders, it was -0.08% for instructions.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30498>
2024-08-01 15:33:36 -07:00
|
|
|
}
|
2014-08-15 10:32:07 -07:00
|
|
|
}
|
|
|
|
|
|
2023-11-20 14:42:06 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_block(nir_to_brw_state &ntb, nir_block *block)
|
2014-08-15 10:32:07 -07:00
|
|
|
{
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder bld = ntb.bld;
|
2023-11-20 22:11:23 -08:00
|
|
|
|
2016-04-26 18:34:19 -07:00
|
|
|
nir_foreach_instr(instr, block) {
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_instr(ntb, instr);
|
2014-08-15 10:32:07 -07:00
|
|
|
}
|
2023-11-20 22:11:23 -08:00
|
|
|
|
2023-12-05 15:27:29 -08:00
|
|
|
ntb.bld = bld;
|
2014-08-15 10:32:07 -07:00
|
|
|
}
|
|
|
|
|
|
2016-01-21 09:10:09 -08:00
|
|
|
/**
|
|
|
|
|
* Recognizes a parent instruction of nir_op_extract_* and changes the type to
|
|
|
|
|
* match instr.
|
|
|
|
|
*/
|
2023-11-20 13:25:36 -08:00
|
|
|
static bool
|
2024-12-29 15:41:04 -08:00
|
|
|
optimize_extract_to_float(nir_to_brw_state &ntb, const brw_builder &bld,
|
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2.
v3: Fix handling of 64-bit CMP results.
v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.
shader-db:
Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21
total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135
Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18
total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134
total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19
total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100
total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0
total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16
total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39
total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2
total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4
LOST: 1
GAINED: 1
fossil-db:
Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%
Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)
Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-01-30 18:53:05 -08:00
|
|
|
nir_alu_instr *instr, const brw_reg &result)
|
2016-01-21 09:10:09 -08:00
|
|
|
{
|
2023-12-05 15:27:29 -08:00
|
|
|
const intel_device_info *devinfo = ntb.devinfo;
|
2023-11-20 13:25:36 -08:00
|
|
|
|
2024-02-28 16:49:05 -08:00
|
|
|
/* No fast path for f16 (yet) or f64. */
|
|
|
|
|
assert(instr->op == nir_op_i2f32 || instr->op == nir_op_u2f32);
|
|
|
|
|
|
2023-07-24 17:33:58 -05:00
|
|
|
if (!instr->src[0].src.ssa->parent_instr)
|
2016-01-21 09:10:09 -08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
nir_alu_instr *src0 =
|
2025-07-31 09:49:36 -04:00
|
|
|
nir_def_as_alu(instr->src[0].src.ssa);
|
2016-01-21 09:10:09 -08:00
|
|
|
|
2024-02-28 16:49:05 -08:00
|
|
|
unsigned bytes;
|
|
|
|
|
bool is_signed;
|
|
|
|
|
|
|
|
|
|
switch (src0->op) {
|
|
|
|
|
case nir_op_extract_u8:
|
|
|
|
|
case nir_op_extract_u16:
|
|
|
|
|
bytes = src0->op == nir_op_extract_u8 ? 1 : 2;
|
|
|
|
|
|
|
|
|
|
/* i2f(extract_u8(a, b)) and u2f(extract_u8(a, b)) produce the same
|
|
|
|
|
* result. Ditto for extract_u16.
|
|
|
|
|
*/
|
|
|
|
|
is_signed = false;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_extract_i8:
|
|
|
|
|
case nir_op_extract_i16:
|
|
|
|
|
bytes = src0->op == nir_op_extract_i8 ? 1 : 2;
|
|
|
|
|
|
|
|
|
|
/* The fast path can't handle u2f(extract_i8(a, b)) because the implicit
|
|
|
|
|
* sign extension of the extract_i8 is lost. For example,
|
|
|
|
|
* u2f(extract_i8(0x0000ff00, 1)) should produce 4294967295.0, but a
|
|
|
|
|
* fast path could either give 255.0 (by implementing the fast path as
|
|
|
|
|
* u2f(extract_u8(x))) or -1.0 (by implementing the fast path as
|
|
|
|
|
* i2f(extract_i8(x))). At one point in time, we incorrectly implemented
|
|
|
|
|
* the former.
|
|
|
|
|
*/
|
|
|
|
|
if (instr->op != nir_op_i2f32)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
is_signed = true;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
2016-01-21 09:10:09 -08:00
|
|
|
return false;
|
2024-02-28 16:49:05 -08:00
|
|
|
}
|
2016-01-21 09:10:09 -08:00
|
|
|
|
2018-10-20 09:55:28 -05:00
|
|
|
unsigned element = nir_src_as_uint(src0->src[1].src);
|
2016-01-21 09:10:09 -08:00
|
|
|
|
2016-05-18 18:43:54 -07:00
|
|
|
/* Element type to extract.*/
|
2024-02-28 16:49:05 -08:00
|
|
|
const brw_reg_type type = brw_int_type(bytes, is_signed);
|
2016-01-21 09:10:09 -08:00
|
|
|
|
2024-02-12 08:43:34 -08:00
|
|
|
brw_reg op0 = get_nir_src(ntb, src0->src[0].src, -1);
|
2017-01-20 19:03:21 -08:00
|
|
|
op0.type = brw_type_for_nir_type(devinfo,
|
2015-07-29 09:11:03 -07:00
|
|
|
(nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
|
|
|
|
|
nir_src_bit_size(src0->src[0].src)));
|
brw/nir: Don't generate scalar byte to float conversions on DG2+ in optimize_extract_to_float
The lowering code does not generate efficient code. It is better to
just not emit the bad thing in the first place. The shaders that I
examined had blocks of NIR like:
con 32 %527 = extract_u8 %456.o, %5 (0x0)
con 32 %528 = extract_u8 %456.o, %35 (0x1)
con 32 %529 = extract_u8 %456.o, %14 (0x2)
con 32 %530 = extract_u8 %456.o, %11 (0x3)
con 32 %531 = u2f32 %527
con 32 %532 = u2f32 %528
con 32 %533 = u2f32 %529
con 32 %534 = u2f32 %530
In some cases the u2f results are multiplied with 1/255. There may be
a slightly more efficient way to do this by doing something like
mov(8) g40<1>UW g12.1<32,8,4>UB
mov(8) g41<1>UW g12.2<32,8,4>UB
mov(8) g42<1>UW g12.3<32,8,4>UB
mov(8) g60<1>F g12<32,8,4>UB
mov(8) g61<1>F g40<1,1,0>UW
mov(8) g62<1>F g41<1,1,0>UW
mov(8) g63<1>F g42<1,1,0>UW
In SIMD16 and SIMD32 that would save temporary register space. It could
save a register in SIMD8 by using g40.8 instead of g42. Making that
happen might be tricky. Maybe we should just add a special NIR opcode
that converts a packed uint32 to a vec4?
v2: Add a bunch of documentation explaining what's going on. Suggested
by Ken.
shader-db:
Lunar Lake, Meteor Lake, and DG2 had similar results. (Lunar Lake shown)
total instructions in shared programs: 18228689 -> 18228720 (<.01%)
instructions in affected programs: 43091 -> 43122 (0.07%)
helped: 0 / HURT: 30
total cycles in shared programs: 932542994 -> 932544290 (<.01%)
cycles in affected programs: 8150758 -> 8152054 (0.02%)
helped: 15 / HURT: 17
fossil-db:
Lunar Lake, Meteor Lake, and DG2 had similar results. (Lunar Lake shown)
Totals:
Instrs: 142890605 -> 142890392 (-0.00%); split: -0.00%, +0.00%
Cycle count: 21655049536 -> 21654693720 (-0.00%); split: -0.00%, +0.00%
Totals from 181 (0.03% of 553251) affected shaders:
Instrs: 188022 -> 187809 (-0.11%); split: -0.12%, +0.01%
Cycle count: 85291658 -> 84935842 (-0.42%); split: -0.47%, +0.05%
Tiger Lake, Ice Lake, and Skylake had similar results. (Tiger Lake shown)
Totals:
Instrs: 154438050 -> 154436980 (-0.00%)
Cycle count: 15334650326 -> 15334644375 (-0.00%); split: -0.00%, +0.00%
Spill count: 56754 -> 56706 (-0.08%)
Fill count: 95919 -> 95808 (-0.12%)
Scratch Memory Size: 2306048 -> 2304000 (-0.09%)
Max live registers: 32469924 -> 32469899 (-0.00%)
Totals from 112 (0.02% of 642922) affected shaders:
Instrs: 156186 -> 155116 (-0.69%)
Cycle count: 11111478 -> 11105527 (-0.05%); split: -0.62%, +0.56%
Spill count: 1766 -> 1718 (-2.72%)
Fill count: 2815 -> 2704 (-3.94%)
Scratch Memory Size: 78848 -> 76800 (-2.60%)
Max live registers: 11526 -> 11501 (-0.22%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-07-29 12:59:28 -07:00
|
|
|
|
|
|
|
|
/* It is not documented in the Bspec, but DG2 and newer platforms cannot do
|
|
|
|
|
* direct byte-to-float conversions from scalars. MR !30140 has more
|
|
|
|
|
* details. If the optimization is applied in cases that would require
|
|
|
|
|
* lower_regioning to do some lowering, the code generated will be much,
|
|
|
|
|
* much worse.
|
|
|
|
|
*/
|
|
|
|
|
if (devinfo->verx10 >= 125 && bytes == 1) {
|
|
|
|
|
/* If the source truly scalar, for example from the UNIFORM file, skip
|
|
|
|
|
* the optimize_extract_to_float optimization.
|
|
|
|
|
*
|
|
|
|
|
* Note: is_scalar values won't have zero stride until after the call to
|
|
|
|
|
* offset() below that applies the swizzle.
|
|
|
|
|
*/
|
|
|
|
|
if (is_uniform(op0))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* If the dispatch width matches the scalar allocation width, then
|
|
|
|
|
* is_scalar can be demoted to non-is_scalar. This prevents offset() and
|
|
|
|
|
* component() (both called below) from setting the stride to zero, and
|
|
|
|
|
* that avoids the awful code generated by lower_regioning.
|
|
|
|
|
*/
|
|
|
|
|
if (op0.is_scalar) {
|
|
|
|
|
const unsigned allocation_width = 8 * reg_unit(ntb.devinfo);
|
|
|
|
|
if (ntb.bld.dispatch_width() != allocation_width)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
assert(bld.dispatch_width() == allocation_width);
|
|
|
|
|
op0.is_scalar = false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 21:21:54 -08:00
|
|
|
op0 = offset(op0, bld, src0->src[0].swizzle[0]);
|
2016-01-21 09:10:09 -08:00
|
|
|
|
brw/nir: Don't generate scalar byte to float conversions on DG2+ in optimize_extract_to_float
The lowering code does not generate efficient code. It is better to
just not emit the bad thing in the first place. The shaders that I
examined had blocks of NIR like:
con 32 %527 = extract_u8 %456.o, %5 (0x0)
con 32 %528 = extract_u8 %456.o, %35 (0x1)
con 32 %529 = extract_u8 %456.o, %14 (0x2)
con 32 %530 = extract_u8 %456.o, %11 (0x3)
con 32 %531 = u2f32 %527
con 32 %532 = u2f32 %528
con 32 %533 = u2f32 %529
con 32 %534 = u2f32 %530
In some cases the u2f results are multiplied with 1/255. There may be
a slightly more efficient way to do this by doing something like
mov(8) g40<1>UW g12.1<32,8,4>UB
mov(8) g41<1>UW g12.2<32,8,4>UB
mov(8) g42<1>UW g12.3<32,8,4>UB
mov(8) g60<1>F g12<32,8,4>UB
mov(8) g61<1>F g40<1,1,0>UW
mov(8) g62<1>F g41<1,1,0>UW
mov(8) g63<1>F g42<1,1,0>UW
In SIMD16 and SIMD32 that would save temporary register space. It could
save a register in SIMD8 by using g40.8 instead of g42. Making that
happen might be tricky. Maybe we should just add a special NIR opcode
that converts a packed uint32 to a vec4?
v2: Add a bunch of documentation explaining what's going on. Suggested
by Ken.
shader-db:
Lunar Lake, Meteor Lake, and DG2 had similar results. (Lunar Lake shown)
total instructions in shared programs: 18228689 -> 18228720 (<.01%)
instructions in affected programs: 43091 -> 43122 (0.07%)
helped: 0 / HURT: 30
total cycles in shared programs: 932542994 -> 932544290 (<.01%)
cycles in affected programs: 8150758 -> 8152054 (0.02%)
helped: 15 / HURT: 17
fossil-db:
Lunar Lake, Meteor Lake, and DG2 had similar results. (Lunar Lake shown)
Totals:
Instrs: 142890605 -> 142890392 (-0.00%); split: -0.00%, +0.00%
Cycle count: 21655049536 -> 21654693720 (-0.00%); split: -0.00%, +0.00%
Totals from 181 (0.03% of 553251) affected shaders:
Instrs: 188022 -> 187809 (-0.11%); split: -0.12%, +0.01%
Cycle count: 85291658 -> 84935842 (-0.42%); split: -0.47%, +0.05%
Tiger Lake, Ice Lake, and Skylake had similar results. (Tiger Lake shown)
Totals:
Instrs: 154438050 -> 154436980 (-0.00%)
Cycle count: 15334650326 -> 15334644375 (-0.00%); split: -0.00%, +0.00%
Spill count: 56754 -> 56706 (-0.08%)
Fill count: 95919 -> 95808 (-0.12%)
Scratch Memory Size: 2306048 -> 2304000 (-0.09%)
Max live registers: 32469924 -> 32469899 (-0.00%)
Totals from 112 (0.02% of 642922) affected shaders:
Instrs: 156186 -> 155116 (-0.69%)
Cycle count: 11111478 -> 11105527 (-0.05%); split: -0.62%, +0.56%
Spill count: 1766 -> 1718 (-2.72%)
Fill count: 2815 -> 2704 (-3.94%)
Scratch Memory Size: 78848 -> 76800 (-2.60%)
Max live registers: 11526 -> 11501 (-0.22%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-07-29 12:59:28 -07:00
|
|
|
/* If the dispatch width matches the scalar allocation width, offset() will
|
|
|
|
|
* not modify the stride, but having source stride <0;1,0> is advantageous.
|
|
|
|
|
*/
|
|
|
|
|
if (op0.is_scalar)
|
|
|
|
|
op0 = component(op0, 0);
|
|
|
|
|
|
intel/brw: Avoid optimize_extract_to_float when it will just be undone later
v2: Add bspec quotation. Suggested by Caio. With better understand of
the restriction, only apply on DG2 and newer platforms.
shader-db:
DG2 and Meteor Lake had similar results. (DG2 shown)
total instructions in shared programs: 19659363 -> 19659360 (<.01%)
instructions in affected programs: 2484 -> 2481 (-0.12%)
helped: 6 / HURT: 1
total cycles in shared programs: 823445738 -> 823432524 (<.01%)
cycles in affected programs: 2619836 -> 2606622 (-0.50%)
helped: 48 / HURT: 63
fossil-db:
DG2 and Meteor Lake had similar results. (DG2 shown)
Totals:
Instrs: 154015863 -> 153987806 (-0.02%); split: -0.02%, +0.00%
Cycle count: 17552172994 -> 17562047866 (+0.06%); split: -0.13%, +0.19%
Spill count: 142124 -> 141544 (-0.41%); split: -0.54%, +0.13%
Fill count: 266803 -> 266046 (-0.28%); split: -0.38%, +0.09%
Scratch Memory Size: 10266624 -> 10271744 (+0.05%); split: -0.02%, +0.07%
Max live registers: 32592428 -> 32592393 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5535944 -> 5535912 (-0.00%); split: +0.00%, -0.00%
Totals from 41887 (6.63% of 631367) affected shaders:
Instrs: 32971032 -> 32942975 (-0.09%); split: -0.10%, +0.01%
Cycle count: 3892086217 -> 3901961089 (+0.25%); split: -0.60%, +0.85%
Spill count: 105669 -> 105089 (-0.55%); split: -0.72%, +0.18%
Fill count: 206459 -> 205702 (-0.37%); split: -0.49%, +0.12%
Scratch Memory Size: 7766016 -> 7771136 (+0.07%); split: -0.03%, +0.09%
Max live registers: 3230515 -> 3230480 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 337232 -> 337200 (-0.01%); split: +0.00%, -0.01%
No shader-db or fossil-db changes on any earlier Intel platforms.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27891>
2024-02-28 17:34:19 -08:00
|
|
|
/* Bspec "Register Region Restrictions" for Xe says:
|
|
|
|
|
*
|
|
|
|
|
* "In case of all float point data types used in destination
|
|
|
|
|
*
|
|
|
|
|
* 1. Register Regioning patterns where register data bit location of
|
|
|
|
|
* the LSB of the channels are changed between source and destination
|
|
|
|
|
* are not supported on Src0 and Src1 except for broadcast of a
|
|
|
|
|
* scalar."
|
|
|
|
|
*
|
2024-12-06 11:37:57 -08:00
|
|
|
* This restriction is enfored in brw_lower_regioning. There is no
|
|
|
|
|
* reason to generate an optimized instruction that brw_lower_regioning
|
intel/brw: Avoid optimize_extract_to_float when it will just be undone later
v2: Add bspec quotation. Suggested by Caio. With better understand of
the restriction, only apply on DG2 and newer platforms.
shader-db:
DG2 and Meteor Lake had similar results. (DG2 shown)
total instructions in shared programs: 19659363 -> 19659360 (<.01%)
instructions in affected programs: 2484 -> 2481 (-0.12%)
helped: 6 / HURT: 1
total cycles in shared programs: 823445738 -> 823432524 (<.01%)
cycles in affected programs: 2619836 -> 2606622 (-0.50%)
helped: 48 / HURT: 63
fossil-db:
DG2 and Meteor Lake had similar results. (DG2 shown)
Totals:
Instrs: 154015863 -> 153987806 (-0.02%); split: -0.02%, +0.00%
Cycle count: 17552172994 -> 17562047866 (+0.06%); split: -0.13%, +0.19%
Spill count: 142124 -> 141544 (-0.41%); split: -0.54%, +0.13%
Fill count: 266803 -> 266046 (-0.28%); split: -0.38%, +0.09%
Scratch Memory Size: 10266624 -> 10271744 (+0.05%); split: -0.02%, +0.07%
Max live registers: 32592428 -> 32592393 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5535944 -> 5535912 (-0.00%); split: +0.00%, -0.00%
Totals from 41887 (6.63% of 631367) affected shaders:
Instrs: 32971032 -> 32942975 (-0.09%); split: -0.10%, +0.01%
Cycle count: 3892086217 -> 3901961089 (+0.25%); split: -0.60%, +0.85%
Spill count: 105669 -> 105089 (-0.55%); split: -0.72%, +0.18%
Fill count: 206459 -> 205702 (-0.37%); split: -0.49%, +0.12%
Scratch Memory Size: 7766016 -> 7771136 (+0.07%); split: -0.03%, +0.09%
Max live registers: 3230515 -> 3230480 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 337232 -> 337200 (-0.01%); split: +0.00%, -0.01%
No shader-db or fossil-db changes on any earlier Intel platforms.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27891>
2024-02-28 17:34:19 -08:00
|
|
|
* will have to break up later.
|
|
|
|
|
*/
|
|
|
|
|
if (devinfo->verx10 >= 125 && element != 0 && !is_uniform(op0))
|
|
|
|
|
return false;
|
|
|
|
|
|
2023-11-20 21:21:54 -08:00
|
|
|
bld.MOV(result, subscript(op0, type, element));
|
2016-01-21 09:10:09 -08:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 13:25:36 -08:00
|
|
|
static bool
|
2023-12-05 15:27:29 -08:00
|
|
|
optimize_frontfacing_ternary(nir_to_brw_state &ntb,
|
2023-11-20 13:25:36 -08:00
|
|
|
nir_alu_instr *instr,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg &result)
|
2015-02-15 13:45:04 -08:00
|
|
|
{
|
2023-12-05 15:27:29 -08:00
|
|
|
const intel_device_info *devinfo = ntb.devinfo;
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 13:25:36 -08:00
|
|
|
|
2019-04-17 17:18:19 -05:00
|
|
|
nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src);
|
|
|
|
|
if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face)
|
2015-02-15 13:45:04 -08:00
|
|
|
return false;
|
|
|
|
|
|
2018-10-20 09:55:28 -05:00
|
|
|
if (!nir_src_is_const(instr->src[1].src) ||
|
|
|
|
|
!nir_src_is_const(instr->src[2].src))
|
2015-02-15 13:45:04 -08:00
|
|
|
return false;
|
|
|
|
|
|
2018-10-20 09:55:28 -05:00
|
|
|
const float value1 = nir_src_as_float(instr->src[1].src);
|
|
|
|
|
const float value2 = nir_src_as_float(instr->src[2].src);
|
|
|
|
|
if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f)
|
2015-02-15 13:45:04 -08:00
|
|
|
return false;
|
|
|
|
|
|
2018-11-07 15:47:18 -06:00
|
|
|
/* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */
|
|
|
|
|
assert(value1 == -value2);
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp = ntb.bld.vgrf(BRW_TYPE_D);
|
2015-02-15 13:45:04 -08:00
|
|
|
|
2023-12-01 21:50:47 -08:00
|
|
|
if (devinfo->ver >= 20) {
|
|
|
|
|
/* Gfx20+ has separate back-facing bits for each pair of
|
|
|
|
|
* subspans in order to support multiple polygons, so we need to
|
|
|
|
|
* use a <1;8,0> region in order to select the correct word for
|
|
|
|
|
* each channel. Unfortunately they're no longer aligned to the
|
|
|
|
|
* sign bit of a 16-bit word, so a left shift is necessary.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg ff = ntb.bld.vgrf(BRW_TYPE_UW);
|
2023-12-01 21:50:47 -08:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder hbld = ntb.bld.group(16, i);
|
2023-12-01 21:50:47 -08:00
|
|
|
const struct brw_reg gi_uw = retype(xe2_vec1_grf(i, 9),
|
2024-04-20 17:08:02 -07:00
|
|
|
BRW_TYPE_UW);
|
2023-12-01 21:50:47 -08:00
|
|
|
hbld.SHL(offset(ff, hbld, i), stride(gi_uw, 1, 8, 0), brw_imm_ud(4));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (value1 == -1.0f)
|
|
|
|
|
ff.negate = true;
|
|
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
ntb.bld.OR(subscript(tmp, BRW_TYPE_UW, 1), ff,
|
2023-12-01 21:50:47 -08:00
|
|
|
brw_imm_uw(0x3f80));
|
|
|
|
|
|
|
|
|
|
} else if (devinfo->ver >= 12 && s.max_polygons == 2) {
|
2022-06-22 17:02:27 -07:00
|
|
|
/* According to the BSpec "PS Thread Payload for Normal
|
|
|
|
|
* Dispatch", the front/back facing interpolation bit is stored
|
|
|
|
|
* as bit 15 of either the R1.1 or R1.6 poly info field, for the
|
|
|
|
|
* first and second polygons respectively in multipolygon PS
|
|
|
|
|
* dispatch mode.
|
|
|
|
|
*/
|
|
|
|
|
assert(s.dispatch_width == 16);
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < s.max_polygons; i++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder hbld = ntb.bld.group(8, i);
|
2022-06-22 17:02:27 -07:00
|
|
|
struct brw_reg g1 = retype(brw_vec1_grf(1, 1 + 5 * i),
|
2024-04-20 17:08:02 -07:00
|
|
|
BRW_TYPE_UW);
|
2022-06-22 17:02:27 -07:00
|
|
|
|
|
|
|
|
if (value1 == -1.0f)
|
|
|
|
|
g1.negate = true;
|
|
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
hbld.OR(subscript(offset(tmp, hbld, i), BRW_TYPE_UW, 1),
|
2022-06-22 17:02:27 -07:00
|
|
|
g1, brw_imm_uw(0x3f80));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} else if (devinfo->ver >= 12) {
|
2018-06-11 23:21:57 -07:00
|
|
|
/* Bit 15 of g1.1 is 0 if the polygon is front facing. */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg g1 = brw_reg(retype(brw_vec1_grf(1, 1), BRW_TYPE_W));
|
2018-06-11 23:21:57 -07:00
|
|
|
|
|
|
|
|
/* For (gl_FrontFacing ? 1.0 : -1.0), emit:
|
|
|
|
|
*
|
2022-01-18 16:30:37 -08:00
|
|
|
* or(8) tmp.1<2>W g1.1<0,1,0>W 0x00003f80W
|
2018-06-11 23:21:57 -07:00
|
|
|
* and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
|
|
|
|
|
*
|
2022-01-18 16:30:37 -08:00
|
|
|
* and negate g1.1<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
|
2018-06-11 23:21:57 -07:00
|
|
|
*/
|
|
|
|
|
if (value1 == -1.0f)
|
2022-01-18 16:30:37 -08:00
|
|
|
g1.negate = true;
|
2018-06-11 23:21:57 -07:00
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
ntb.bld.OR(subscript(tmp, BRW_TYPE_W, 1),
|
2023-11-20 22:00:28 -08:00
|
|
|
g1, brw_imm_uw(0x3f80));
|
2024-02-15 02:51:39 -08:00
|
|
|
} else {
|
2015-02-15 13:45:04 -08:00
|
|
|
/* Bit 15 of g0.0 is 0 if the polygon is front facing. */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg g0 = brw_reg(retype(brw_vec1_grf(0, 0), BRW_TYPE_W));
|
2015-02-15 13:45:04 -08:00
|
|
|
|
|
|
|
|
/* For (gl_FrontFacing ? 1.0 : -1.0), emit:
|
|
|
|
|
*
|
|
|
|
|
* or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W
|
|
|
|
|
* and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
|
|
|
|
|
*
|
|
|
|
|
* and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
|
|
|
|
|
*
|
|
|
|
|
* This negation looks like it's safe in practice, because bits 0:4 will
|
|
|
|
|
* surely be TRIANGLES
|
|
|
|
|
*/
|
|
|
|
|
|
2018-10-20 09:55:28 -05:00
|
|
|
if (value1 == -1.0f) {
|
2015-02-15 13:45:04 -08:00
|
|
|
g0.negate = true;
|
|
|
|
|
}
|
|
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
ntb.bld.OR(subscript(tmp, BRW_TYPE_W, 1),
|
2023-11-20 22:00:28 -08:00
|
|
|
g0, brw_imm_uw(0x3f80));
|
2015-02-15 13:45:04 -08:00
|
|
|
}
|
2024-04-20 17:08:02 -07:00
|
|
|
ntb.bld.AND(retype(result, BRW_TYPE_D), tmp, brw_imm_d(0xbf800000));
|
2015-02-15 13:45:04 -08:00
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2017-07-01 08:14:09 +02:00
|
|
|
static brw_rnd_mode
|
|
|
|
|
brw_rnd_mode_from_nir_op (const nir_op op) {
|
|
|
|
|
switch (op) {
|
|
|
|
|
case nir_op_f2f16_rtz:
|
|
|
|
|
return BRW_RND_MODE_RTZ;
|
|
|
|
|
case nir_op_f2f16_rtne:
|
|
|
|
|
return BRW_RND_MODE_RTNE;
|
|
|
|
|
default:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("Operation doesn't support rounding mode");
|
2017-07-01 08:14:09 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-02-12 16:13:59 +01:00
|
|
|
static brw_rnd_mode
|
|
|
|
|
brw_rnd_mode_from_execution_mode(unsigned execution_mode)
|
|
|
|
|
{
|
|
|
|
|
if (nir_has_any_rounding_mode_rtne(execution_mode))
|
|
|
|
|
return BRW_RND_MODE_RTNE;
|
|
|
|
|
if (nir_has_any_rounding_mode_rtz(execution_mode))
|
|
|
|
|
return BRW_RND_MODE_RTZ;
|
|
|
|
|
return BRW_RND_MODE_UNSPECIFIED;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2023-12-05 15:27:29 -08:00
|
|
|
prepare_alu_destination_and_sources(nir_to_brw_state &ntb,
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld,
|
2023-11-20 13:25:36 -08:00
|
|
|
nir_alu_instr *instr,
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg *op,
|
2023-11-20 13:25:36 -08:00
|
|
|
bool need_dest)
|
2014-08-15 10:32:07 -07:00
|
|
|
{
|
2023-12-05 15:27:29 -08:00
|
|
|
const intel_device_info *devinfo = ntb.devinfo;
|
2023-11-20 13:25:36 -08:00
|
|
|
|
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2.
v3: Fix handling of 64-bit CMP results.
v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.
shader-db:
Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21
total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135
Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18
total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134
total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19
total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100
total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0
total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16
total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39
total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2
total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4
LOST: 1
GAINED: 1
fossil-db:
Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%
Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)
Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-01-30 18:53:05 -08:00
|
|
|
bool all_sources_uniform = true;
|
2015-01-21 16:00:55 -08:00
|
|
|
for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
|
2024-02-12 08:43:34 -08:00
|
|
|
op[i] = get_nir_src(ntb, instr->src[i].src, -1);
|
2017-01-20 19:03:21 -08:00
|
|
|
op[i].type = brw_type_for_nir_type(devinfo,
|
2015-07-29 09:11:03 -07:00
|
|
|
(nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
|
|
|
|
|
nir_src_bit_size(instr->src[i].src)));
|
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2.
v3: Fix handling of 64-bit CMP results.
v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.
shader-db:
Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21
total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135
Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18
total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134
total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19
total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100
total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0
total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16
total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39
total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2
total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4
LOST: 1
GAINED: 1
fossil-db:
Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%
Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)
Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-01-30 18:53:05 -08:00
|
|
|
|
|
|
|
|
/* is_scalar sources won't be is_uniform because get_nir_src was passed
|
|
|
|
|
* -1 as the channel.
|
|
|
|
|
*/
|
|
|
|
|
if (!is_uniform(op[i]) && !op[i].is_scalar)
|
|
|
|
|
all_sources_uniform = false;
|
2015-01-21 16:00:55 -08:00
|
|
|
}
|
|
|
|
|
|
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2.
v3: Fix handling of 64-bit CMP results.
v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.
shader-db:
Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21
total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135
Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18
total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134
total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19
total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100
total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0
total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16
total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39
total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2
total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4
LOST: 1
GAINED: 1
fossil-db:
Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%
Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)
Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-01-30 18:53:05 -08:00
|
|
|
brw_reg result =
|
|
|
|
|
need_dest ? get_nir_def(ntb, instr->def, all_sources_uniform) : bld.null_reg_ud();
|
|
|
|
|
|
|
|
|
|
result.type = brw_type_for_nir_type(devinfo,
|
|
|
|
|
(nir_alu_type)(nir_op_infos[instr->op].output_type |
|
|
|
|
|
instr->def.bit_size));
|
|
|
|
|
|
2018-12-05 11:35:37 -08:00
|
|
|
/* Move and vecN instrutions may still be vectored. Return the raw,
|
2024-12-07 10:25:45 -08:00
|
|
|
* vectored source and destination so that brw_shader::nir_emit_alu can
|
2018-12-05 11:35:37 -08:00
|
|
|
* handle it. Other callers should not have to handle these kinds of
|
|
|
|
|
* instructions.
|
|
|
|
|
*/
|
|
|
|
|
switch (instr->op) {
|
2019-05-06 11:45:46 -05:00
|
|
|
case nir_op_mov:
|
2018-12-05 11:35:37 -08:00
|
|
|
case nir_op_vec2:
|
|
|
|
|
case nir_op_vec3:
|
|
|
|
|
case nir_op_vec4:
|
2020-08-27 17:42:43 -05:00
|
|
|
case nir_op_vec8:
|
|
|
|
|
case nir_op_vec16:
|
2018-12-05 11:35:37 -08:00
|
|
|
return result;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2.
v3: Fix handling of 64-bit CMP results.
v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.
shader-db:
Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21
total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135
Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18
total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134
total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19
total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100
total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0
total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16
total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39
total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2
total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4
LOST: 1
GAINED: 1
fossil-db:
Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%
Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)
Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-01-30 18:53:05 -08:00
|
|
|
const bool is_scalar = result.is_scalar || (!need_dest && all_sources_uniform);
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder xbld = is_scalar ? bld.scalar_group() : bld;
|
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2.
v3: Fix handling of 64-bit CMP results.
v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.
shader-db:
Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21
total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135
Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18
total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134
total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19
total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100
total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0
total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16
total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39
total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2
total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4
LOST: 1
GAINED: 1
fossil-db:
Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%
Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)
Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-01-30 18:53:05 -08:00
|
|
|
|
2018-12-05 11:35:37 -08:00
|
|
|
/* At this point, we have dealt with any instruction that operates on
|
|
|
|
|
* more than a single channel. Therefore, we can just adjust the source
|
|
|
|
|
* and destination registers for that channel and emit the instruction.
|
2015-01-21 16:00:55 -08:00
|
|
|
*/
|
2018-12-05 11:35:37 -08:00
|
|
|
unsigned channel = 0;
|
|
|
|
|
if (nir_op_infos[instr->op].output_size == 0) {
|
|
|
|
|
/* Since NIR is doing the scalarizing for us, we should only ever see
|
|
|
|
|
* vectorized operations with a single channel.
|
|
|
|
|
*/
|
2023-11-20 15:21:11 -08:00
|
|
|
nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
|
2023-07-24 17:06:32 -05:00
|
|
|
assert(util_bitcount(write_mask) == 1);
|
|
|
|
|
channel = ffs(write_mask) - 1;
|
2018-12-05 11:35:37 -08:00
|
|
|
|
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2.
v3: Fix handling of 64-bit CMP results.
v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.
shader-db:
Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21
total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135
Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18
total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134
total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19
total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100
total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0
total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16
total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39
total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2
total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4
LOST: 1
GAINED: 1
fossil-db:
Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%
Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)
Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-01-30 18:53:05 -08:00
|
|
|
result = offset(result, xbld, channel);
|
2018-12-05 11:35:37 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
|
|
|
|
|
assert(nir_op_infos[instr->op].input_sizes[i] < 2);
|
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2.
v3: Fix handling of 64-bit CMP results.
v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.
shader-db:
Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21
total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135
Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18
total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134
total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19
total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100
total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0
total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16
total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39
total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2
total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4
LOST: 1
GAINED: 1
fossil-db:
Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%
Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)
Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-01-30 18:53:05 -08:00
|
|
|
op[i] = offset(op[i], xbld, instr->src[i].swizzle[channel]);
|
|
|
|
|
|
|
|
|
|
/* If the dispatch width matches the scalar allocation width, offset()
|
|
|
|
|
* won't set the stride to zero. Force that here.
|
|
|
|
|
*/
|
|
|
|
|
if (op[i].is_scalar)
|
|
|
|
|
op[i] = component(op[i], 0);
|
2018-12-05 11:35:37 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2024-12-29 15:41:04 -08:00
|
|
|
resolve_source_modifiers(const brw_builder &bld, const brw_reg &src)
|
2023-11-20 13:25:36 -08:00
|
|
|
{
|
2024-04-12 17:43:22 -07:00
|
|
|
return (src.abs || src.negate) ? bld.MOV(src) : src;
|
2023-11-20 13:25:36 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2024-12-29 15:41:04 -08:00
|
|
|
resolve_inot_sources(nir_to_brw_state &ntb, const brw_builder &bld, nir_alu_instr *instr,
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg *op)
|
2017-02-09 15:20:04 +00:00
|
|
|
{
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
2019-04-17 17:10:18 -05:00
|
|
|
nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src);
|
2017-02-09 15:20:04 +00:00
|
|
|
|
intel/compiler: Drop nir_lower_to_source_mods() and related handling.
I think we're unanimous in wanting to drop nir_lower_to_source_mods.
It's a bit of complexity to handle in the backend, but perhaps more
importantly, would be even more complexity to handle in nir_search.
And, it turns out that since we made other compiler improvements in the
last few years, they no longer appear to buy us anything of value.
Summarizing the results from shader-db from this patch:
- Icelake (scalar mode)
Instruction counts:
- 411 helped, 598 hurt (out of 139,470 shaders)
- 99.2% of shaders remain unaffected. The average increase in
instruction count in hurt programs is 1.78 instructions.
- total instructions in shared programs: 17214951 -> 17215206 (<.01%)
- instructions in affected programs: 1143879 -> 1144134 (0.02%)
Cycles:
- 1042 helped, 1357 hurt
- total cycles in shared programs: 365613294 -> 365882263 (0.07%)
- cycles in affected programs: 138155497 -> 138424466 (0.19%)
- Haswell (both scalar and vector modes)
Instruction counts:
- 73 helped, 1680 hurt (out of 139,470 shaders)
- 98.7% of shaders remain unaffected. The average increase in
instruction count in hurt programs is 1.9 instructions.
- total instructions in shared programs: 14199527 -> 14202262 (0.02%)
- instructions in affected programs: 446499 -> 449234 (0.61%)
Cycles:
- 5253 helped, 5559 hurt
- total cycles in shared programs: 359996545 -> 360038731 (0.01%)
- cycles in affected programs: 155897127 -> 155939313 (0.03%)
Given that ~99% of shader-db remains unaffected, and the affected
programs are hurt by about 1-2 instructions - which are all cheap
ALU instructions - this is unlikely to be measurable in terms of
any real performance impact that would affect users.
So, drop them and simplify the backend, and hopefully enable other
future simplifications in NIR.
Reviewed-by: Eric Anholt <eric@anholt.net> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4616>
2020-04-18 01:20:42 -07:00
|
|
|
if (inot_instr != NULL && inot_instr->op == nir_op_inot) {
|
2017-02-09 15:20:04 +00:00
|
|
|
/* The source of the inot is now the source of instr. */
|
2023-11-20 21:21:54 -08:00
|
|
|
prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op[i], false);
|
2017-02-09 15:20:04 +00:00
|
|
|
|
|
|
|
|
assert(!op[i].negate);
|
|
|
|
|
op[i].negate = true;
|
|
|
|
|
} else {
|
2021-07-05 16:01:41 +03:00
|
|
|
op[i] = resolve_source_modifiers(bld, op[i]);
|
2017-02-09 15:20:04 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 13:25:36 -08:00
|
|
|
static bool
|
2024-12-29 15:41:04 -08:00
|
|
|
try_emit_b2fi_of_inot(nir_to_brw_state &ntb, const brw_builder &bld,
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg result,
|
2023-11-20 13:25:36 -08:00
|
|
|
nir_alu_instr *instr)
|
2018-12-03 15:53:36 -08:00
|
|
|
{
|
2019-04-17 17:10:18 -05:00
|
|
|
nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src);
|
2018-12-03 15:53:36 -08:00
|
|
|
|
|
|
|
|
if (inot_instr == NULL || inot_instr->op != nir_op_inot)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* HF is also possible as a destination on BDW+. For nir_op_b2i, the set
|
|
|
|
|
* of valid size-changing combinations is a bit more complex.
|
|
|
|
|
*
|
|
|
|
|
* The source restriction is just because I was lazy about generating the
|
|
|
|
|
* constant below.
|
|
|
|
|
*/
|
2024-11-12 14:15:26 -08:00
|
|
|
if (nir_src_bit_size(inot_instr->src[0].src) != 32)
|
2018-12-03 15:53:36 -08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0. Since a can only be 0 or -1,
|
2024-11-12 14:15:26 -08:00
|
|
|
* this is either intBitsToFloat(~a & ONE_POINT_ZERO) or (~a & 1).
|
2018-12-03 15:53:36 -08:00
|
|
|
*/
|
2024-11-12 14:15:26 -08:00
|
|
|
brw_reg one;
|
|
|
|
|
switch (instr->op) {
|
|
|
|
|
case nir_op_b2f32:
|
|
|
|
|
one = brw_imm_ud(0x3f800000);
|
|
|
|
|
break;
|
|
|
|
|
case nir_op_b2i32:
|
|
|
|
|
one = brw_imm_ud(1);
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg op;
|
2018-12-03 15:53:36 -08:00
|
|
|
|
2023-11-20 21:21:54 -08:00
|
|
|
prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op, false);
|
2018-12-03 15:53:36 -08:00
|
|
|
|
2024-11-12 14:15:26 -08:00
|
|
|
op.negate = true;
|
|
|
|
|
bld.AND(retype(result, one.type), op, one);
|
2018-12-03 15:53:36 -08:00
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
intel/fs: Emit better code for bfi(..., 0)
DG2, Tiger Lake, Ice Lake, and Skylake had similar results (Ice Lake shown)
total instructions in shared programs: 20570141 -> 20570063 (<.01%)
instructions in affected programs: 30679 -> 30601 (-0.25%)
helped: 77 / HURT: 0
total cycles in shared programs: 902113977 -> 902118723 (<.01%)
cycles in affected programs: 3255958 -> 3260704 (0.15%)
helped: 60 / HURT: 19
Broadwell
total instructions in shared programs: 18524633 -> 18524547 (<.01%)
instructions in affected programs: 34095 -> 34009 (-0.25%)
helped: 75 / HURT: 2
total cycles in shared programs: 949532394 -> 949543761 (<.01%)
cycles in affected programs: 3419107 -> 3430474 (0.33%)
helped: 57 / HURT: 24
total spills in shared programs: 22484 -> 22484 (0.00%)
spills in affected programs: 516 -> 516 (0.00%)
helped: 2 / HURT: 2
total fills in shared programs: 29346 -> 29338 (-0.03%)
fills in affected programs: 572 -> 564 (-1.40%)
helped: 4 / HURT: 0
Haswell
total instructions in shared programs: 17331356 -> 17331523 (<.01%)
instructions in affected programs: 27920 -> 28087 (0.60%)
helped: 41 / HURT: 4
total cycles in shared programs: 936603192 -> 936574664 (<.01%)
cycles in affected programs: 3417695 -> 3389167 (-0.83%)
helped: 28 / HURT: 21
total spills in shared programs: 19718 -> 19756 (0.19%)
spills in affected programs: 436 -> 474 (8.72%)
helped: 0 / HURT: 4
total fills in shared programs: 22547 -> 22607 (0.27%)
fills in affected programs: 444 -> 504 (13.51%)
helped: 0 / HURT: 4
Ivy Bridge
total cycles in shared programs: 463451277 -> 463451273 (<.01%)
cycles in affected programs: 95870 -> 95866 (<.01%)
helped: 3 / HURT: 2
DG2, Tiger Lake, Ice Lake, and Skylake had similar results (Ice Lake shown)
Totals:
Instrs: 152825278 -> 152819969 (-0.00%); split: -0.00%, +0.00%
Cycles: 15014075626 -> 15014628652 (+0.00%); split: -0.01%, +0.01%
Subgroup size: 8528536 -> 8528560 (+0.00%)
Send messages: 7711431 -> 7711464 (+0.00%)
Spill count: 99907 -> 99509 (-0.40%); split: -0.40%, +0.00%
Fill count: 202459 -> 201598 (-0.43%); split: -0.43%, +0.00%
Scratch Memory Size: 4376576 -> 4371456 (-0.12%)
Totals from 2915 (0.44% of 662497) affected shaders:
Instrs: 2288842 -> 2283533 (-0.23%); split: -0.24%, +0.01%
Cycles: 471633295 -> 472186321 (+0.12%); split: -0.27%, +0.39%
Subgroup size: 27488 -> 27512 (+0.09%)
Send messages: 151344 -> 151377 (+0.02%)
Spill count: 48091 -> 47693 (-0.83%); split: -0.83%, +0.00%
Fill count: 59053 -> 58192 (-1.46%); split: -1.46%, +0.00%
Scratch Memory Size: 1827840 -> 1822720 (-0.28%)
Reviewed-by: Matt Turner <mattst88@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19968>
2022-11-16 13:12:50 -08:00
|
|
|
static bool
|
|
|
|
|
is_const_zero(const nir_src &src)
|
|
|
|
|
{
|
|
|
|
|
return nir_src_is_const(src) && nir_src_as_int(src) == 0;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 14:55:21 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
|
2023-11-20 14:55:21 -08:00
|
|
|
bool need_dest)
|
2018-12-05 11:35:37 -08:00
|
|
|
{
|
2023-12-05 15:27:29 -08:00
|
|
|
const intel_device_info *devinfo = ntb.devinfo;
|
2023-11-20 14:55:21 -08:00
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *inst;
|
2019-02-12 16:13:59 +01:00
|
|
|
unsigned execution_mode =
|
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2.
v3: Fix handling of 64-bit CMP results.
v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.
shader-db:
Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21
total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135
Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18
total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134
total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19
total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100
total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0
total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16
total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39
total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2
total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4
LOST: 1
GAINED: 1
fossil-db:
Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%
Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)
Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-01-30 18:53:05 -08:00
|
|
|
ntb.bld.shader->nir->info.float_controls_execution_mode;
|
2018-12-05 11:35:37 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg op[NIR_MAX_VEC_COMPONENTS];
|
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2.
v3: Fix handling of 64-bit CMP results.
v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.
shader-db:
Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21
total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135
Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18
total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134
total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19
total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100
total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0
total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16
total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39
total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2
total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4
LOST: 1
GAINED: 1
fossil-db:
Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%
Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)
Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-01-30 18:53:05 -08:00
|
|
|
brw_reg result = prepare_alu_destination_and_sources(ntb, ntb.bld, instr, op, need_dest);
|
2018-12-05 11:35:37 -08:00
|
|
|
|
2021-01-22 14:54:02 -08:00
|
|
|
#ifndef NDEBUG
|
|
|
|
|
/* Everything except raw moves, some type conversions, iabs, and ineg
|
|
|
|
|
* should have 8-bit sources lowered by nir_lower_bit_size in
|
|
|
|
|
* brw_preprocess_nir or by brw_nir_lower_conversions in
|
|
|
|
|
* brw_postprocess_nir.
|
|
|
|
|
*/
|
|
|
|
|
switch (instr->op) {
|
|
|
|
|
case nir_op_mov:
|
|
|
|
|
case nir_op_vec2:
|
|
|
|
|
case nir_op_vec3:
|
|
|
|
|
case nir_op_vec4:
|
|
|
|
|
case nir_op_vec8:
|
|
|
|
|
case nir_op_vec16:
|
|
|
|
|
case nir_op_i2f16:
|
|
|
|
|
case nir_op_i2f32:
|
|
|
|
|
case nir_op_i2i16:
|
|
|
|
|
case nir_op_i2i32:
|
|
|
|
|
case nir_op_u2f16:
|
|
|
|
|
case nir_op_u2f32:
|
|
|
|
|
case nir_op_u2u16:
|
|
|
|
|
case nir_op_u2u32:
|
|
|
|
|
case nir_op_iabs:
|
|
|
|
|
case nir_op_ineg:
|
2021-01-25 16:31:17 -08:00
|
|
|
case nir_op_pack_32_4x8_split:
|
2021-01-22 14:54:02 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
|
2024-04-21 00:57:59 -07:00
|
|
|
assert(brw_type_size_bytes(op[i].type) > 1);
|
2021-01-22 14:54:02 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = result.is_scalar ? ntb.bld.scalar_group() : ntb.bld;
|
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2.
v3: Fix handling of 64-bit CMP results.
v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.
shader-db:
Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21
total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135
Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18
total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134
total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19
total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100
total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0
total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16
total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39
total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2
total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4
LOST: 1
GAINED: 1
fossil-db:
Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%
Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)
Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-01-30 18:53:05 -08:00
|
|
|
|
2015-01-21 16:00:55 -08:00
|
|
|
switch (instr->op) {
|
2019-05-06 11:45:46 -05:00
|
|
|
case nir_op_mov:
|
2015-01-21 16:00:55 -08:00
|
|
|
case nir_op_vec2:
|
|
|
|
|
case nir_op_vec3:
|
2020-08-27 17:42:43 -05:00
|
|
|
case nir_op_vec4:
|
|
|
|
|
case nir_op_vec8:
|
|
|
|
|
case nir_op_vec16: {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg temp = result;
|
2015-01-21 16:00:55 -08:00
|
|
|
bool need_extra_copy = false;
|
2023-07-24 17:32:01 -05:00
|
|
|
|
|
|
|
|
nir_intrinsic_instr *store_reg =
|
2023-08-14 11:43:35 -05:00
|
|
|
nir_store_reg_for_def(&instr->def);
|
2023-07-24 17:32:01 -05:00
|
|
|
if (store_reg != NULL) {
|
2023-08-12 16:17:15 -04:00
|
|
|
nir_def *dest_reg = store_reg->src[1].ssa;
|
2023-07-24 17:32:01 -05:00
|
|
|
for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
|
|
|
|
|
nir_intrinsic_instr *load_reg =
|
|
|
|
|
nir_load_reg_for_def(instr->src[i].src.ssa);
|
|
|
|
|
if (load_reg == NULL)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (load_reg->src[0].ssa == dest_reg) {
|
|
|
|
|
need_extra_copy = true;
|
|
|
|
|
temp = bld.vgrf(result.type, 4);
|
|
|
|
|
break;
|
|
|
|
|
}
|
2015-01-21 16:00:55 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 15:21:11 -08:00
|
|
|
nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
|
2023-07-24 17:06:32 -05:00
|
|
|
unsigned last_bit = util_last_bit(write_mask);
|
2023-01-12 16:06:42 +01:00
|
|
|
|
2024-08-16 00:11:04 -07:00
|
|
|
assert(last_bit <= NIR_MAX_VEC_COMPONENTS);
|
|
|
|
|
brw_reg comps[NIR_MAX_VEC_COMPONENTS];
|
2024-01-31 22:07:15 -08:00
|
|
|
|
2023-01-12 16:06:42 +01:00
|
|
|
for (unsigned i = 0; i < last_bit; i++) {
|
2024-01-31 22:07:15 -08:00
|
|
|
if (instr->op == nir_op_mov)
|
|
|
|
|
comps[i] = offset(op[0], bld, instr->src[0].swizzle[i]);
|
|
|
|
|
else
|
|
|
|
|
comps[i] = offset(op[i], bld, instr->src[i].swizzle[0]);
|
|
|
|
|
}
|
2015-01-21 16:00:55 -08:00
|
|
|
|
2024-01-31 22:07:15 -08:00
|
|
|
if (write_mask == (1u << last_bit) - 1) {
|
|
|
|
|
bld.VEC(temp, comps, last_bit);
|
|
|
|
|
} else {
|
|
|
|
|
for (unsigned i = 0; i < last_bit; i++) {
|
|
|
|
|
if (write_mask & (1 << i))
|
|
|
|
|
bld.MOV(offset(temp, bld, i), comps[i]);
|
2015-01-21 16:00:55 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* In this case the source and destination registers were the same,
|
|
|
|
|
* so we need to insert an extra set of moves in order to deal with
|
|
|
|
|
* any swizzling.
|
|
|
|
|
*/
|
|
|
|
|
if (need_extra_copy) {
|
2023-01-12 16:06:42 +01:00
|
|
|
for (unsigned i = 0; i < last_bit; i++) {
|
2023-07-24 17:06:32 -05:00
|
|
|
if (!(write_mask & (1 << i)))
|
2015-01-21 16:00:55 -08:00
|
|
|
continue;
|
|
|
|
|
|
2015-06-18 12:07:27 -07:00
|
|
|
bld.MOV(offset(result, bld, i), offset(temp, bld, i));
|
2015-01-21 16:00:55 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return;
|
|
|
|
|
}
|
2014-12-23 14:44:19 -08:00
|
|
|
|
2017-03-07 19:54:37 -08:00
|
|
|
case nir_op_i2f32:
|
|
|
|
|
case nir_op_u2f32:
|
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2.
v3: Fix handling of 64-bit CMP results.
v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.
shader-db:
Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21
total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135
Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18
total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134
total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19
total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100
total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0
total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16
total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39
total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2
total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4
LOST: 1
GAINED: 1
fossil-db:
Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%
Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)
Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-01-30 18:53:05 -08:00
|
|
|
if (optimize_extract_to_float(ntb, bld, instr, result))
|
2016-01-21 09:10:09 -08:00
|
|
|
return;
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.MOV(result, op[0]);
|
2016-06-13 03:13:23 -04:00
|
|
|
break;
|
2016-01-21 09:10:09 -08:00
|
|
|
|
2017-07-01 08:14:09 +02:00
|
|
|
case nir_op_f2f16_rtne:
|
|
|
|
|
case nir_op_f2f16_rtz:
|
2019-02-13 10:42:05 +01:00
|
|
|
case nir_op_f2f16: {
|
|
|
|
|
brw_rnd_mode rnd = BRW_RND_MODE_UNSPECIFIED;
|
|
|
|
|
|
|
|
|
|
if (nir_op_f2f16 == instr->op)
|
|
|
|
|
rnd = brw_rnd_mode_from_execution_mode(execution_mode);
|
|
|
|
|
else
|
|
|
|
|
rnd = brw_rnd_mode_from_nir_op(instr->op);
|
|
|
|
|
|
|
|
|
|
if (BRW_RND_MODE_UNSPECIFIED != rnd)
|
2023-04-05 15:38:34 +03:00
|
|
|
bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), brw_imm_d(rnd));
|
2019-02-13 10:42:05 +01:00
|
|
|
|
2024-04-21 00:57:59 -07:00
|
|
|
assert(brw_type_size_bytes(op[0].type) < 8); /* brw_nir_lower_conversions */
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.MOV(result, op[0]);
|
2017-07-01 08:11:05 +02:00
|
|
|
break;
|
2019-02-13 10:42:05 +01:00
|
|
|
}
|
2017-07-01 08:11:05 +02:00
|
|
|
|
2018-11-07 13:43:40 -06:00
|
|
|
case nir_op_b2i8:
|
|
|
|
|
case nir_op_b2i16:
|
|
|
|
|
case nir_op_b2i32:
|
|
|
|
|
case nir_op_b2i64:
|
|
|
|
|
case nir_op_b2f16:
|
|
|
|
|
case nir_op_b2f32:
|
|
|
|
|
case nir_op_b2f64:
|
2023-11-20 21:21:54 -08:00
|
|
|
if (try_emit_b2fi_of_inot(ntb, bld, result, instr))
|
2018-12-03 15:53:36 -08:00
|
|
|
break;
|
2024-04-20 17:08:02 -07:00
|
|
|
op[0].type = BRW_TYPE_D;
|
2018-10-10 15:17:11 -07:00
|
|
|
op[0].negate = !op[0].negate;
|
2021-04-10 17:11:58 +02:00
|
|
|
FALLTHROUGH;
|
2017-03-07 19:54:37 -08:00
|
|
|
case nir_op_i2f64:
|
2017-11-08 15:14:19 -08:00
|
|
|
case nir_op_i2i64:
|
2017-03-07 19:54:37 -08:00
|
|
|
case nir_op_u2f64:
|
2017-11-08 15:14:19 -08:00
|
|
|
case nir_op_u2u64:
|
2018-07-17 09:02:27 +02:00
|
|
|
case nir_op_f2f64:
|
|
|
|
|
case nir_op_f2i64:
|
|
|
|
|
case nir_op_f2u64:
|
|
|
|
|
case nir_op_i2i32:
|
|
|
|
|
case nir_op_u2u32:
|
2017-03-07 19:54:37 -08:00
|
|
|
case nir_op_f2i32:
|
|
|
|
|
case nir_op_f2u32:
|
2018-07-17 09:02:27 +02:00
|
|
|
case nir_op_i2f16:
|
|
|
|
|
case nir_op_u2f16:
|
|
|
|
|
case nir_op_f2i16:
|
|
|
|
|
case nir_op_f2u16:
|
|
|
|
|
case nir_op_f2i8:
|
|
|
|
|
case nir_op_f2u8:
|
2024-04-20 17:08:02 -07:00
|
|
|
if (result.type == BRW_TYPE_B ||
|
|
|
|
|
result.type == BRW_TYPE_UB ||
|
|
|
|
|
result.type == BRW_TYPE_HF)
|
2024-04-21 00:57:59 -07:00
|
|
|
assert(brw_type_size_bytes(op[0].type) < 8); /* brw_nir_lower_conversions */
|
2018-07-17 09:02:27 +02:00
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
if (op[0].type == BRW_TYPE_B ||
|
|
|
|
|
op[0].type == BRW_TYPE_UB ||
|
|
|
|
|
op[0].type == BRW_TYPE_HF)
|
2024-04-21 00:57:59 -07:00
|
|
|
assert(brw_type_size_bytes(result.type) < 8); /* brw_nir_lower_conversions */
|
2018-07-17 09:02:27 +02:00
|
|
|
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.MOV(result, op[0]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
2021-01-26 19:52:50 -08:00
|
|
|
case nir_op_i2i8:
|
|
|
|
|
case nir_op_u2u8:
|
2024-04-21 00:57:59 -07:00
|
|
|
assert(brw_type_size_bytes(op[0].type) < 8); /* brw_nir_lower_conversions */
|
2021-01-26 19:52:50 -08:00
|
|
|
FALLTHROUGH;
|
|
|
|
|
case nir_op_i2i16:
|
|
|
|
|
case nir_op_u2u16: {
|
|
|
|
|
/* Emit better code for u2u8(extract_u8(a, b)) and similar patterns.
|
|
|
|
|
* Emitting the instructions one by one results in two MOV instructions
|
|
|
|
|
* that won't be propagated. By handling both instructions here, a
|
|
|
|
|
* single MOV is emitted.
|
|
|
|
|
*/
|
|
|
|
|
nir_alu_instr *extract_instr = nir_src_as_alu_instr(instr->src[0].src);
|
|
|
|
|
if (extract_instr != NULL) {
|
|
|
|
|
if (extract_instr->op == nir_op_extract_u8 ||
|
|
|
|
|
extract_instr->op == nir_op_extract_i8) {
|
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2.
v3: Fix handling of 64-bit CMP results.
v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.
shader-db:
Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21
total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135
Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18
total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134
total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19
total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100
total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0
total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16
total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39
total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2
total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4
LOST: 1
GAINED: 1
fossil-db:
Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%
Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)
Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-01-30 18:53:05 -08:00
|
|
|
prepare_alu_destination_and_sources(ntb, ntb.bld, extract_instr, op, false);
|
2021-01-26 19:52:50 -08:00
|
|
|
|
|
|
|
|
const unsigned byte = nir_src_as_uint(extract_instr->src[1].src);
|
|
|
|
|
const brw_reg_type type =
|
|
|
|
|
brw_int_type(1, extract_instr->op == nir_op_extract_i8);
|
|
|
|
|
|
|
|
|
|
op[0] = subscript(op[0], type, byte);
|
|
|
|
|
} else if (extract_instr->op == nir_op_extract_u16 ||
|
|
|
|
|
extract_instr->op == nir_op_extract_i16) {
|
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2.
v3: Fix handling of 64-bit CMP results.
v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.
shader-db:
Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21
total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135
Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18
total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134
total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19
total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100
total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0
total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16
total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39
total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2
total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4
LOST: 1
GAINED: 1
fossil-db:
Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%
Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)
Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-01-30 18:53:05 -08:00
|
|
|
prepare_alu_destination_and_sources(ntb, ntb.bld, extract_instr, op, false);
|
2021-01-26 19:52:50 -08:00
|
|
|
|
|
|
|
|
const unsigned word = nir_src_as_uint(extract_instr->src[1].src);
|
|
|
|
|
const brw_reg_type type =
|
|
|
|
|
brw_int_type(2, extract_instr->op == nir_op_extract_i16);
|
|
|
|
|
|
|
|
|
|
op[0] = subscript(op[0], type, word);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-10-23 00:01:52 -07:00
|
|
|
/* If narrowing (e.g. 32 -> 16), don't do a D -> W or UD -> UW mov,
|
|
|
|
|
* instead just read the source as W/UW with a stride (discarding
|
|
|
|
|
* the top bits). This avoids the need for the destination to be
|
|
|
|
|
* DWord aligned due to regioning restrictions.
|
|
|
|
|
*/
|
|
|
|
|
if (brw_type_size_bits(result.type) < brw_type_size_bits(op[0].type)) {
|
|
|
|
|
const unsigned bits = brw_type_size_bits(result.type);
|
|
|
|
|
op[0] = subscript(op[0], brw_type_with_size(op[0].type, bits), 0);
|
|
|
|
|
}
|
|
|
|
|
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.MOV(result, op[0]);
|
2021-01-26 19:52:50 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-06 11:16:25 -05:00
|
|
|
case nir_op_fsat:
|
|
|
|
|
inst = bld.MOV(result, op[0]);
|
|
|
|
|
inst->saturate = true;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_fneg:
|
|
|
|
|
case nir_op_ineg:
|
|
|
|
|
op[0].negate = true;
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.MOV(result, op[0]);
|
2019-05-06 11:16:25 -05:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_fabs:
|
|
|
|
|
case nir_op_iabs:
|
|
|
|
|
op[0].negate = false;
|
|
|
|
|
op[0].abs = true;
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.MOV(result, op[0]);
|
2019-05-06 11:16:25 -05:00
|
|
|
break;
|
|
|
|
|
|
2019-02-13 10:42:05 +01:00
|
|
|
case nir_op_f2f32:
|
|
|
|
|
if (nir_has_any_rounding_mode_enabled(execution_mode)) {
|
|
|
|
|
brw_rnd_mode rnd =
|
|
|
|
|
brw_rnd_mode_from_execution_mode(execution_mode);
|
2023-04-05 15:38:34 +03:00
|
|
|
bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
|
|
|
|
|
brw_imm_d(rnd));
|
2019-02-13 10:42:05 +01:00
|
|
|
}
|
|
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
if (op[0].type == BRW_TYPE_HF)
|
2024-04-21 00:57:59 -07:00
|
|
|
assert(brw_type_size_bytes(result.type) < 8); /* brw_nir_lower_conversions */
|
2019-02-13 10:42:05 +01:00
|
|
|
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.MOV(result, op[0]);
|
2019-02-13 10:42:05 +01:00
|
|
|
break;
|
|
|
|
|
|
2018-06-25 19:50:56 -07:00
|
|
|
case nir_op_fsign:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("Should have been lowered by brw_nir_lower_fsign.");
|
2014-08-15 10:32:07 -07:00
|
|
|
|
|
|
|
|
case nir_op_frcp:
|
2024-04-12 17:57:33 -07:00
|
|
|
bld.RCP(result, op[0]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_fexp2:
|
2024-04-12 17:57:33 -07:00
|
|
|
bld.EXP2(result, op[0]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_flog2:
|
2024-04-12 17:57:33 -07:00
|
|
|
bld.LOG2(result, op[0]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_fsin:
|
2024-04-12 17:57:33 -07:00
|
|
|
bld.SIN(result, op[0]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_fcos:
|
2024-04-12 17:57:33 -07:00
|
|
|
bld.COS(result, op[0]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
2015-11-10 10:18:55 +01:00
|
|
|
case nir_op_fadd:
|
2019-02-12 16:13:59 +01:00
|
|
|
if (nir_has_any_rounding_mode_enabled(execution_mode)) {
|
|
|
|
|
brw_rnd_mode rnd =
|
|
|
|
|
brw_rnd_mode_from_execution_mode(execution_mode);
|
2023-04-05 15:38:34 +03:00
|
|
|
bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
|
|
|
|
|
brw_imm_d(rnd));
|
2019-02-12 16:13:59 +01:00
|
|
|
}
|
2021-04-10 17:11:58 +02:00
|
|
|
FALLTHROUGH;
|
2019-02-12 16:13:59 +01:00
|
|
|
case nir_op_iadd:
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.ADD(result, op[0], op[1]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
2020-06-05 22:40:26 -07:00
|
|
|
case nir_op_iadd3:
|
2023-08-14 12:58:51 -07:00
|
|
|
assert(instr->def.bit_size < 64);
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.ADD3(result, op[0], op[1], op[2]);
|
2020-06-05 22:40:26 -07:00
|
|
|
break;
|
|
|
|
|
|
2018-09-11 16:49:51 -07:00
|
|
|
case nir_op_iadd_sat:
|
2018-10-05 21:04:47 -05:00
|
|
|
case nir_op_uadd_sat:
|
|
|
|
|
inst = bld.ADD(result, op[0], op[1]);
|
2020-04-21 16:06:54 -07:00
|
|
|
inst->saturate = true;
|
2018-10-05 21:04:47 -05:00
|
|
|
break;
|
|
|
|
|
|
2018-09-11 16:49:51 -07:00
|
|
|
case nir_op_isub_sat:
|
|
|
|
|
bld.emit(SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_usub_sat:
|
|
|
|
|
bld.emit(SHADER_OPCODE_USUB_SAT, result, op[0], op[1]);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_irhadd:
|
|
|
|
|
case nir_op_urhadd:
|
2023-08-14 11:43:35 -05:00
|
|
|
assert(instr->def.bit_size < 64);
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.AVG(result, op[0], op[1]);
|
2018-09-11 16:49:51 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_ihadd:
|
|
|
|
|
case nir_op_uhadd: {
|
2023-08-14 11:43:35 -05:00
|
|
|
assert(instr->def.bit_size < 64);
|
2018-09-11 16:49:51 -07:00
|
|
|
|
2024-02-15 02:51:39 -08:00
|
|
|
op[0] = resolve_source_modifiers(bld, op[0]);
|
|
|
|
|
op[1] = resolve_source_modifiers(bld, op[1]);
|
2018-09-11 16:49:51 -07:00
|
|
|
|
|
|
|
|
/* AVG(x, y) - ((x ^ y) & 1) */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg one = retype(brw_imm_ud(1), result.type);
|
2024-04-12 17:43:22 -07:00
|
|
|
bld.ADD(result, bld.AVG(op[0], op[1]),
|
|
|
|
|
negate(bld.AND(bld.XOR(op[0], op[1]), one)));
|
2018-09-11 16:49:51 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2014-12-23 14:44:19 -08:00
|
|
|
case nir_op_fmul:
|
2019-02-12 16:13:59 +01:00
|
|
|
if (nir_has_any_rounding_mode_enabled(execution_mode)) {
|
|
|
|
|
brw_rnd_mode rnd =
|
|
|
|
|
brw_rnd_mode_from_execution_mode(execution_mode);
|
2023-04-05 15:38:34 +03:00
|
|
|
bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
|
|
|
|
|
brw_imm_d(rnd));
|
2019-02-12 16:13:59 +01:00
|
|
|
}
|
|
|
|
|
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.MUL(result, op[0], op[1]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
2019-02-14 23:08:39 -08:00
|
|
|
case nir_op_imul_2x32_64:
|
|
|
|
|
case nir_op_umul_2x32_64:
|
|
|
|
|
bld.MUL(result, op[0], op[1]);
|
|
|
|
|
break;
|
|
|
|
|
|
2018-09-11 16:49:51 -07:00
|
|
|
case nir_op_imul_32x16:
|
|
|
|
|
case nir_op_umul_32x16: {
|
|
|
|
|
const bool ud = instr->op == nir_op_umul_32x16;
|
2024-04-20 17:30:23 -07:00
|
|
|
const enum brw_reg_type word_type = ud ? BRW_TYPE_UW : BRW_TYPE_W;
|
|
|
|
|
const enum brw_reg_type dword_type = ud ? BRW_TYPE_UD : BRW_TYPE_D;
|
2018-09-11 16:49:51 -07:00
|
|
|
|
2023-08-14 11:43:35 -05:00
|
|
|
assert(instr->def.bit_size == 32);
|
2018-09-11 16:49:51 -07:00
|
|
|
|
2021-02-08 18:49:06 -08:00
|
|
|
/* Before copy propagation there are no immediate values. */
|
|
|
|
|
assert(op[0].file != IMM && op[1].file != IMM);
|
2018-09-11 16:49:51 -07:00
|
|
|
|
2021-02-08 18:49:06 -08:00
|
|
|
op[1] = subscript(op[1], word_type, 0);
|
2021-02-08 16:45:08 -08:00
|
|
|
|
2024-02-15 02:51:39 -08:00
|
|
|
bld.MUL(result, retype(op[0], dword_type), op[1]);
|
2021-02-08 16:45:08 -08:00
|
|
|
|
2018-09-11 16:49:51 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2015-05-11 09:29:56 -07:00
|
|
|
case nir_op_imul:
|
2023-08-14 11:43:35 -05:00
|
|
|
assert(instr->def.bit_size < 64);
|
2015-06-03 20:59:26 +03:00
|
|
|
bld.MUL(result, op[0], op[1]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_imul_high:
|
2015-08-04 19:08:45 +03:00
|
|
|
case nir_op_umul_high:
|
2023-08-14 11:43:35 -05:00
|
|
|
assert(instr->def.bit_size < 64);
|
|
|
|
|
if (instr->def.bit_size == 32) {
|
2022-04-08 15:17:12 -05:00
|
|
|
bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
|
|
|
|
|
} else {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp = bld.vgrf(brw_type_with_size(op[0].type, 32));
|
2022-04-08 15:17:12 -05:00
|
|
|
bld.MUL(tmp, op[0], op[1]);
|
|
|
|
|
bld.MOV(result, subscript(tmp, result.type, 1));
|
|
|
|
|
}
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_idiv:
|
|
|
|
|
case nir_op_udiv:
|
2023-08-14 11:43:35 -05:00
|
|
|
assert(instr->def.bit_size < 64);
|
2024-04-12 17:57:33 -07:00
|
|
|
bld.INT_QUOTIENT(result, op[0], op[1]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
2015-07-09 21:42:28 +03:00
|
|
|
case nir_op_uadd_carry:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("Should have been lowered by carry_to_arith().");
|
2014-08-15 10:32:07 -07:00
|
|
|
|
2015-07-09 21:42:28 +03:00
|
|
|
case nir_op_usub_borrow:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("Should have been lowered by borrow_to_arith().");
|
2014-08-15 10:32:07 -07:00
|
|
|
|
|
|
|
|
case nir_op_umod:
|
2016-03-25 11:17:53 -07:00
|
|
|
case nir_op_irem:
|
|
|
|
|
/* According to the sign table for INT DIV in the Ivy Bridge PRM, it
|
|
|
|
|
* appears that our hardware just does the right thing for signed
|
|
|
|
|
* remainder.
|
|
|
|
|
*/
|
2023-08-14 11:43:35 -05:00
|
|
|
assert(instr->def.bit_size < 64);
|
2024-04-12 17:57:33 -07:00
|
|
|
bld.INT_REMAINDER(result, op[0], op[1]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
2016-03-25 11:17:53 -07:00
|
|
|
case nir_op_imod: {
|
|
|
|
|
/* Get a regular C-style remainder. If a % b == 0, set the predicate. */
|
2024-04-12 17:57:33 -07:00
|
|
|
bld.INT_REMAINDER(result, op[0], op[1]);
|
2016-03-25 11:17:53 -07:00
|
|
|
|
|
|
|
|
/* Math instructions don't support conditional mod */
|
|
|
|
|
inst = bld.MOV(bld.null_reg_d(), result);
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NZ;
|
|
|
|
|
|
|
|
|
|
/* Now, we need to determine if signs of the sources are different.
|
|
|
|
|
* When we XOR the sources, the top bit is 0 if they are the same and 1
|
|
|
|
|
* if they are different. We can then use a conditional modifier to
|
|
|
|
|
* turn that into a predicate. This leads us to an XOR.l instruction.
|
|
|
|
|
*
|
|
|
|
|
* Technically, according to the PRM, you're not allowed to use .l on a
|
2022-06-22 18:31:08 +02:00
|
|
|
* XOR instruction. However, empirical experiments and Curro's reading
|
2016-03-25 11:17:53 -07:00
|
|
|
* of the simulator source both indicate that it's safe.
|
|
|
|
|
*/
|
2024-04-12 17:43:22 -07:00
|
|
|
bld.XOR(op[0], op[1], &inst);
|
2016-03-25 11:17:53 -07:00
|
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_L;
|
|
|
|
|
|
|
|
|
|
/* If the result of the initial remainder operation is non-zero and the
|
|
|
|
|
* two sources have different signs, add in a copy of op[1] to get the
|
|
|
|
|
* final integer modulus value.
|
|
|
|
|
*/
|
|
|
|
|
inst = bld.ADD(result, result, op[1]);
|
|
|
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2018-10-18 11:44:38 -05:00
|
|
|
case nir_op_flt32:
|
|
|
|
|
case nir_op_fge32:
|
|
|
|
|
case nir_op_feq32:
|
2020-08-18 19:51:57 +02:00
|
|
|
case nir_op_fneu32: {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dest = result;
|
2018-04-19 10:06:43 +02:00
|
|
|
|
|
|
|
|
const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
|
intel/compiler: UNDEF comparisons with smaller than 32-bit
Comparisons which produce 32-bit boolean results (0 or 0xFFFFFFFF)
but operate on 16-bit types would first generate a CMP instruction
with W or HF types, before expanding it out. This CMP is a partial
write, which leads us to think the register may contain some prior
contents still. When placed in a loop, this causes its live range
to extend beyond its real life time.
Mark the register with UNDEF first so that we know that no prior
contents exist and need to be preserved.
This affects:
flt32, fge32, feq32, fneu32, ilt32, ult32, ige32, uge32, ieq32, ine32
On one of Cyberpunk 2077's most complex compute shaders, this reduces
the maximum live registers from 696 to 537 (22.8%). Together with the
next patch, Cyberpunk's spills and fills are cut by 10.23% and 9.19%,
respectively.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22835>
2023-05-03 01:49:56 -07:00
|
|
|
if (bit_size != 32) {
|
2024-04-11 14:52:56 -07:00
|
|
|
dest = bld.vgrf(op[0].type);
|
2024-10-23 01:34:40 -07:00
|
|
|
bld.emit_undef_for_partial_reg(dest);
|
intel/compiler: UNDEF comparisons with smaller than 32-bit
Comparisons which produce 32-bit boolean results (0 or 0xFFFFFFFF)
but operate on 16-bit types would first generate a CMP instruction
with W or HF types, before expanding it out. This CMP is a partial
write, which leads us to think the register may contain some prior
contents still. When placed in a loop, this causes its live range
to extend beyond its real life time.
Mark the register with UNDEF first so that we know that no prior
contents exist and need to be preserved.
This affects:
flt32, fge32, feq32, fneu32, ilt32, ult32, ige32, uge32, ieq32, ine32
On one of Cyberpunk 2077's most complex compute shaders, this reduces
the maximum live registers from 696 to 537 (22.8%). Together with the
next patch, Cyberpunk's spills and fills are cut by 10.23% and 9.19%,
respectively.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22835>
2023-05-03 01:49:56 -07:00
|
|
|
}
|
2018-04-19 10:06:43 +02:00
|
|
|
|
2019-08-02 15:19:16 -05:00
|
|
|
bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op));
|
2018-04-19 10:06:43 +02:00
|
|
|
|
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2.
v3: Fix handling of 64-bit CMP results.
v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.
shader-db:
Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21
total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135
Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18
total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134
total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19
total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100
total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0
total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16
total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39
total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2
total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4
LOST: 1
GAINED: 1
fossil-db:
Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%
Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)
Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-01-30 18:53:05 -08:00
|
|
|
/* The destination will now be used as a source, so select component 0
|
|
|
|
|
* if it's is_scalar (as is done in get_nir_src).
|
|
|
|
|
*/
|
|
|
|
|
if (bit_size != 32 && result.is_scalar)
|
|
|
|
|
dest = component(dest, 0);
|
|
|
|
|
|
2018-04-19 10:06:43 +02:00
|
|
|
if (bit_size > 32) {
|
2024-04-20 17:08:02 -07:00
|
|
|
bld.MOV(result, subscript(dest, BRW_TYPE_UD, 0));
|
2018-04-19 10:06:43 +02:00
|
|
|
} else if(bit_size < 32) {
|
|
|
|
|
/* When we convert the result to 32-bit we need to be careful and do
|
|
|
|
|
* it as a signed conversion to get sign extension (for 32-bit true)
|
|
|
|
|
*/
|
|
|
|
|
const brw_reg_type src_type =
|
2024-04-21 00:33:52 -07:00
|
|
|
brw_type_with_size(BRW_TYPE_D, bit_size);
|
2018-04-19 10:06:43 +02:00
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
bld.MOV(retype(result, BRW_TYPE_D), retype(dest, src_type));
|
2015-08-03 18:08:58 -07:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2018-10-18 11:44:38 -05:00
|
|
|
case nir_op_ilt32:
|
|
|
|
|
case nir_op_ult32:
|
|
|
|
|
case nir_op_ige32:
|
|
|
|
|
case nir_op_uge32:
|
|
|
|
|
case nir_op_ieq32:
|
|
|
|
|
case nir_op_ine32: {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dest = result;
|
2018-04-19 10:06:43 +02:00
|
|
|
|
2024-04-21 00:57:59 -07:00
|
|
|
const uint32_t bit_size = brw_type_size_bits(op[0].type);
|
intel/compiler: UNDEF comparisons with smaller than 32-bit
Comparisons which produce 32-bit boolean results (0 or 0xFFFFFFFF)
but operate on 16-bit types would first generate a CMP instruction
with W or HF types, before expanding it out. This CMP is a partial
write, which leads us to think the register may contain some prior
contents still. When placed in a loop, this causes its live range
to extend beyond its real life time.
Mark the register with UNDEF first so that we know that no prior
contents exist and need to be preserved.
This affects:
flt32, fge32, feq32, fneu32, ilt32, ult32, ige32, uge32, ieq32, ine32
On one of Cyberpunk 2077's most complex compute shaders, this reduces
the maximum live registers from 696 to 537 (22.8%). Together with the
next patch, Cyberpunk's spills and fills are cut by 10.23% and 9.19%,
respectively.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22835>
2023-05-03 01:49:56 -07:00
|
|
|
if (bit_size != 32) {
|
2024-04-11 14:52:56 -07:00
|
|
|
dest = bld.vgrf(op[0].type);
|
2024-10-23 01:34:40 -07:00
|
|
|
bld.emit_undef_for_partial_reg(dest);
|
intel/compiler: UNDEF comparisons with smaller than 32-bit
Comparisons which produce 32-bit boolean results (0 or 0xFFFFFFFF)
but operate on 16-bit types would first generate a CMP instruction
with W or HF types, before expanding it out. This CMP is a partial
write, which leads us to think the register may contain some prior
contents still. When placed in a loop, this causes its live range
to extend beyond its real life time.
Mark the register with UNDEF first so that we know that no prior
contents exist and need to be preserved.
This affects:
flt32, fge32, feq32, fneu32, ilt32, ult32, ige32, uge32, ieq32, ine32
On one of Cyberpunk 2077's most complex compute shaders, this reduces
the maximum live registers from 696 to 537 (22.8%). Together with the
next patch, Cyberpunk's spills and fills are cut by 10.23% and 9.19%,
respectively.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22835>
2023-05-03 01:49:56 -07:00
|
|
|
}
|
2014-08-15 10:32:07 -07:00
|
|
|
|
2020-11-05 23:19:31 -06:00
|
|
|
bld.CMP(dest, op[0], op[1],
|
2019-08-02 15:19:16 -05:00
|
|
|
brw_cmod_for_nir_comparison(instr->op));
|
2018-04-19 10:06:43 +02:00
|
|
|
|
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2.
v3: Fix handling of 64-bit CMP results.
v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.
shader-db:
Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21
total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135
Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18
total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134
total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19
total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100
total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0
total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16
total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39
total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2
total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4
LOST: 1
GAINED: 1
fossil-db:
Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%
Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)
Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-01-30 18:53:05 -08:00
|
|
|
/* The destination will now be used as a source, so select component 0
|
|
|
|
|
* if it's is_scalar (as is done in get_nir_src).
|
|
|
|
|
*/
|
|
|
|
|
if (bit_size != 32 && result.is_scalar)
|
|
|
|
|
dest = component(dest, 0);
|
|
|
|
|
|
2018-04-19 10:06:43 +02:00
|
|
|
if (bit_size > 32) {
|
2024-04-20 17:08:02 -07:00
|
|
|
bld.MOV(result, subscript(dest, BRW_TYPE_UD, 0));
|
2018-04-19 10:06:43 +02:00
|
|
|
} else if (bit_size < 32) {
|
|
|
|
|
/* When we convert the result to 32-bit we need to be careful and do
|
|
|
|
|
* it as a signed conversion to get sign extension (for 32-bit true)
|
|
|
|
|
*/
|
|
|
|
|
const brw_reg_type src_type =
|
2024-04-21 00:33:52 -07:00
|
|
|
brw_type_with_size(BRW_TYPE_D, bit_size);
|
2018-04-19 10:06:43 +02:00
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
bld.MOV(retype(result, BRW_TYPE_D), retype(dest, src_type));
|
2016-10-24 20:24:56 -07:00
|
|
|
}
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
2016-10-24 20:24:56 -07:00
|
|
|
}
|
2014-08-15 10:32:07 -07:00
|
|
|
|
2024-02-15 02:51:39 -08:00
|
|
|
case nir_op_inot: {
|
|
|
|
|
nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src);
|
2017-02-09 15:21:47 +00:00
|
|
|
|
2024-02-15 02:51:39 -08:00
|
|
|
if (inot_src_instr != NULL &&
|
|
|
|
|
(inot_src_instr->op == nir_op_ior ||
|
|
|
|
|
inot_src_instr->op == nir_op_ixor ||
|
|
|
|
|
inot_src_instr->op == nir_op_iand)) {
|
|
|
|
|
/* The sources of the source logical instruction are now the
|
|
|
|
|
* sources of the instruction that will be generated.
|
|
|
|
|
*/
|
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2.
v3: Fix handling of 64-bit CMP results.
v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.
shader-db:
Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21
total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135
Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18
total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134
total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19
total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100
total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0
total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16
total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39
total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2
total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4
LOST: 1
GAINED: 1
fossil-db:
Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%
Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)
Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-01-30 18:53:05 -08:00
|
|
|
prepare_alu_destination_and_sources(ntb, ntb.bld, inot_src_instr, op, false);
|
2024-02-15 02:51:39 -08:00
|
|
|
resolve_inot_sources(ntb, bld, inot_src_instr, op);
|
2017-02-09 15:21:47 +00:00
|
|
|
|
2024-02-15 02:51:39 -08:00
|
|
|
/* Smash all of the sources and destination to be signed. This
|
|
|
|
|
* doesn't matter for the operation of the instruction, but cmod
|
|
|
|
|
* propagation fails on unsigned sources with negation (due to
|
2024-12-07 00:23:07 -08:00
|
|
|
* brw_inst::can_do_cmod returning false).
|
2024-02-15 02:51:39 -08:00
|
|
|
*/
|
|
|
|
|
result.type =
|
|
|
|
|
brw_type_for_nir_type(devinfo,
|
|
|
|
|
(nir_alu_type)(nir_type_int |
|
|
|
|
|
instr->def.bit_size));
|
|
|
|
|
op[0].type =
|
|
|
|
|
brw_type_for_nir_type(devinfo,
|
|
|
|
|
(nir_alu_type)(nir_type_int |
|
|
|
|
|
nir_src_bit_size(inot_src_instr->src[0].src)));
|
|
|
|
|
op[1].type =
|
|
|
|
|
brw_type_for_nir_type(devinfo,
|
|
|
|
|
(nir_alu_type)(nir_type_int |
|
|
|
|
|
nir_src_bit_size(inot_src_instr->src[1].src)));
|
|
|
|
|
|
|
|
|
|
/* For XOR, only invert one of the sources. Arbitrarily choose
|
|
|
|
|
* the first source.
|
|
|
|
|
*/
|
|
|
|
|
op[0].negate = !op[0].negate;
|
|
|
|
|
if (inot_src_instr->op != nir_op_ixor)
|
|
|
|
|
op[1].negate = !op[1].negate;
|
2017-02-09 15:21:47 +00:00
|
|
|
|
2024-02-15 02:51:39 -08:00
|
|
|
switch (inot_src_instr->op) {
|
|
|
|
|
case nir_op_ior:
|
|
|
|
|
bld.AND(result, op[0], op[1]);
|
|
|
|
|
return;
|
2017-02-09 15:21:47 +00:00
|
|
|
|
2024-02-15 02:51:39 -08:00
|
|
|
case nir_op_iand:
|
|
|
|
|
bld.OR(result, op[0], op[1]);
|
|
|
|
|
return;
|
2017-02-09 15:21:47 +00:00
|
|
|
|
2024-02-15 02:51:39 -08:00
|
|
|
case nir_op_ixor:
|
|
|
|
|
bld.XOR(result, op[0], op[1]);
|
|
|
|
|
return;
|
2017-02-09 15:21:47 +00:00
|
|
|
|
2024-02-15 02:51:39 -08:00
|
|
|
default:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("impossible opcode");
|
2017-02-09 15:21:47 +00:00
|
|
|
}
|
2015-03-05 20:39:49 -08:00
|
|
|
}
|
2024-02-15 02:51:39 -08:00
|
|
|
op[0] = resolve_source_modifiers(bld, op[0]);
|
2015-06-03 20:59:26 +03:00
|
|
|
bld.NOT(result, op[0]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
2024-02-15 02:51:39 -08:00
|
|
|
}
|
|
|
|
|
|
2014-08-15 10:32:07 -07:00
|
|
|
case nir_op_ixor:
|
2024-02-15 02:51:39 -08:00
|
|
|
resolve_inot_sources(ntb, bld, instr, op);
|
2015-06-03 20:59:26 +03:00
|
|
|
bld.XOR(result, op[0], op[1]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
case nir_op_ior:
|
2024-02-15 02:51:39 -08:00
|
|
|
resolve_inot_sources(ntb, bld, instr, op);
|
2015-06-03 20:59:26 +03:00
|
|
|
bld.OR(result, op[0], op[1]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
case nir_op_iand:
|
2024-02-15 02:51:39 -08:00
|
|
|
resolve_inot_sources(ntb, bld, instr, op);
|
2015-06-03 20:59:26 +03:00
|
|
|
bld.AND(result, op[0], op[1]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_fdot2:
|
|
|
|
|
case nir_op_fdot3:
|
2014-12-23 14:44:19 -08:00
|
|
|
case nir_op_fdot4:
|
2018-10-18 11:44:38 -05:00
|
|
|
case nir_op_b32all_fequal2:
|
|
|
|
|
case nir_op_b32all_iequal2:
|
|
|
|
|
case nir_op_b32all_fequal3:
|
|
|
|
|
case nir_op_b32all_iequal3:
|
|
|
|
|
case nir_op_b32all_fequal4:
|
|
|
|
|
case nir_op_b32all_iequal4:
|
|
|
|
|
case nir_op_b32any_fnequal2:
|
|
|
|
|
case nir_op_b32any_inequal2:
|
|
|
|
|
case nir_op_b32any_fnequal3:
|
|
|
|
|
case nir_op_b32any_inequal3:
|
|
|
|
|
case nir_op_b32any_fnequal4:
|
|
|
|
|
case nir_op_b32any_inequal4:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("Lowered by nir_lower_alu_reductions");
|
2014-08-15 10:32:07 -07:00
|
|
|
|
|
|
|
|
case nir_op_ldexp:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("not reached: should be handled by ldexp_to_arith()");
|
2014-08-15 10:32:07 -07:00
|
|
|
|
|
|
|
|
case nir_op_fsqrt:
|
2024-04-12 17:57:33 -07:00
|
|
|
bld.SQRT(result, op[0]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_frsq:
|
2024-04-12 17:57:33 -07:00
|
|
|
bld.RSQ(result, op[0]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
2014-12-23 14:44:19 -08:00
|
|
|
case nir_op_ftrunc:
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.RNDZ(result, op[0]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
2014-12-23 14:44:19 -08:00
|
|
|
|
2024-04-12 17:43:22 -07:00
|
|
|
case nir_op_fceil:
|
|
|
|
|
bld.MOV(result, negate(bld.RNDD(negate(op[0]))));
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
2014-12-23 14:44:19 -08:00
|
|
|
case nir_op_ffloor:
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.RNDD(result, op[0]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
2014-12-23 14:44:19 -08:00
|
|
|
case nir_op_ffract:
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.FRC(result, op[0]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
2014-12-23 14:44:19 -08:00
|
|
|
case nir_op_fround_even:
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.RNDE(result, op[0]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_imin:
|
|
|
|
|
case nir_op_umin:
|
2015-11-10 10:18:55 +01:00
|
|
|
case nir_op_fmin:
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_imax:
|
|
|
|
|
case nir_op_umax:
|
2015-11-10 10:18:55 +01:00
|
|
|
case nir_op_fmax:
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_pack_snorm_2x16:
|
|
|
|
|
case nir_op_pack_snorm_4x8:
|
|
|
|
|
case nir_op_pack_unorm_2x16:
|
|
|
|
|
case nir_op_pack_unorm_4x8:
|
|
|
|
|
case nir_op_unpack_snorm_2x16:
|
|
|
|
|
case nir_op_unpack_snorm_4x8:
|
|
|
|
|
case nir_op_unpack_unorm_2x16:
|
|
|
|
|
case nir_op_unpack_unorm_4x8:
|
|
|
|
|
case nir_op_unpack_half_2x16:
|
|
|
|
|
case nir_op_pack_half_2x16:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("not reached: should be handled by lower_packing_builtins");
|
2014-08-15 10:32:07 -07:00
|
|
|
|
|
|
|
|
case nir_op_unpack_half_2x16_split_x:
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.MOV(result, subscript(op[0], BRW_TYPE_HF, 0));
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
2019-09-18 09:04:39 -07:00
|
|
|
|
2014-08-15 10:32:07 -07:00
|
|
|
case nir_op_unpack_half_2x16_split_y:
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.MOV(result, subscript(op[0], BRW_TYPE_HF, 1));
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
2017-02-14 22:15:16 -08:00
|
|
|
case nir_op_pack_64_2x32_split:
|
2018-04-17 10:23:47 +02:00
|
|
|
case nir_op_pack_32_2x16_split:
|
2015-08-14 12:29:31 -07:00
|
|
|
bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
|
|
|
|
|
break;
|
|
|
|
|
|
2021-01-25 16:31:17 -08:00
|
|
|
case nir_op_pack_32_4x8_split:
|
|
|
|
|
bld.emit(FS_OPCODE_PACK, result, op, 4);
|
|
|
|
|
break;
|
|
|
|
|
|
2017-02-14 22:15:16 -08:00
|
|
|
case nir_op_unpack_64_2x32_split_x:
|
|
|
|
|
case nir_op_unpack_64_2x32_split_y: {
|
|
|
|
|
if (instr->op == nir_op_unpack_64_2x32_split_x)
|
2024-04-20 17:08:02 -07:00
|
|
|
bld.MOV(result, subscript(op[0], BRW_TYPE_UD, 0));
|
2016-09-02 18:49:20 -07:00
|
|
|
else
|
2024-04-20 17:08:02 -07:00
|
|
|
bld.MOV(result, subscript(op[0], BRW_TYPE_UD, 1));
|
2016-09-02 18:49:20 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2018-04-17 10:23:47 +02:00
|
|
|
case nir_op_unpack_32_2x16_split_x:
|
|
|
|
|
case nir_op_unpack_32_2x16_split_y: {
|
|
|
|
|
if (instr->op == nir_op_unpack_32_2x16_split_x)
|
2024-04-20 17:08:02 -07:00
|
|
|
bld.MOV(result, subscript(op[0], BRW_TYPE_UW, 0));
|
2018-04-17 10:23:47 +02:00
|
|
|
else
|
2024-04-20 17:08:02 -07:00
|
|
|
bld.MOV(result, subscript(op[0], BRW_TYPE_UW, 1));
|
2018-04-17 10:23:47 +02:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2014-08-15 10:32:07 -07:00
|
|
|
case nir_op_fpow:
|
2024-04-12 17:57:33 -07:00
|
|
|
bld.POW(result, op[0], op[1]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_bitfield_reverse:
|
2023-08-14 11:43:35 -05:00
|
|
|
assert(instr->def.bit_size == 32);
|
2022-10-12 15:32:01 -07:00
|
|
|
assert(nir_src_bit_size(instr->src[0].src) == 32);
|
2015-06-03 20:59:26 +03:00
|
|
|
bld.BFREV(result, op[0]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_bit_count:
|
2023-08-14 11:43:35 -05:00
|
|
|
assert(instr->def.bit_size == 32);
|
2022-10-12 15:32:01 -07:00
|
|
|
assert(nir_src_bit_size(instr->src[0].src) < 64);
|
2015-06-03 20:59:26 +03:00
|
|
|
bld.CBIT(result, op[0]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
2018-09-11 16:49:51 -07:00
|
|
|
case nir_op_uclz:
|
2023-08-14 11:43:35 -05:00
|
|
|
assert(instr->def.bit_size == 32);
|
2022-10-12 15:32:01 -07:00
|
|
|
assert(nir_src_bit_size(instr->src[0].src) == 32);
|
2024-04-20 17:08:02 -07:00
|
|
|
bld.LZD(retype(result, BRW_TYPE_UD), op[0]);
|
2018-09-11 16:49:51 -07:00
|
|
|
break;
|
|
|
|
|
|
2014-11-07 10:59:16 -08:00
|
|
|
case nir_op_ifind_msb: {
|
2023-08-14 11:43:35 -05:00
|
|
|
assert(instr->def.bit_size == 32);
|
2022-10-12 15:32:01 -07:00
|
|
|
assert(nir_src_bit_size(instr->src[0].src) == 32);
|
2014-08-15 10:32:07 -07:00
|
|
|
|
intel/brw: Make ifind_msb SSA friendly
No shader-db changes on any Intel platform.
v2: Use negate(tmp) instead of creating a new temporary. Suggested by
Ken.
fossil-db:
Meteor Lake, DG2, and Skylake had similar results. (Meteor Lake shown)
Totals:
Instrs: 152535897 -> 152535883 (-0.00%); split: -0.00%, +0.00%
Cycle count: 17112329592 -> 17112406110 (+0.00%); split: -0.06%, +0.06%
Totals from 40 (0.01% of 633223) affected shaders:
Instrs: 458813 -> 458799 (-0.00%); split: -0.01%, +0.00%
Cycle count: 4358016282 -> 4358092800 (+0.00%); split: -0.23%, +0.24%
Tiger Lake and Ice Lake had similar results. (Tiger Lake shown)
Totals:
Instrs: 150560511 -> 150560465 (-0.00%); split: -0.00%, +0.00%
Cycle count: 15484534441 -> 15482372893 (-0.01%); split: -0.12%, +0.11%
Spill count: 59795 -> 59794 (-0.00%)
Fill count: 103513 -> 103509 (-0.00%)
Totals from 40 (0.01% of 632445) affected shaders:
Instrs: 368877 -> 368831 (-0.01%); split: -0.01%, +0.00%
Cycle count: 3918398264 -> 3916236716 (-0.06%); split: -0.49%, +0.43%
Spill count: 16896 -> 16895 (-0.01%)
Fill count: 27819 -> 27815 (-0.01%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30650>
2024-08-09 18:25:36 -07:00
|
|
|
brw_reg tmp = bld.FBH(retype(op[0], BRW_TYPE_D));
|
2015-10-26 11:35:57 -07:00
|
|
|
|
2022-10-10 13:35:01 -07:00
|
|
|
/* FBH counts from the MSB side, while GLSL's findMSB() wants the count
|
|
|
|
|
* from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
|
|
|
|
|
* subtract the result from 31 to convert the MSB count into an LSB
|
|
|
|
|
* count.
|
|
|
|
|
*/
|
intel/brw: Make ifind_msb SSA friendly
No shader-db changes on any Intel platform.
v2: Use negate(tmp) instead of creating a new temporary. Suggested by
Ken.
fossil-db:
Meteor Lake, DG2, and Skylake had similar results. (Meteor Lake shown)
Totals:
Instrs: 152535897 -> 152535883 (-0.00%); split: -0.00%, +0.00%
Cycle count: 17112329592 -> 17112406110 (+0.00%); split: -0.06%, +0.06%
Totals from 40 (0.01% of 633223) affected shaders:
Instrs: 458813 -> 458799 (-0.00%); split: -0.01%, +0.00%
Cycle count: 4358016282 -> 4358092800 (+0.00%); split: -0.23%, +0.24%
Tiger Lake and Ice Lake had similar results. (Tiger Lake shown)
Totals:
Instrs: 150560511 -> 150560465 (-0.00%); split: -0.00%, +0.00%
Cycle count: 15484534441 -> 15482372893 (-0.01%); split: -0.12%, +0.11%
Spill count: 59795 -> 59794 (-0.00%)
Fill count: 103513 -> 103509 (-0.00%)
Totals from 40 (0.01% of 632445) affected shaders:
Instrs: 368877 -> 368831 (-0.01%); split: -0.01%, +0.00%
Cycle count: 3918398264 -> 3916236716 (-0.06%); split: -0.49%, +0.43%
Spill count: 16896 -> 16895 (-0.01%)
Fill count: 27819 -> 27815 (-0.01%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30650>
2024-08-09 18:25:36 -07:00
|
|
|
brw_reg count_from_lsb = bld.ADD(negate(tmp), brw_imm_w(31));
|
2016-06-21 17:18:04 -07:00
|
|
|
|
intel/brw: Make ifind_msb SSA friendly
No shader-db changes on any Intel platform.
v2: Use negate(tmp) instead of creating a new temporary. Suggested by
Ken.
fossil-db:
Meteor Lake, DG2, and Skylake had similar results. (Meteor Lake shown)
Totals:
Instrs: 152535897 -> 152535883 (-0.00%); split: -0.00%, +0.00%
Cycle count: 17112329592 -> 17112406110 (+0.00%); split: -0.06%, +0.06%
Totals from 40 (0.01% of 633223) affected shaders:
Instrs: 458813 -> 458799 (-0.00%); split: -0.01%, +0.00%
Cycle count: 4358016282 -> 4358092800 (+0.00%); split: -0.23%, +0.24%
Tiger Lake and Ice Lake had similar results. (Tiger Lake shown)
Totals:
Instrs: 150560511 -> 150560465 (-0.00%); split: -0.00%, +0.00%
Cycle count: 15484534441 -> 15482372893 (-0.01%); split: -0.12%, +0.11%
Spill count: 59795 -> 59794 (-0.00%)
Fill count: 103513 -> 103509 (-0.00%)
Totals from 40 (0.01% of 632445) affected shaders:
Instrs: 368877 -> 368831 (-0.01%); split: -0.01%, +0.00%
Cycle count: 3918398264 -> 3916236716 (-0.06%); split: -0.49%, +0.43%
Spill count: 16896 -> 16895 (-0.01%)
Fill count: 27819 -> 27815 (-0.01%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30650>
2024-08-09 18:25:36 -07:00
|
|
|
/* The high word of the FBH result will be 0xffff or 0x0000. After
|
|
|
|
|
* calculating 31 - fbh, we can obtain the correct result for
|
|
|
|
|
* ifind_msb(0) by ORing the (sign extended) upper word of the
|
|
|
|
|
* intermediate result.
|
|
|
|
|
*/
|
|
|
|
|
bld.OR(result, count_from_lsb, subscript(tmp, BRW_TYPE_W, 1));
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
case nir_op_find_lsb:
|
2023-08-14 11:43:35 -05:00
|
|
|
assert(instr->def.bit_size == 32);
|
2022-10-12 15:32:01 -07:00
|
|
|
assert(nir_src_bit_size(instr->src[0].src) == 32);
|
2022-10-10 13:41:59 -07:00
|
|
|
bld.FBL(result, op[0]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_ubitfield_extract:
|
|
|
|
|
case nir_op_ibitfield_extract:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("should have been lowered");
|
2016-01-13 11:09:11 -08:00
|
|
|
case nir_op_ubfe:
|
|
|
|
|
case nir_op_ibfe:
|
2023-08-14 11:43:35 -05:00
|
|
|
assert(instr->def.bit_size < 64);
|
2015-06-03 20:59:26 +03:00
|
|
|
bld.BFE(result, op[2], op[1], op[0]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
case nir_op_bfm:
|
2023-08-14 11:43:35 -05:00
|
|
|
assert(instr->def.bit_size < 64);
|
2015-06-03 20:59:26 +03:00
|
|
|
bld.BFI1(result, op[0], op[1]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
case nir_op_bfi:
|
2023-08-14 11:43:35 -05:00
|
|
|
assert(instr->def.bit_size < 64);
|
intel/fs: Emit better code for bfi(..., 0)
DG2, Tiger Lake, Ice Lake, and Skylake had similar results (Ice Lake shown)
total instructions in shared programs: 20570141 -> 20570063 (<.01%)
instructions in affected programs: 30679 -> 30601 (-0.25%)
helped: 77 / HURT: 0
total cycles in shared programs: 902113977 -> 902118723 (<.01%)
cycles in affected programs: 3255958 -> 3260704 (0.15%)
helped: 60 / HURT: 19
Broadwell
total instructions in shared programs: 18524633 -> 18524547 (<.01%)
instructions in affected programs: 34095 -> 34009 (-0.25%)
helped: 75 / HURT: 2
total cycles in shared programs: 949532394 -> 949543761 (<.01%)
cycles in affected programs: 3419107 -> 3430474 (0.33%)
helped: 57 / HURT: 24
total spills in shared programs: 22484 -> 22484 (0.00%)
spills in affected programs: 516 -> 516 (0.00%)
helped: 2 / HURT: 2
total fills in shared programs: 29346 -> 29338 (-0.03%)
fills in affected programs: 572 -> 564 (-1.40%)
helped: 4 / HURT: 0
Haswell
total instructions in shared programs: 17331356 -> 17331523 (<.01%)
instructions in affected programs: 27920 -> 28087 (0.60%)
helped: 41 / HURT: 4
total cycles in shared programs: 936603192 -> 936574664 (<.01%)
cycles in affected programs: 3417695 -> 3389167 (-0.83%)
helped: 28 / HURT: 21
total spills in shared programs: 19718 -> 19756 (0.19%)
spills in affected programs: 436 -> 474 (8.72%)
helped: 0 / HURT: 4
total fills in shared programs: 22547 -> 22607 (0.27%)
fills in affected programs: 444 -> 504 (13.51%)
helped: 0 / HURT: 4
Ivy Bridge
total cycles in shared programs: 463451277 -> 463451273 (<.01%)
cycles in affected programs: 95870 -> 95866 (<.01%)
helped: 3 / HURT: 2
DG2, Tiger Lake, Ice Lake, and Skylake had similar results (Ice Lake shown)
Totals:
Instrs: 152825278 -> 152819969 (-0.00%); split: -0.00%, +0.00%
Cycles: 15014075626 -> 15014628652 (+0.00%); split: -0.01%, +0.01%
Subgroup size: 8528536 -> 8528560 (+0.00%)
Send messages: 7711431 -> 7711464 (+0.00%)
Spill count: 99907 -> 99509 (-0.40%); split: -0.40%, +0.00%
Fill count: 202459 -> 201598 (-0.43%); split: -0.43%, +0.00%
Scratch Memory Size: 4376576 -> 4371456 (-0.12%)
Totals from 2915 (0.44% of 662497) affected shaders:
Instrs: 2288842 -> 2283533 (-0.23%); split: -0.24%, +0.01%
Cycles: 471633295 -> 472186321 (+0.12%); split: -0.27%, +0.39%
Subgroup size: 27488 -> 27512 (+0.09%)
Send messages: 151344 -> 151377 (+0.02%)
Spill count: 48091 -> 47693 (-0.83%); split: -0.83%, +0.00%
Fill count: 59053 -> 58192 (-1.46%); split: -1.46%, +0.00%
Scratch Memory Size: 1827840 -> 1822720 (-0.28%)
Reviewed-by: Matt Turner <mattst88@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19968>
2022-11-16 13:12:50 -08:00
|
|
|
|
|
|
|
|
/* bfi is ((...) | (~src0 & src2)). The second part is zero when src2 is
|
|
|
|
|
* either 0 or src0. Replacing the 0 with another value can eliminate a
|
|
|
|
|
* temporary register.
|
|
|
|
|
*/
|
|
|
|
|
if (is_const_zero(instr->src[2].src))
|
|
|
|
|
bld.BFI2(result, op[0], op[1], op[0]);
|
|
|
|
|
else
|
|
|
|
|
bld.BFI2(result, op[0], op[1], op[2]);
|
|
|
|
|
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_bitfield_insert:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("not reached: should have been lowered");
|
2014-08-15 10:32:07 -07:00
|
|
|
|
2023-05-11 15:22:44 -07:00
|
|
|
/* With regards to implicit masking of the shift counts for 8- and 16-bit
|
|
|
|
|
* types, the PRMs are **incorrect**. They falsely state that on Gen9+ only
|
|
|
|
|
* the low bits of src1 matching the size of src0 (e.g., 4-bits for W or UW
|
|
|
|
|
* src0) are used. The Bspec (backed by data from experimentation) state
|
|
|
|
|
* that 0x3f is used for Q and UQ types, and 0x1f is used for **all** other
|
|
|
|
|
* types.
|
2021-01-23 14:28:07 -08:00
|
|
|
*
|
2023-05-11 15:22:44 -07:00
|
|
|
* The match the behavior expected for the NIR opcodes, explicit masks for
|
|
|
|
|
* 8- and 16-bit types must be added.
|
2021-01-23 14:28:07 -08:00
|
|
|
*/
|
2014-08-15 10:32:07 -07:00
|
|
|
case nir_op_ishl:
|
2023-05-11 15:22:44 -07:00
|
|
|
if (instr->def.bit_size < 32) {
|
2024-10-22 23:07:44 -07:00
|
|
|
bld.SHL(result, op[0],
|
|
|
|
|
bld.AND(subscript(op[1], BRW_TYPE_UW, 0),
|
|
|
|
|
brw_imm_uw(instr->def.bit_size - 1)));
|
2023-05-11 15:22:44 -07:00
|
|
|
} else {
|
|
|
|
|
bld.SHL(result, op[0], op[1]);
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-07 15:40:43 -08:00
|
|
|
break;
|
2014-08-15 10:32:07 -07:00
|
|
|
case nir_op_ishr:
|
2023-05-11 15:22:44 -07:00
|
|
|
if (instr->def.bit_size < 32) {
|
2024-10-22 23:07:44 -07:00
|
|
|
bld.ASR(result, op[0],
|
|
|
|
|
bld.AND(subscript(op[1], BRW_TYPE_UW, 0),
|
|
|
|
|
brw_imm_uw(instr->def.bit_size - 1)));
|
2023-05-11 15:22:44 -07:00
|
|
|
} else {
|
|
|
|
|
bld.ASR(result, op[0], op[1]);
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-07 15:40:43 -08:00
|
|
|
break;
|
|
|
|
|
case nir_op_ushr:
|
2023-05-11 15:22:44 -07:00
|
|
|
if (instr->def.bit_size < 32) {
|
2024-10-22 23:07:44 -07:00
|
|
|
bld.SHR(result, op[0],
|
|
|
|
|
bld.AND(subscript(op[1], BRW_TYPE_UW, 0),
|
|
|
|
|
brw_imm_uw(instr->def.bit_size - 1)));
|
2023-05-11 15:22:44 -07:00
|
|
|
} else {
|
|
|
|
|
bld.SHR(result, op[0], op[1]);
|
|
|
|
|
}
|
|
|
|
|
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
2019-05-30 14:14:52 -07:00
|
|
|
case nir_op_urol:
|
|
|
|
|
bld.ROL(result, op[0], op[1]);
|
|
|
|
|
break;
|
|
|
|
|
case nir_op_uror:
|
|
|
|
|
bld.ROR(result, op[0], op[1]);
|
|
|
|
|
break;
|
|
|
|
|
|
2014-08-15 10:32:07 -07:00
|
|
|
case nir_op_pack_half_2x16_split:
|
2015-06-03 20:59:26 +03:00
|
|
|
bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
2021-02-23 18:46:53 -08:00
|
|
|
case nir_op_sdot_4x8_iadd:
|
|
|
|
|
case nir_op_sdot_4x8_iadd_sat:
|
2024-04-20 17:08:02 -07:00
|
|
|
inst = bld.DP4A(retype(result, BRW_TYPE_D),
|
|
|
|
|
retype(op[2], BRW_TYPE_D),
|
|
|
|
|
retype(op[0], BRW_TYPE_D),
|
|
|
|
|
retype(op[1], BRW_TYPE_D));
|
2021-02-23 18:46:53 -08:00
|
|
|
|
|
|
|
|
if (instr->op == nir_op_sdot_4x8_iadd_sat)
|
|
|
|
|
inst->saturate = true;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_udot_4x8_uadd:
|
|
|
|
|
case nir_op_udot_4x8_uadd_sat:
|
2024-04-20 17:08:02 -07:00
|
|
|
inst = bld.DP4A(retype(result, BRW_TYPE_UD),
|
|
|
|
|
retype(op[2], BRW_TYPE_UD),
|
|
|
|
|
retype(op[0], BRW_TYPE_UD),
|
|
|
|
|
retype(op[1], BRW_TYPE_UD));
|
2021-02-23 18:46:53 -08:00
|
|
|
|
|
|
|
|
if (instr->op == nir_op_udot_4x8_uadd_sat)
|
|
|
|
|
inst->saturate = true;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_sudot_4x8_iadd:
|
|
|
|
|
case nir_op_sudot_4x8_iadd_sat:
|
2024-04-20 17:08:02 -07:00
|
|
|
inst = bld.DP4A(retype(result, BRW_TYPE_D),
|
|
|
|
|
retype(op[2], BRW_TYPE_D),
|
|
|
|
|
retype(op[0], BRW_TYPE_D),
|
|
|
|
|
retype(op[1], BRW_TYPE_UD));
|
2021-02-23 18:46:53 -08:00
|
|
|
|
|
|
|
|
if (instr->op == nir_op_sudot_4x8_iadd_sat)
|
|
|
|
|
inst->saturate = true;
|
|
|
|
|
break;
|
|
|
|
|
|
2014-08-15 10:32:07 -07:00
|
|
|
case nir_op_ffma:
|
2019-02-12 16:13:59 +01:00
|
|
|
if (nir_has_any_rounding_mode_enabled(execution_mode)) {
|
|
|
|
|
brw_rnd_mode rnd =
|
|
|
|
|
brw_rnd_mode_from_execution_mode(execution_mode);
|
2023-04-05 15:38:34 +03:00
|
|
|
bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
|
|
|
|
|
brw_imm_d(rnd));
|
2019-02-12 16:13:59 +01:00
|
|
|
}
|
|
|
|
|
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.MAD(result, op[2], op[1], op[0]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_flrp:
|
2019-09-24 01:37:57 +03:00
|
|
|
if (nir_has_any_rounding_mode_enabled(execution_mode)) {
|
|
|
|
|
brw_rnd_mode rnd =
|
|
|
|
|
brw_rnd_mode_from_execution_mode(execution_mode);
|
2023-04-05 15:38:34 +03:00
|
|
|
bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
|
|
|
|
|
brw_imm_d(rnd));
|
2019-09-24 01:37:57 +03:00
|
|
|
}
|
|
|
|
|
|
2024-04-12 15:17:06 -07:00
|
|
|
bld.LRP(result, op[0], op[1], op[2]);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
2018-10-18 11:44:38 -05:00
|
|
|
case nir_op_b32csel:
|
2023-11-20 22:00:28 -08:00
|
|
|
if (optimize_frontfacing_ternary(ntb, instr, result))
|
2015-02-15 13:45:04 -08:00
|
|
|
return;
|
|
|
|
|
|
2015-11-02 11:26:16 -08:00
|
|
|
bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
|
2015-06-03 20:59:26 +03:00
|
|
|
inst = bld.SEL(result, op[1], op[2]);
|
2014-12-23 14:44:19 -08:00
|
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
|
2019-06-18 18:21:44 -07:00
|
|
|
case nir_op_fcsel:
|
|
|
|
|
bld.CSEL(result, op[1], op[2], op[0], BRW_CONDITIONAL_NZ);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_fcsel_gt:
|
|
|
|
|
bld.CSEL(result, op[1], op[2], op[0], BRW_CONDITIONAL_G);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_fcsel_ge:
|
|
|
|
|
bld.CSEL(result, op[1], op[2], op[0], BRW_CONDITIONAL_GE);
|
|
|
|
|
break;
|
|
|
|
|
|
2016-01-20 18:56:37 -08:00
|
|
|
case nir_op_extract_u8:
|
|
|
|
|
case nir_op_extract_i8: {
|
2024-02-16 14:24:56 +01:00
|
|
|
const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
|
2018-10-20 09:55:28 -05:00
|
|
|
unsigned byte = nir_src_as_uint(instr->src[1].src);
|
2017-11-10 14:00:24 -08:00
|
|
|
|
|
|
|
|
/* The PRMs say:
|
|
|
|
|
*
|
|
|
|
|
* BDW+
|
|
|
|
|
* There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
|
|
|
|
|
* Use two instructions and a word or DWord intermediate integer type.
|
|
|
|
|
*/
|
2023-08-14 11:43:35 -05:00
|
|
|
if (instr->def.bit_size == 64) {
|
2017-11-10 14:00:24 -08:00
|
|
|
if (instr->op == nir_op_extract_i8) {
|
|
|
|
|
/* If we need to sign extend, extract to a word first */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg w_temp = bld.vgrf(BRW_TYPE_W);
|
2018-10-20 09:55:28 -05:00
|
|
|
bld.MOV(w_temp, subscript(op[0], type, byte));
|
2017-11-10 14:00:24 -08:00
|
|
|
bld.MOV(result, w_temp);
|
2019-02-27 15:53:55 -08:00
|
|
|
} else if (byte & 1) {
|
|
|
|
|
/* Extract the high byte from the word containing the desired byte
|
|
|
|
|
* offset.
|
|
|
|
|
*/
|
|
|
|
|
bld.SHR(result,
|
2024-04-20 17:08:02 -07:00
|
|
|
subscript(op[0], BRW_TYPE_UW, byte / 2),
|
2019-02-27 15:53:55 -08:00
|
|
|
brw_imm_uw(8));
|
2017-11-10 14:00:24 -08:00
|
|
|
} else {
|
|
|
|
|
/* Otherwise use an AND with 0xff and a word type */
|
2019-02-27 15:52:18 -08:00
|
|
|
bld.AND(result,
|
2024-04-20 17:08:02 -07:00
|
|
|
subscript(op[0], BRW_TYPE_UW, byte / 2),
|
2019-02-27 15:52:18 -08:00
|
|
|
brw_imm_uw(0xff));
|
2017-11-10 14:00:24 -08:00
|
|
|
}
|
|
|
|
|
} else {
|
2018-10-20 09:55:28 -05:00
|
|
|
bld.MOV(result, subscript(op[0], type, byte));
|
2017-11-10 14:00:24 -08:00
|
|
|
}
|
2016-01-20 18:56:37 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
case nir_op_extract_u16:
|
|
|
|
|
case nir_op_extract_i16: {
|
2016-05-18 18:43:54 -07:00
|
|
|
const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
|
2018-10-20 09:55:28 -05:00
|
|
|
unsigned word = nir_src_as_uint(instr->src[1].src);
|
|
|
|
|
bld.MOV(result, subscript(op[0], type, word));
|
2016-01-20 18:56:37 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-28 01:03:50 -08:00
|
|
|
/* BFloat16 values in NIR are represented by uint16_t,
|
|
|
|
|
* but BRW can handle them natively.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
case nir_op_bf2f:
|
|
|
|
|
bld.MOV(result, retype(op[0], BRW_TYPE_BF));
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_f2bf:
|
|
|
|
|
bld.MOV(retype(result, BRW_TYPE_BF), op[0]);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_bfmul:
|
|
|
|
|
bld.MUL(retype(result, BRW_TYPE_BF),
|
|
|
|
|
retype(op[0], BRW_TYPE_BF),
|
|
|
|
|
retype(op[1], BRW_TYPE_BF));
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_op_bffma:
|
|
|
|
|
bld.MAD(retype(result, BRW_TYPE_BF),
|
|
|
|
|
retype(op[2], BRW_TYPE_BF),
|
|
|
|
|
retype(op[1], BRW_TYPE_BF),
|
|
|
|
|
retype(op[0], BRW_TYPE_BF));
|
|
|
|
|
break;
|
|
|
|
|
|
2014-08-15 10:32:07 -07:00
|
|
|
default:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("unhandled instruction");
|
2014-08-15 10:32:07 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 14:55:21 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_load_const(nir_to_brw_state &ntb,
|
2023-11-20 14:55:21 -08:00
|
|
|
nir_load_const_instr *instr)
|
2015-06-25 16:22:26 -07:00
|
|
|
{
|
2023-12-05 15:27:29 -08:00
|
|
|
const intel_device_info *devinfo = ntb.devinfo;
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld.scalar_group();
|
2023-11-20 14:55:21 -08:00
|
|
|
|
2015-07-29 14:16:51 -07:00
|
|
|
const brw_reg_type reg_type =
|
2024-04-21 00:33:52 -07:00
|
|
|
brw_type_with_size(BRW_TYPE_D, instr->def.bit_size);
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg reg = bld.vgrf(reg_type, instr->def.num_components);
|
2015-06-25 16:22:26 -07:00
|
|
|
|
brw/nir: Treat load_const as convergent
opt_combine_constants goes to great effort to pack 8 constants into a
single register, this can't have much effect.
There is a lot of fossil-db variation among platforms, but the results
are generally positive.
v2: Fix for Xe2.
shader-db:
Lunar Lake
total instructions in shared programs: 18095100 -> 18092845 (-0.01%)
instructions in affected programs: 158931 -> 156676 (-1.42%)
helped: 423 / HURT: 0
total cycles in shared programs: 921523326 -> 921522784 (<.01%)
cycles in affected programs: 7522774 -> 7522232 (<.01%)
helped: 225 / HURT: 228
LOST: 1
GAINED: 7
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
total instructions in shared programs: 19820211 -> 19820303 (<.01%)
instructions in affected programs: 53087 -> 53179 (0.17%)
helped: 135 / HURT: 1
total cycles in shared programs: 906380523 -> 906383031 (<.01%)
cycles in affected programs: 1402315 -> 1404823 (0.18%)
helped: 156 / HURT: 100
LOST: 1
GAINED: 16
fossil-db:
Lunar Lake
Totals:
Instrs: 141876801 -> 141783010 (-0.07%); split: -0.07%, +0.00%
Subgroup size: 10994624 -> 10994704 (+0.00%)
Cycle count: 22173441950 -> 22172949188 (-0.00%); split: -0.01%, +0.01%
Spill count: 69850 -> 69890 (+0.06%); split: -0.00%, +0.06%
Fill count: 129285 -> 128877 (-0.32%)
Max live registers: 48047900 -> 48043650 (-0.01%); split: -0.01%, +0.00%
Totals from 29837 (5.41% of 551396) affected shaders:
Instrs: 7842512 -> 7748721 (-1.20%); split: -1.23%, +0.03%
Subgroup size: 940320 -> 940400 (+0.01%)
Cycle count: 3444846368 -> 3444353606 (-0.01%); split: -0.09%, +0.08%
Spill count: 23358 -> 23398 (+0.17%); split: -0.01%, +0.18%
Fill count: 52296 -> 51888 (-0.78%)
Max live registers: 3183481 -> 3179231 (-0.13%); split: -0.16%, +0.03%
Meteor Lake
Totals:
Instrs: 152709353 -> 152666543 (-0.03%); split: -0.03%, +0.00%
Cycle count: 17397176906 -> 17397668904 (+0.00%); split: -0.00%, +0.01%
Fill count: 147896 -> 147893 (-0.00%)
Max live registers: 31862891 -> 31861888 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5559664 -> 5561776 (+0.04%); split: +0.08%, -0.04%
Totals from 20913 (3.30% of 633046) affected shaders:
Instrs: 6676676 -> 6633866 (-0.64%); split: -0.64%, +0.00%
Cycle count: 1498330125 -> 1498822123 (+0.03%); split: -0.06%, +0.09%
Fill count: 41010 -> 41007 (-0.01%)
Max live registers: 1799295 -> 1798292 (-0.06%); split: -0.06%, +0.00%
Max dispatch width: 12880 -> 14992 (+16.40%); split: +33.29%, -16.89%
DG2 and Tiger Lake had similar results. (DG2 shown)
Totals:
Instrs: 152730878 -> 152688139 (-0.03%); split: -0.03%, +0.00%
Cycle count: 17394835605 -> 17394179808 (-0.00%); split: -0.01%, +0.00%
Max live registers: 31862843 -> 31861840 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5559664 -> 5561776 (+0.04%); split: +0.08%, -0.04%
Totals from 20912 (3.30% of 633046) affected shaders:
Instrs: 6563021 -> 6520282 (-0.65%); split: -0.65%, +0.00%
Cycle count: 1201999616 -> 1201343819 (-0.05%); split: -0.08%, +0.03%
Max live registers: 1798392 -> 1797389 (-0.06%); split: -0.06%, +0.00%
Max dispatch width: 12872 -> 14984 (+16.41%); split: +33.31%, -16.90%
Ice Lake
Totals:
Instrs: 151914872 -> 151868108 (-0.03%)
Cycle count: 15262958696 -> 15262665082 (-0.00%); split: -0.00%, +0.00%
Max live registers: 32194225 -> 32193192 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5650880 -> 5650608 (-0.00%); split: +0.02%, -0.03%
Totals from 22192 (3.48% of 637223) affected shaders:
Instrs: 6419739 -> 6372975 (-0.73%)
Cycle count: 184733818 -> 184440204 (-0.16%); split: -0.36%, +0.20%
Max live registers: 1989950 -> 1988917 (-0.05%); split: -0.05%, +0.00%
Max dispatch width: 5744 -> 5472 (-4.74%); split: +23.40%, -28.13%
Skylake
Totals:
Instrs: 141027379 -> 140811741 (-0.15%)
Cycle count: 14817704293 -> 14817418611 (-0.00%); split: -0.01%, +0.01%
Max live registers: 31628796 -> 31627791 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5535176 -> 5539880 (+0.08%); split: +0.14%, -0.06%
Totals from 22218 (3.53% of 628840) affected shaders:
Instrs: 5944856 -> 5729218 (-3.63%)
Cycle count: 182845101 -> 182559419 (-0.16%); split: -0.60%, +0.44%
Max live registers: 1974576 -> 1973571 (-0.05%); split: -0.07%, +0.02%
Max dispatch width: 16912 -> 21616 (+27.81%); split: +46.93%, -19.11%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2023-12-01 09:53:24 -08:00
|
|
|
reg.is_scalar = true;
|
|
|
|
|
|
2024-08-16 00:11:04 -07:00
|
|
|
brw_reg comps[NIR_MAX_VEC_COMPONENTS];
|
2023-12-30 00:46:12 -08:00
|
|
|
|
2015-07-29 14:16:51 -07:00
|
|
|
switch (instr->def.bit_size) {
|
2018-07-27 13:38:39 +02:00
|
|
|
case 8:
|
|
|
|
|
for (unsigned i = 0; i < instr->def.num_components; i++)
|
2023-12-30 00:46:12 -08:00
|
|
|
comps[i] = setup_imm_b(bld, instr->value[i].i8);
|
2018-07-27 13:38:39 +02:00
|
|
|
break;
|
|
|
|
|
|
2018-04-10 10:02:29 +02:00
|
|
|
case 16:
|
|
|
|
|
for (unsigned i = 0; i < instr->def.num_components; i++)
|
2023-12-30 00:46:12 -08:00
|
|
|
comps[i] = brw_imm_w(instr->value[i].i16);
|
2018-04-10 10:02:29 +02:00
|
|
|
break;
|
|
|
|
|
|
2015-07-29 14:16:51 -07:00
|
|
|
case 32:
|
|
|
|
|
for (unsigned i = 0; i < instr->def.num_components; i++)
|
2023-12-30 00:46:12 -08:00
|
|
|
comps[i] = brw_imm_d(instr->value[i].i32);
|
2015-07-29 14:16:51 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case 64:
|
2022-09-09 10:27:28 -07:00
|
|
|
if (!devinfo->has_64bit_int) {
|
2023-12-30 00:46:12 -08:00
|
|
|
reg.type = BRW_TYPE_DF;
|
|
|
|
|
for (unsigned i = 0; i < instr->def.num_components; i++)
|
|
|
|
|
comps[i] = brw_imm_df(instr->value[i].f64);
|
2017-11-02 18:32:39 -07:00
|
|
|
} else {
|
|
|
|
|
for (unsigned i = 0; i < instr->def.num_components; i++)
|
2023-12-30 00:46:12 -08:00
|
|
|
comps[i] = brw_imm_q(instr->value[i].i64);
|
2017-11-02 18:32:39 -07:00
|
|
|
}
|
2015-07-29 14:16:51 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("Invalid bit size");
|
2015-07-29 14:16:51 -07:00
|
|
|
}
|
2015-06-25 16:22:26 -07:00
|
|
|
|
2023-12-30 00:46:12 -08:00
|
|
|
bld.VEC(reg, comps, instr->def.num_components);
|
|
|
|
|
|
2023-12-05 15:27:29 -08:00
|
|
|
ntb.ssa_values[instr->def.index] = reg;
|
2015-06-25 16:22:26 -07:00
|
|
|
}
|
|
|
|
|
|
2023-11-20 15:21:11 -08:00
|
|
|
static bool
|
2023-12-05 15:27:29 -08:00
|
|
|
get_nir_src_bindless(nir_to_brw_state &ntb, const nir_src &src)
|
2023-01-13 12:26:01 +02:00
|
|
|
{
|
2023-12-05 15:27:29 -08:00
|
|
|
return ntb.ssa_bind_infos[src.ssa->index].bindless;
|
2023-01-13 12:26:01 +02:00
|
|
|
}
|
|
|
|
|
|
2024-02-12 08:43:34 -08:00
|
|
|
/**
|
|
|
|
|
* Specifying -1 for channel indicates that no channel selection should be applied.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2024-02-12 08:43:34 -08:00
|
|
|
get_nir_src(nir_to_brw_state &ntb, const nir_src &src, int channel)
|
2014-08-15 10:32:07 -07:00
|
|
|
{
|
2023-07-12 02:37:17 -05:00
|
|
|
nir_intrinsic_instr *load_reg = nir_load_reg_for_def(src.ssa);
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg reg;
|
2023-07-12 02:37:17 -05:00
|
|
|
if (!load_reg) {
|
|
|
|
|
if (nir_src_is_undef(src)) {
|
|
|
|
|
const brw_reg_type reg_type =
|
2024-04-21 00:33:52 -07:00
|
|
|
brw_type_with_size(BRW_TYPE_D, src.ssa->bit_size);
|
2023-12-05 15:27:29 -08:00
|
|
|
reg = ntb.bld.vgrf(reg_type, src.ssa->num_components);
|
2016-07-29 01:29:09 -07:00
|
|
|
} else {
|
2023-12-05 15:27:29 -08:00
|
|
|
reg = ntb.ssa_values[src.ssa->index];
|
2016-07-29 01:29:09 -07:00
|
|
|
}
|
2014-11-12 16:24:21 -08:00
|
|
|
} else {
|
2023-07-12 02:37:17 -05:00
|
|
|
nir_intrinsic_instr *decl_reg = nir_reg_get_decl(load_reg->src[0].ssa);
|
2015-11-10 21:07:45 -08:00
|
|
|
/* We don't handle indirects on locals */
|
2023-07-12 02:37:17 -05:00
|
|
|
assert(nir_intrinsic_base(load_reg) == 0);
|
|
|
|
|
assert(load_reg->intrinsic != nir_intrinsic_load_reg_indirect);
|
2023-12-05 15:27:29 -08:00
|
|
|
reg = ntb.ssa_values[decl_reg->def.index];
|
2014-11-12 16:24:21 -08:00
|
|
|
}
|
2015-06-24 12:28:47 -07:00
|
|
|
|
2024-02-15 02:51:39 -08:00
|
|
|
/* To avoid floating-point denorm flushing problems, set the type by
|
|
|
|
|
* default to an integer type - instructions that need floating point
|
|
|
|
|
* semantics will set this to F if they need to
|
|
|
|
|
*/
|
2024-04-21 00:33:52 -07:00
|
|
|
reg.type = brw_type_with_size(BRW_TYPE_D, nir_src_bit_size(src));
|
2017-08-23 17:10:33 -07:00
|
|
|
|
2024-02-12 08:43:34 -08:00
|
|
|
if (channel >= 0) {
|
|
|
|
|
reg = offset(reg, ntb.bld, channel);
|
|
|
|
|
|
|
|
|
|
/* If the dispatch width matches the scalar allocation width, offset()
|
|
|
|
|
* won't set the stride to zero. Force that here.
|
|
|
|
|
*/
|
|
|
|
|
if (reg.is_scalar)
|
|
|
|
|
reg = component(reg, 0);
|
|
|
|
|
}
|
|
|
|
|
|
2017-08-23 17:10:33 -07:00
|
|
|
return reg;
|
2014-08-15 10:32:07 -07:00
|
|
|
}
|
|
|
|
|
|
2016-05-04 15:10:25 -07:00
|
|
|
/**
|
2025-01-03 03:47:52 -08:00
|
|
|
* Return an IMM for 32-bit constants; otherwise call get_nir_src() as normal.
|
2016-05-04 15:10:25 -07:00
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2023-12-05 15:27:29 -08:00
|
|
|
get_nir_src_imm(nir_to_brw_state &ntb, const nir_src &src)
|
2016-05-04 15:10:25 -07:00
|
|
|
{
|
2025-01-03 03:47:52 -08:00
|
|
|
return nir_src_is_const(src) && nir_src_bit_size(src) == 32 ?
|
2025-01-15 13:27:05 -08:00
|
|
|
brw_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(ntb, src, 0);
|
2016-05-04 15:10:25 -07:00
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2.
v3: Fix handling of 64-bit CMP results.
v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.
shader-db:
Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21
total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135
Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18
total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134
total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19
total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100
total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0
total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16
total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39
total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2
total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4
LOST: 1
GAINED: 1
fossil-db:
Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%
Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)
Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-01-30 18:53:05 -08:00
|
|
|
get_nir_def(nir_to_brw_state &ntb, const nir_def &def, bool all_sources_uniform)
|
2014-08-15 10:32:07 -07:00
|
|
|
{
|
2023-08-14 08:34:38 -05:00
|
|
|
nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
|
2024-02-01 15:02:37 -08:00
|
|
|
bool is_scalar = false;
|
|
|
|
|
|
|
|
|
|
if (def.parent_instr->type == nir_instr_type_intrinsic &&
|
|
|
|
|
store_reg == NULL) {
|
|
|
|
|
const nir_intrinsic_instr *instr =
|
|
|
|
|
nir_instr_as_intrinsic(def.parent_instr);
|
|
|
|
|
|
|
|
|
|
switch (instr->intrinsic) {
|
brw/nir: Treat load_btd_{global,local}_arg_addr_intel and load_btd_shader_type_intel as convergent
No shader-db changes on any Intel platform. No fossil-db changes on
Tiger Lake, Ice Lake, or Skylake.
fossil-db:
Lunar Lake
Totals:
Instrs: 141808714 -> 141808513 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22177889310 -> 22181410192 (+0.02%); split: -0.00%, +0.02%
Spill count: 69892 -> 69890 (-0.00%); split: -0.01%, +0.01%
Fill count: 128313 -> 128331 (+0.01%)
Max live registers: 48052083 -> 48052742 (+0.00%); split: -0.00%, +0.00%
Totals from 549 (0.10% of 551446) affected shaders:
Instrs: 911251 -> 911050 (-0.02%); split: -0.10%, +0.07%
Cycle count: 1244153266 -> 1247674148 (+0.28%); split: -0.04%, +0.32%
Spill count: 15849 -> 15847 (-0.01%); split: -0.04%, +0.03%
Fill count: 35087 -> 35105 (+0.05%)
Max live registers: 68047 -> 68706 (+0.97%); split: -0.25%, +1.22%
Meteor Lake
Totals:
Instrs: 152744298 -> 152741241 (-0.00%); split: -0.00%, +0.00%
Cycle count: 17410258529 -> 17405949054 (-0.02%); split: -0.04%, +0.01%
Spill count: 78528 -> 78598 (+0.09%); split: -0.01%, +0.09%
Fill count: 147893 -> 147978 (+0.06%); split: -0.00%, +0.06%
Scratch Memory Size: 3962880 -> 3969024 (+0.16%)
Max live registers: 31887206 -> 31887413 (+0.00%); split: -0.00%, +0.00%
Totals from 552 (0.09% of 633315) affected shaders:
Instrs: 907279 -> 904222 (-0.34%); split: -0.48%, +0.15%
Cycle count: 1152358569 -> 1148049094 (-0.37%); split: -0.56%, +0.19%
Spill count: 15290 -> 15360 (+0.46%); split: -0.03%, +0.48%
Fill count: 35313 -> 35398 (+0.24%); split: -0.02%, +0.26%
Scratch Memory Size: 1313792 -> 1319936 (+0.47%)
Max live registers: 34218 -> 34425 (+0.60%); split: -0.47%, +1.08%
DG2
Totals:
Instrs: 152766492 -> 152763061 (-0.00%); split: -0.00%, +0.00%
Cycle count: 17406058608 -> 17406396943 (+0.00%); split: -0.02%, +0.02%
Spill count: 78626 -> 78624 (-0.00%); split: -0.01%, +0.01%
Fill count: 147956 -> 148007 (+0.03%); split: -0.01%, +0.04%
Scratch Memory Size: 3962880 -> 3969024 (+0.16%)
Max live registers: 31887158 -> 31887365 (+0.00%); split: -0.00%, +0.00%
Totals from 552 (0.09% of 633315) affected shaders:
Instrs: 908513 -> 905082 (-0.38%); split: -0.47%, +0.09%
Cycle count: 1148162185 -> 1148500520 (+0.03%); split: -0.23%, +0.26%
Spill count: 15364 -> 15362 (-0.01%); split: -0.07%, +0.06%
Fill count: 35343 -> 35394 (+0.14%); split: -0.03%, +0.17%
Scratch Memory Size: 1313792 -> 1319936 (+0.47%)
Max live registers: 34218 -> 34425 (+0.60%); split: -0.47%, +1.08%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-07-06 11:49:44 -07:00
|
|
|
case nir_intrinsic_load_btd_global_arg_addr_intel:
|
|
|
|
|
case nir_intrinsic_load_btd_local_arg_addr_intel:
|
|
|
|
|
case nir_intrinsic_load_btd_shader_type_intel:
|
brw/nir: Treat load_*_uniform_block_intel as convergent
Between 5 and 10 shaders (depending on the platform) from Blender are
massively helped for spills and fills (e.g., from 45 spills to 0, and
180 fills to 0).
Previously this commit cause a lot of spill and fill damage to
Wolfenstein Youngblood and Red Dead Redemption 2. I believe due to
!32041 and !32097, this is no longer the case. RDR2 is helped, and
Wolfenstein Youngblood has no changes.
However, q2rtx/q2rtx-rt-pipeline is hurt:
Spill count: 126 -> 175 (+38.89%); split: -0.79%, +39.68%
Fill count: 156 -> 235 (+50.64%); split: -1.92%, +52.56%
By the end of this series this damage is fixed, and q2rtx is helped
overall by -0.79% spills and -1.92% fills.
v2: Fix for Xe2.
v3: Just keep using bld for the group(1, 0) call. Suggested by Ken.
v4: Major re-write. Pass bld and xbld to fs_emit_memory_access. The big
fix is changing the way srcs[MEMORY_LOGICAL_ADDRESS] is calculated
(around line 7180). In previous versions of the commit, the address
would be calculated using bld (which is now xbld) even if the address
source was not is_scalar. This could cause the emit_uniformize (later in
the function) to fetch garbage. This also drops the special case
handling of constant offset. Constant propagation and algebraic will
handle this.
v5: Fix a subtle bug that was ultimately caused by the removal of
offset_to_component. The MEMORY_LOGICAL_ADDRESS for
load_shared_uniform_block_intel was being calculated as SIMD16 on LNL,
but the later emit_uniformize would treat it as SIMD32. This caused GPU
hangs in Assassin's Creed Valhalla.
v6: Fix a bug in D16 to D16U32 expansion. Noticed by Ken. Add a comment
explaining bld vs xbld vs ubld in fs_nir_emit_memory_access. Suggested
by Ken.
v7: Revert some of the v6 changes related to D16 to D16U32
expansion. This code was mostly correct. xbld is correct because DATA0
needs to be generated in size of the eventual SEND instruction. Using
offset(nir_src, xbld, c) will cause offset() to correctly added
component(..., 0) if nir_src.is_scalar but xbld is not scalar_group().
v8: nir_intrinsic_load_shared_uniform_block_intel was removed. This
caused reproducible hangs in Assassin's Creed: Valhalla. There are some
other compiler issues related to this game, and we're not yet sure
exactly what the cause of any of it is.
shader-db:
Lunar Lake
total instructions in shared programs: 18058270 -> 18068886 (0.06%)
instructions in affected programs: 5196846 -> 5207462 (0.20%)
helped: 4442 / HURT: 11416
total cycles in shared programs: 921324492 -> 919819398 (-0.16%)
cycles in affected programs: 733274162 -> 731769068 (-0.21%)
helped: 11312 / HURT: 31788
total spills in shared programs: 3633 -> 3585 (-1.32%)
spills in affected programs: 48 -> 0
helped: 5 / HURT: 0
total fills in shared programs: 2277 -> 2198 (-3.47%)
fills in affected programs: 79 -> 0
helped: 5 / HURT: 0
LOST: 123
GAINED: 377
Meteor Lake, DG2, and Tiger Lake had similar results. (Meteor Lake shown)
total instructions in shared programs: 19703458 -> 19699173 (-0.02%)
instructions in affected programs: 5885251 -> 5880966 (-0.07%)
helped: 4545 / HURT: 14971
total cycles in shared programs: 903497253 -> 902054570 (-0.16%)
cycles in affected programs: 691762248 -> 690319565 (-0.21%)
helped: 16412 / HURT: 28080
total spills in shared programs: 4894 -> 4646 (-5.07%)
spills in affected programs: 248 -> 0
helped: 7 / HURT: 0
total fills in shared programs: 6638 -> 5581 (-15.92%)
fills in affected programs: 1057 -> 0
helped: 7 / HURT: 0
LOST: 427
GAINED: 978
Ice Lake and Skylake had similar results. (Ice Lake shonw)
total instructions in shared programs: 20384200 -> 20384889 (<.01%)
instructions in affected programs: 5295084 -> 5295773 (0.01%)
helped: 5309 / HURT: 12564
total cycles in shared programs: 873002832 -> 872515246 (-0.06%)
cycles in affected programs: 463413458 -> 462925872 (-0.11%)
helped: 16079 / HURT: 13339
total spills in shared programs: 4552 -> 4373 (-3.93%)
spills in affected programs: 546 -> 367 (-32.78%)
helped: 11 / HURT: 0
total fills in shared programs: 5298 -> 4657 (-12.10%)
fills in affected programs: 1798 -> 1157 (-35.65%)
helped: 10 / HURT: 0
LOST: 380
GAINED: 925
fossil-db:
All Intel platforms had similar results. (Lunar Lake shown)
Totals:
Instrs: 141528822 -> 141728392 (+0.14%); split: -0.21%, +0.35%
Subgroup size: 10968048 -> 10968144 (+0.00%)
Send messages: 6567930 -> 6567909 (-0.00%)
Cycle count: 22165780202 -> 21624534624 (-2.44%); split: -3.09%, +0.65%
Spill count: 69890 -> 66665 (-4.61%); split: -5.06%, +0.44%
Fill count: 128331 -> 120189 (-6.34%); split: -7.44%, +1.09%
Scratch Memory Size: 5829632 -> 5664768 (-2.83%); split: -2.86%, +0.04%
Max live registers: 47928290 -> 47611371 (-0.66%); split: -0.71%, +0.05%
Totals from 364369 (66.18% of 550563) affected shaders:
Instrs: 113448842 -> 113648412 (+0.18%); split: -0.26%, +0.44%
Subgroup size: 7694080 -> 7694176 (+0.00%)
Send messages: 5308287 -> 5308266 (-0.00%)
Cycle count: 21885237842 -> 21343992264 (-2.47%); split: -3.13%, +0.65%
Spill count: 65152 -> 61927 (-4.95%); split: -5.42%, +0.47%
Fill count: 122811 -> 114669 (-6.63%); split: -7.77%, +1.14%
Scratch Memory Size: 5438464 -> 5273600 (-3.03%); split: -3.07%, +0.04%
Max live registers: 34355310 -> 34038391 (-0.92%); split: -1.00%, +0.07%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-28 10:52:47 -08:00
|
|
|
case nir_intrinsic_load_global_constant_uniform_block_intel:
|
2024-07-05 16:35:33 -07:00
|
|
|
case nir_intrinsic_load_inline_data_intel:
|
brw/nir: Treat load_reloc_const_intel as convergent
shader-db:
Lunar Lake, Meteor Lake, DG2, and Tiger Lake had similar results. (Lunar Lake shown)
Lunar Lake
total instructions in shared programs: 18096549 -> 18096537 (<.01%)
instructions in affected programs: 26128 -> 26116 (-0.05%)
helped: 7 / HURT: 2
total cycles in shared programs: 922073090 -> 922093922 (<.01%)
cycles in affected programs: 10574198 -> 10595030 (0.20%)
helped: 19 / HURT: 76
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20503943 -> 20504053 (<.01%)
instructions in affected programs: 23378 -> 23488 (0.47%)
helped: 6 / HURT: 5
total cycles in shared programs: 875477036 -> 875480112 (<.01%)
cycles in affected programs: 13840528 -> 13843604 (0.02%)
helped: 22 / HURT: 55
total spills in shared programs: 4546 -> 4552 (0.13%)
spills in affected programs: 8 -> 14 (75.00%)
helped: 0 / HURT: 1
total fills in shared programs: 5280 -> 5298 (0.34%)
fills in affected programs: 24 -> 42 (75.00%)
helped: 0 / HURT: 1
One compute shader in Tomb Raider was hurt for spills and fills.
fossil-db:
Lunar Lake
Totals:
Instrs: 141808815 -> 141808714 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22185066952 -> 22177889310 (-0.03%); split: -0.05%, +0.02%
Spill count: 69859 -> 69892 (+0.05%); split: -0.03%, +0.07%
Fill count: 128344 -> 128313 (-0.02%); split: -0.04%, +0.01%
Scratch Memory Size: 5833728 -> 5829632 (-0.07%)
Totals from 13384 (2.43% of 551446) affected shaders:
Instrs: 13852162 -> 13852061 (-0.00%); split: -0.00%, +0.00%
Cycle count: 7691993336 -> 7684815694 (-0.09%); split: -0.15%, +0.06%
Spill count: 53266 -> 53299 (+0.06%); split: -0.03%, +0.10%
Fill count: 96492 -> 96461 (-0.03%); split: -0.05%, +0.02%
Scratch Memory Size: 3827712 -> 3823616 (-0.11%)
Meteor Lake and DG2 had similar results. (Meteor Lake shown)
Totals:
Instrs: 152744735 -> 152744298 (-0.00%); split: -0.00%, +0.00%
Cycle count: 17400199290 -> 17410258529 (+0.06%); split: -0.01%, +0.07%
Max live registers: 31887208 -> 31887206 (-0.00%)
Totals from 12435 (1.96% of 633315) affected shaders:
Instrs: 13445310 -> 13444873 (-0.00%); split: -0.00%, +0.00%
Cycle count: 6941685096 -> 6951744335 (+0.14%); split: -0.03%, +0.18%
Max live registers: 1071302 -> 1071300 (-0.00%)
Tiger Lake and Ice Lake had similar results. (Tiger Lake shown)
Totals:
Instrs: 150644063 -> 150643944 (-0.00%); split: -0.00%, +0.00%
Cycle count: 15618718733 -> 15622092285 (+0.02%); split: -0.01%, +0.03%
Spill count: 58816 -> 58790 (-0.04%)
Fill count: 101054 -> 101065 (+0.01%)
Max live registers: 31792771 -> 31792766 (-0.00%); split: -0.00%, +0.00%
Totals from 13383 (2.12% of 632544) affected shaders:
Instrs: 12016285 -> 12016166 (-0.00%); split: -0.00%, +0.00%
Cycle count: 5239956851 -> 5243330403 (+0.06%); split: -0.02%, +0.08%
Spill count: 28977 -> 28951 (-0.09%)
Fill count: 47568 -> 47579 (+0.02%)
Max live registers: 1001554 -> 1001549 (-0.00%); split: -0.00%, +0.00%
Skylake
Totals:
Instrs: 140943195 -> 140943154 (-0.00%); split: -0.00%, +0.00%
Cycle count: 14818940190 -> 14816706154 (-0.02%); split: -0.02%, +0.00%
Max live registers: 31663173 -> 31663168 (-0.00%); split: -0.00%, +0.00%
Totals from 12625 (2.01% of 629351) affected shaders:
Instrs: 11598223 -> 11598182 (-0.00%); split: -0.00%, +0.00%
Cycle count: 4519027823 -> 4516793787 (-0.05%); split: -0.05%, +0.00%
Max live registers: 970275 -> 970270 (-0.00%); split: -0.00%, +0.00%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-07-06 09:18:44 -07:00
|
|
|
case nir_intrinsic_load_reloc_const_intel:
|
brw/nir: Treat load_*_uniform_block_intel as convergent
Between 5 and 10 shaders (depending on the platform) from Blender are
massively helped for spills and fills (e.g., from 45 spills to 0, and
180 fills to 0).
Previously this commit cause a lot of spill and fill damage to
Wolfenstein Youngblood and Red Dead Redemption 2. I believe due to
!32041 and !32097, this is no longer the case. RDR2 is helped, and
Wolfenstein Youngblood has no changes.
However, q2rtx/q2rtx-rt-pipeline is hurt:
Spill count: 126 -> 175 (+38.89%); split: -0.79%, +39.68%
Fill count: 156 -> 235 (+50.64%); split: -1.92%, +52.56%
By the end of this series this damage is fixed, and q2rtx is helped
overall by -0.79% spills and -1.92% fills.
v2: Fix for Xe2.
v3: Just keep using bld for the group(1, 0) call. Suggested by Ken.
v4: Major re-write. Pass bld and xbld to fs_emit_memory_access. The big
fix is changing the way srcs[MEMORY_LOGICAL_ADDRESS] is calculated
(around line 7180). In previous versions of the commit, the address
would be calculated using bld (which is now xbld) even if the address
source was not is_scalar. This could cause the emit_uniformize (later in
the function) to fetch garbage. This also drops the special case
handling of constant offset. Constant propagation and algebraic will
handle this.
v5: Fix a subtle bug that was ultimately caused by the removal of
offset_to_component. The MEMORY_LOGICAL_ADDRESS for
load_shared_uniform_block_intel was being calculated as SIMD16 on LNL,
but the later emit_uniformize would treat it as SIMD32. This caused GPU
hangs in Assassin's Creed Valhalla.
v6: Fix a bug in D16 to D16U32 expansion. Noticed by Ken. Add a comment
explaining bld vs xbld vs ubld in fs_nir_emit_memory_access. Suggested
by Ken.
v7: Revert some of the v6 changes related to D16 to D16U32
expansion. This code was mostly correct. xbld is correct because DATA0
needs to be generated in size of the eventual SEND instruction. Using
offset(nir_src, xbld, c) will cause offset() to correctly added
component(..., 0) if nir_src.is_scalar but xbld is not scalar_group().
v8: nir_intrinsic_load_shared_uniform_block_intel was removed. This
caused reproducible hangs in Assassin's Creed: Valhalla. There are some
other compiler issues related to this game, and we're not yet sure
exactly what the cause of any of it is.
shader-db:
Lunar Lake
total instructions in shared programs: 18058270 -> 18068886 (0.06%)
instructions in affected programs: 5196846 -> 5207462 (0.20%)
helped: 4442 / HURT: 11416
total cycles in shared programs: 921324492 -> 919819398 (-0.16%)
cycles in affected programs: 733274162 -> 731769068 (-0.21%)
helped: 11312 / HURT: 31788
total spills in shared programs: 3633 -> 3585 (-1.32%)
spills in affected programs: 48 -> 0
helped: 5 / HURT: 0
total fills in shared programs: 2277 -> 2198 (-3.47%)
fills in affected programs: 79 -> 0
helped: 5 / HURT: 0
LOST: 123
GAINED: 377
Meteor Lake, DG2, and Tiger Lake had similar results. (Meteor Lake shown)
total instructions in shared programs: 19703458 -> 19699173 (-0.02%)
instructions in affected programs: 5885251 -> 5880966 (-0.07%)
helped: 4545 / HURT: 14971
total cycles in shared programs: 903497253 -> 902054570 (-0.16%)
cycles in affected programs: 691762248 -> 690319565 (-0.21%)
helped: 16412 / HURT: 28080
total spills in shared programs: 4894 -> 4646 (-5.07%)
spills in affected programs: 248 -> 0
helped: 7 / HURT: 0
total fills in shared programs: 6638 -> 5581 (-15.92%)
fills in affected programs: 1057 -> 0
helped: 7 / HURT: 0
LOST: 427
GAINED: 978
Ice Lake and Skylake had similar results. (Ice Lake shonw)
total instructions in shared programs: 20384200 -> 20384889 (<.01%)
instructions in affected programs: 5295084 -> 5295773 (0.01%)
helped: 5309 / HURT: 12564
total cycles in shared programs: 873002832 -> 872515246 (-0.06%)
cycles in affected programs: 463413458 -> 462925872 (-0.11%)
helped: 16079 / HURT: 13339
total spills in shared programs: 4552 -> 4373 (-3.93%)
spills in affected programs: 546 -> 367 (-32.78%)
helped: 11 / HURT: 0
total fills in shared programs: 5298 -> 4657 (-12.10%)
fills in affected programs: 1798 -> 1157 (-35.65%)
helped: 10 / HURT: 0
LOST: 380
GAINED: 925
fossil-db:
All Intel platforms had similar results. (Lunar Lake shown)
Totals:
Instrs: 141528822 -> 141728392 (+0.14%); split: -0.21%, +0.35%
Subgroup size: 10968048 -> 10968144 (+0.00%)
Send messages: 6567930 -> 6567909 (-0.00%)
Cycle count: 22165780202 -> 21624534624 (-2.44%); split: -3.09%, +0.65%
Spill count: 69890 -> 66665 (-4.61%); split: -5.06%, +0.44%
Fill count: 128331 -> 120189 (-6.34%); split: -7.44%, +1.09%
Scratch Memory Size: 5829632 -> 5664768 (-2.83%); split: -2.86%, +0.04%
Max live registers: 47928290 -> 47611371 (-0.66%); split: -0.71%, +0.05%
Totals from 364369 (66.18% of 550563) affected shaders:
Instrs: 113448842 -> 113648412 (+0.18%); split: -0.26%, +0.44%
Subgroup size: 7694080 -> 7694176 (+0.00%)
Send messages: 5308287 -> 5308266 (-0.00%)
Cycle count: 21885237842 -> 21343992264 (-2.47%); split: -3.13%, +0.65%
Spill count: 65152 -> 61927 (-4.95%); split: -5.42%, +0.47%
Fill count: 122811 -> 114669 (-6.63%); split: -7.77%, +1.14%
Scratch Memory Size: 5438464 -> 5273600 (-3.03%); split: -3.07%, +0.04%
Max live registers: 34355310 -> 34038391 (-0.92%); split: -1.00%, +0.07%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-28 10:52:47 -08:00
|
|
|
case nir_intrinsic_load_ssbo_uniform_block_intel:
|
|
|
|
|
case nir_intrinsic_load_ubo_uniform_block_intel:
|
2024-01-30 18:14:02 -08:00
|
|
|
case nir_intrinsic_load_workgroup_id:
|
|
|
|
|
is_scalar = true;
|
|
|
|
|
break;
|
|
|
|
|
|
brw/nir: Treat some load_ubo as convergent
v2: Fix for Xe2.
No changes in shader-db or fossil-db on Lunar Lake, Meteor Lake, or DG2.
shader-db:
Tiger Lake, Ice Lake, and Skylake had similar results. (Tiger Lake shown)
total instructions in shared programs: 19626547 -> 19634353 (0.04%)
instructions in affected programs: 1591181 -> 1598987 (0.49%)
helped: 925 / HURT: 3595
total cycles in shared programs: 865236718 -> 866682659 (0.17%)
cycles in affected programs: 151284264 -> 152730205 (0.96%)
helped: 3430 / HURT: 5510
total sends in shared programs: 1032237 -> 1032233 (<.01%)
sends in affected programs: 20 -> 16 (-20.00%)
helped: 4 / HURT: 0
LOST: 48
GAINED: 141
fossil-db:
Tiger Lake, Ice Lake, and Skylake had similar results. (Tiger Lake shown)
Totals:
Instrs: 150662952 -> 150641175 (-0.01%); split: -0.03%, +0.02%
Subgroup size: 7768880 -> 7768888 (+0.00%)
Send messages: 7502265 -> 7502044 (-0.00%)
Cycle count: 15621785298 -> 15618640525 (-0.02%); split: -0.06%, +0.04%
Spill count: 58818 -> 58816 (-0.00%)
Fill count: 101063 -> 101054 (-0.01%)
Max live registers: 31795403 -> 31792179 (-0.01%); split: -0.01%, +0.00%
Max dispatch width: 5572160 -> 5571488 (-0.01%); split: +0.00%, -0.01%
Totals from 10278 (1.62% of 632539) affected shaders:
Instrs: 5276493 -> 5254716 (-0.41%); split: -0.89%, +0.48%
Subgroup size: 156432 -> 156440 (+0.01%)
Send messages: 279259 -> 279038 (-0.08%)
Cycle count: 6483576378 -> 6480431605 (-0.05%); split: -0.16%, +0.11%
Spill count: 27133 -> 27131 (-0.01%)
Fill count: 49384 -> 49375 (-0.02%)
Max live registers: 675781 -> 672557 (-0.48%); split: -0.49%, +0.01%
Max dispatch width: 97256 -> 96584 (-0.69%); split: +0.08%, -0.77%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-14 16:22:45 -08:00
|
|
|
case nir_intrinsic_load_ubo:
|
2025-01-15 13:27:05 -08:00
|
|
|
is_scalar = get_nir_src(ntb, instr->src[1], 0).is_scalar;
|
brw/nir: Treat some load_ubo as convergent
v2: Fix for Xe2.
No changes in shader-db or fossil-db on Lunar Lake, Meteor Lake, or DG2.
shader-db:
Tiger Lake, Ice Lake, and Skylake had similar results. (Tiger Lake shown)
total instructions in shared programs: 19626547 -> 19634353 (0.04%)
instructions in affected programs: 1591181 -> 1598987 (0.49%)
helped: 925 / HURT: 3595
total cycles in shared programs: 865236718 -> 866682659 (0.17%)
cycles in affected programs: 151284264 -> 152730205 (0.96%)
helped: 3430 / HURT: 5510
total sends in shared programs: 1032237 -> 1032233 (<.01%)
sends in affected programs: 20 -> 16 (-20.00%)
helped: 4 / HURT: 0
LOST: 48
GAINED: 141
fossil-db:
Tiger Lake, Ice Lake, and Skylake had similar results. (Tiger Lake shown)
Totals:
Instrs: 150662952 -> 150641175 (-0.01%); split: -0.03%, +0.02%
Subgroup size: 7768880 -> 7768888 (+0.00%)
Send messages: 7502265 -> 7502044 (-0.00%)
Cycle count: 15621785298 -> 15618640525 (-0.02%); split: -0.06%, +0.04%
Spill count: 58818 -> 58816 (-0.00%)
Fill count: 101063 -> 101054 (-0.01%)
Max live registers: 31795403 -> 31792179 (-0.01%); split: -0.01%, +0.00%
Max dispatch width: 5572160 -> 5571488 (-0.01%); split: +0.00%, -0.01%
Totals from 10278 (1.62% of 632539) affected shaders:
Instrs: 5276493 -> 5254716 (-0.41%); split: -0.89%, +0.48%
Subgroup size: 156432 -> 156440 (+0.01%)
Send messages: 279259 -> 279038 (-0.08%)
Cycle count: 6483576378 -> 6480431605 (-0.05%); split: -0.16%, +0.11%
Spill count: 27133 -> 27131 (-0.01%)
Fill count: 49384 -> 49375 (-0.02%)
Max live registers: 675781 -> 672557 (-0.48%); split: -0.49%, +0.01%
Max dispatch width: 97256 -> 96584 (-0.69%); split: +0.08%, -0.77%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-14 16:22:45 -08:00
|
|
|
break;
|
|
|
|
|
|
2024-02-01 15:02:37 -08:00
|
|
|
case nir_intrinsic_load_uniform:
|
2025-04-16 08:54:14 +03:00
|
|
|
case nir_intrinsic_load_push_constant:
|
2025-01-15 13:27:05 -08:00
|
|
|
is_scalar = get_nir_src(ntb, instr->src[0], 0).is_scalar;
|
2024-02-01 15:02:37 -08:00
|
|
|
break;
|
|
|
|
|
|
brw/nir: Treat some ballot as convergent
v2: Fix for Xe2.
v3: Add a comment explaining the use of bld instead of xbld. Suggested
by Ken. Fix a bug in handing is_scalar source. Noticed by me while
applying Ken's review feedback.
shader-db:
Lunar Lake, Meteor Lake, DG2, and Tiger Lake had similar results. (Lunar Lake shown)
total instructions in shared programs: 18228657 -> 18228689 (<.01%)
instructions in affected programs: 9333 -> 9365 (0.34%)
helped: 2 / HURT: 26
total cycles in shared programs: 932511560 -> 932542994 (<.01%)
cycles in affected programs: 2263040 -> 2294474 (1.39%)
helped: 7 / HURT: 27
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20700370 -> 20700392 (<.01%)
instructions in affected programs: 18579 -> 18601 (0.12%)
helped: 1 / HURT: 28
total cycles in shared programs: 888385851 -> 888386325 (<.01%)
cycles in affected programs: 2571368 -> 2571842 (0.02%)
helped: 14 / HURT: 6
total spills in shared programs: 4373 -> 4371 (-0.05%)
spills in affected programs: 71 -> 69 (-2.82%)
helped: 1 / HURT: 0
total fills in shared programs: 4657 -> 4653 (-0.09%)
fills in affected programs: 196 -> 192 (-2.04%)
helped: 1 / HURT: 0
fossil-db:
Lunar Lake
Totals:
Instrs: 142887258 -> 142890605 (+0.00%); split: -0.00%, +0.00%
Cycle count: 21653599282 -> 21655049536 (+0.01%); split: -0.00%, +0.01%
Max live registers: 47942973 -> 47942837 (-0.00%)
Totals from 22209 (4.01% of 553251) affected shaders:
Instrs: 4337679 -> 4341026 (+0.08%); split: -0.00%, +0.08%
Cycle count: 261852040 -> 263302294 (+0.55%); split: -0.38%, +0.93%
Max live registers: 1299670 -> 1299534 (-0.01%)
Meteor Lake, DG2, Tiger Lake, and Skylake had similar results. (Meteor Lake shown)
Totals:
Instrs: 156599915 -> 156590882 (-0.01%); split: -0.01%, +0.00%
Cycle count: 16940072009 -> 16940902317 (+0.00%); split: -0.01%, +0.01%
Max live registers: 32610801 -> 32610488 (-0.00%)
Max dispatch width: 5730736 -> 5731744 (+0.02%); split: +0.12%, -0.11%
Totals from 35528 (5.52% of 643617) affected shaders:
Instrs: 6175409 -> 6166376 (-0.15%); split: -0.21%, +0.06%
Cycle count: 230679923 -> 231510231 (+0.36%); split: -0.46%, +0.82%
Max live registers: 1354716 -> 1354403 (-0.02%)
Max dispatch width: 167648 -> 168656 (+0.60%); split: +4.26%, -3.66%
Ice Lake
Totals:
Instrs: 155330276 -> 155318037 (-0.01%); split: -0.01%, +0.00%
Cycle count: 15019092327 -> 15019637026 (+0.00%); split: -0.00%, +0.01%
Max live registers: 32640341 -> 32637305 (-0.01%)
Max dispatch width: 5780720 -> 5780688 (-0.00%); split: +0.02%, -0.02%
Totals from 37773 (5.85% of 645641) affected shaders:
Instrs: 6643030 -> 6630791 (-0.18%); split: -0.24%, +0.05%
Cycle count: 223589025 -> 224133724 (+0.24%); split: -0.29%, +0.53%
Max live registers: 1491781 -> 1488745 (-0.20%)
Max dispatch width: 167600 -> 167568 (-0.02%); split: +0.75%, -0.77%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-28 13:05:08 -08:00
|
|
|
case nir_intrinsic_ballot:
|
2024-07-06 12:11:33 -07:00
|
|
|
case nir_intrinsic_resource_intel:
|
|
|
|
|
is_scalar = !def.divergent;
|
|
|
|
|
break;
|
|
|
|
|
|
2024-02-01 15:02:37 -08:00
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* This cannot be is_scalar if NIR thought it was divergent. */
|
|
|
|
|
assert(!(is_scalar && def.divergent));
|
brw/nir: Treat some ALU results as convergent
v2: Fix for Xe2.
v3: Fix handling of 64-bit CMP results.
v4: Scalarize 16-bit comparison temporary destination when used as a
source (as was already done for 64-bit). Suggested by Ken.
shader-db:
Lunar Lake
total instructions in shared programs: 18096500 -> 18096549 (<.01%)
instructions in affected programs: 15919 -> 15968 (0.31%)
helped: 8 / HURT: 21
total cycles in shared programs: 921841300 -> 922073090 (0.03%)
cycles in affected programs: 115946336 -> 116178126 (0.20%)
helped: 386 / HURT: 135
Meteor Lake and DG2 (Meteor Lake shown)
total instructions in shared programs: 19836053 -> 19836016 (<.01%)
instructions in affected programs: 19547 -> 19510 (-0.19%)
helped: 21 / HURT: 18
total cycles in shared programs: 906713777 -> 906588541 (-0.01%)
cycles in affected programs: 96914584 -> 96789348 (-0.13%)
helped: 335 / HURT: 134
total fills in shared programs: 6712 -> 6710 (-0.03%)
fills in affected programs: 52 -> 50 (-3.85%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Tiger Lake
total instructions in shared programs: 19641284 -> 19641278 (<.01%)
instructions in affected programs: 12358 -> 12352 (-0.05%)
helped: 10 / HURT: 19
total cycles in shared programs: 865413131 -> 865460513 (<.01%)
cycles in affected programs: 74641489 -> 74688871 (0.06%)
helped: 388 / HURT: 100
total spills in shared programs: 3899 -> 3898 (-0.03%)
spills in affected programs: 17 -> 16 (-5.88%)
helped: 1 / HURT: 0
total fills in shared programs: 3249 -> 3245 (-0.12%)
fills in affected programs: 51 -> 47 (-7.84%)
helped: 1 / HURT: 0
LOST: 1
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20495826 -> 20496111 (<.01%)
instructions in affected programs: 53220 -> 53505 (0.54%)
helped: 28 / HURT: 16
total cycles in shared programs: 875173550 -> 875243910 (<.01%)
cycles in affected programs: 51700652 -> 51771012 (0.14%)
helped: 400 / HURT: 39
total spills in shared programs: 4546 -> 4546 (0.00%)
spills in affected programs: 288 -> 288 (0.00%)
helped: 1 / HURT: 2
total fills in shared programs: 5224 -> 5280 (1.07%)
fills in affected programs: 795 -> 851 (7.04%)
helped: 0 / HURT: 4
LOST: 1
GAINED: 1
fossil-db:
Lunar Lake
Totals:
Instrs: 141811551 -> 141807640 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22183128332 -> 22181285594 (-0.01%); split: -0.06%, +0.05%
Spill count: 69890 -> 69859 (-0.04%); split: -0.09%, +0.04%
Fill count: 128877 -> 128344 (-0.41%); split: -0.42%, +0.00%
Max live registers: 48053415 -> 48051613 (-0.00%); split: -0.00%, +0.00%
Totals from 6817 (1.24% of 551443) affected shaders:
Instrs: 4300169 -> 4296258 (-0.09%); split: -0.14%, +0.05%
Cycle count: 17263755610 -> 17261912872 (-0.01%); split: -0.08%, +0.07%
Spill count: 41822 -> 41791 (-0.07%); split: -0.15%, +0.07%
Fill count: 75523 -> 74990 (-0.71%); split: -0.71%, +0.01%
Max live registers: 733647 -> 731845 (-0.25%); split: -0.29%, +0.04%
Meteor Lake and all older Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 152735305 -> 152735801 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7733536 -> 7733616 (+0.00%)
Cycle count: 17398725539 -> 17400873100 (+0.01%); split: -0.00%, +0.02%
Max live registers: 31887018 -> 31885742 (-0.00%); split: -0.00%, +0.00%
Max dispatch width: 5561696 -> 5561712 (+0.00%)
Totals from 5672 (0.90% of 633314) affected shaders:
Instrs: 2817606 -> 2818102 (+0.02%); split: -0.05%, +0.07%
Subgroup size: 81128 -> 81208 (+0.10%)
Cycle count: 10021470543 -> 10023618104 (+0.02%); split: -0.01%, +0.03%
Max live registers: 306520 -> 305244 (-0.42%); split: -0.43%, +0.01%
Max dispatch width: 74136 -> 74152 (+0.02%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-01-30 18:53:05 -08:00
|
|
|
} else if (def.parent_instr->type == nir_instr_type_alu) {
|
|
|
|
|
is_scalar = store_reg == NULL && all_sources_uniform && !def.divergent;
|
2024-02-01 15:02:37 -08:00
|
|
|
}
|
|
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = is_scalar ? ntb.bld.scalar_group() : ntb.bld;
|
2024-02-01 15:02:37 -08:00
|
|
|
|
2023-07-12 02:37:17 -05:00
|
|
|
if (!store_reg) {
|
|
|
|
|
const brw_reg_type reg_type =
|
2024-04-21 00:33:52 -07:00
|
|
|
brw_type_with_size(def.bit_size == 8 ? BRW_TYPE_D : BRW_TYPE_F,
|
|
|
|
|
def.bit_size);
|
2023-12-05 15:27:29 -08:00
|
|
|
ntb.ssa_values[def.index] =
|
2023-08-14 08:34:38 -05:00
|
|
|
bld.vgrf(reg_type, def.num_components);
|
2024-02-26 22:50:08 -08:00
|
|
|
|
2024-02-01 15:02:37 -08:00
|
|
|
ntb.ssa_values[def.index].is_scalar = is_scalar;
|
|
|
|
|
|
2024-10-23 00:44:54 -07:00
|
|
|
bld.emit_undef_for_partial_reg(ntb.ssa_values[def.index]);
|
2024-02-26 22:50:08 -08:00
|
|
|
|
2023-12-05 15:27:29 -08:00
|
|
|
return ntb.ssa_values[def.index];
|
2015-11-10 21:07:45 -08:00
|
|
|
} else {
|
2023-07-12 02:37:17 -05:00
|
|
|
nir_intrinsic_instr *decl_reg =
|
|
|
|
|
nir_reg_get_decl(store_reg->src[1].ssa);
|
2015-11-10 21:07:45 -08:00
|
|
|
/* We don't handle indirects on locals */
|
2023-07-12 02:37:17 -05:00
|
|
|
assert(nir_intrinsic_base(store_reg) == 0);
|
|
|
|
|
assert(store_reg->intrinsic != nir_intrinsic_store_reg_indirect);
|
2024-02-01 15:02:37 -08:00
|
|
|
assert(!is_scalar);
|
2023-12-05 15:27:29 -08:00
|
|
|
return ntb.ssa_values[decl_reg->def.index];
|
2015-06-24 12:28:47 -07:00
|
|
|
}
|
2014-08-15 10:32:07 -07:00
|
|
|
}
|
|
|
|
|
|
2023-11-20 15:21:11 -08:00
|
|
|
static nir_component_mask_t
|
|
|
|
|
get_nir_write_mask(const nir_def &def)
|
2023-07-24 17:06:32 -05:00
|
|
|
{
|
2023-08-14 08:34:38 -05:00
|
|
|
nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
|
2023-07-24 17:06:32 -05:00
|
|
|
if (!store_reg) {
|
2023-08-14 08:34:38 -05:00
|
|
|
return nir_component_mask(def.num_components);
|
2023-07-24 17:06:32 -05:00
|
|
|
} else {
|
|
|
|
|
return nir_intrinsic_write_mask(store_reg);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
static brw_inst *
|
2024-12-29 15:41:04 -08:00
|
|
|
emit_pixel_interpolater_send(const brw_builder &bld,
|
2015-07-17 14:40:03 +01:00
|
|
|
enum opcode opcode,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg &dst,
|
|
|
|
|
const brw_reg &src,
|
|
|
|
|
const brw_reg &desc,
|
|
|
|
|
const brw_reg &flag_reg,
|
2016-07-07 02:02:38 -07:00
|
|
|
glsl_interp_mode interpolation)
|
2015-07-17 14:40:03 +01:00
|
|
|
{
|
2016-07-13 20:16:11 -07:00
|
|
|
struct brw_wm_prog_data *wm_prog_data =
|
2024-02-19 23:07:04 -08:00
|
|
|
brw_wm_prog_data(bld.shader->prog_data);
|
2015-07-17 14:40:03 +01:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[INTERP_NUM_SRCS];
|
2024-02-12 08:43:34 -08:00
|
|
|
|
|
|
|
|
if (src.is_scalar) {
|
|
|
|
|
srcs[INTERP_SRC_OFFSET] = bld.vgrf(src.type, 2);
|
|
|
|
|
brw_combine_with_vec(bld, srcs[INTERP_SRC_OFFSET], src, 2);
|
|
|
|
|
} else {
|
|
|
|
|
srcs[INTERP_SRC_OFFSET] = src;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-21 10:38:19 +02:00
|
|
|
srcs[INTERP_SRC_MSG_DESC] = desc;
|
|
|
|
|
srcs[INTERP_SRC_DYNAMIC_MODE] = flag_reg;
|
2025-01-16 15:12:14 -08:00
|
|
|
srcs[INTERP_SRC_NOPERSPECTIVE] = brw_imm_ud(false);
|
2023-11-21 10:38:19 +02:00
|
|
|
|
2022-07-07 12:24:38 +03:00
|
|
|
if (interpolation == INTERP_MODE_NOPERSPECTIVE) {
|
2025-01-16 15:12:14 -08:00
|
|
|
srcs[INTERP_SRC_NOPERSPECTIVE] = brw_imm_ud(true);
|
|
|
|
|
|
2022-07-07 12:24:38 +03:00
|
|
|
/* TGL BSpec says:
|
|
|
|
|
* This field cannot be set to "Linear Interpolation"
|
|
|
|
|
* unless Non-Perspective Barycentric Enable in 3DSTATE_CLIP is enabled"
|
|
|
|
|
*/
|
|
|
|
|
wm_prog_data->uses_nonperspective_interp_modes = true;
|
|
|
|
|
}
|
2015-07-17 14:40:03 +01:00
|
|
|
|
2025-01-16 15:12:14 -08:00
|
|
|
brw_inst *inst = bld.emit(opcode, dst, srcs, INTERP_NUM_SRCS);
|
|
|
|
|
/* 2 floats per slot returned */
|
|
|
|
|
inst->size_written = 2 * dst.component_size(inst->exec_size);
|
|
|
|
|
|
2016-07-13 20:16:11 -07:00
|
|
|
wm_prog_data->pulls_bary = true;
|
|
|
|
|
|
2015-07-17 14:40:03 +01:00
|
|
|
return inst;
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-10 17:33:56 -07:00
|
|
|
/**
|
|
|
|
|
* Return the specified component \p subreg of a per-polygon PS
|
|
|
|
|
* payload register for the polygon corresponding to each channel
|
|
|
|
|
* specified in the provided \p bld.
|
|
|
|
|
*
|
|
|
|
|
* \p reg specifies the payload register in REG_SIZE units for the
|
|
|
|
|
* first polygon dispatched to the thread. This function requires
|
|
|
|
|
* that subsequent registers on the payload contain the corresponding
|
|
|
|
|
* register for subsequent polygons, one GRF register per polygon, if
|
|
|
|
|
* multiple polygons are being processed by the same PS thread.
|
|
|
|
|
*
|
|
|
|
|
* This can be used to access the value of a "Source Depth and/or W
|
|
|
|
|
* Attribute Vertex Deltas", "Perspective Bary Planes" or
|
|
|
|
|
* "Non-Perspective Bary Planes" payload field conveniently for
|
2024-06-18 23:42:59 -07:00
|
|
|
* multiple polygons as a single brw_reg.
|
2022-08-10 17:33:56 -07:00
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2024-12-29 15:41:04 -08:00
|
|
|
fetch_polygon_reg(const brw_builder &bld, unsigned reg, unsigned subreg)
|
2022-08-10 17:33:56 -07:00
|
|
|
{
|
2024-12-07 10:25:45 -08:00
|
|
|
const brw_shader *shader = bld.shader;
|
2022-08-10 17:33:56 -07:00
|
|
|
assert(shader->stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
|
|
|
|
const struct intel_device_info *devinfo = shader->devinfo;
|
|
|
|
|
const unsigned poly_width = shader->dispatch_width / shader->max_polygons;
|
|
|
|
|
const unsigned poly_idx = bld.group() / poly_width;
|
|
|
|
|
assert(bld.group() % poly_width == 0);
|
|
|
|
|
|
|
|
|
|
if (bld.dispatch_width() > poly_width) {
|
|
|
|
|
assert(bld.dispatch_width() <= 2 * poly_width);
|
|
|
|
|
const unsigned reg_size = reg_unit(devinfo) * REG_SIZE;
|
|
|
|
|
const unsigned vstride = reg_size / brw_type_size_bytes(BRW_TYPE_F);
|
|
|
|
|
return stride(brw_vec1_grf(reg + reg_unit(devinfo) * poly_idx, subreg),
|
|
|
|
|
vstride, poly_width, 0);
|
|
|
|
|
} else {
|
|
|
|
|
return brw_vec1_grf(reg + reg_unit(devinfo) * poly_idx, subreg);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Interpolate per-polygon barycentrics at a specific offset relative
|
|
|
|
|
* to each channel fragment coordinates, optionally using
|
|
|
|
|
* perspective-correct interpolation if requested. This is mostly
|
|
|
|
|
* useful as replacement for the PI shared function that existed on
|
|
|
|
|
* platforms prior to Xe2, but is expected to work on earlier
|
|
|
|
|
* platforms since we can get the required polygon setup information
|
|
|
|
|
* from the thread payload as far back as ICL.
|
|
|
|
|
*/
|
|
|
|
|
static void
|
2024-12-29 15:41:04 -08:00
|
|
|
emit_pixel_interpolater_alu_at_offset(const brw_builder &bld,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg &dst,
|
|
|
|
|
const brw_reg &offs,
|
2022-08-10 17:33:56 -07:00
|
|
|
glsl_interp_mode interpolation)
|
|
|
|
|
{
|
2024-12-07 10:25:45 -08:00
|
|
|
const brw_shader *shader = bld.shader;
|
2022-08-10 17:33:56 -07:00
|
|
|
assert(shader->stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
|
|
|
|
const intel_device_info *devinfo = shader->devinfo;
|
|
|
|
|
assert(devinfo->ver >= 11);
|
|
|
|
|
|
2024-12-06 22:13:36 -08:00
|
|
|
const brw_fs_thread_payload &payload = shader->fs_payload();
|
2022-08-10 17:33:56 -07:00
|
|
|
const struct brw_wm_prog_data *wm_prog_data =
|
|
|
|
|
brw_wm_prog_data(shader->prog_data);
|
|
|
|
|
|
|
|
|
|
if (interpolation == INTERP_MODE_NOPERSPECTIVE) {
|
|
|
|
|
assert(wm_prog_data->uses_npc_bary_coefficients &&
|
|
|
|
|
wm_prog_data->uses_nonperspective_interp_modes);
|
|
|
|
|
} else {
|
|
|
|
|
assert(interpolation == INTERP_MODE_SMOOTH);
|
|
|
|
|
assert(wm_prog_data->uses_pc_bary_coefficients &&
|
|
|
|
|
wm_prog_data->uses_depth_w_coefficients);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Account for half-pixel X/Y coordinate offset. */
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg off_x = bld.vgrf(BRW_TYPE_F);
|
brw/nir: Use offset() for all uses of offs in emit_pixel_interpolater_alu_at_offset
This is necessary to appropriately uniformize the first component
access of a convergent vector. Without this, this is produced:
load_payload(16) %18:D, 0d, 0d NoMask group0
add(32) %21:F, %18+0.0:F, 0.5f
add(32) %22:F, %18+2.0<0>:F, 0.5f
This is the correct code:
load_payload(16) %18:D, 0d, 0d NoMask group0
add(32) %21:F, %18+0.0<0>:F, 0.5f
add(32) %22:F, %18+2.0<0>:F, 0.5f
Without 38b58e286fa, the code generated was more incorrect, but happened
to work for this test case:
load_payload(16) %18:D, 0d, 0d NoMask group0
add(32) %21:F, %18+0.0<0>:F, 0.5f
add(32) %22:F, %18+0.4<0>:F, 0.5f
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Fixes: 38b58e286fa ("brw/nir: Fix source handling of nir_intrinsic_load_barycentric_at_offset")
Closes: #12969
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34427>
2025-04-08 12:40:14 -07:00
|
|
|
bld.ADD(off_x, offset(offs, bld, 0), brw_imm_f(0.5));
|
2022-08-10 17:33:56 -07:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg off_y = bld.vgrf(BRW_TYPE_F);
|
2022-08-10 17:33:56 -07:00
|
|
|
bld.ADD(off_y, offset(offs, bld, 1), brw_imm_f(0.5));
|
|
|
|
|
|
|
|
|
|
/* Process no more than two polygons at a time to avoid hitting
|
|
|
|
|
* regioning restrictions.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned poly_width = shader->dispatch_width / shader->max_polygons;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < DIV_ROUND_UP(shader->max_polygons, 2); i++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder ibld = bld.group(MIN2(bld.dispatch_width(), 2 * poly_width), i);
|
2022-08-10 17:33:56 -07:00
|
|
|
|
|
|
|
|
/* Fetch needed parameters from the thread payload. */
|
|
|
|
|
const unsigned bary_coef_reg = interpolation == INTERP_MODE_NOPERSPECTIVE ?
|
|
|
|
|
payload.npc_bary_coef_reg : payload.pc_bary_coef_reg;
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg start_x = devinfo->ver < 12 ? fetch_polygon_reg(ibld, 1, 1) :
|
2022-08-10 17:33:56 -07:00
|
|
|
fetch_polygon_reg(ibld, bary_coef_reg,
|
|
|
|
|
devinfo->ver >= 20 ? 6 : 2);
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg start_y = devinfo->ver < 12 ? fetch_polygon_reg(ibld, 1, 6) :
|
2022-08-10 17:33:56 -07:00
|
|
|
fetch_polygon_reg(ibld, bary_coef_reg,
|
|
|
|
|
devinfo->ver >= 20 ? 7 : 6);
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg bary1_c0 = fetch_polygon_reg(ibld, bary_coef_reg,
|
2022-08-10 17:33:56 -07:00
|
|
|
devinfo->ver >= 20 ? 2 : 3);
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg bary1_cx = fetch_polygon_reg(ibld, bary_coef_reg, 1);
|
|
|
|
|
const brw_reg bary1_cy = fetch_polygon_reg(ibld, bary_coef_reg, 0);
|
2022-08-10 17:33:56 -07:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg bary2_c0 = fetch_polygon_reg(ibld, bary_coef_reg,
|
2022-08-10 17:33:56 -07:00
|
|
|
devinfo->ver >= 20 ? 5 : 7);
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg bary2_cx = fetch_polygon_reg(ibld, bary_coef_reg,
|
2022-08-10 17:33:56 -07:00
|
|
|
devinfo->ver >= 20 ? 4 : 5);
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg bary2_cy = fetch_polygon_reg(ibld, bary_coef_reg,
|
2022-08-10 17:33:56 -07:00
|
|
|
devinfo->ver >= 20 ? 3 : 4);
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg rhw_c0 = devinfo->ver >= 20 ?
|
2022-08-10 17:33:56 -07:00
|
|
|
fetch_polygon_reg(ibld, payload.depth_w_coef_reg + 1, 5) :
|
|
|
|
|
fetch_polygon_reg(ibld, payload.depth_w_coef_reg, 7);
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg rhw_cx = devinfo->ver >= 20 ?
|
2022-08-10 17:33:56 -07:00
|
|
|
fetch_polygon_reg(ibld, payload.depth_w_coef_reg + 1, 4) :
|
|
|
|
|
fetch_polygon_reg(ibld, payload.depth_w_coef_reg, 5);
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg rhw_cy = devinfo->ver >= 20 ?
|
2022-08-10 17:33:56 -07:00
|
|
|
fetch_polygon_reg(ibld, payload.depth_w_coef_reg + 1, 3) :
|
|
|
|
|
fetch_polygon_reg(ibld, payload.depth_w_coef_reg, 4);
|
|
|
|
|
|
|
|
|
|
/* Compute X/Y coordinate deltas relative to the origin of the polygon. */
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg delta_x = ibld.vgrf(BRW_TYPE_F);
|
2022-08-10 17:33:56 -07:00
|
|
|
ibld.ADD(delta_x, offset(shader->pixel_x, ibld, i), negate(start_x));
|
|
|
|
|
ibld.ADD(delta_x, delta_x, offset(off_x, ibld, i));
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg delta_y = ibld.vgrf(BRW_TYPE_F);
|
2022-08-10 17:33:56 -07:00
|
|
|
ibld.ADD(delta_y, offset(shader->pixel_y, ibld, i), negate(start_y));
|
|
|
|
|
ibld.ADD(delta_y, delta_y, offset(off_y, ibld, i));
|
|
|
|
|
|
|
|
|
|
/* Evaluate the plane equations obtained above for the
|
|
|
|
|
* barycentrics and RHW coordinate at the offset specified for
|
|
|
|
|
* each channel. Limit arithmetic to acc_width in order to
|
|
|
|
|
* allow the accumulator to be used for linear interpolation.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned acc_width = 16 * reg_unit(devinfo);
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg rhw = ibld.vgrf(BRW_TYPE_F);
|
|
|
|
|
const brw_reg bary1 = ibld.vgrf(BRW_TYPE_F);
|
|
|
|
|
const brw_reg bary2 = ibld.vgrf(BRW_TYPE_F);
|
2022-08-10 17:33:56 -07:00
|
|
|
|
|
|
|
|
for (unsigned j = 0; j < DIV_ROUND_UP(ibld.dispatch_width(), acc_width); j++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder jbld = ibld.group(MIN2(ibld.dispatch_width(), acc_width), j);
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg acc = suboffset(brw_acc_reg(16), jbld.group() % acc_width);
|
2022-08-10 17:33:56 -07:00
|
|
|
|
|
|
|
|
if (interpolation != INTERP_MODE_NOPERSPECTIVE) {
|
|
|
|
|
jbld.MAD(acc, horiz_offset(rhw_c0, acc_width * j),
|
|
|
|
|
horiz_offset(rhw_cx, acc_width * j), offset(delta_x, jbld, j));
|
|
|
|
|
jbld.MAC(offset(rhw, jbld, j),
|
|
|
|
|
horiz_offset(rhw_cy, acc_width * j), offset(delta_y, jbld, j));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
jbld.MAD(acc, horiz_offset(bary1_c0, acc_width * j),
|
|
|
|
|
horiz_offset(bary1_cx, acc_width * j), offset(delta_x, jbld, j));
|
|
|
|
|
jbld.MAC(offset(bary1, jbld, j),
|
|
|
|
|
horiz_offset(bary1_cy, acc_width * j), offset(delta_y, jbld, j));
|
|
|
|
|
|
|
|
|
|
jbld.MAD(acc, horiz_offset(bary2_c0, acc_width * j),
|
|
|
|
|
horiz_offset(bary2_cx, acc_width * j), offset(delta_x, jbld, j));
|
|
|
|
|
jbld.MAC(offset(bary2, jbld, j),
|
|
|
|
|
horiz_offset(bary2_cy, acc_width * j), offset(delta_y, jbld, j));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Scale the results dividing by the interpolated RHW coordinate
|
|
|
|
|
* if the interpolation is required to be perspective-correct.
|
|
|
|
|
*/
|
|
|
|
|
if (interpolation == INTERP_MODE_NOPERSPECTIVE) {
|
|
|
|
|
ibld.MOV(offset(dst, ibld, i), bary1);
|
|
|
|
|
ibld.MOV(offset(offset(dst, bld, 1), ibld, i), bary2);
|
|
|
|
|
} else {
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg w = ibld.vgrf(BRW_TYPE_F);
|
2022-08-10 17:33:56 -07:00
|
|
|
ibld.emit(SHADER_OPCODE_RCP, w, rhw);
|
|
|
|
|
ibld.MUL(offset(dst, ibld, i), bary1, w);
|
|
|
|
|
ibld.MUL(offset(offset(dst, bld, 1), ibld, i), bary2, w);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-20 18:51:06 -07:00
|
|
|
/**
|
|
|
|
|
* Interpolate per-polygon barycentrics at a specified sample index,
|
|
|
|
|
* optionally using perspective-correct interpolation if requested.
|
|
|
|
|
* This is mostly useful as replacement for the PI shared function
|
|
|
|
|
* that existed on platforms prior to Xe2, but is expected to work on
|
|
|
|
|
* earlier platforms since we can get the required polygon setup
|
|
|
|
|
* information from the thread payload as far back as ICL.
|
|
|
|
|
*/
|
|
|
|
|
static void
|
2024-12-29 15:41:04 -08:00
|
|
|
emit_pixel_interpolater_alu_at_sample(const brw_builder &bld,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg &dst,
|
|
|
|
|
const brw_reg &idx,
|
2024-06-20 18:51:06 -07:00
|
|
|
glsl_interp_mode interpolation)
|
|
|
|
|
{
|
2024-12-06 22:13:36 -08:00
|
|
|
const brw_fs_thread_payload &payload = bld.shader->fs_payload();
|
2024-06-20 18:51:06 -07:00
|
|
|
const struct brw_wm_prog_data *wm_prog_data =
|
|
|
|
|
brw_wm_prog_data(bld.shader->prog_data);
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder ubld = bld.exec_all().group(16, 0);
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg sample_offs_xy = ubld.vgrf(BRW_TYPE_UD);
|
2024-06-20 18:51:06 -07:00
|
|
|
assert(wm_prog_data->uses_sample_offsets);
|
|
|
|
|
|
|
|
|
|
/* Interleave the X/Y coordinates of each sample in order to allow
|
|
|
|
|
* a single indirect look-up, by using a MOV for the 16 X
|
|
|
|
|
* coordinates, then another MOV for the 16 Y coordinates.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg reg = retype(brw_vec16_grf(payload.sample_offsets_reg, 4 * i),
|
2024-06-20 18:51:06 -07:00
|
|
|
BRW_TYPE_UB);
|
|
|
|
|
ubld.MOV(subscript(sample_offs_xy, BRW_TYPE_UW, i), reg);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Use indirect addressing to fetch the X/Y offsets of the sample
|
|
|
|
|
* index provided for each channel.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg idx_b = bld.vgrf(BRW_TYPE_UD);
|
2024-06-20 18:51:06 -07:00
|
|
|
bld.MUL(idx_b, idx, brw_imm_ud(brw_type_size_bytes(BRW_TYPE_UD)));
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg off_xy = bld.vgrf(BRW_TYPE_UD);
|
2024-06-20 18:51:06 -07:00
|
|
|
bld.emit(SHADER_OPCODE_MOV_INDIRECT, off_xy, component(sample_offs_xy, 0),
|
|
|
|
|
idx_b, brw_imm_ud(16 * brw_type_size_bytes(BRW_TYPE_UD)));
|
|
|
|
|
|
|
|
|
|
/* Convert the selected fixed-point offsets to floating-point
|
|
|
|
|
* offsets.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg offs = bld.vgrf(BRW_TYPE_F, 2);
|
2024-06-20 18:51:06 -07:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg tmp = bld.vgrf(BRW_TYPE_F);
|
2024-06-20 18:51:06 -07:00
|
|
|
bld.MOV(tmp, subscript(off_xy, BRW_TYPE_UW, i));
|
|
|
|
|
bld.MUL(tmp, tmp, brw_imm_f(0.0625));
|
|
|
|
|
bld.ADD(offset(offs, bld, i), tmp, brw_imm_f(-0.5));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Interpolate at the resulting offsets. */
|
|
|
|
|
emit_pixel_interpolater_alu_at_offset(bld, dst, offs, interpolation);
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
/**
|
|
|
|
|
* Computes 1 << x, given a D/UD register containing some value x.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2024-12-29 15:41:04 -08:00
|
|
|
intexp2(const brw_builder &bld, const brw_reg &x)
|
2015-03-11 23:14:31 -07:00
|
|
|
{
|
2024-04-20 17:08:02 -07:00
|
|
|
assert(x.type == BRW_TYPE_UD || x.type == BRW_TYPE_D);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
2024-04-12 17:43:22 -07:00
|
|
|
return bld.SHL(bld.MOV(retype(brw_imm_d(1), x.type)), x);
|
2015-03-11 23:14:31 -07:00
|
|
|
}
|
|
|
|
|
|
2023-11-20 13:25:36 -08:00
|
|
|
static void
|
2023-12-05 15:27:29 -08:00
|
|
|
emit_gs_end_primitive(nir_to_brw_state &ntb, const nir_src &vertex_count_nir_src)
|
2015-03-11 23:14:31 -07:00
|
|
|
{
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_GEOMETRY);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
2024-12-06 23:01:58 -08:00
|
|
|
if (s.gs.control_data_header_size_bits == 0)
|
2016-05-23 15:17:02 -07:00
|
|
|
return;
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
/* We can only do EndPrimitive() functionality when the control data
|
|
|
|
|
* consists of cut bits. Fortunately, the only time it isn't is when the
|
|
|
|
|
* output type is points, in which case EndPrimitive() is a no-op.
|
|
|
|
|
*/
|
|
|
|
|
if (gs_prog_data->control_data_format !=
|
2021-03-29 15:16:59 -07:00
|
|
|
GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
|
2015-03-11 23:14:31 -07:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Cut bits use one bit per vertex. */
|
2024-12-06 23:01:58 -08:00
|
|
|
assert(s.gs.control_data_bits_per_vertex == 1);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
2025-01-15 13:27:05 -08:00
|
|
|
brw_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src, 0);
|
2024-04-20 17:08:02 -07:00
|
|
|
vertex_count.type = BRW_TYPE_UD;
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
/* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
|
|
|
|
|
* vertex n, 0 otherwise. So all we need to do here is mark bit
|
|
|
|
|
* (vertex_count - 1) % 32 in the cut_bits register to indicate that
|
|
|
|
|
* EndPrimitive() was called after emitting vertex (vertex_count - 1);
|
|
|
|
|
* vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
|
|
|
|
|
*
|
|
|
|
|
* Note that if EndPrimitive() is called before emitting any vertices, this
|
|
|
|
|
* will cause us to set bit 31 of the control_data_bits register to 1.
|
|
|
|
|
* That's fine because:
|
|
|
|
|
*
|
|
|
|
|
* - If max_vertices < 32, then vertex number 31 (zero-based) will never be
|
|
|
|
|
* output, so the hardware will ignore cut bit 31.
|
|
|
|
|
*
|
|
|
|
|
* - If max_vertices == 32, then vertex number 31 is guaranteed to be the
|
|
|
|
|
* last vertex, so setting cut bit 31 has no effect (since the primitive
|
|
|
|
|
* is automatically ended when the GS terminates).
|
|
|
|
|
*
|
|
|
|
|
* - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
|
|
|
|
|
* control_data_bits register to 0 when the first vertex is emitted.
|
|
|
|
|
*/
|
|
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder abld = ntb.bld.annotate("end primitive");
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
/* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg prev_count = abld.ADD(vertex_count, brw_imm_ud(0xffffffffu));
|
|
|
|
|
brw_reg mask = intexp2(abld, prev_count);
|
2015-03-11 23:14:31 -07:00
|
|
|
/* Note: we're relying on the fact that the GEN SHL instruction only pays
|
|
|
|
|
* attention to the lower 5 bits of its second source argument, so on this
|
|
|
|
|
* architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
|
|
|
|
|
* ((vertex_count - 1) % 32).
|
|
|
|
|
*/
|
2023-12-05 17:16:34 -08:00
|
|
|
abld.OR(s.control_data_bits, s.control_data_bits, mask);
|
2015-03-11 23:14:31 -07:00
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader::gs_urb_per_slot_dword_index(const brw_reg &vertex_count)
|
2015-03-11 23:14:31 -07:00
|
|
|
{
|
|
|
|
|
/* We use a single UD register to accumulate control data bits (32 bits
|
|
|
|
|
* for each of the SIMD8 channels). So we need to write a DWord (32 bits)
|
|
|
|
|
* at a time.
|
|
|
|
|
*
|
2024-02-05 13:34:53 -08:00
|
|
|
* On platforms < Xe2:
|
|
|
|
|
* Unfortunately,the URB_WRITE_SIMD8 message uses 128-bit (OWord)
|
|
|
|
|
* offsets. We have select a 128-bit group via the Global and Per-Slot
|
|
|
|
|
* Offsets, then use the Channel Mask phase to enable/disable which DWord
|
|
|
|
|
* within that group to write. (Remember, different SIMD8 channels may
|
|
|
|
|
* have emitted different numbers of vertices, so we may need per-slot
|
|
|
|
|
* offsets.)
|
|
|
|
|
*
|
|
|
|
|
* Channel masking presents an annoying problem: we may have to replicate
|
|
|
|
|
* the data up to 4 times:
|
|
|
|
|
*
|
|
|
|
|
* Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data,
|
|
|
|
|
* Data.
|
|
|
|
|
*
|
|
|
|
|
* To avoid penalizing shaders that emit a small number of vertices, we
|
|
|
|
|
* can avoid these sometimes: if the size of the control data header is
|
|
|
|
|
* <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land
|
|
|
|
|
* land in the same 128-bit group, so we can skip per-slot offsets.
|
|
|
|
|
*
|
|
|
|
|
* Similarly, if the control data header is <= 32 bits, there is only one
|
|
|
|
|
* DWord, so we can skip channel masks.
|
|
|
|
|
*/
|
2025-02-27 22:56:15 -08:00
|
|
|
const brw_builder bld = brw_builder(this);
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder abld = bld.annotate("urb per slot offset");
|
2024-02-05 13:34:53 -08:00
|
|
|
|
|
|
|
|
/* Figure out which DWord we're trying to write to using the formula:
|
|
|
|
|
*
|
|
|
|
|
* dword_index = (vertex_count - 1) * bits_per_vertex / 32
|
|
|
|
|
*
|
|
|
|
|
* Since bits_per_vertex is a power of two, and is known at compile
|
|
|
|
|
* time, this can be optimized to:
|
2015-03-11 23:14:31 -07:00
|
|
|
*
|
2024-02-05 13:34:53 -08:00
|
|
|
* dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg prev_count = abld.ADD(vertex_count, brw_imm_ud(0xffffffffu));
|
2024-02-05 13:34:53 -08:00
|
|
|
unsigned log2_bits_per_vertex =
|
2024-12-06 23:01:58 -08:00
|
|
|
util_last_bit(gs.control_data_bits_per_vertex);
|
2024-04-12 17:43:22 -07:00
|
|
|
return abld.SHR(prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
|
2024-02-05 13:34:53 -08:00
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader::gs_urb_channel_mask(const brw_reg &dword_index)
|
2024-02-05 13:34:53 -08:00
|
|
|
{
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg channel_mask;
|
2024-02-05 13:34:53 -08:00
|
|
|
|
|
|
|
|
/* Xe2+ can do URB loads with a byte offset, so we don't need to
|
|
|
|
|
* construct a channel mask.
|
|
|
|
|
*/
|
|
|
|
|
if (devinfo->ver >= 20)
|
|
|
|
|
return channel_mask;
|
|
|
|
|
|
|
|
|
|
/* Channel masking presents an annoying problem: we may have to replicate
|
2015-03-11 23:14:31 -07:00
|
|
|
* the data up to 4 times:
|
|
|
|
|
*
|
|
|
|
|
* Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
|
|
|
|
|
*
|
|
|
|
|
* To avoid penalizing shaders that emit a small number of vertices, we
|
|
|
|
|
* can avoid these sometimes: if the size of the control data header is
|
|
|
|
|
* <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land
|
|
|
|
|
* land in the same 128-bit group, so we can skip per-slot offsets.
|
|
|
|
|
*
|
|
|
|
|
* Similarly, if the control data header is <= 32 bits, there is only one
|
|
|
|
|
* DWord, so we can skip channel masks.
|
|
|
|
|
*/
|
2024-12-06 23:01:58 -08:00
|
|
|
if (gs.control_data_header_size_bits <= 32)
|
2024-02-05 13:34:53 -08:00
|
|
|
return channel_mask;
|
|
|
|
|
|
2025-02-27 22:56:15 -08:00
|
|
|
const brw_builder bld = brw_builder(this);
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder ubld = bld.exec_all();
|
2024-02-05 13:34:53 -08:00
|
|
|
|
|
|
|
|
/* Set the channel masks to 1 << (dword_index % 4), so that we'll
|
|
|
|
|
* write to the appropriate DWORD within the OWORD.
|
|
|
|
|
*/
|
2024-08-22 12:21:19 +03:00
|
|
|
return intexp2(ubld, ubld.AND(dword_index, brw_imm_ud(3u)));
|
2024-02-05 13:34:53 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader::emit_gs_control_data_bits(const brw_reg &vertex_count)
|
2024-02-05 13:34:53 -08:00
|
|
|
{
|
|
|
|
|
assert(stage == MESA_SHADER_GEOMETRY);
|
2024-12-06 23:01:58 -08:00
|
|
|
assert(gs.control_data_bits_per_vertex != 0);
|
2024-02-05 13:34:53 -08:00
|
|
|
|
|
|
|
|
const struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
|
|
|
|
|
|
2025-02-27 22:56:15 -08:00
|
|
|
const brw_builder bld = brw_builder(this);
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder abld = bld.annotate("emit control data bits");
|
2015-03-11 23:14:31 -07:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dword_index = gs_urb_per_slot_dword_index(vertex_count);
|
|
|
|
|
brw_reg channel_mask = gs_urb_channel_mask(dword_index);
|
|
|
|
|
brw_reg per_slot_offset;
|
2015-03-11 23:14:31 -07:00
|
|
|
|
2024-02-05 13:34:53 -08:00
|
|
|
const unsigned max_control_data_header_size_bits =
|
|
|
|
|
devinfo->ver >= 20 ? 32 : 128;
|
|
|
|
|
|
2024-12-06 23:01:58 -08:00
|
|
|
if (gs.control_data_header_size_bits > max_control_data_header_size_bits) {
|
2024-02-05 13:34:53 -08:00
|
|
|
/* Convert dword_index to bytes on Xe2+ since LSC can do operate on byte
|
|
|
|
|
* offset granularity.
|
|
|
|
|
*/
|
|
|
|
|
if (devinfo->ver >= 20) {
|
2024-04-12 17:43:22 -07:00
|
|
|
per_slot_offset = abld.SHL(dword_index, brw_imm_ud(2u));
|
2024-02-05 13:34:53 -08:00
|
|
|
} else {
|
2015-03-11 23:14:31 -07:00
|
|
|
/* Set the per-slot offset to dword_index / 4, so that we'll write to
|
|
|
|
|
* the appropriate OWord within the control data header.
|
|
|
|
|
*/
|
2024-04-12 17:43:22 -07:00
|
|
|
per_slot_offset = abld.SHR(dword_index, brw_imm_ud(2u));
|
2015-03-11 23:14:31 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-12 15:32:01 -07:00
|
|
|
/* If there are channel masks, add 3 extra copies of the data. */
|
|
|
|
|
const unsigned length = 1 + 3 * unsigned(channel_mask.file != BAD_FILE);
|
2024-08-16 00:11:04 -07:00
|
|
|
assert(length <= 4);
|
|
|
|
|
brw_reg sources[4];
|
2022-07-12 15:32:01 -07:00
|
|
|
|
2024-08-16 00:11:04 -07:00
|
|
|
for (unsigned i = 0; i < length; i++)
|
2022-07-12 15:32:01 -07:00
|
|
|
sources[i] = this->control_data_bits;
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2022-08-22 22:23:17 -07:00
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
|
2022-07-12 15:32:01 -07:00
|
|
|
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offset;
|
|
|
|
|
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = channel_mask;
|
2024-04-20 17:08:02 -07:00
|
|
|
srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(BRW_TYPE_F, length);
|
2022-07-12 15:32:01 -07:00
|
|
|
abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
|
|
|
|
|
|
2025-08-22 00:30:40 -07:00
|
|
|
brw_urb_inst *urb = abld.URB_WRITE(srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
urb->components = length;
|
2022-09-28 16:50:41 -07:00
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
/* We need to increment Global Offset by 256-bits to make room for
|
|
|
|
|
* Broadwell's extra "Vertex Count" payload at the beginning of the
|
|
|
|
|
* URB entry. Since this is an OWord message, Global Offset is counted
|
|
|
|
|
* in 128-bit units, so we must set it to 2.
|
|
|
|
|
*/
|
|
|
|
|
if (gs_prog_data->static_vertex_count == -1)
|
2025-08-22 00:30:40 -07:00
|
|
|
urb->offset = 2;
|
2015-03-11 23:14:31 -07:00
|
|
|
}
|
|
|
|
|
|
2023-11-20 13:25:36 -08:00
|
|
|
static void
|
2024-06-18 23:42:59 -07:00
|
|
|
set_gs_stream_control_data_bits(nir_to_brw_state &ntb, const brw_reg &vertex_count,
|
2023-11-20 13:25:36 -08:00
|
|
|
unsigned stream_id)
|
2015-03-11 23:14:31 -07:00
|
|
|
{
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 22:00:28 -08:00
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
/* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
|
|
|
|
|
|
|
|
|
|
/* Note: we are calling this *before* increasing vertex_count, so
|
|
|
|
|
* this->vertex_count == vertex_count - 1 in the formula above.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/* Stream mode uses 2 bits per vertex */
|
2024-12-06 23:01:58 -08:00
|
|
|
assert(s.gs.control_data_bits_per_vertex == 2);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
/* Must be a valid stream */
|
2023-08-06 13:01:40 +08:00
|
|
|
assert(stream_id < 4); /* MAX_VERTEX_STREAMS */
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
/* Control data bits are initialized to 0 so we don't have to set any
|
|
|
|
|
* bits when sending vertices to stream 0.
|
|
|
|
|
*/
|
|
|
|
|
if (stream_id == 0)
|
|
|
|
|
return;
|
|
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder abld = ntb.bld.annotate("set stream control data bits");
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
/* reg::sid = stream_id */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg sid = abld.MOV(brw_imm_ud(stream_id));
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
/* reg:shift_count = 2 * (vertex_count - 1) */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg shift_count = abld.SHL(vertex_count, brw_imm_ud(1u));
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
/* Note: we're relying on the fact that the GEN SHL instruction only pays
|
|
|
|
|
* attention to the lower 5 bits of its second source argument, so on this
|
|
|
|
|
* architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
|
|
|
|
|
* stream_id << ((2 * (vertex_count - 1)) % 32).
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg mask = abld.SHL(sid, shift_count);
|
2023-12-05 17:16:34 -08:00
|
|
|
abld.OR(s.control_data_bits, s.control_data_bits, mask);
|
2015-03-11 23:14:31 -07:00
|
|
|
}
|
|
|
|
|
|
2023-11-20 13:25:36 -08:00
|
|
|
static void
|
2023-12-05 15:27:29 -08:00
|
|
|
emit_gs_vertex(nir_to_brw_state &ntb, const nir_src &vertex_count_nir_src,
|
2023-11-20 13:25:36 -08:00
|
|
|
unsigned stream_id)
|
2015-03-11 23:14:31 -07:00
|
|
|
{
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 21:21:54 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_GEOMETRY);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
2025-01-15 13:27:05 -08:00
|
|
|
brw_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src, 0);
|
2024-04-20 17:08:02 -07:00
|
|
|
vertex_count.type = BRW_TYPE_UD;
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
/* Haswell and later hardware ignores the "Render Stream Select" bits
|
|
|
|
|
* from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
|
|
|
|
|
* and instead sends all primitives down the pipeline for rasterization.
|
|
|
|
|
* If the SOL stage is enabled, "Render Stream Select" is honored and
|
|
|
|
|
* primitives bound to non-zero streams are discarded after stream output.
|
|
|
|
|
*
|
|
|
|
|
* Since the only purpose of primives sent to non-zero streams is to
|
|
|
|
|
* be recorded by transform feedback, we can simply discard all geometry
|
|
|
|
|
* bound to these streams when transform feedback is disabled.
|
|
|
|
|
*/
|
2023-12-05 17:16:34 -08:00
|
|
|
if (stream_id > 0 && !s.nir->info.has_transform_feedback_varyings)
|
2015-03-11 23:14:31 -07:00
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
/* If we're outputting 32 control data bits or less, then we can wait
|
|
|
|
|
* until the shader is over to output them all. Otherwise we need to
|
|
|
|
|
* output them as we go. Now is the time to do it, since we're about to
|
|
|
|
|
* output the vertex_count'th vertex, so it's guaranteed that the
|
|
|
|
|
* control data bits associated with the (vertex_count - 1)th vertex are
|
|
|
|
|
* correct.
|
|
|
|
|
*/
|
2024-12-06 23:01:58 -08:00
|
|
|
if (s.gs.control_data_header_size_bits > 32) {
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder abld =
|
2023-12-05 15:27:29 -08:00
|
|
|
ntb.bld.annotate("emit vertex: emit control data bits");
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
/* Only emit control data bits if we've finished accumulating a batch
|
|
|
|
|
* of 32 bits. This is the case when:
|
|
|
|
|
*
|
|
|
|
|
* (vertex_count * bits_per_vertex) % 32 == 0
|
|
|
|
|
*
|
|
|
|
|
* (in other words, when the last 5 bits of vertex_count *
|
|
|
|
|
* bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some
|
|
|
|
|
* integer n (which is always the case, since bits_per_vertex is
|
|
|
|
|
* always 1 or 2), this is equivalent to requiring that the last 5-n
|
|
|
|
|
* bits of vertex_count are 0:
|
|
|
|
|
*
|
|
|
|
|
* vertex_count & (2^(5-n) - 1) == 0
|
|
|
|
|
*
|
|
|
|
|
* 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
|
|
|
|
|
* equivalent to:
|
|
|
|
|
*
|
|
|
|
|
* vertex_count & (32 / bits_per_vertex - 1) == 0
|
|
|
|
|
*
|
|
|
|
|
* TODO: If vertex_count is an immediate, we could do some of this math
|
|
|
|
|
* at compile time...
|
|
|
|
|
*/
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *inst =
|
2023-12-05 15:27:29 -08:00
|
|
|
abld.AND(ntb.bld.null_reg_d(), vertex_count,
|
2024-12-06 23:01:58 -08:00
|
|
|
brw_imm_ud(32u / s.gs.control_data_bits_per_vertex - 1u));
|
2015-03-11 23:14:31 -07:00
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_Z;
|
|
|
|
|
|
|
|
|
|
abld.IF(BRW_PREDICATE_NORMAL);
|
|
|
|
|
/* If vertex_count is 0, then no control data bits have been
|
|
|
|
|
* accumulated yet, so we can skip emitting them.
|
|
|
|
|
*/
|
2023-12-05 15:27:29 -08:00
|
|
|
abld.CMP(ntb.bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
|
2015-03-11 23:14:31 -07:00
|
|
|
BRW_CONDITIONAL_NEQ);
|
|
|
|
|
abld.IF(BRW_PREDICATE_NORMAL);
|
2023-12-05 17:16:34 -08:00
|
|
|
s.emit_gs_control_data_bits(vertex_count);
|
2015-03-11 23:14:31 -07:00
|
|
|
abld.emit(BRW_OPCODE_ENDIF);
|
|
|
|
|
|
|
|
|
|
/* Reset control_data_bits to 0 so we can start accumulating a new
|
|
|
|
|
* batch.
|
|
|
|
|
*
|
|
|
|
|
* Note: in the case where vertex_count == 0, this neutralizes the
|
|
|
|
|
* effect of any call to EndPrimitive() that the shader may have
|
|
|
|
|
* made before outputting its first vertex.
|
|
|
|
|
*/
|
2024-04-12 17:43:22 -07:00
|
|
|
abld.exec_all().MOV(s.control_data_bits, brw_imm_ud(0u));
|
2015-03-11 23:14:31 -07:00
|
|
|
abld.emit(BRW_OPCODE_ENDIF);
|
|
|
|
|
}
|
|
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
s.emit_urb_writes(vertex_count);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
/* In stream mode we have to set control data bits for all vertices
|
|
|
|
|
* unless we have disabled control data bits completely (which we do
|
2023-08-21 18:11:21 -05:00
|
|
|
* do for MESA_PRIM_POINTS outputs that don't use streams).
|
2015-03-11 23:14:31 -07:00
|
|
|
*/
|
2024-12-06 23:01:58 -08:00
|
|
|
if (s.gs.control_data_header_size_bits > 0 &&
|
2015-03-11 23:14:31 -07:00
|
|
|
gs_prog_data->control_data_format ==
|
2021-03-29 15:16:59 -07:00
|
|
|
GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
|
2023-11-20 22:00:28 -08:00
|
|
|
set_gs_stream_control_data_bits(ntb, vertex_count, stream_id);
|
2015-03-11 23:14:31 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-08-15 23:04:23 -07:00
|
|
|
static void
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_combine_with_vec(const brw_builder &bld, const brw_reg &dst,
|
2024-08-15 23:04:23 -07:00
|
|
|
const brw_reg &src, unsigned n)
|
|
|
|
|
{
|
|
|
|
|
assert(n <= NIR_MAX_VEC_COMPONENTS);
|
|
|
|
|
brw_reg comps[NIR_MAX_VEC_COMPONENTS];
|
|
|
|
|
for (unsigned i = 0; i < n; i++)
|
|
|
|
|
comps[i] = offset(src, bld, i);
|
|
|
|
|
bld.VEC(dst, comps, n);
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 13:25:36 -08:00
|
|
|
static void
|
2024-06-18 23:42:59 -07:00
|
|
|
emit_gs_input_load(nir_to_brw_state &ntb, const brw_reg &dst,
|
2023-11-20 13:25:36 -08:00
|
|
|
const nir_src &vertex_src,
|
|
|
|
|
unsigned base_offset,
|
|
|
|
|
const nir_src &offset_src,
|
|
|
|
|
unsigned num_components,
|
|
|
|
|
unsigned first_component)
|
2015-03-11 23:14:31 -07:00
|
|
|
{
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2024-02-05 13:34:53 -08:00
|
|
|
const struct intel_device_info *devinfo = ntb.devinfo;
|
|
|
|
|
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 13:25:36 -08:00
|
|
|
|
2024-04-21 00:57:59 -07:00
|
|
|
assert(brw_type_size_bytes(dst.type) == 4);
|
2023-12-05 17:16:34 -08:00
|
|
|
struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);
|
2015-11-25 14:14:05 -08:00
|
|
|
const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
|
|
|
|
|
|
2016-05-08 02:54:28 -07:00
|
|
|
/* TODO: figure out push input layout for invocations == 1 */
|
|
|
|
|
if (gs_prog_data->invocations == 1 &&
|
2018-10-20 09:55:28 -05:00
|
|
|
nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&
|
|
|
|
|
4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {
|
|
|
|
|
int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +
|
|
|
|
|
nir_src_as_uint(vertex_src) * push_reg_count;
|
2024-01-03 01:00:18 -08:00
|
|
|
|
2024-08-15 23:04:23 -07:00
|
|
|
const brw_reg attr = offset(brw_attr_reg(0, dst.type), bld,
|
|
|
|
|
first_component + imm_offset);
|
|
|
|
|
brw_combine_with_vec(bld, dst, attr, num_components);
|
2016-05-08 05:22:13 -07:00
|
|
|
return;
|
|
|
|
|
}
|
2015-03-11 23:14:31 -07:00
|
|
|
|
2016-05-08 05:22:13 -07:00
|
|
|
/* Resort to the pull model. Ensure the VUE handles are provided. */
|
2017-09-27 11:36:31 +02:00
|
|
|
assert(gs_prog_data->base.include_vue_handles);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg start = s.gs_payload().icp_handle_start;
|
|
|
|
|
brw_reg icp_handle = ntb.bld.vgrf(BRW_TYPE_UD);
|
2024-08-14 21:27:09 -07:00
|
|
|
const unsigned grf_size_bytes = REG_SIZE * reg_unit(devinfo);
|
2016-05-08 05:22:13 -07:00
|
|
|
|
|
|
|
|
if (gs_prog_data->invocations == 1) {
|
2018-10-20 09:55:28 -05:00
|
|
|
if (nir_src_is_const(vertex_src)) {
|
2015-11-07 18:58:59 -08:00
|
|
|
/* The vertex index is constant; just select the proper URB handle. */
|
2024-08-14 21:27:09 -07:00
|
|
|
icp_handle =
|
|
|
|
|
byte_offset(start, nir_src_as_uint(vertex_src) * grf_size_bytes);
|
2015-03-11 23:14:31 -07:00
|
|
|
} else {
|
2015-11-07 18:58:59 -08:00
|
|
|
/* The vertex index is non-constant. We need to use indirect
|
|
|
|
|
* addressing to fetch the proper URB handle.
|
|
|
|
|
*
|
|
|
|
|
* First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
|
|
|
|
|
* indicating that channel <n> should read the handle from
|
|
|
|
|
* DWord <n>. We convert that to bytes by multiplying by 4.
|
|
|
|
|
*
|
|
|
|
|
* Next, we convert the vertex index to bytes by multiplying
|
2024-08-14 21:27:09 -07:00
|
|
|
* by 32/64 (shifting by 5/6), and add the two together. This is
|
2015-11-07 18:58:59 -08:00
|
|
|
* the final indirect byte offset.
|
|
|
|
|
*/
|
intel/brw: Use CSE for LOAD_SUBGROUP_INVOCATION
Instead of emitting a single one at the top, and making reference to it,
emit the virtual instruction as needed and let CSE do its job.
Since load_subgroup_invocation now can appear not at the start of the
shader, use UNDEF in all cases to ensure that the liveness of the
destination doesn't extend to the first partial write done here (it was
being used only for SIMD > 8 before).
Note this option was considered in the past
6132992cdb858268af0e985727d80e4140be389c but at the time dismissed. The
difference now is that the lowering of the virtual instruction happens
earlier than the scheduling.
The motivation for this change is to allow passes other than the NIR
conversion to use this value. The alternative of storing a `brw_reg` in
the shader (instead of NIR state) gets complicated by passes like
compact_vgrfs, that move VGRFs around (and update the instructions).
This and maybe other passes would have to care about the brw_reg.
Fossil-db numbers, TGL
```
*** Shaders only in 'after' results are ignored:
steam-native/shadow_of_the_tomb_raider/c683ea5067ee157d/fs.32/0, steam-native/shadow_of_the_tomb_raider/f4df450c3cef40b4/fs.32/0, steam-native/shadow_of_the_tomb_raider/94b708fb8e3d9597/fs.32/0, steam-native/shadow_of_the_tomb_raider/19d44c328edabd30/fs.32/0, steam-native/shadow_of_the_tomb_raider/8a7dcbd5a74a19bf/fs.32/0, and 366 more
from 4 apps: steam-dxvk/alan_wake, steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
*** Shaders only in 'before' results are ignored:
steam-dxvk/octopath_traveler/aaa3d10acb726906/fs.32/0, steam-dxvk/batman_arkham_origins/e6872ae23569c35f/fs.32/0, steam-dxvk/octopath_traveler/fd33a99fa5c271a8/fs.32/0, steam-dxvk/octopath_traveler/9a077cdc16f24520/fs.32/0, steam-dxvk/batman_arkham_city_goty/fac7b438ad52f622/fs.32/0, and 12 more
from 4 apps: steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-dxvk/octopath_traveler, steam-native/shadow_of_the_tomb_raider
Totals:
Instrs: 149752381 -> 149751337 (-0.00%); split: -0.00%, +0.00%
Cycle count: 11553609349 -> 11549970294 (-0.03%); split: -0.06%, +0.03%
Spill count: 42763 -> 42764 (+0.00%); split: -0.01%, +0.01%
Fill count: 75650 -> 75651 (+0.00%); split: -0.00%, +0.01%
Max live registers: 31725096 -> 31671792 (-0.17%)
Max dispatch width: 5546008 -> 5551672 (+0.10%); split: +0.11%, -0.00%
Totals from 52574 (8.34% of 630441) affected shaders:
Instrs: 9535159 -> 9534115 (-0.01%); split: -0.03%, +0.02%
Cycle count: 1006627109 -> 1002988054 (-0.36%); split: -0.65%, +0.29%
Spill count: 11588 -> 11589 (+0.01%); split: -0.03%, +0.03%
Fill count: 21057 -> 21058 (+0.00%); split: -0.01%, +0.02%
Max live registers: 1992493 -> 1939189 (-2.68%)
Max dispatch width: 559696 -> 565360 (+1.01%); split: +1.06%, -0.05%
```
and DG2
```
*** Shaders only in 'after' results are ignored:
steam-native/shadow_of_the_tomb_raider/1f95a9d3db21df85/fs.32/0, steam-native/shadow_of_the_tomb_raider/56b87c4a46613a2a/fs.32/0, steam-native/shadow_of_the_tomb_raider/a74b4137f85dbbd3/fs.32/0, steam-native/shadow_of_the_tomb_raider/e07e38d3f48e8402/fs.32/0, steam-native/shadow_of_the_tomb_raider/206336789c48996c/fs.32/0, and 268 more
from 4 apps: steam-dxvk/alan_wake, steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
*** Shaders only in 'before' results are ignored:
steam-native/shadow_of_the_tomb_raider/0420d7c3a2ea99ec/fs.32/0, steam-native/shadow_of_the_tomb_raider/2ff39f8bf7d24abb/fs.32/0, steam-native/shadow_of_the_tomb_raider/92d7be2824bd9659/fs.32/0, steam-native/shadow_of_the_tomb_raider/f09ca6d2ecf18015/fs.32/0, steam-native/shadow_of_the_tomb_raider/490f8ffd59e52949/fs.32/0, and 205 more
from 3 apps: steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
Totals:
Instrs: 151597619 -> 151599914 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7699776 -> 7699784 (+0.00%)
Cycle count: 12738501989 -> 12739841170 (+0.01%); split: -0.01%, +0.02%
Spill count: 61283 -> 61274 (-0.01%)
Fill count: 119886 -> 119849 (-0.03%)
Max live registers: 31810432 -> 31758920 (-0.16%)
Max dispatch width: 5540128 -> 5541136 (+0.02%); split: +0.08%, -0.06%
Totals from 49286 (7.81% of 631231) affected shaders:
Instrs: 8607753 -> 8610048 (+0.03%); split: -0.01%, +0.04%
Subgroup size: 857752 -> 857760 (+0.00%)
Cycle count: 305939495 -> 307278676 (+0.44%); split: -0.28%, +0.72%
Spill count: 6339 -> 6330 (-0.14%)
Fill count: 12571 -> 12534 (-0.29%)
Max live registers: 1788346 -> 1736834 (-2.88%)
Max dispatch width: 510920 -> 511928 (+0.20%); split: +0.85%, -0.66%
```
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30489>
2024-07-31 22:46:20 -07:00
|
|
|
brw_reg sequence = bld.LOAD_SUBGROUP_INVOCATION();
|
2015-11-07 18:58:59 -08:00
|
|
|
|
|
|
|
|
/* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg channel_offsets = bld.SHL(sequence, brw_imm_ud(2u));
|
2024-08-14 21:27:09 -07:00
|
|
|
/* Convert vertex_index to bytes (multiply by 32/64) */
|
|
|
|
|
assert(util_is_power_of_two_nonzero(grf_size_bytes)); /* for ffs() */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg vertex_offset_bytes =
|
2025-01-15 13:27:05 -08:00
|
|
|
bld.SHL(retype(get_nir_src(ntb, vertex_src, 0), BRW_TYPE_UD),
|
2024-08-14 21:27:09 -07:00
|
|
|
brw_imm_ud(ffs(grf_size_bytes) - 1));
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg icp_offset_bytes =
|
2024-04-12 17:43:22 -07:00
|
|
|
bld.ADD(vertex_offset_bytes, channel_offsets);
|
2015-11-07 18:58:59 -08:00
|
|
|
|
|
|
|
|
/* Use first_icp_handle as the base offset. There is one register
|
|
|
|
|
* of URB handles per vertex, so inform the register allocator that
|
2017-05-08 09:20:21 -07:00
|
|
|
* we might read up to nir->info.gs.vertices_in registers.
|
2015-11-07 18:58:59 -08:00
|
|
|
*/
|
2022-08-22 22:50:23 -07:00
|
|
|
bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg(icp_offset_bytes),
|
2024-08-14 21:27:09 -07:00
|
|
|
brw_imm_ud(s.nir->info.gs.vertices_in * grf_size_bytes));
|
2015-03-11 23:14:31 -07:00
|
|
|
}
|
2016-05-08 02:54:28 -07:00
|
|
|
} else {
|
|
|
|
|
assert(gs_prog_data->invocations > 1);
|
|
|
|
|
|
2018-10-20 09:55:28 -05:00
|
|
|
if (nir_src_is_const(vertex_src)) {
|
|
|
|
|
unsigned vertex = nir_src_as_uint(vertex_src);
|
2022-08-22 22:50:23 -07:00
|
|
|
bld.MOV(icp_handle, component(start, vertex));
|
2016-05-08 02:54:28 -07:00
|
|
|
} else {
|
|
|
|
|
/* The vertex index is non-constant. We need to use indirect
|
|
|
|
|
* addressing to fetch the proper URB handle.
|
|
|
|
|
*
|
2024-04-12 17:43:22 -07:00
|
|
|
* Convert vertex_index to bytes (multiply by 4)
|
2016-05-08 02:54:28 -07:00
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg icp_offset_bytes =
|
2025-01-15 13:27:05 -08:00
|
|
|
bld.SHL(retype(get_nir_src(ntb, vertex_src, 0), BRW_TYPE_UD),
|
2024-04-12 17:43:22 -07:00
|
|
|
brw_imm_ud(2u));
|
2016-05-08 02:54:28 -07:00
|
|
|
|
|
|
|
|
/* Use first_icp_handle as the base offset. There is one DWord
|
|
|
|
|
* of URB handles per vertex, so inform the register allocator that
|
2017-05-08 09:20:21 -07:00
|
|
|
* we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
|
2016-05-08 02:54:28 -07:00
|
|
|
*/
|
2022-08-22 22:50:23 -07:00
|
|
|
bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg(icp_offset_bytes),
|
2023-12-05 17:16:34 -08:00
|
|
|
brw_imm_ud(DIV_ROUND_UP(s.nir->info.gs.vertices_in, 8) *
|
2024-08-14 21:27:09 -07:00
|
|
|
grf_size_bytes));
|
2016-05-08 02:54:28 -07:00
|
|
|
}
|
2016-05-08 05:22:13 -07:00
|
|
|
}
|
2015-11-07 18:58:59 -08:00
|
|
|
|
2025-08-22 00:30:40 -07:00
|
|
|
brw_urb_inst *urb;
|
2025-01-15 13:27:05 -08:00
|
|
|
brw_reg indirect_offset = get_nir_src(ntb, offset_src, 0);
|
2016-05-27 11:59:48 +02:00
|
|
|
|
2019-07-19 17:38:04 -05:00
|
|
|
if (nir_src_is_const(offset_src)) {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2022-07-14 11:57:03 -07:00
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
|
|
|
|
|
|
2019-07-19 17:38:04 -05:00
|
|
|
/* Constant indexing - use global offset. */
|
|
|
|
|
if (first_component != 0) {
|
2016-05-22 22:48:53 +10:00
|
|
|
unsigned read_components = num_components + first_component;
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp = bld.vgrf(dst.type, read_components);
|
2025-08-22 00:30:40 -07:00
|
|
|
urb = bld.URB_READ(tmp, srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
urb->size_written = read_components *
|
|
|
|
|
tmp.component_size(urb->exec_size);
|
2024-08-15 23:04:23 -07:00
|
|
|
brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component),
|
|
|
|
|
num_components);
|
2019-07-19 17:38:04 -05:00
|
|
|
} else {
|
2025-08-22 00:30:40 -07:00
|
|
|
urb = bld.URB_READ(dst, srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
urb->size_written = num_components *
|
|
|
|
|
dst.component_size(urb->exec_size);
|
2016-05-27 11:59:48 +02:00
|
|
|
}
|
2025-08-22 00:30:40 -07:00
|
|
|
urb->offset = base_offset + nir_src_as_uint(offset_src);
|
2019-07-19 17:38:04 -05:00
|
|
|
} else {
|
|
|
|
|
/* Indirect indexing - use per-slot offsets as well. */
|
|
|
|
|
unsigned read_components = num_components + first_component;
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp = bld.vgrf(dst.type, read_components);
|
2022-07-14 11:57:03 -07:00
|
|
|
|
2024-04-12 17:43:22 -07:00
|
|
|
/* Convert oword offset to bytes on Xe2+ */
|
|
|
|
|
if (devinfo->ver >= 20)
|
|
|
|
|
indirect_offset = bld.SHL(indirect_offset, brw_imm_ud(4u));
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2022-07-14 11:57:03 -07:00
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
|
|
|
|
|
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
|
|
|
|
|
|
2019-07-19 17:38:04 -05:00
|
|
|
if (first_component != 0) {
|
2025-08-22 00:30:40 -07:00
|
|
|
urb = bld.URB_READ(tmp, srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
urb->size_written = read_components *
|
|
|
|
|
tmp.component_size(urb->exec_size);
|
2024-08-15 23:04:23 -07:00
|
|
|
brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component),
|
|
|
|
|
num_components);
|
2019-07-19 17:38:04 -05:00
|
|
|
} else {
|
2025-08-22 00:30:40 -07:00
|
|
|
urb = bld.URB_READ(dst, srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
urb->size_written = num_components *
|
|
|
|
|
dst.component_size(urb->exec_size);
|
2016-05-27 11:59:48 +02:00
|
|
|
}
|
2025-08-22 00:30:40 -07:00
|
|
|
urb->offset = base_offset;
|
2016-05-08 05:22:13 -07:00
|
|
|
}
|
2015-03-11 23:14:31 -07:00
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2023-12-05 15:27:29 -08:00
|
|
|
get_indirect_offset(nir_to_brw_state &ntb, nir_intrinsic_instr *instr)
|
2015-11-10 14:35:27 -08:00
|
|
|
{
|
2024-02-05 13:34:53 -08:00
|
|
|
const intel_device_info *devinfo = ntb.devinfo;
|
2015-11-10 14:35:27 -08:00
|
|
|
nir_src *offset_src = nir_get_io_offset_src(instr);
|
|
|
|
|
|
2018-10-20 09:55:28 -05:00
|
|
|
if (nir_src_is_const(*offset_src)) {
|
2015-11-10 14:35:27 -08:00
|
|
|
/* The only constant offset we should find is 0. brw_nir.c's
|
|
|
|
|
* add_const_offset_to_base() will fold other constant offsets
|
2022-08-13 01:11:58 -07:00
|
|
|
* into the "base" index.
|
2015-11-10 14:35:27 -08:00
|
|
|
*/
|
2018-10-20 09:55:28 -05:00
|
|
|
assert(nir_src_as_uint(*offset_src) == 0);
|
2024-06-18 23:42:59 -07:00
|
|
|
return brw_reg();
|
2015-11-10 14:35:27 -08:00
|
|
|
}
|
|
|
|
|
|
2025-01-15 13:27:05 -08:00
|
|
|
brw_reg offset = get_nir_src(ntb, *offset_src, 0);
|
2024-02-05 13:34:53 -08:00
|
|
|
|
|
|
|
|
if (devinfo->ver < 20)
|
2024-04-12 17:43:22 -07:00
|
|
|
return offset;
|
2024-02-05 13:34:53 -08:00
|
|
|
|
|
|
|
|
/* Convert Owords (16-bytes) to bytes */
|
2024-04-30 11:19:36 -07:00
|
|
|
return ntb.bld.SHL(retype(offset, BRW_TYPE_UD), brw_imm_ud(4u));
|
2015-11-10 14:35:27 -08:00
|
|
|
}
|
|
|
|
|
|
2023-11-20 12:13:47 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_vs_intrinsic(nir_to_brw_state &ntb,
|
2023-11-20 12:13:47 -08:00
|
|
|
nir_intrinsic_instr *instr)
|
2014-08-15 10:32:07 -07:00
|
|
|
{
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_VERTEX);
|
2015-11-04 23:05:07 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dest;
|
2014-08-15 10:32:07 -07:00
|
|
|
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
|
2023-11-20 21:21:54 -08:00
|
|
|
dest = get_nir_def(ntb, instr->def);
|
2014-08-15 10:32:07 -07:00
|
|
|
|
2015-11-04 23:05:07 -08:00
|
|
|
switch (instr->intrinsic) {
|
|
|
|
|
case nir_intrinsic_load_vertex_id:
|
2018-04-28 14:09:22 +02:00
|
|
|
case nir_intrinsic_load_base_vertex:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("should be lowered by nir_lower_system_values()");
|
2015-11-04 23:05:07 -08:00
|
|
|
|
2016-07-19 19:00:19 -07:00
|
|
|
case nir_intrinsic_load_input: {
|
2023-08-14 11:56:00 -05:00
|
|
|
assert(instr->def.bit_size == 32);
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg src = offset(brw_attr_reg(0, dest.type), bld,
|
2022-09-12 16:47:19 -07:00
|
|
|
nir_intrinsic_base(instr) * 4 +
|
|
|
|
|
nir_intrinsic_component(instr) +
|
|
|
|
|
nir_src_as_uint(instr->src[0]));
|
2024-08-15 23:04:23 -07:00
|
|
|
brw_combine_with_vec(bld, dest, src, instr->num_components);
|
2016-07-19 19:00:19 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2018-07-18 15:45:46 -07:00
|
|
|
case nir_intrinsic_load_vertex_id_zero_base:
|
|
|
|
|
case nir_intrinsic_load_instance_id:
|
|
|
|
|
case nir_intrinsic_load_base_instance:
|
|
|
|
|
case nir_intrinsic_load_draw_id:
|
2018-01-25 19:15:40 +01:00
|
|
|
case nir_intrinsic_load_first_vertex:
|
2018-04-28 14:09:20 +02:00
|
|
|
case nir_intrinsic_load_is_indexed_draw:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("lowered by brw_nir_lower_vs_inputs");
|
2018-01-25 19:15:40 +01:00
|
|
|
|
2015-11-04 23:05:07 -08:00
|
|
|
default:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_intrinsic(ntb, bld, instr);
|
2015-11-04 23:05:07 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2024-12-29 15:41:04 -08:00
|
|
|
get_tcs_single_patch_icp_handle(nir_to_brw_state &ntb, const brw_builder &bld,
|
2023-11-20 13:25:36 -08:00
|
|
|
nir_intrinsic_instr *instr)
|
2019-05-03 14:37:11 -07:00
|
|
|
{
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 13:25:36 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data);
|
2019-05-03 14:37:11 -07:00
|
|
|
const nir_src &vertex_src = instr->src[0];
|
|
|
|
|
nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src);
|
2022-08-19 17:07:44 -07:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg start = s.tcs_payload().icp_handle_start;
|
2022-08-19 17:07:44 -07:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg icp_handle;
|
2019-05-03 14:37:11 -07:00
|
|
|
|
|
|
|
|
if (nir_src_is_const(vertex_src)) {
|
|
|
|
|
/* Emit a MOV to resolve <0,1,0> regioning. */
|
|
|
|
|
unsigned vertex = nir_src_as_uint(vertex_src);
|
2024-04-12 17:43:22 -07:00
|
|
|
icp_handle = bld.MOV(component(start, vertex));
|
2019-05-03 14:37:11 -07:00
|
|
|
} else if (tcs_prog_data->instances == 1 && vertex_intrin &&
|
|
|
|
|
vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) {
|
|
|
|
|
/* For the common case of only 1 instance, an array index of
|
2022-08-19 17:07:44 -07:00
|
|
|
* gl_InvocationID means reading the handles from the start. Skip all
|
|
|
|
|
* the indirect work.
|
2019-05-03 14:37:11 -07:00
|
|
|
*/
|
2022-08-19 17:07:44 -07:00
|
|
|
icp_handle = start;
|
2019-05-03 14:37:11 -07:00
|
|
|
} else {
|
|
|
|
|
/* The vertex index is non-constant. We need to use indirect
|
|
|
|
|
* addressing to fetch the proper URB handle.
|
|
|
|
|
*/
|
2024-04-20 17:08:02 -07:00
|
|
|
icp_handle = bld.vgrf(BRW_TYPE_UD);
|
2019-05-03 14:37:11 -07:00
|
|
|
|
|
|
|
|
/* Each ICP handle is a single DWord (4 bytes) */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg vertex_offset_bytes =
|
2025-01-15 13:27:05 -08:00
|
|
|
bld.SHL(retype(get_nir_src(ntb, vertex_src, 0), BRW_TYPE_UD),
|
2024-04-12 17:43:22 -07:00
|
|
|
brw_imm_ud(2u));
|
2019-05-03 14:37:11 -07:00
|
|
|
|
2022-08-19 17:07:44 -07:00
|
|
|
/* We might read up to 4 registers. */
|
2019-05-03 14:37:11 -07:00
|
|
|
bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
|
2022-08-19 17:07:44 -07:00
|
|
|
start, vertex_offset_bytes,
|
2019-05-03 14:37:11 -07:00
|
|
|
brw_imm_ud(4 * REG_SIZE));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return icp_handle;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2024-12-29 15:41:04 -08:00
|
|
|
get_tcs_multi_patch_icp_handle(nir_to_brw_state &ntb, const brw_builder &bld,
|
2023-11-20 13:25:36 -08:00
|
|
|
nir_intrinsic_instr *instr)
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
{
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-12-05 17:16:34 -08:00
|
|
|
const intel_device_info *devinfo = s.devinfo;
|
2023-11-20 13:25:36 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) s.key;
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
const nir_src &vertex_src = instr->src[0];
|
2022-09-07 20:37:26 -07:00
|
|
|
const unsigned grf_size_bytes = REG_SIZE * reg_unit(devinfo);
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg start = s.tcs_payload().icp_handle_start;
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
2022-08-19 17:07:44 -07:00
|
|
|
if (nir_src_is_const(vertex_src))
|
2022-09-07 20:37:26 -07:00
|
|
|
return byte_offset(start, nir_src_as_uint(vertex_src) * grf_size_bytes);
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
|
|
|
|
/* The vertex index is non-constant. We need to use indirect
|
|
|
|
|
* addressing to fetch the proper URB handle.
|
|
|
|
|
*
|
2022-09-07 20:37:26 -07:00
|
|
|
* First, we start with the sequence indicating that channel <n>
|
|
|
|
|
* should read the handle from DWord <n>. We convert that to bytes
|
|
|
|
|
* by multiplying by 4.
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
*
|
|
|
|
|
* Next, we convert the vertex index to bytes by multiplying
|
2022-09-07 20:37:26 -07:00
|
|
|
* by the GRF size (by shifting), and add the two together. This is
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
* the final indirect byte offset.
|
|
|
|
|
*/
|
intel/brw: Use CSE for LOAD_SUBGROUP_INVOCATION
Instead of emitting a single one at the top, and making reference to it,
emit the virtual instruction as needed and let CSE do its job.
Since load_subgroup_invocation now can appear not at the start of the
shader, use UNDEF in all cases to ensure that the liveness of the
destination doesn't extend to the first partial write done here (it was
being used only for SIMD > 8 before).
Note this option was considered in the past
6132992cdb858268af0e985727d80e4140be389c but at the time dismissed. The
difference now is that the lowering of the virtual instruction happens
earlier than the scheduling.
The motivation for this change is to allow passes other than the NIR
conversion to use this value. The alternative of storing a `brw_reg` in
the shader (instead of NIR state) gets complicated by passes like
compact_vgrfs, that move VGRFs around (and update the instructions).
This and maybe other passes would have to care about the brw_reg.
Fossil-db numbers, TGL
```
*** Shaders only in 'after' results are ignored:
steam-native/shadow_of_the_tomb_raider/c683ea5067ee157d/fs.32/0, steam-native/shadow_of_the_tomb_raider/f4df450c3cef40b4/fs.32/0, steam-native/shadow_of_the_tomb_raider/94b708fb8e3d9597/fs.32/0, steam-native/shadow_of_the_tomb_raider/19d44c328edabd30/fs.32/0, steam-native/shadow_of_the_tomb_raider/8a7dcbd5a74a19bf/fs.32/0, and 366 more
from 4 apps: steam-dxvk/alan_wake, steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
*** Shaders only in 'before' results are ignored:
steam-dxvk/octopath_traveler/aaa3d10acb726906/fs.32/0, steam-dxvk/batman_arkham_origins/e6872ae23569c35f/fs.32/0, steam-dxvk/octopath_traveler/fd33a99fa5c271a8/fs.32/0, steam-dxvk/octopath_traveler/9a077cdc16f24520/fs.32/0, steam-dxvk/batman_arkham_city_goty/fac7b438ad52f622/fs.32/0, and 12 more
from 4 apps: steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-dxvk/octopath_traveler, steam-native/shadow_of_the_tomb_raider
Totals:
Instrs: 149752381 -> 149751337 (-0.00%); split: -0.00%, +0.00%
Cycle count: 11553609349 -> 11549970294 (-0.03%); split: -0.06%, +0.03%
Spill count: 42763 -> 42764 (+0.00%); split: -0.01%, +0.01%
Fill count: 75650 -> 75651 (+0.00%); split: -0.00%, +0.01%
Max live registers: 31725096 -> 31671792 (-0.17%)
Max dispatch width: 5546008 -> 5551672 (+0.10%); split: +0.11%, -0.00%
Totals from 52574 (8.34% of 630441) affected shaders:
Instrs: 9535159 -> 9534115 (-0.01%); split: -0.03%, +0.02%
Cycle count: 1006627109 -> 1002988054 (-0.36%); split: -0.65%, +0.29%
Spill count: 11588 -> 11589 (+0.01%); split: -0.03%, +0.03%
Fill count: 21057 -> 21058 (+0.00%); split: -0.01%, +0.02%
Max live registers: 1992493 -> 1939189 (-2.68%)
Max dispatch width: 559696 -> 565360 (+1.01%); split: +1.06%, -0.05%
```
and DG2
```
*** Shaders only in 'after' results are ignored:
steam-native/shadow_of_the_tomb_raider/1f95a9d3db21df85/fs.32/0, steam-native/shadow_of_the_tomb_raider/56b87c4a46613a2a/fs.32/0, steam-native/shadow_of_the_tomb_raider/a74b4137f85dbbd3/fs.32/0, steam-native/shadow_of_the_tomb_raider/e07e38d3f48e8402/fs.32/0, steam-native/shadow_of_the_tomb_raider/206336789c48996c/fs.32/0, and 268 more
from 4 apps: steam-dxvk/alan_wake, steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
*** Shaders only in 'before' results are ignored:
steam-native/shadow_of_the_tomb_raider/0420d7c3a2ea99ec/fs.32/0, steam-native/shadow_of_the_tomb_raider/2ff39f8bf7d24abb/fs.32/0, steam-native/shadow_of_the_tomb_raider/92d7be2824bd9659/fs.32/0, steam-native/shadow_of_the_tomb_raider/f09ca6d2ecf18015/fs.32/0, steam-native/shadow_of_the_tomb_raider/490f8ffd59e52949/fs.32/0, and 205 more
from 3 apps: steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
Totals:
Instrs: 151597619 -> 151599914 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7699776 -> 7699784 (+0.00%)
Cycle count: 12738501989 -> 12739841170 (+0.01%); split: -0.01%, +0.02%
Spill count: 61283 -> 61274 (-0.01%)
Fill count: 119886 -> 119849 (-0.03%)
Max live registers: 31810432 -> 31758920 (-0.16%)
Max dispatch width: 5540128 -> 5541136 (+0.02%); split: +0.08%, -0.06%
Totals from 49286 (7.81% of 631231) affected shaders:
Instrs: 8607753 -> 8610048 (+0.03%); split: -0.01%, +0.04%
Subgroup size: 857752 -> 857760 (+0.00%)
Cycle count: 305939495 -> 307278676 (+0.44%); split: -0.28%, +0.72%
Spill count: 6339 -> 6330 (-0.14%)
Fill count: 12571 -> 12534 (-0.29%)
Max live registers: 1788346 -> 1736834 (-2.88%)
Max dispatch width: 510920 -> 511928 (+0.20%); split: +0.85%, -0.66%
```
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30489>
2024-07-31 22:46:20 -07:00
|
|
|
brw_reg sequence = bld.LOAD_SUBGROUP_INVOCATION();
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
2022-09-07 20:37:26 -07:00
|
|
|
/* Offsets will be 0, 4, 8, ... */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg channel_offsets = bld.SHL(sequence, brw_imm_ud(2u));
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
/* Convert vertex_index to bytes (multiply by 32) */
|
2022-09-07 20:37:26 -07:00
|
|
|
assert(util_is_power_of_two_nonzero(grf_size_bytes)); /* for ffs() */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg vertex_offset_bytes =
|
2025-01-15 13:27:05 -08:00
|
|
|
bld.SHL(retype(get_nir_src(ntb, vertex_src, 0), BRW_TYPE_UD),
|
2024-04-12 17:43:22 -07:00
|
|
|
brw_imm_ud(ffs(grf_size_bytes) - 1));
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg icp_offset_bytes =
|
2024-04-12 17:43:22 -07:00
|
|
|
bld.ADD(vertex_offset_bytes, channel_offsets);
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
2022-08-19 17:07:44 -07:00
|
|
|
/* Use start of ICP handles as the base offset. There is one register
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
* of URB handles per vertex, so inform the register allocator that
|
|
|
|
|
* we might read up to nir->info.gs.vertices_in registers.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg icp_handle = bld.vgrf(BRW_TYPE_UD);
|
2022-08-19 17:07:44 -07:00
|
|
|
bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
|
2023-04-08 21:34:35 +03:00
|
|
|
icp_offset_bytes,
|
2022-09-07 20:37:26 -07:00
|
|
|
brw_imm_ud(brw_tcs_prog_key_input_vertices(tcs_key) *
|
|
|
|
|
grf_size_bytes));
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
|
|
|
|
return icp_handle;
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-17 17:17:25 -08:00
|
|
|
static void
|
2024-12-29 15:41:04 -08:00
|
|
|
setup_barrier_message_payload_gfx125(const brw_builder &bld,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg &msg_payload)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
2025-04-03 01:14:03 -07:00
|
|
|
const brw_builder ubld = bld.uniform();
|
2023-01-04 12:56:47 -08:00
|
|
|
const struct intel_device_info *devinfo = bld.shader->devinfo;
|
|
|
|
|
assert(devinfo->verx10 >= 125);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
/* From BSpec: 54006, mov r0.2[31:24] into m0.2[31:24] and m0.2[23:16] */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg m0_10ub = horiz_offset(retype(msg_payload, BRW_TYPE_UB), 10);
|
|
|
|
|
brw_reg r0_11ub =
|
2024-04-20 17:08:02 -07:00
|
|
|
stride(suboffset(retype(brw_vec1_grf(0, 0), BRW_TYPE_UB), 11),
|
2023-11-17 17:17:25 -08:00
|
|
|
0, 1, 0);
|
2023-01-04 12:56:47 -08:00
|
|
|
ubld.group(2, 0).MOV(m0_10ub, r0_11ub);
|
|
|
|
|
|
|
|
|
|
if (devinfo->ver >= 20) {
|
|
|
|
|
/* Use an active threads barrier. */
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg m0_2ud = component(retype(msg_payload, BRW_TYPE_UD), 2);
|
2023-01-04 12:56:47 -08:00
|
|
|
ubld.OR(m0_2ud, m0_2ud, brw_imm_ud(1u << 8));
|
|
|
|
|
}
|
2023-11-17 17:17:25 -08:00
|
|
|
}
|
|
|
|
|
|
2023-11-20 13:25:36 -08:00
|
|
|
static void
|
2023-12-05 15:27:29 -08:00
|
|
|
emit_barrier(nir_to_brw_state &ntb)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
2023-12-05 15:27:29 -08:00
|
|
|
const intel_device_info *devinfo = ntb.devinfo;
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
|
|
|
|
const brw_builder ubld = bld.exec_all();
|
|
|
|
|
const brw_builder hbld = ubld.group(8 * reg_unit(devinfo), 0);
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 13:25:36 -08:00
|
|
|
|
2023-11-17 17:17:25 -08:00
|
|
|
/* We are getting the barrier ID from the compute shader header */
|
2025-08-05 16:50:43 +08:00
|
|
|
assert(mesa_shader_stage_uses_workgroup(s.stage));
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-10-02 14:24:17 -07:00
|
|
|
/* Zero-initialize the payload */
|
|
|
|
|
brw_reg payload = hbld.MOV(brw_imm_ud(0u));
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
if (devinfo->verx10 >= 125) {
|
|
|
|
|
setup_barrier_message_payload_gfx125(bld, payload);
|
|
|
|
|
} else {
|
2025-08-05 16:43:06 +08:00
|
|
|
assert(mesa_shader_stage_is_compute(s.stage));
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-10-02 14:25:16 -07:00
|
|
|
brw_reg barrier_id_mask =
|
|
|
|
|
brw_imm_ud(devinfo->ver == 9 ? 0x8f000000u : 0x7f000000u);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
/* Copy the barrier id from r0.2 to the message payload reg.2 */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg r0_2 = brw_reg(retype(brw_vec1_grf(0, 2), BRW_TYPE_UD));
|
2024-10-02 14:25:16 -07:00
|
|
|
ubld.group(1, 0).AND(component(payload, 2), r0_2, barrier_id_mask);
|
2023-11-17 17:17:25 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Emit a gateway "barrier" message using the payload we set up, followed
|
|
|
|
|
* by a wait instruction.
|
|
|
|
|
*/
|
2024-10-02 14:25:16 -07:00
|
|
|
ubld.emit(SHADER_OPCODE_BARRIER, reg_undef, payload);
|
2023-11-17 17:17:25 -08:00
|
|
|
}
|
|
|
|
|
|
2023-11-20 13:25:36 -08:00
|
|
|
static void
|
2023-12-05 15:27:29 -08:00
|
|
|
emit_tcs_barrier(nir_to_brw_state &ntb)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
2023-12-05 15:27:29 -08:00
|
|
|
const intel_device_info *devinfo = ntb.devinfo;
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 13:25:36 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_TESS_CTRL);
|
|
|
|
|
struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg m0 = bld.vgrf(BRW_TYPE_UD);
|
|
|
|
|
brw_reg m0_2 = component(m0, 2);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2025-04-03 01:14:03 -07:00
|
|
|
const brw_builder chanbld = bld.uniform();
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
/* Zero the message header */
|
|
|
|
|
bld.exec_all().MOV(m0, brw_imm_ud(0u));
|
|
|
|
|
|
|
|
|
|
if (devinfo->verx10 >= 125) {
|
|
|
|
|
setup_barrier_message_payload_gfx125(bld, m0);
|
|
|
|
|
} else if (devinfo->ver >= 11) {
|
2024-04-20 17:08:02 -07:00
|
|
|
chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_TYPE_UD),
|
2023-11-17 17:17:25 -08:00
|
|
|
brw_imm_ud(INTEL_MASK(30, 24)));
|
|
|
|
|
|
|
|
|
|
/* Set the Barrier Count and the enable bit */
|
|
|
|
|
chanbld.OR(m0_2, m0_2,
|
|
|
|
|
brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
|
|
|
|
|
} else {
|
|
|
|
|
/* Copy "Barrier ID" from r0.2, bits 16:13 */
|
2024-04-20 17:08:02 -07:00
|
|
|
chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_TYPE_UD),
|
2023-11-17 17:17:25 -08:00
|
|
|
brw_imm_ud(INTEL_MASK(16, 13)));
|
|
|
|
|
|
|
|
|
|
/* Shift it up to bits 27:24. */
|
|
|
|
|
chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
|
|
|
|
|
|
|
|
|
|
/* Set the Barrier Count and the enable bit */
|
|
|
|
|
chanbld.OR(m0_2, m0_2,
|
|
|
|
|
brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 12:13:47 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_tcs_intrinsic(nir_to_brw_state &ntb,
|
2023-11-20 12:13:47 -08:00
|
|
|
nir_intrinsic_instr *instr)
|
2015-11-14 17:40:43 -08:00
|
|
|
{
|
2023-12-05 15:27:29 -08:00
|
|
|
const intel_device_info *devinfo = ntb.devinfo;
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 12:13:47 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_TESS_CTRL);
|
|
|
|
|
struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data);
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dst;
|
2015-11-14 17:40:43 -08:00
|
|
|
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
|
2023-11-20 21:21:54 -08:00
|
|
|
dst = get_nir_def(ntb, instr->def);
|
2015-11-14 17:40:43 -08:00
|
|
|
|
|
|
|
|
switch (instr->intrinsic) {
|
|
|
|
|
case nir_intrinsic_load_primitive_id:
|
2023-12-05 17:16:34 -08:00
|
|
|
bld.MOV(dst, s.tcs_payload().primitive_id);
|
2015-11-14 17:40:43 -08:00
|
|
|
break;
|
|
|
|
|
case nir_intrinsic_load_invocation_id:
|
2023-12-05 17:16:34 -08:00
|
|
|
bld.MOV(retype(dst, s.invocation_id.type), s.invocation_id);
|
2015-11-14 17:40:43 -08:00
|
|
|
break;
|
|
|
|
|
|
2023-07-28 15:08:00 -04:00
|
|
|
case nir_intrinsic_barrier:
|
2023-05-30 12:05:30 -07:00
|
|
|
if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_intrinsic(ntb, bld, instr);
|
2023-05-30 12:05:30 -07:00
|
|
|
if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
|
2023-03-02 14:26:53 -08:00
|
|
|
if (tcs_prog_data->instances != 1)
|
2023-11-20 22:00:28 -08:00
|
|
|
emit_tcs_barrier(ntb);
|
2023-03-02 14:26:53 -08:00
|
|
|
}
|
2015-11-14 17:40:43 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_input:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("nir_lower_io should never give us these.");
|
2015-11-14 17:40:43 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_per_vertex_input: {
|
2023-08-14 11:56:00 -05:00
|
|
|
assert(instr->def.bit_size == 32);
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg indirect_offset = get_indirect_offset(ntb, instr);
|
2022-08-13 01:11:58 -07:00
|
|
|
unsigned imm_offset = nir_intrinsic_base(instr);
|
2025-08-22 00:30:40 -07:00
|
|
|
brw_urb_inst *urb;
|
2015-11-14 17:40:43 -08:00
|
|
|
|
2022-08-19 15:04:15 -07:00
|
|
|
const bool multi_patch =
|
2024-02-01 13:58:36 -08:00
|
|
|
vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH;
|
2022-08-19 15:04:15 -07:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg icp_handle = multi_patch ?
|
2023-11-20 21:21:54 -08:00
|
|
|
get_tcs_multi_patch_icp_handle(ntb, bld, instr) :
|
|
|
|
|
get_tcs_single_patch_icp_handle(ntb, bld, instr);
|
2015-11-14 17:40:43 -08:00
|
|
|
|
2016-05-09 10:31:50 +02:00
|
|
|
/* We can only read two double components with each URB read, so
|
|
|
|
|
* we send two read messages in that case, each one loading up to
|
|
|
|
|
* two double components.
|
|
|
|
|
*/
|
|
|
|
|
unsigned num_components = instr->num_components;
|
2016-06-10 21:57:49 +10:00
|
|
|
unsigned first_component = nir_intrinsic_component(instr);
|
2016-05-09 10:31:50 +02:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2022-07-14 11:57:03 -07:00
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
|
|
|
|
|
|
2019-07-19 17:38:04 -05:00
|
|
|
if (indirect_offset.file == BAD_FILE) {
|
|
|
|
|
/* Constant indexing - use global offset. */
|
|
|
|
|
if (first_component != 0) {
|
|
|
|
|
unsigned read_components = num_components + first_component;
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp = bld.vgrf(dst.type, read_components);
|
2025-08-22 00:30:40 -07:00
|
|
|
urb = bld.URB_READ(tmp, srcs, ARRAY_SIZE(srcs));
|
2024-08-15 23:04:23 -07:00
|
|
|
brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component),
|
|
|
|
|
num_components);
|
2016-05-09 10:31:50 +02:00
|
|
|
} else {
|
2025-08-22 00:30:40 -07:00
|
|
|
urb = bld.URB_READ(dst, srcs, ARRAY_SIZE(srcs));
|
2016-05-09 10:31:50 +02:00
|
|
|
}
|
2025-08-22 00:30:40 -07:00
|
|
|
urb->offset = imm_offset;
|
2019-07-19 17:38:04 -05:00
|
|
|
} else {
|
|
|
|
|
/* Indirect indexing - use per-slot offsets as well. */
|
2022-07-14 11:57:03 -07:00
|
|
|
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
|
|
|
|
|
|
2019-07-19 17:38:04 -05:00
|
|
|
if (first_component != 0) {
|
|
|
|
|
unsigned read_components = num_components + first_component;
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp = bld.vgrf(dst.type, read_components);
|
2025-08-22 00:30:40 -07:00
|
|
|
urb = bld.URB_READ(tmp, srcs, ARRAY_SIZE(srcs));
|
2024-08-15 23:04:23 -07:00
|
|
|
brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component),
|
|
|
|
|
num_components);
|
2019-07-19 17:38:04 -05:00
|
|
|
} else {
|
2025-08-22 00:30:40 -07:00
|
|
|
urb = bld.URB_READ(dst, srcs, ARRAY_SIZE(srcs));
|
2016-05-09 10:31:50 +02:00
|
|
|
}
|
2025-08-22 00:30:40 -07:00
|
|
|
urb->offset = imm_offset;
|
2019-07-19 17:38:04 -05:00
|
|
|
}
|
2025-08-22 00:30:40 -07:00
|
|
|
urb->size_written = (num_components + first_component) *
|
|
|
|
|
urb->dst.component_size(urb->exec_size);
|
2016-05-09 10:31:50 +02:00
|
|
|
|
2019-07-19 17:38:04 -05:00
|
|
|
/* Copy the temporary to the destination to deal with writemasking.
|
|
|
|
|
*
|
|
|
|
|
* Also attempt to deal with gl_PointSize being in the .w component.
|
|
|
|
|
*/
|
2025-08-22 00:30:40 -07:00
|
|
|
if (urb->offset == 0 && indirect_offset.file == BAD_FILE) {
|
2024-04-21 00:57:59 -07:00
|
|
|
assert(brw_type_size_bytes(dst.type) == 4);
|
2025-08-22 00:30:40 -07:00
|
|
|
urb->dst = bld.vgrf(dst.type, 4);
|
|
|
|
|
urb->size_written = 4 * REG_SIZE * reg_unit(devinfo);
|
|
|
|
|
bld.MOV(dst, offset(urb->dst, bld, 3));
|
2015-11-14 17:40:43 -08:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_output:
|
|
|
|
|
case nir_intrinsic_load_per_vertex_output: {
|
2023-08-14 11:56:00 -05:00
|
|
|
assert(instr->def.bit_size == 32);
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg indirect_offset = get_indirect_offset(ntb, instr);
|
2022-08-13 01:11:58 -07:00
|
|
|
unsigned imm_offset = nir_intrinsic_base(instr);
|
2016-06-15 12:35:49 +10:00
|
|
|
unsigned first_component = nir_intrinsic_component(instr);
|
2015-11-14 17:40:43 -08:00
|
|
|
|
2025-08-22 00:30:40 -07:00
|
|
|
brw_urb_inst *urb;
|
2015-11-14 17:40:43 -08:00
|
|
|
if (indirect_offset.file == BAD_FILE) {
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
/* This MOV replicates the output handle to all enabled channels
|
|
|
|
|
* is SINGLE_PATCH mode.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg patch_handle = bld.MOV(s.tcs_payload().patch_urb_output);
|
2015-11-14 17:40:43 -08:00
|
|
|
|
2016-11-24 01:50:10 -08:00
|
|
|
{
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2022-07-14 11:57:03 -07:00
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = patch_handle;
|
|
|
|
|
|
2016-06-15 12:35:49 +10:00
|
|
|
if (first_component != 0) {
|
|
|
|
|
unsigned read_components =
|
|
|
|
|
instr->num_components + first_component;
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp = bld.vgrf(dst.type, read_components);
|
2025-08-22 00:30:40 -07:00
|
|
|
urb = bld.URB_READ(tmp, srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
urb->size_written = read_components * REG_SIZE * reg_unit(devinfo);
|
2024-08-15 23:04:23 -07:00
|
|
|
brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component),
|
|
|
|
|
instr->num_components);
|
2016-06-15 12:35:49 +10:00
|
|
|
} else {
|
2025-08-22 00:30:40 -07:00
|
|
|
urb = bld.URB_READ(dst, srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
urb->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
|
2016-06-15 12:35:49 +10:00
|
|
|
}
|
2025-08-22 00:30:40 -07:00
|
|
|
urb->offset = imm_offset;
|
2015-11-14 17:40:43 -08:00
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
/* Indirect indexing - use per-slot offsets as well. */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2023-12-05 17:16:34 -08:00
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
|
2022-07-14 11:57:03 -07:00
|
|
|
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
|
|
|
|
|
|
2016-06-15 12:35:49 +10:00
|
|
|
if (first_component != 0) {
|
|
|
|
|
unsigned read_components =
|
|
|
|
|
instr->num_components + first_component;
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp = bld.vgrf(dst.type, read_components);
|
2025-08-22 00:30:40 -07:00
|
|
|
urb = bld.URB_READ(tmp, srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
urb->size_written = read_components * REG_SIZE * reg_unit(devinfo);
|
2024-08-15 23:04:23 -07:00
|
|
|
brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component),
|
|
|
|
|
instr->num_components);
|
2016-06-15 12:35:49 +10:00
|
|
|
} else {
|
2025-08-22 00:30:40 -07:00
|
|
|
urb = bld.URB_READ(dst, srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
urb->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
|
2016-06-15 12:35:49 +10:00
|
|
|
}
|
2025-08-22 00:30:40 -07:00
|
|
|
urb->offset = imm_offset;
|
2015-11-14 17:40:43 -08:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_store_output:
|
|
|
|
|
case nir_intrinsic_store_per_vertex_output: {
|
2019-07-19 17:38:04 -05:00
|
|
|
assert(nir_src_bit_size(instr->src[0]) == 32);
|
2024-02-12 08:43:34 -08:00
|
|
|
brw_reg value = get_nir_src(ntb, instr->src[0], -1);
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg indirect_offset = get_indirect_offset(ntb, instr);
|
2022-08-13 01:11:58 -07:00
|
|
|
unsigned imm_offset = nir_intrinsic_base(instr);
|
|
|
|
|
unsigned mask = nir_intrinsic_write_mask(instr);
|
2015-11-14 17:40:43 -08:00
|
|
|
|
|
|
|
|
if (mask == 0)
|
|
|
|
|
break;
|
|
|
|
|
|
2016-08-02 08:46:04 +02:00
|
|
|
unsigned num_components = util_last_bit(mask);
|
2016-06-10 21:57:49 +10:00
|
|
|
unsigned first_component = nir_intrinsic_component(instr);
|
2022-09-07 00:21:20 -07:00
|
|
|
assert((first_component + num_components) <= 4);
|
|
|
|
|
|
2016-05-20 10:29:06 +10:00
|
|
|
mask = mask << first_component;
|
|
|
|
|
|
2022-09-07 00:21:20 -07:00
|
|
|
const bool has_urb_lsc = devinfo->ver >= 20;
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg mask_reg;
|
2022-07-12 15:52:31 -07:00
|
|
|
if (mask != WRITEMASK_XYZW)
|
2024-08-22 12:21:19 +03:00
|
|
|
mask_reg = brw_imm_ud(mask);
|
2016-05-09 15:23:34 +02:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg sources[4];
|
2022-07-12 15:32:01 -07:00
|
|
|
|
2022-09-07 00:21:20 -07:00
|
|
|
unsigned m = has_urb_lsc ? 0 : first_component;
|
2019-07-19 17:38:04 -05:00
|
|
|
for (unsigned i = 0; i < num_components; i++) {
|
2022-09-07 00:21:20 -07:00
|
|
|
int c = i + first_component;
|
|
|
|
|
if (mask & (1 << c)) {
|
|
|
|
|
sources[m++] = offset(value, bld, i);
|
|
|
|
|
} else if (devinfo->ver < 20) {
|
|
|
|
|
m++;
|
|
|
|
|
}
|
2019-07-19 17:38:04 -05:00
|
|
|
}
|
2016-05-09 15:23:34 +02:00
|
|
|
|
2022-09-07 00:21:20 -07:00
|
|
|
assert(has_urb_lsc || m == (first_component + num_components));
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2023-12-05 17:16:34 -08:00
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
|
2022-07-12 15:32:01 -07:00
|
|
|
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
|
|
|
|
|
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask_reg;
|
2024-04-20 17:08:02 -07:00
|
|
|
srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(BRW_TYPE_F, m);
|
2022-09-28 16:50:41 -07:00
|
|
|
bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, m, 0);
|
2016-05-09 15:23:34 +02:00
|
|
|
|
2025-08-22 00:30:40 -07:00
|
|
|
brw_urb_inst *urb = bld.URB_WRITE(srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
urb->offset = imm_offset;
|
|
|
|
|
urb->components = m;
|
2015-11-14 17:40:43 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2025-04-22 18:42:35 +03:00
|
|
|
case nir_intrinsic_load_tess_config_intel:
|
|
|
|
|
bld.MOV(retype(dst, BRW_TYPE_UD),
|
|
|
|
|
brw_uniform_reg(tcs_prog_data->tess_config_param, BRW_TYPE_UD));
|
|
|
|
|
break;
|
|
|
|
|
|
2015-11-14 17:40:43 -08:00
|
|
|
default:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_intrinsic(ntb, bld, instr);
|
2015-11-14 17:40:43 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 12:13:47 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_tes_intrinsic(nir_to_brw_state &ntb,
|
2023-11-20 12:13:47 -08:00
|
|
|
nir_intrinsic_instr *instr)
|
2015-11-10 14:35:27 -08:00
|
|
|
{
|
2023-12-05 15:27:29 -08:00
|
|
|
const intel_device_info *devinfo = ntb.devinfo;
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 12:13:47 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_TESS_EVAL);
|
|
|
|
|
struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(s.prog_data);
|
2015-11-10 14:35:27 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dest;
|
2015-11-10 14:35:27 -08:00
|
|
|
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
|
2023-11-20 21:21:54 -08:00
|
|
|
dest = get_nir_def(ntb, instr->def);
|
2015-11-10 14:35:27 -08:00
|
|
|
|
|
|
|
|
switch (instr->intrinsic) {
|
|
|
|
|
case nir_intrinsic_load_primitive_id:
|
2023-12-05 17:16:34 -08:00
|
|
|
bld.MOV(dest, s.tes_payload().primitive_id);
|
2015-11-10 14:35:27 -08:00
|
|
|
break;
|
2022-08-21 20:51:58 -07:00
|
|
|
|
2015-11-10 14:35:27 -08:00
|
|
|
case nir_intrinsic_load_tess_coord:
|
2022-08-21 20:51:58 -07:00
|
|
|
for (unsigned i = 0; i < 3; i++)
|
2023-12-05 17:16:34 -08:00
|
|
|
bld.MOV(offset(dest, bld, i), s.tes_payload().coords[i]);
|
2015-11-10 14:35:27 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_input:
|
|
|
|
|
case nir_intrinsic_load_per_vertex_input: {
|
2023-08-14 11:56:00 -05:00
|
|
|
assert(instr->def.bit_size == 32);
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg indirect_offset = get_indirect_offset(ntb, instr);
|
2022-08-13 01:11:58 -07:00
|
|
|
unsigned imm_offset = nir_intrinsic_base(instr);
|
2016-05-19 16:58:48 +10:00
|
|
|
unsigned first_component = nir_intrinsic_component(instr);
|
2015-11-10 14:35:27 -08:00
|
|
|
|
2025-08-22 00:30:40 -07:00
|
|
|
brw_urb_inst *urb;
|
2015-11-10 14:35:27 -08:00
|
|
|
if (indirect_offset.file == BAD_FILE) {
|
i965: Push most TES inputs in SIMD8 mode.
Using the push model for inputs is much more efficient than pulling
inputs - the hardware can simply copy a large chunk into URB registers
at thread creation time, rather than having the thread send messages to
request data from the L3 cache. Unfortunately, it's possible to have
more TES inputs than fit in registers, so we have to fall back to the
pull model in some cases.
However, it turns out that most tessellation evaluation shaders are
fairly simple, and don't use many inputs. An arbitrary cut-off of
32 vec4 slots (16 registers) is more than sufficient to ensure that
100% of TES inputs are pushed for Shadow of Mordor, Unigine Heaven,
GPUTest/TessMark, and SynMark.
Note that unlike most SIMD8 stages, this actually reads packed vec4
data, since that is what our vec4 TCS programs write.
Improves performance in GPUTest's tessmark_x64 microbenchmark
by 93.4426% +/- 5.35541% (n = 25) on my Lenovo X250 at 1024x768.
Improves performance in Synmark's Gl40TerrainFlyTess microbenchmark
by 22.74% +/- 0.309394% (n = 5).
Improves performance in Shadow of Mordor at low settings with
tessellation enabled at 1280x720 by 2.12197% +/- 0.478553% (n = 4).
shader-db statistics for files containing tessellation shaders:
total instructions in shared programs: 184358 -> 181181 (-1.72%)
instructions in affected programs: 27971 -> 24794 (-11.36%)
helped: 226
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-12-27 17:26:30 -08:00
|
|
|
/* Arbitrarily only push up to 32 vec4 slots worth of data,
|
|
|
|
|
* which is 16 registers (since each holds 2 vec4 slots).
|
|
|
|
|
*/
|
|
|
|
|
const unsigned max_push_slots = 32;
|
2019-07-19 17:38:04 -05:00
|
|
|
if (imm_offset < max_push_slots) {
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg src = horiz_offset(brw_attr_reg(0, dest.type),
|
2022-09-12 16:48:02 -07:00
|
|
|
4 * imm_offset + first_component);
|
2024-08-16 00:11:04 -07:00
|
|
|
brw_reg comps[NIR_MAX_VEC_COMPONENTS];
|
2024-01-03 01:00:18 -08:00
|
|
|
for (unsigned i = 0; i < instr->num_components; i++) {
|
|
|
|
|
comps[i] = component(src, i);
|
|
|
|
|
}
|
|
|
|
|
bld.VEC(dest, comps, instr->num_components);
|
2017-10-09 14:17:43 +02:00
|
|
|
|
i965: Push most TES inputs in SIMD8 mode.
Using the push model for inputs is much more efficient than pulling
inputs - the hardware can simply copy a large chunk into URB registers
at thread creation time, rather than having the thread send messages to
request data from the L3 cache. Unfortunately, it's possible to have
more TES inputs than fit in registers, so we have to fall back to the
pull model in some cases.
However, it turns out that most tessellation evaluation shaders are
fairly simple, and don't use many inputs. An arbitrary cut-off of
32 vec4 slots (16 registers) is more than sufficient to ensure that
100% of TES inputs are pushed for Shadow of Mordor, Unigine Heaven,
GPUTest/TessMark, and SynMark.
Note that unlike most SIMD8 stages, this actually reads packed vec4
data, since that is what our vec4 TCS programs write.
Improves performance in GPUTest's tessmark_x64 microbenchmark
by 93.4426% +/- 5.35541% (n = 25) on my Lenovo X250 at 1024x768.
Improves performance in Synmark's Gl40TerrainFlyTess microbenchmark
by 22.74% +/- 0.309394% (n = 5).
Improves performance in Shadow of Mordor at low settings with
tessellation enabled at 1280x720 by 2.12197% +/- 0.478553% (n = 4).
shader-db statistics for files containing tessellation shaders:
total instructions in shared programs: 184358 -> 181181 (-1.72%)
instructions in affected programs: 27971 -> 24794 (-11.36%)
helped: 226
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-12-27 17:26:30 -08:00
|
|
|
tes_prog_data->base.urb_read_length =
|
|
|
|
|
MAX2(tes_prog_data->base.urb_read_length,
|
2019-07-19 17:38:04 -05:00
|
|
|
(imm_offset / 2) + 1);
|
i965: Push most TES inputs in SIMD8 mode.
Using the push model for inputs is much more efficient than pulling
inputs - the hardware can simply copy a large chunk into URB registers
at thread creation time, rather than having the thread send messages to
request data from the L3 cache. Unfortunately, it's possible to have
more TES inputs than fit in registers, so we have to fall back to the
pull model in some cases.
However, it turns out that most tessellation evaluation shaders are
fairly simple, and don't use many inputs. An arbitrary cut-off of
32 vec4 slots (16 registers) is more than sufficient to ensure that
100% of TES inputs are pushed for Shadow of Mordor, Unigine Heaven,
GPUTest/TessMark, and SynMark.
Note that unlike most SIMD8 stages, this actually reads packed vec4
data, since that is what our vec4 TCS programs write.
Improves performance in GPUTest's tessmark_x64 microbenchmark
by 93.4426% +/- 5.35541% (n = 25) on my Lenovo X250 at 1024x768.
Improves performance in Synmark's Gl40TerrainFlyTess microbenchmark
by 22.74% +/- 0.309394% (n = 5).
Improves performance in Shadow of Mordor at low settings with
tessellation enabled at 1280x720 by 2.12197% +/- 0.478553% (n = 4).
shader-db statistics for files containing tessellation shaders:
total instructions in shared programs: 184358 -> 181181 (-1.72%)
instructions in affected programs: 27971 -> 24794 (-11.36%)
helped: 226
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-12-27 17:26:30 -08:00
|
|
|
} else {
|
|
|
|
|
/* Replicate the patch handle to all enabled channels */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2023-12-05 17:16:34 -08:00
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input;
|
i965: Push most TES inputs in SIMD8 mode.
Using the push model for inputs is much more efficient than pulling
inputs - the hardware can simply copy a large chunk into URB registers
at thread creation time, rather than having the thread send messages to
request data from the L3 cache. Unfortunately, it's possible to have
more TES inputs than fit in registers, so we have to fall back to the
pull model in some cases.
However, it turns out that most tessellation evaluation shaders are
fairly simple, and don't use many inputs. An arbitrary cut-off of
32 vec4 slots (16 registers) is more than sufficient to ensure that
100% of TES inputs are pushed for Shadow of Mordor, Unigine Heaven,
GPUTest/TessMark, and SynMark.
Note that unlike most SIMD8 stages, this actually reads packed vec4
data, since that is what our vec4 TCS programs write.
Improves performance in GPUTest's tessmark_x64 microbenchmark
by 93.4426% +/- 5.35541% (n = 25) on my Lenovo X250 at 1024x768.
Improves performance in Synmark's Gl40TerrainFlyTess microbenchmark
by 22.74% +/- 0.309394% (n = 5).
Improves performance in Shadow of Mordor at low settings with
tessellation enabled at 1280x720 by 2.12197% +/- 0.478553% (n = 4).
shader-db statistics for files containing tessellation shaders:
total instructions in shared programs: 184358 -> 181181 (-1.72%)
instructions in affected programs: 27971 -> 24794 (-11.36%)
helped: 226
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-12-27 17:26:30 -08:00
|
|
|
|
2016-05-19 16:58:48 +10:00
|
|
|
if (first_component != 0) {
|
|
|
|
|
unsigned read_components =
|
|
|
|
|
instr->num_components + first_component;
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp = bld.vgrf(dest.type, read_components);
|
2025-08-22 00:30:40 -07:00
|
|
|
urb = bld.URB_READ(tmp, srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
urb->size_written = read_components * REG_SIZE * reg_unit(devinfo);
|
2024-08-15 23:04:23 -07:00
|
|
|
brw_combine_with_vec(bld, dest, offset(tmp, bld, first_component),
|
|
|
|
|
instr->num_components);
|
2016-05-19 16:58:48 +10:00
|
|
|
} else {
|
2025-08-22 00:30:40 -07:00
|
|
|
urb = bld.URB_READ(dest, srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
urb->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
|
2016-05-19 16:58:48 +10:00
|
|
|
}
|
2025-08-22 00:30:40 -07:00
|
|
|
urb->offset = imm_offset;
|
i965: Push most TES inputs in SIMD8 mode.
Using the push model for inputs is much more efficient than pulling
inputs - the hardware can simply copy a large chunk into URB registers
at thread creation time, rather than having the thread send messages to
request data from the L3 cache. Unfortunately, it's possible to have
more TES inputs than fit in registers, so we have to fall back to the
pull model in some cases.
However, it turns out that most tessellation evaluation shaders are
fairly simple, and don't use many inputs. An arbitrary cut-off of
32 vec4 slots (16 registers) is more than sufficient to ensure that
100% of TES inputs are pushed for Shadow of Mordor, Unigine Heaven,
GPUTest/TessMark, and SynMark.
Note that unlike most SIMD8 stages, this actually reads packed vec4
data, since that is what our vec4 TCS programs write.
Improves performance in GPUTest's tessmark_x64 microbenchmark
by 93.4426% +/- 5.35541% (n = 25) on my Lenovo X250 at 1024x768.
Improves performance in Synmark's Gl40TerrainFlyTess microbenchmark
by 22.74% +/- 0.309394% (n = 5).
Improves performance in Shadow of Mordor at low settings with
tessellation enabled at 1280x720 by 2.12197% +/- 0.478553% (n = 4).
shader-db statistics for files containing tessellation shaders:
total instructions in shared programs: 184358 -> 181181 (-1.72%)
instructions in affected programs: 27971 -> 24794 (-11.36%)
helped: 226
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-12-27 17:26:30 -08:00
|
|
|
}
|
2015-11-10 14:35:27 -08:00
|
|
|
} else {
|
|
|
|
|
/* Indirect indexing - use per-slot offsets as well. */
|
|
|
|
|
|
2016-07-15 10:55:05 +02:00
|
|
|
/* We can only read two double components with each URB read, so
|
|
|
|
|
* we send two read messages in that case, each one loading up to
|
|
|
|
|
* two double components.
|
|
|
|
|
*/
|
|
|
|
|
unsigned num_components = instr->num_components;
|
2022-07-14 11:57:03 -07:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2023-12-05 17:16:34 -08:00
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input;
|
2022-07-14 11:57:03 -07:00
|
|
|
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
|
2016-07-15 10:55:05 +02:00
|
|
|
|
2019-07-19 17:38:04 -05:00
|
|
|
if (first_component != 0) {
|
|
|
|
|
unsigned read_components =
|
|
|
|
|
num_components + first_component;
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp = bld.vgrf(dest.type, read_components);
|
2025-08-22 00:30:40 -07:00
|
|
|
urb = bld.URB_READ(tmp, srcs, ARRAY_SIZE(srcs));
|
2024-08-15 23:04:23 -07:00
|
|
|
brw_combine_with_vec(bld, dest, offset(tmp, bld, first_component),
|
|
|
|
|
num_components);
|
2019-07-19 17:38:04 -05:00
|
|
|
} else {
|
2025-08-22 00:30:40 -07:00
|
|
|
urb = bld.URB_READ(dest, srcs, ARRAY_SIZE(srcs));
|
2016-05-23 16:32:50 +10:00
|
|
|
}
|
2025-08-22 00:30:40 -07:00
|
|
|
urb->offset = imm_offset;
|
|
|
|
|
urb->size_written = (num_components + first_component) *
|
|
|
|
|
urb->dst.component_size(urb->exec_size);
|
2015-11-10 14:35:27 -08:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
2025-04-22 18:42:35 +03:00
|
|
|
|
|
|
|
|
case nir_intrinsic_load_tess_config_intel:
|
|
|
|
|
bld.MOV(retype(dest, BRW_TYPE_UD),
|
|
|
|
|
brw_uniform_reg(tes_prog_data->tess_config_param, BRW_TYPE_UD));
|
|
|
|
|
break;
|
|
|
|
|
|
2015-11-10 14:35:27 -08:00
|
|
|
default:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_intrinsic(ntb, bld, instr);
|
2015-11-10 14:35:27 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 12:13:47 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_gs_intrinsic(nir_to_brw_state &ntb,
|
2023-11-20 12:13:47 -08:00
|
|
|
nir_intrinsic_instr *instr)
|
2015-11-04 23:05:07 -08:00
|
|
|
{
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 12:13:47 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_GEOMETRY);
|
2015-11-04 23:05:07 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dest;
|
2015-11-04 23:05:07 -08:00
|
|
|
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
|
2023-11-20 21:21:54 -08:00
|
|
|
dest = get_nir_def(ntb, instr->def);
|
2015-11-04 23:05:07 -08:00
|
|
|
|
|
|
|
|
switch (instr->intrinsic) {
|
|
|
|
|
case nir_intrinsic_load_primitive_id:
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_GEOMETRY);
|
|
|
|
|
assert(brw_gs_prog_data(s.prog_data)->include_primitive_id);
|
2024-04-20 17:08:02 -07:00
|
|
|
bld.MOV(retype(dest, BRW_TYPE_UD), s.gs_payload().primitive_id);
|
2015-11-04 23:05:07 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_input:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("load_input intrinsics are invalid for the GS stage");
|
2015-11-04 23:05:07 -08:00
|
|
|
|
|
|
|
|
case nir_intrinsic_load_per_vertex_input:
|
2023-11-20 21:21:54 -08:00
|
|
|
emit_gs_input_load(ntb, dest, instr->src[0], nir_intrinsic_base(instr),
|
2016-05-19 15:58:51 +10:00
|
|
|
instr->src[1], instr->num_components,
|
|
|
|
|
nir_intrinsic_component(instr));
|
2015-11-04 23:05:07 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_emit_vertex_with_counter:
|
2023-11-20 21:21:54 -08:00
|
|
|
emit_gs_vertex(ntb, instr->src[0], nir_intrinsic_stream_id(instr));
|
2024-01-09 01:30:19 -08:00
|
|
|
|
|
|
|
|
/* After an EmitVertex() call, the values of all outputs are undefined.
|
|
|
|
|
* If this is not in control flow, recreate a fresh set of output
|
|
|
|
|
* registers to keep their live ranges separate.
|
|
|
|
|
*/
|
|
|
|
|
if (instr->instr.block->cf_node.parent->type == nir_cf_node_function)
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_setup_outputs(ntb);
|
2015-11-04 23:05:07 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_end_primitive_with_counter:
|
2023-11-20 21:21:54 -08:00
|
|
|
emit_gs_end_primitive(ntb, instr->src[0]);
|
2015-11-04 23:05:07 -08:00
|
|
|
break;
|
|
|
|
|
|
2020-06-08 12:16:13 +02:00
|
|
|
case nir_intrinsic_set_vertex_and_primitive_count:
|
2025-01-15 13:27:05 -08:00
|
|
|
bld.MOV(s.final_gs_vertex_count, get_nir_src(ntb, instr->src[0], 0));
|
2015-11-04 23:05:07 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_invocation_id: {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg val = ntb.system_values[SYSTEM_VALUE_INVOCATION_ID];
|
2015-11-04 23:05:07 -08:00
|
|
|
assert(val.file != BAD_FILE);
|
|
|
|
|
dest.type = val.type;
|
|
|
|
|
bld.MOV(dest, val);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
default:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_intrinsic(ntb, bld, instr);
|
2015-11-04 23:05:07 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-07-21 20:25:28 -07:00
|
|
|
/**
|
|
|
|
|
* Fetch the current render target layer index.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2024-12-29 15:41:04 -08:00
|
|
|
fetch_render_target_array_index(const brw_builder &bld)
|
2016-07-21 20:25:28 -07:00
|
|
|
{
|
2024-12-07 10:25:45 -08:00
|
|
|
const brw_shader *v = bld.shader;
|
2022-06-22 17:05:44 -07:00
|
|
|
|
2023-12-01 21:50:47 -08:00
|
|
|
if (bld.shader->devinfo->ver >= 20) {
|
|
|
|
|
/* Gfx20+ has separate Render Target Array indices for each pair
|
|
|
|
|
* of subspans in order to support multiple polygons, so we need
|
|
|
|
|
* to use a <1;8,0> region in order to select the correct word
|
|
|
|
|
* for each channel.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
|
2023-12-01 21:50:47 -08:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < DIV_ROUND_UP(bld.dispatch_width(), 16); i++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder hbld = bld.group(16, i);
|
2023-12-01 21:50:47 -08:00
|
|
|
const struct brw_reg reg = retype(brw_vec1_grf(2 * i + 1, 1),
|
2024-04-20 17:08:02 -07:00
|
|
|
BRW_TYPE_UW);
|
2023-12-01 21:50:47 -08:00
|
|
|
hbld.AND(offset(idx, hbld, i), stride(reg, 1, 8, 0),
|
|
|
|
|
brw_imm_uw(0x7ff));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return idx;
|
|
|
|
|
} else if (bld.shader->devinfo->ver >= 12 && v->max_polygons == 2) {
|
2022-06-22 17:05:44 -07:00
|
|
|
/* According to the BSpec "PS Thread Payload for Normal
|
|
|
|
|
* Dispatch", the render target array index is stored as bits
|
|
|
|
|
* 26:16 of either the R1.1 or R1.6 poly info dwords, for the
|
|
|
|
|
* first and second polygons respectively in multipolygon PS
|
|
|
|
|
* dispatch mode.
|
|
|
|
|
*/
|
|
|
|
|
assert(bld.dispatch_width() == 16);
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
|
2022-06-22 17:05:44 -07:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < v->max_polygons; i++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder hbld = bld.group(8, i);
|
2024-08-20 11:48:54 -07:00
|
|
|
const struct brw_reg g1 = brw_uw1_reg(FIXED_GRF, 1, 3 + 10 * i);
|
2022-06-22 17:05:44 -07:00
|
|
|
hbld.AND(offset(idx, hbld, i), g1, brw_imm_uw(0x7ff));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return idx;
|
|
|
|
|
} else if (bld.shader->devinfo->ver >= 12) {
|
2020-04-30 17:38:33 +00:00
|
|
|
/* The render target array index is provided in the thread payload as
|
|
|
|
|
* bits 26:16 of r1.1.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
|
2024-08-20 11:48:54 -07:00
|
|
|
bld.AND(idx, brw_uw1_reg(FIXED_GRF, 1, 3),
|
2020-04-30 17:38:33 +00:00
|
|
|
brw_imm_uw(0x7ff));
|
|
|
|
|
return idx;
|
2024-02-15 02:51:39 -08:00
|
|
|
} else {
|
2016-07-21 20:25:28 -07:00
|
|
|
/* The render target array index is provided in the thread payload as
|
|
|
|
|
* bits 26:16 of r0.0.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
|
2024-08-20 11:48:54 -07:00
|
|
|
bld.AND(idx, brw_uw1_reg(FIXED_GRF, 0, 1),
|
2016-07-21 20:25:28 -07:00
|
|
|
brw_imm_uw(0x7ff));
|
|
|
|
|
return idx;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2024-12-29 15:41:04 -08:00
|
|
|
fetch_viewport_index(const brw_builder &bld)
|
2024-05-28 11:15:33 -07:00
|
|
|
{
|
2024-12-07 10:25:45 -08:00
|
|
|
const brw_shader *v = bld.shader;
|
2024-05-28 11:15:33 -07:00
|
|
|
|
|
|
|
|
if (bld.shader->devinfo->ver >= 20) {
|
|
|
|
|
/* Gfx20+ has separate viewport indices for each pair
|
|
|
|
|
* of subspans in order to support multiple polygons, so we need
|
|
|
|
|
* to use a <1;8,0> region in order to select the correct word
|
|
|
|
|
* for each channel.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
|
2024-05-28 11:15:33 -07:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < DIV_ROUND_UP(bld.dispatch_width(), 16); i++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder hbld = bld.group(16, i);
|
2024-05-28 11:15:33 -07:00
|
|
|
const struct brw_reg reg = retype(xe2_vec1_grf(i, 9),
|
|
|
|
|
BRW_TYPE_UW);
|
|
|
|
|
hbld.AND(offset(idx, hbld, i), stride(reg, 1, 8, 0),
|
|
|
|
|
brw_imm_uw(0xf000));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bld.SHR(idx, idx, brw_imm_ud(12));
|
|
|
|
|
return idx;
|
|
|
|
|
} else if (bld.shader->devinfo->ver >= 12 && v->max_polygons == 2) {
|
|
|
|
|
/* According to the BSpec "PS Thread Payload for Normal
|
|
|
|
|
* Dispatch", the viewport index is stored as bits
|
|
|
|
|
* 30:27 of either the R1.1 or R1.6 poly info dwords, for the
|
|
|
|
|
* first and second polygons respectively in multipolygon PS
|
|
|
|
|
* dispatch mode.
|
|
|
|
|
*/
|
|
|
|
|
assert(bld.dispatch_width() == 16);
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
|
|
|
|
|
brw_reg vp_idx_per_poly_dw[2] = {
|
2024-08-20 11:48:54 -07:00
|
|
|
brw_ud1_reg(FIXED_GRF, 1, 1), /* R1.1 bits 30:27 */
|
|
|
|
|
brw_ud1_reg(FIXED_GRF, 1, 6), /* R1.6 bits 30:27 */
|
2024-05-28 11:15:33 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < v->max_polygons; i++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder hbld = bld.group(8, i);
|
2024-05-28 11:15:33 -07:00
|
|
|
hbld.SHR(offset(idx, hbld, i), vp_idx_per_poly_dw[i], brw_imm_ud(27));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return bld.AND(idx, brw_imm_ud(0xf));
|
|
|
|
|
} else if (bld.shader->devinfo->ver >= 12) {
|
|
|
|
|
/* The viewport index is provided in the thread payload as
|
|
|
|
|
* bits 30:27 of r1.1.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
|
2024-05-28 11:15:33 -07:00
|
|
|
bld.SHR(idx,
|
2024-08-20 11:48:54 -07:00
|
|
|
bld.AND(brw_uw1_reg(FIXED_GRF, 1, 3),
|
2024-05-28 11:15:33 -07:00
|
|
|
brw_imm_uw(0x7800)),
|
|
|
|
|
brw_imm_ud(11));
|
|
|
|
|
return idx;
|
|
|
|
|
} else {
|
|
|
|
|
/* The viewport index is provided in the thread payload as
|
|
|
|
|
* bits 30:27 of r0.0.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
|
2024-05-28 11:15:33 -07:00
|
|
|
bld.SHR(idx,
|
2024-08-20 11:48:54 -07:00
|
|
|
bld.AND(brw_uw1_reg(FIXED_GRF, 0, 1),
|
2024-05-28 11:15:33 -07:00
|
|
|
brw_imm_uw(0x7800)),
|
|
|
|
|
brw_imm_ud(11));
|
|
|
|
|
return idx;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-18 22:12:37 -07:00
|
|
|
/**
|
|
|
|
|
* Actual coherent framebuffer read implemented using the native render target
|
|
|
|
|
* read message. Requires SKL+.
|
|
|
|
|
*/
|
2024-12-07 00:23:07 -08:00
|
|
|
static brw_inst *
|
2024-12-29 15:41:04 -08:00
|
|
|
emit_coherent_fb_read(const brw_builder &bld, const brw_reg &dst, unsigned target)
|
2016-08-18 22:12:37 -07:00
|
|
|
{
|
2025-01-16 15:36:36 -08:00
|
|
|
brw_inst *inst =
|
|
|
|
|
bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst, brw_imm_ud(target));
|
2016-09-07 13:38:20 -07:00
|
|
|
inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
|
2016-08-18 22:12:37 -07:00
|
|
|
|
|
|
|
|
return inst;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2024-12-29 15:41:04 -08:00
|
|
|
alloc_temporary(const brw_builder &bld, unsigned size, brw_reg *regs, unsigned n)
|
2016-07-21 21:47:45 -07:00
|
|
|
{
|
|
|
|
|
if (n && regs[0].file != BAD_FILE) {
|
|
|
|
|
return regs[0];
|
|
|
|
|
|
|
|
|
|
} else {
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg tmp = bld.vgrf(BRW_TYPE_F, size);
|
2016-07-21 21:47:45 -07:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < n; i++)
|
|
|
|
|
regs[i] = tmp;
|
|
|
|
|
|
|
|
|
|
return tmp;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2023-12-05 15:27:29 -08:00
|
|
|
alloc_frag_output(nir_to_brw_state &ntb, unsigned location)
|
2016-07-21 21:26:20 -07:00
|
|
|
{
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 22:00:28 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
2016-07-21 21:26:20 -07:00
|
|
|
const brw_wm_prog_key *const key =
|
2023-12-05 17:16:34 -08:00
|
|
|
reinterpret_cast<const brw_wm_prog_key *>(s.key);
|
2016-07-21 21:26:20 -07:00
|
|
|
const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
|
|
|
|
|
const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
|
|
|
|
|
|
|
|
|
|
if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
|
2023-12-05 17:16:34 -08:00
|
|
|
return alloc_temporary(ntb.bld, 4, &s.dual_src_output, 1);
|
2016-07-21 21:26:20 -07:00
|
|
|
|
|
|
|
|
else if (l == FRAG_RESULT_COLOR)
|
2023-12-05 17:16:34 -08:00
|
|
|
return alloc_temporary(ntb.bld, 4, s.outputs,
|
2016-07-21 21:47:45 -07:00
|
|
|
MAX2(key->nr_color_regions, 1));
|
2016-07-21 21:26:20 -07:00
|
|
|
|
|
|
|
|
else if (l == FRAG_RESULT_DEPTH)
|
2023-12-05 17:16:34 -08:00
|
|
|
return alloc_temporary(ntb.bld, 1, &s.frag_depth, 1);
|
2016-07-21 21:26:20 -07:00
|
|
|
|
|
|
|
|
else if (l == FRAG_RESULT_STENCIL)
|
2023-12-05 17:16:34 -08:00
|
|
|
return alloc_temporary(ntb.bld, 1, &s.frag_stencil, 1);
|
2016-07-21 21:26:20 -07:00
|
|
|
|
|
|
|
|
else if (l == FRAG_RESULT_SAMPLE_MASK)
|
2023-12-05 17:16:34 -08:00
|
|
|
return alloc_temporary(ntb.bld, 1, &s.sample_mask, 1);
|
2016-07-21 21:26:20 -07:00
|
|
|
|
|
|
|
|
else if (l >= FRAG_RESULT_DATA0 &&
|
|
|
|
|
l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
|
2023-12-05 15:27:29 -08:00
|
|
|
return alloc_temporary(ntb.bld, 4,
|
2023-12-05 17:16:34 -08:00
|
|
|
&s.outputs[l - FRAG_RESULT_DATA0], 1);
|
2016-07-21 21:26:20 -07:00
|
|
|
|
|
|
|
|
else
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("Invalid location");
|
2016-07-21 21:26:20 -07:00
|
|
|
}
|
|
|
|
|
|
2023-11-20 13:25:36 -08:00
|
|
|
static void
|
2024-06-18 23:42:59 -07:00
|
|
|
emit_is_helper_invocation(nir_to_brw_state &ntb, brw_reg result)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2023-11-20 13:25:36 -08:00
|
|
|
|
2023-11-17 17:17:25 -08:00
|
|
|
/* Unlike the regular gl_HelperInvocation, that is defined at dispatch,
|
|
|
|
|
* the helperInvocationEXT() (aka SpvOpIsHelperInvocationEXT) takes into
|
|
|
|
|
* consideration demoted invocations.
|
|
|
|
|
*/
|
2024-04-20 17:08:02 -07:00
|
|
|
result.type = BRW_TYPE_UD;
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
bld.MOV(result, brw_imm_ud(0));
|
|
|
|
|
|
|
|
|
|
/* See brw_sample_mask_reg() for why we split SIMD32 into SIMD16 here. */
|
|
|
|
|
unsigned width = bld.dispatch_width();
|
|
|
|
|
for (unsigned i = 0; i < DIV_ROUND_UP(width, 16); i++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder b = bld.group(MIN2(width, 16), i);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *mov = b.MOV(offset(result, b, i), brw_imm_ud(~0));
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2025-04-02 16:12:45 -07:00
|
|
|
/* The before() ensures that any code emitted to get the predicate happens
|
2023-11-17 17:17:25 -08:00
|
|
|
* before the mov right above. This is not an issue elsewhere because
|
|
|
|
|
* lowering code already set up the builder this way.
|
|
|
|
|
*/
|
2025-04-02 16:12:45 -07:00
|
|
|
brw_emit_predicate_on_sample_mask(b.before(mov), mov);
|
2023-11-17 17:17:25 -08:00
|
|
|
mov->predicate_inverse = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2023-12-05 15:27:29 -08:00
|
|
|
emit_frontfacing_interpolation(nir_to_brw_state &ntb)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
2023-12-05 15:27:29 -08:00
|
|
|
const intel_device_info *devinfo = ntb.devinfo;
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 13:25:36 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg ff = bld.vgrf(BRW_TYPE_D);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2023-12-01 21:50:47 -08:00
|
|
|
if (devinfo->ver >= 20) {
|
|
|
|
|
/* Gfx20+ has separate back-facing bits for each pair of
|
|
|
|
|
* subspans in order to support multiple polygons, so we need to
|
|
|
|
|
* use a <1;8,0> region in order to select the correct word for
|
|
|
|
|
* each channel.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg tmp = bld.vgrf(BRW_TYPE_UW);
|
2023-12-01 21:50:47 -08:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder hbld = bld.group(16, i);
|
2023-12-01 21:50:47 -08:00
|
|
|
const struct brw_reg gi_uw = retype(xe2_vec1_grf(i, 9),
|
2024-04-20 17:08:02 -07:00
|
|
|
BRW_TYPE_UW);
|
2023-12-01 21:50:47 -08:00
|
|
|
hbld.AND(offset(tmp, hbld, i), gi_uw, brw_imm_uw(0x800));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bld.CMP(ff, tmp, brw_imm_uw(0), BRW_CONDITIONAL_Z);
|
|
|
|
|
|
|
|
|
|
} else if (devinfo->ver >= 12 && s.max_polygons == 2) {
|
2022-06-22 17:02:27 -07:00
|
|
|
/* According to the BSpec "PS Thread Payload for Normal
|
|
|
|
|
* Dispatch", the front/back facing interpolation bit is stored
|
|
|
|
|
* as bit 15 of either the R1.1 or R1.6 poly info field, for the
|
|
|
|
|
* first and second polygons respectively in multipolygon PS
|
|
|
|
|
* dispatch mode.
|
|
|
|
|
*/
|
|
|
|
|
assert(s.dispatch_width == 16);
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp = bld.vgrf(BRW_TYPE_W);
|
2022-06-22 17:02:27 -07:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < s.max_polygons; i++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder hbld = bld.group(8, i);
|
2022-06-22 17:02:27 -07:00
|
|
|
const struct brw_reg g1 = retype(brw_vec1_grf(1, 1 + 5 * i),
|
2024-04-20 17:08:02 -07:00
|
|
|
BRW_TYPE_W);
|
2022-06-22 17:02:27 -07:00
|
|
|
hbld.ASR(offset(tmp, hbld, i), g1, brw_imm_d(15));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bld.NOT(ff, tmp);
|
|
|
|
|
|
|
|
|
|
} else if (devinfo->ver >= 12) {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg g1 = brw_reg(retype(brw_vec1_grf(1, 1), BRW_TYPE_W));
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp = bld.vgrf(BRW_TYPE_W);
|
2023-11-17 17:17:25 -08:00
|
|
|
bld.ASR(tmp, g1, brw_imm_d(15));
|
|
|
|
|
bld.NOT(ff, tmp);
|
2024-02-15 02:51:39 -08:00
|
|
|
} else {
|
2023-11-17 17:17:25 -08:00
|
|
|
/* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
|
|
|
|
|
* a boolean result from this (~0/true or 0/false).
|
|
|
|
|
*
|
|
|
|
|
* We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
|
|
|
|
|
* this task in only one instruction:
|
|
|
|
|
* - a negation source modifier will flip the bit; and
|
|
|
|
|
* - a W -> D type conversion will sign extend the bit into the high
|
|
|
|
|
* word of the destination.
|
|
|
|
|
*
|
|
|
|
|
* An ASR 15 fills the low word of the destination.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg g0 = brw_reg(retype(brw_vec1_grf(0, 0), BRW_TYPE_W));
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-04-12 17:43:22 -07:00
|
|
|
bld.ASR(ff, negate(g0), brw_imm_d(15));
|
2023-11-17 17:17:25 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return ff;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2023-12-05 15:27:29 -08:00
|
|
|
emit_samplepos_setup(nir_to_brw_state &ntb)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 13:25:36 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder abld = bld.annotate("compute sample position");
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg pos = abld.vgrf(BRW_TYPE_F, 2);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-11-18 10:58:46 +02:00
|
|
|
if (wm_prog_data->persample_dispatch == INTEL_NEVER) {
|
2023-11-17 17:17:25 -08:00
|
|
|
/* From ARB_sample_shading specification:
|
|
|
|
|
* "When rendering to a non-multisample buffer, or if multisample
|
|
|
|
|
* rasterization is disabled, gl_SamplePosition will always be
|
|
|
|
|
* (0.5, 0.5).
|
|
|
|
|
*/
|
|
|
|
|
bld.MOV(offset(pos, bld, 0), brw_imm_f(0.5f));
|
|
|
|
|
bld.MOV(offset(pos, bld, 1), brw_imm_f(0.5f));
|
|
|
|
|
return pos;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
|
|
|
|
|
* mode will be enabled.
|
|
|
|
|
*
|
|
|
|
|
* From the Ivy Bridge PRM, volume 2 part 1, page 344:
|
|
|
|
|
* R31.1:0 Position Offset X/Y for Slot[3:0]
|
|
|
|
|
* R31.3:2 Position Offset X/Y for Slot[7:4]
|
|
|
|
|
* .....
|
|
|
|
|
*
|
|
|
|
|
* The X, Y sample positions come in as bytes in thread payload. So, read
|
|
|
|
|
* the positions using vstride=16, width=8, hstride=2.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg sample_pos_reg =
|
2024-12-06 21:46:48 -08:00
|
|
|
brw_fetch_payload_reg(abld, s.fs_payload().sample_pos_reg, BRW_TYPE_W);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp_d = bld.vgrf(BRW_TYPE_D);
|
2024-04-20 17:08:02 -07:00
|
|
|
abld.MOV(tmp_d, subscript(sample_pos_reg, BRW_TYPE_B, i));
|
2023-11-17 17:17:25 -08:00
|
|
|
/* Convert int_sample_pos to floating point */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp_f = bld.vgrf(BRW_TYPE_F);
|
2023-11-17 17:17:25 -08:00
|
|
|
abld.MOV(tmp_f, tmp_d);
|
|
|
|
|
/* Scale to the range [0, 1] */
|
|
|
|
|
abld.MUL(offset(pos, abld, i), tmp_f, brw_imm_f(1 / 16.0f));
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-18 10:58:46 +02:00
|
|
|
if (wm_prog_data->persample_dispatch == INTEL_SOMETIMES) {
|
2024-12-06 21:46:48 -08:00
|
|
|
brw_check_dynamic_msaa_flag(abld, wm_prog_data,
|
|
|
|
|
INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
|
2023-11-17 17:17:25 -08:00
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
set_predicate(BRW_PREDICATE_NORMAL,
|
|
|
|
|
bld.SEL(offset(pos, abld, i), offset(pos, abld, i),
|
|
|
|
|
brw_imm_f(0.5f)));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return pos;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2023-12-05 15:27:29 -08:00
|
|
|
emit_sampleid_setup(nir_to_brw_state &ntb)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
2023-12-05 15:27:29 -08:00
|
|
|
const intel_device_info *devinfo = ntb.devinfo;
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 13:25:36 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
ASSERTED brw_wm_prog_key *key = (brw_wm_prog_key*) s.key;
|
|
|
|
|
struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder abld = bld.annotate("compute sample id");
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg sample_id = abld.vgrf(BRW_TYPE_UD);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-11-18 10:58:46 +02:00
|
|
|
assert(key->multisample_fbo != INTEL_NEVER);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-02-15 02:51:39 -08:00
|
|
|
/* Sample ID comes in as 4-bit numbers in g1.0:
|
|
|
|
|
*
|
|
|
|
|
* 15:12 Slot 3 SampleID (only used in SIMD16)
|
|
|
|
|
* 11:8 Slot 2 SampleID (only used in SIMD16)
|
|
|
|
|
* 7:4 Slot 1 SampleID
|
|
|
|
|
* 3:0 Slot 0 SampleID
|
|
|
|
|
*
|
|
|
|
|
* Each slot corresponds to four channels, so we want to replicate each
|
|
|
|
|
* half-byte value to 4 channels in a row:
|
|
|
|
|
*
|
|
|
|
|
* dst+0: .7 .6 .5 .4 .3 .2 .1 .0
|
|
|
|
|
* 7:4 7:4 7:4 7:4 3:0 3:0 3:0 3:0
|
|
|
|
|
*
|
|
|
|
|
* dst+1: .7 .6 .5 .4 .3 .2 .1 .0 (if SIMD16)
|
|
|
|
|
* 15:12 15:12 15:12 15:12 11:8 11:8 11:8 11:8
|
|
|
|
|
*
|
|
|
|
|
* First, we read g1.0 with a <1,8,0>UB region, causing the first 8
|
|
|
|
|
* channels to read the first byte (7:0), and the second group of 8
|
|
|
|
|
* channels to read the second byte (15:8). Then, we shift right by
|
|
|
|
|
* a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
|
|
|
|
|
* values into place. Finally, we AND with 0xf to keep the low nibble.
|
|
|
|
|
*
|
|
|
|
|
* shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
|
|
|
|
|
* and(16) dst<1>D tmp<8,8,1>W 0xf:W
|
|
|
|
|
*
|
|
|
|
|
* TODO: These payload bits exist on Gfx7 too, but they appear to always
|
|
|
|
|
* be zero, so this code fails to work. We should find out why.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg tmp = abld.vgrf(BRW_TYPE_UW);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-02-15 02:51:39 -08:00
|
|
|
for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
|
2024-02-15 02:51:39 -08:00
|
|
|
/* According to the "PS Thread Payload for Normal Dispatch"
|
|
|
|
|
* pages on the BSpec, the sample ids are stored in R0.8/R1.8
|
|
|
|
|
* on gfx20+ and in R1.0/R2.0 on gfx8+.
|
2023-11-17 17:17:25 -08:00
|
|
|
*/
|
2024-02-15 02:51:39 -08:00
|
|
|
const struct brw_reg id_reg = devinfo->ver >= 20 ? xe2_vec1_grf(i, 8) :
|
|
|
|
|
brw_vec1_grf(i + 1, 0);
|
|
|
|
|
hbld.SHR(offset(tmp, hbld, i),
|
2024-04-20 17:08:02 -07:00
|
|
|
stride(retype(id_reg, BRW_TYPE_UB), 1, 8, 0),
|
2024-02-15 02:51:39 -08:00
|
|
|
brw_imm_v(0x44440000));
|
2023-11-17 17:17:25 -08:00
|
|
|
}
|
|
|
|
|
|
2024-02-15 02:51:39 -08:00
|
|
|
abld.AND(sample_id, tmp, brw_imm_w(0xf));
|
|
|
|
|
|
2024-11-18 10:58:46 +02:00
|
|
|
if (key->multisample_fbo == INTEL_SOMETIMES) {
|
2024-12-06 21:46:48 -08:00
|
|
|
brw_check_dynamic_msaa_flag(abld, wm_prog_data,
|
|
|
|
|
INTEL_MSAA_FLAG_MULTISAMPLE_FBO);
|
2023-11-17 17:17:25 -08:00
|
|
|
set_predicate(BRW_PREDICATE_NORMAL,
|
|
|
|
|
abld.SEL(sample_id, sample_id, brw_imm_ud(0)));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return sample_id;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2023-12-05 15:27:29 -08:00
|
|
|
emit_samplemaskin_setup(nir_to_brw_state &ntb)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 13:25:36 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
/* The HW doesn't provide us with expected values. */
|
2024-11-18 10:58:46 +02:00
|
|
|
assert(wm_prog_data->coarse_pixel_dispatch != INTEL_ALWAYS);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg coverage_mask =
|
2024-12-06 21:46:48 -08:00
|
|
|
brw_fetch_payload_reg(bld, s.fs_payload().sample_mask_in_reg, BRW_TYPE_UD);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-11-18 10:58:46 +02:00
|
|
|
if (wm_prog_data->persample_dispatch == INTEL_NEVER)
|
2023-11-17 17:17:25 -08:00
|
|
|
return coverage_mask;
|
|
|
|
|
|
|
|
|
|
/* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
|
|
|
|
|
* and a mask representing which sample is being processed by the
|
|
|
|
|
* current shader invocation.
|
|
|
|
|
*
|
|
|
|
|
* From the OES_sample_variables specification:
|
|
|
|
|
* "When per-sample shading is active due to the use of a fragment input
|
|
|
|
|
* qualified by "sample" or due to the use of the gl_SampleID or
|
|
|
|
|
* gl_SamplePosition variables, only the bit for the current sample is
|
|
|
|
|
* set in gl_SampleMaskIn."
|
|
|
|
|
*/
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder abld = bld.annotate("compute gl_SampleMaskIn");
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2023-12-05 15:27:29 -08:00
|
|
|
if (ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
|
|
|
|
|
ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2025-04-17 15:09:18 +02:00
|
|
|
brw_reg enabled_mask =
|
|
|
|
|
abld.SHL(brw_imm_ud(1), ntb.system_values[SYSTEM_VALUE_SAMPLE_ID]);
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg mask = abld.AND(enabled_mask, coverage_mask);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-11-18 10:58:46 +02:00
|
|
|
if (wm_prog_data->persample_dispatch == INTEL_ALWAYS)
|
2023-11-17 17:17:25 -08:00
|
|
|
return mask;
|
|
|
|
|
|
2024-12-06 21:46:48 -08:00
|
|
|
brw_check_dynamic_msaa_flag(abld, wm_prog_data,
|
|
|
|
|
INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
|
2023-11-17 17:17:25 -08:00
|
|
|
set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(mask, mask, coverage_mask));
|
|
|
|
|
|
|
|
|
|
return mask;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2023-12-05 15:27:29 -08:00
|
|
|
emit_shading_rate_setup(nir_to_brw_state &ntb)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
2023-12-05 15:27:29 -08:00
|
|
|
const intel_device_info *devinfo = ntb.devinfo;
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2023-11-20 13:25:36 -08:00
|
|
|
|
2023-11-17 17:17:25 -08:00
|
|
|
assert(devinfo->ver >= 11);
|
|
|
|
|
|
|
|
|
|
struct brw_wm_prog_data *wm_prog_data =
|
2024-02-19 23:07:04 -08:00
|
|
|
brw_wm_prog_data(bld.shader->prog_data);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
/* Coarse pixel shading size fields overlap with other fields of not in
|
|
|
|
|
* coarse pixel dispatch mode, so report 0 when that's not the case.
|
|
|
|
|
*/
|
2024-11-18 10:58:46 +02:00
|
|
|
if (wm_prog_data->coarse_pixel_dispatch == INTEL_NEVER)
|
2023-11-17 17:17:25 -08:00
|
|
|
return brw_imm_ud(0);
|
|
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder abld = bld.annotate("compute fragment shading rate");
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
/* The shading rates provided in the shader are the actual 2D shading
|
|
|
|
|
* rate while the SPIR-V built-in is the enum value that has the shading
|
|
|
|
|
* rate encoded as a bitfield. Fortunately, the bitfield value is just
|
|
|
|
|
* the shading rate divided by two and shifted.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg actual_x = brw_reg(retype(brw_vec1_grf(1, 0), BRW_TYPE_UB));
|
2023-11-17 17:17:25 -08:00
|
|
|
/* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg actual_y = byte_offset(actual_x, 1);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg int_rate_y = abld.SHR(actual_y, brw_imm_ud(1));
|
|
|
|
|
brw_reg int_rate_x = abld.SHR(actual_x, brw_imm_ud(1));
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg rate = abld.OR(abld.SHL(int_rate_x, brw_imm_ud(2)), int_rate_y);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-11-18 10:58:46 +02:00
|
|
|
if (wm_prog_data->coarse_pixel_dispatch == INTEL_ALWAYS)
|
2023-11-17 17:17:25 -08:00
|
|
|
return rate;
|
|
|
|
|
|
2024-12-06 21:46:48 -08:00
|
|
|
brw_check_dynamic_msaa_flag(abld, wm_prog_data,
|
|
|
|
|
INTEL_MSAA_FLAG_COARSE_RT_WRITES);
|
2023-11-17 17:17:25 -08:00
|
|
|
set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(rate, rate, brw_imm_ud(0)));
|
|
|
|
|
|
|
|
|
|
return rate;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-12 23:36:49 -07:00
|
|
|
/* Input data is organized with first the per-primitive values, followed
|
|
|
|
|
* by per-vertex values. The per-vertex will have interpolation information
|
|
|
|
|
* associated, so use 4 components for each value.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/* The register location here is relative to the start of the URB
|
|
|
|
|
* data. It will get adjusted to be a real location before
|
|
|
|
|
* generate_code() time.
|
|
|
|
|
*/
|
|
|
|
|
static brw_reg
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_interp_reg(const brw_builder &bld, unsigned location,
|
2024-07-12 23:36:49 -07:00
|
|
|
unsigned channel, unsigned comp)
|
|
|
|
|
{
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = *bld.shader;
|
2024-07-12 23:36:49 -07:00
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
2025-03-10 23:18:30 +02:00
|
|
|
assert((BITFIELD64_BIT(location) & ~s.nir->info.per_primitive_inputs) ||
|
|
|
|
|
location == VARYING_SLOT_PRIMITIVE_ID);
|
2024-07-12 23:36:49 -07:00
|
|
|
|
|
|
|
|
const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
|
|
|
|
|
|
|
|
|
|
assert(prog_data->urb_setup[location] >= 0);
|
|
|
|
|
unsigned nr = prog_data->urb_setup[location];
|
|
|
|
|
channel += prog_data->urb_setup_channel[location];
|
|
|
|
|
|
|
|
|
|
const unsigned per_vertex_start = prog_data->num_per_primitive_inputs;
|
|
|
|
|
const unsigned regnr = per_vertex_start + (nr * 4) + channel;
|
|
|
|
|
|
|
|
|
|
if (s.max_polygons > 1) {
|
|
|
|
|
/* In multipolygon dispatch each plane parameter is a
|
|
|
|
|
* dispatch_width-wide SIMD vector (see comment in
|
|
|
|
|
* assign_urb_setup()), so we need to use offset() instead of
|
|
|
|
|
* component() to select the specified parameter.
|
|
|
|
|
*/
|
|
|
|
|
const brw_reg tmp = bld.vgrf(BRW_TYPE_UD);
|
|
|
|
|
bld.MOV(tmp, offset(brw_attr_reg(regnr, BRW_TYPE_UD),
|
|
|
|
|
s.dispatch_width, comp));
|
|
|
|
|
return retype(tmp, BRW_TYPE_F);
|
|
|
|
|
} else {
|
|
|
|
|
return component(brw_attr_reg(regnr, BRW_TYPE_F), comp);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* The register location here is relative to the start of the URB
|
|
|
|
|
* data. It will get adjusted to be a real location before
|
|
|
|
|
* generate_code() time.
|
|
|
|
|
*/
|
|
|
|
|
static brw_reg
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_per_primitive_reg(const brw_builder &bld, int location, unsigned comp)
|
2024-07-12 23:36:49 -07:00
|
|
|
{
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = *bld.shader;
|
2024-07-12 23:36:49 -07:00
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
2025-03-18 20:56:02 +02:00
|
|
|
assert((BITFIELD64_BIT(location) & s.nir->info.per_primitive_inputs) ||
|
|
|
|
|
location == VARYING_SLOT_PRIMITIVE_ID);
|
2024-07-12 23:36:49 -07:00
|
|
|
|
|
|
|
|
const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
|
|
|
|
|
|
2025-03-10 23:18:30 +02:00
|
|
|
comp += (s.fs.per_primitive_offsets[location] % 16) / 4;
|
2024-07-12 23:36:49 -07:00
|
|
|
|
2025-03-10 23:18:30 +02:00
|
|
|
const unsigned regnr = s.fs.per_primitive_offsets[location] / 16 + comp / 4;
|
2024-07-12 23:36:49 -07:00
|
|
|
|
2025-03-10 23:18:30 +02:00
|
|
|
assert(s.fs.per_primitive_offsets[location] >= 0);
|
2024-07-12 23:36:49 -07:00
|
|
|
assert(regnr < prog_data->num_per_primitive_inputs);
|
|
|
|
|
|
2025-03-10 23:18:30 +02:00
|
|
|
brw_reg loc_reg = brw_attr_reg(regnr, BRW_TYPE_UD);
|
|
|
|
|
|
2024-07-12 23:36:49 -07:00
|
|
|
if (s.max_polygons > 1) {
|
|
|
|
|
/* In multipolygon dispatch each primitive constant is a
|
|
|
|
|
* dispatch_width-wide SIMD vector (see comment in
|
|
|
|
|
* assign_urb_setup()), so we need to use offset() instead of
|
|
|
|
|
* component() to select the specified parameter.
|
|
|
|
|
*/
|
|
|
|
|
const brw_reg tmp = bld.vgrf(BRW_TYPE_UD);
|
2025-03-10 23:18:30 +02:00
|
|
|
bld.MOV(tmp, offset(loc_reg, s.dispatch_width, comp % 4));
|
2024-07-12 23:36:49 -07:00
|
|
|
return retype(tmp, BRW_TYPE_F);
|
|
|
|
|
} else {
|
2025-03-10 23:18:30 +02:00
|
|
|
return component(loc_reg, comp % 4);
|
2024-07-12 23:36:49 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 12:13:47 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_fs_intrinsic(nir_to_brw_state &ntb,
|
2023-11-20 12:13:47 -08:00
|
|
|
nir_intrinsic_instr *instr)
|
2015-11-04 23:05:07 -08:00
|
|
|
{
|
2023-12-05 15:27:29 -08:00
|
|
|
const intel_device_info *devinfo = ntb.devinfo;
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 12:13:47 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
2015-11-04 23:05:07 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dest;
|
2015-11-04 23:05:07 -08:00
|
|
|
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
|
2023-11-20 21:21:54 -08:00
|
|
|
dest = get_nir_def(ntb, instr->def);
|
2014-12-04 12:27:29 -08:00
|
|
|
|
2014-08-15 10:32:07 -07:00
|
|
|
switch (instr->intrinsic) {
|
2015-11-04 23:05:07 -08:00
|
|
|
case nir_intrinsic_load_front_face:
|
2024-04-20 17:30:23 -07:00
|
|
|
bld.MOV(retype(dest, BRW_TYPE_D), emit_frontfacing_interpolation(ntb));
|
2015-11-04 23:05:07 -08:00
|
|
|
break;
|
|
|
|
|
|
2021-12-02 14:16:02 -06:00
|
|
|
case nir_intrinsic_load_sample_pos:
|
|
|
|
|
case nir_intrinsic_load_sample_pos_or_center: {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg sample_pos = ntb.system_values[SYSTEM_VALUE_SAMPLE_POS];
|
2015-11-04 23:05:07 -08:00
|
|
|
assert(sample_pos.file != BAD_FILE);
|
|
|
|
|
dest.type = sample_pos.type;
|
|
|
|
|
bld.MOV(dest, sample_pos);
|
|
|
|
|
bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2016-11-15 15:18:32 -08:00
|
|
|
case nir_intrinsic_load_layer_id:
|
2024-04-20 17:08:02 -07:00
|
|
|
dest.type = BRW_TYPE_UD;
|
2016-11-15 15:18:32 -08:00
|
|
|
bld.MOV(dest, fetch_render_target_array_index(bld));
|
|
|
|
|
break;
|
|
|
|
|
|
2022-03-15 17:04:04 -07:00
|
|
|
case nir_intrinsic_is_helper_invocation:
|
2023-11-20 22:00:28 -08:00
|
|
|
emit_is_helper_invocation(ntb, dest);
|
2019-06-07 23:06:27 -07:00
|
|
|
break;
|
|
|
|
|
|
2015-11-13 17:51:12 -08:00
|
|
|
case nir_intrinsic_load_helper_invocation:
|
2015-11-04 23:05:07 -08:00
|
|
|
case nir_intrinsic_load_sample_mask_in:
|
2020-10-29 15:19:30 +02:00
|
|
|
case nir_intrinsic_load_sample_id:
|
|
|
|
|
case nir_intrinsic_load_frag_shading_rate: {
|
2015-11-04 23:05:07 -08:00
|
|
|
gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg val = ntb.system_values[sv];
|
2015-11-04 23:05:07 -08:00
|
|
|
assert(val.file != BAD_FILE);
|
|
|
|
|
dest.type = val.type;
|
|
|
|
|
bld.MOV(dest, val);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2016-07-21 21:25:46 -07:00
|
|
|
case nir_intrinsic_store_output: {
|
2024-02-12 08:43:34 -08:00
|
|
|
const brw_reg src = get_nir_src(ntb, instr->src[0], -1);
|
2018-10-20 09:55:28 -05:00
|
|
|
const unsigned store_offset = nir_src_as_uint(instr->src[1]);
|
2016-07-21 21:26:20 -07:00
|
|
|
const unsigned location = nir_intrinsic_base(instr) +
|
2018-10-20 09:55:28 -05:00
|
|
|
SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION);
|
2024-08-15 23:04:23 -07:00
|
|
|
const brw_reg new_dest =
|
|
|
|
|
offset(retype(alloc_frag_output(ntb, location), src.type),
|
|
|
|
|
bld, nir_intrinsic_component(instr));
|
2016-07-21 21:25:46 -07:00
|
|
|
|
2024-08-15 23:04:23 -07:00
|
|
|
brw_combine_with_vec(bld, new_dest, src, instr->num_components);
|
2016-07-21 21:25:46 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2016-07-21 21:57:00 -07:00
|
|
|
case nir_intrinsic_load_output: {
|
|
|
|
|
const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
|
|
|
|
|
BRW_NIR_FRAG_OUTPUT_LOCATION);
|
|
|
|
|
assert(l >= FRAG_RESULT_DATA0);
|
2018-10-20 09:55:28 -05:00
|
|
|
const unsigned load_offset = nir_src_as_uint(instr->src[0]);
|
|
|
|
|
const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg tmp = bld.vgrf(dest.type, 4);
|
2016-07-21 21:57:00 -07:00
|
|
|
|
2025-09-02 13:39:29 +03:00
|
|
|
assert(reinterpret_cast<const brw_wm_prog_key *>(s.key)->coherent_fb_fetch);
|
|
|
|
|
emit_coherent_fb_read(bld, tmp, target);
|
2016-07-21 21:57:00 -07:00
|
|
|
|
2024-08-15 23:04:23 -07:00
|
|
|
brw_combine_with_vec(bld, dest,
|
|
|
|
|
offset(tmp, bld, nir_intrinsic_component(instr)),
|
|
|
|
|
instr->num_components);
|
2016-07-21 21:57:00 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2019-06-07 23:06:27 -07:00
|
|
|
case nir_intrinsic_demote:
|
2020-05-08 09:08:55 -07:00
|
|
|
case nir_intrinsic_terminate:
|
2019-07-18 13:39:49 +02:00
|
|
|
case nir_intrinsic_demote_if:
|
2020-05-08 09:08:55 -07:00
|
|
|
case nir_intrinsic_terminate_if: {
|
2020-01-04 15:48:07 -08:00
|
|
|
/* We track our discarded pixels in f0.1/f1.0. By predicating on it, we
|
|
|
|
|
* can update just the flag bits that aren't yet discarded. If there's
|
|
|
|
|
* no condition, we emit a CMP of g0 != g0, so all currently executing
|
2014-08-19 15:22:43 -07:00
|
|
|
* channels will get turned off.
|
2014-08-15 10:32:07 -07:00
|
|
|
*/
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *cmp = NULL;
|
2019-07-18 13:39:49 +02:00
|
|
|
if (instr->intrinsic == nir_intrinsic_demote_if ||
|
2020-05-08 09:08:55 -07:00
|
|
|
instr->intrinsic == nir_intrinsic_terminate_if) {
|
intel/fs: Improve discard_if code generation
Previously we would blindly emit an sequence like:
mov(1) f0.1<1>UW g1.14<0,1,0>UW
...
cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */
(+f0.1) cmp.z.f0.1(16) null<1>D g7<8,8,1>D 0D
The first move sets the flags based on the initial execution mask.
Later discard sequences contain a predicated compare that can only
remove more SIMD channels. Often times the only user of the result from
the first compare is the second compare. Instead, generate a sequence
like
mov(1) f0.1<1>UW g1.14<0,1,0>UW
...
cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */
(+f0.1) cmp.ge.f0.1(8) null<1>F g5<8,8,1>F 0x41700000F /* 15F */
If the results stored in g7 and f0.0 are not used, the comparison will
be eliminated. This removes an instruction and potentially reduces
register pressure.
v2: Major re-write of the commit message (including fixing the assembly
code). Suggested by Matt.
All Gen8+ platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 17224434 -> 17198659 (-0.15%)
instructions in affected programs: 2908125 -> 2882350 (-0.89%)
helped: 18891
HURT: 5
helped stats (abs) min: 1 max: 12 x̄: 1.38 x̃: 1
helped stats (rel) min: 0.03% max: 25.00% x̄: 1.76% x̃: 1.02%
HURT stats (abs) min: 9 max: 105 x̄: 51.40 x̃: 35
HURT stats (rel) min: 0.43% max: 4.92% x̄: 2.34% x̃: 1.56%
95% mean confidence interval for instructions value: -1.39 -1.34
95% mean confidence interval for instructions %-change: -1.79% -1.73%
Instructions are helped.
total cycles in shared programs: 361468458 -> 361170679 (-0.08%)
cycles in affected programs: 38470116 -> 38172337 (-0.77%)
helped: 16202
HURT: 1456
helped stats (abs) min: 1 max: 4473 x̄: 26.24 x̃: 18
helped stats (rel) min: <.01% max: 28.44% x̄: 2.90% x̃: 2.18%
HURT stats (abs) min: 1 max: 5982 x̄: 87.51 x̃: 28
HURT stats (rel) min: <.01% max: 51.29% x̄: 5.48% x̃: 1.64%
95% mean confidence interval for cycles value: -18.24 -15.49
95% mean confidence interval for cycles %-change: -2.26% -2.14%
Cycles are helped.
total spills in shared programs: 12147 -> 12176 (0.24%)
spills in affected programs: 175 -> 204 (16.57%)
helped: 8
HURT: 5
total fills in shared programs: 25262 -> 25292 (0.12%)
fills in affected programs: 269 -> 299 (11.15%)
helped: 8
HURT: 5
Haswell
total instructions in shared programs: 13530316 -> 13502647 (-0.20%)
instructions in affected programs: 2507824 -> 2480155 (-1.10%)
helped: 18859
HURT: 10
helped stats (abs) min: 1 max: 12 x̄: 1.48 x̃: 1
helped stats (rel) min: 0.03% max: 27.78% x̄: 2.38% x̃: 1.41%
HURT stats (abs) min: 5 max: 39 x̄: 25.70 x̃: 31
HURT stats (rel) min: 0.22% max: 1.66% x̄: 1.09% x̃: 1.31%
95% mean confidence interval for instructions value: -1.49 -1.44
95% mean confidence interval for instructions %-change: -2.42% -2.34%
Instructions are helped.
total cycles in shared programs: 377865412 -> 377639034 (-0.06%)
cycles in affected programs: 40169572 -> 39943194 (-0.56%)
helped: 15550
HURT: 1938
helped stats (abs) min: 1 max: 2482 x̄: 25.67 x̃: 18
helped stats (rel) min: <.01% max: 37.77% x̄: 3.00% x̃: 2.25%
HURT stats (abs) min: 1 max: 4862 x̄: 89.17 x̃: 35
HURT stats (rel) min: <.01% max: 67.67% x̄: 6.16% x̃: 2.75%
95% mean confidence interval for cycles value: -14.42 -11.47
95% mean confidence interval for cycles %-change: -2.05% -1.91%
Cycles are helped.
total spills in shared programs: 26769 -> 26814 (0.17%)
spills in affected programs: 826 -> 871 (5.45%)
helped: 9
HURT: 10
total fills in shared programs: 38383 -> 38425 (0.11%)
fills in affected programs: 834 -> 876 (5.04%)
helped: 9
HURT: 10
LOST: 5
GAINED: 10
Ivy Bridge
total instructions in shared programs: 12079250 -> 12044139 (-0.29%)
instructions in affected programs: 2409680 -> 2374569 (-1.46%)
helped: 16135
HURT: 0
helped stats (abs) min: 1 max: 23 x̄: 2.18 x̃: 2
helped stats (rel) min: 0.07% max: 37.50% x̄: 2.72% x̃: 1.68%
95% mean confidence interval for instructions value: -2.21 -2.14
95% mean confidence interval for instructions %-change: -2.76% -2.67%
Instructions are helped.
total cycles in shared programs: 180116747 -> 179900405 (-0.12%)
cycles in affected programs: 25439823 -> 25223481 (-0.85%)
helped: 13817
HURT: 1499
helped stats (abs) min: 1 max: 1886 x̄: 26.40 x̃: 18
helped stats (rel) min: <.01% max: 38.84% x̄: 2.57% x̃: 1.97%
HURT stats (abs) min: 1 max: 3684 x̄: 98.99 x̃: 52
HURT stats (rel) min: <.01% max: 97.01% x̄: 6.37% x̃: 3.42%
95% mean confidence interval for cycles value: -15.68 -12.57
95% mean confidence interval for cycles %-change: -1.77% -1.63%
Cycles are helped.
LOST: 8
GAINED: 10
Sandy Bridge
total instructions in shared programs: 10878990 -> 10863659 (-0.14%)
instructions in affected programs: 1806702 -> 1791371 (-0.85%)
helped: 13023
HURT: 0
helped stats (abs) min: 1 max: 5 x̄: 1.18 x̃: 1
helped stats (rel) min: 0.07% max: 13.79% x̄: 1.65% x̃: 1.10%
95% mean confidence interval for instructions value: -1.18 -1.17
95% mean confidence interval for instructions %-change: -1.68% -1.62%
Instructions are helped.
total cycles in shared programs: 154082878 -> 153862810 (-0.14%)
cycles in affected programs: 20199374 -> 19979306 (-1.09%)
helped: 12048
HURT: 510
helped stats (abs) min: 1 max: 323 x̄: 20.57 x̃: 18
helped stats (rel) min: 0.03% max: 17.78% x̄: 2.05% x̃: 1.52%
HURT stats (abs) min: 1 max: 448 x̄: 54.39 x̃: 16
HURT stats (rel) min: 0.02% max: 37.98% x̄: 4.13% x̃: 1.17%
95% mean confidence interval for cycles value: -17.97 -17.08
95% mean confidence interval for cycles %-change: -1.84% -1.75%
Cycles are helped.
LOST: 1
GAINED: 0
Iron Lake
total instructions in shared programs: 8155075 -> 8142729 (-0.15%)
instructions in affected programs: 949495 -> 937149 (-1.30%)
helped: 5810
HURT: 0
helped stats (abs) min: 1 max: 8 x̄: 2.12 x̃: 2
helped stats (rel) min: 0.10% max: 16.67% x̄: 2.53% x̃: 1.85%
95% mean confidence interval for instructions value: -2.14 -2.11
95% mean confidence interval for instructions %-change: -2.59% -2.48%
Instructions are helped.
total cycles in shared programs: 188584610 -> 188549632 (-0.02%)
cycles in affected programs: 17274446 -> 17239468 (-0.20%)
helped: 3881
HURT: 90
helped stats (abs) min: 2 max: 168 x̄: 9.08 x̃: 6
helped stats (rel) min: <.01% max: 23.53% x̄: 0.83% x̃: 0.30%
HURT stats (abs) min: 2 max: 10 x̄: 2.80 x̃: 2
HURT stats (rel) min: <.01% max: 0.60% x̄: 0.10% x̃: 0.07%
95% mean confidence interval for cycles value: -9.35 -8.27
95% mean confidence interval for cycles %-change: -0.85% -0.77%
Cycles are helped.
GM45
total instructions in shared programs: 5019308 -> 5013119 (-0.12%)
instructions in affected programs: 489028 -> 482839 (-1.27%)
helped: 2912
HURT: 0
helped stats (abs) min: 1 max: 8 x̄: 2.13 x̃: 2
helped stats (rel) min: 0.10% max: 16.67% x̄: 2.46% x̃: 1.81%
95% mean confidence interval for instructions value: -2.14 -2.11
95% mean confidence interval for instructions %-change: -2.54% -2.39%
Instructions are helped.
total cycles in shared programs: 129002592 -> 128977804 (-0.02%)
cycles in affected programs: 12669152 -> 12644364 (-0.20%)
helped: 2759
HURT: 37
helped stats (abs) min: 2 max: 168 x̄: 9.03 x̃: 4
helped stats (rel) min: <.01% max: 21.43% x̄: 0.75% x̃: 0.31%
HURT stats (abs) min: 2 max: 10 x̄: 3.62 x̃: 4
HURT stats (rel) min: <.01% max: 0.41% x̄: 0.10% x̃: 0.04%
95% mean confidence interval for cycles value: -9.53 -8.20
95% mean confidence interval for cycles %-change: -0.79% -0.70%
Cycles are helped.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2019-05-20 17:25:01 -07:00
|
|
|
nir_alu_instr *alu = nir_src_as_alu_instr(instr->src[0]);
|
|
|
|
|
|
|
|
|
|
if (alu != NULL &&
|
2024-02-15 02:51:39 -08:00
|
|
|
alu->op != nir_op_bcsel) {
|
intel/fs: Improve discard_if code generation
Previously we would blindly emit an sequence like:
mov(1) f0.1<1>UW g1.14<0,1,0>UW
...
cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */
(+f0.1) cmp.z.f0.1(16) null<1>D g7<8,8,1>D 0D
The first move sets the flags based on the initial execution mask.
Later discard sequences contain a predicated compare that can only
remove more SIMD channels. Often times the only user of the result from
the first compare is the second compare. Instead, generate a sequence
like
mov(1) f0.1<1>UW g1.14<0,1,0>UW
...
cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */
(+f0.1) cmp.ge.f0.1(8) null<1>F g5<8,8,1>F 0x41700000F /* 15F */
If the results stored in g7 and f0.0 are not used, the comparison will
be eliminated. This removes an instruction and potentially reduces
register pressure.
v2: Major re-write of the commit message (including fixing the assembly
code). Suggested by Matt.
All Gen8+ platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 17224434 -> 17198659 (-0.15%)
instructions in affected programs: 2908125 -> 2882350 (-0.89%)
helped: 18891
HURT: 5
helped stats (abs) min: 1 max: 12 x̄: 1.38 x̃: 1
helped stats (rel) min: 0.03% max: 25.00% x̄: 1.76% x̃: 1.02%
HURT stats (abs) min: 9 max: 105 x̄: 51.40 x̃: 35
HURT stats (rel) min: 0.43% max: 4.92% x̄: 2.34% x̃: 1.56%
95% mean confidence interval for instructions value: -1.39 -1.34
95% mean confidence interval for instructions %-change: -1.79% -1.73%
Instructions are helped.
total cycles in shared programs: 361468458 -> 361170679 (-0.08%)
cycles in affected programs: 38470116 -> 38172337 (-0.77%)
helped: 16202
HURT: 1456
helped stats (abs) min: 1 max: 4473 x̄: 26.24 x̃: 18
helped stats (rel) min: <.01% max: 28.44% x̄: 2.90% x̃: 2.18%
HURT stats (abs) min: 1 max: 5982 x̄: 87.51 x̃: 28
HURT stats (rel) min: <.01% max: 51.29% x̄: 5.48% x̃: 1.64%
95% mean confidence interval for cycles value: -18.24 -15.49
95% mean confidence interval for cycles %-change: -2.26% -2.14%
Cycles are helped.
total spills in shared programs: 12147 -> 12176 (0.24%)
spills in affected programs: 175 -> 204 (16.57%)
helped: 8
HURT: 5
total fills in shared programs: 25262 -> 25292 (0.12%)
fills in affected programs: 269 -> 299 (11.15%)
helped: 8
HURT: 5
Haswell
total instructions in shared programs: 13530316 -> 13502647 (-0.20%)
instructions in affected programs: 2507824 -> 2480155 (-1.10%)
helped: 18859
HURT: 10
helped stats (abs) min: 1 max: 12 x̄: 1.48 x̃: 1
helped stats (rel) min: 0.03% max: 27.78% x̄: 2.38% x̃: 1.41%
HURT stats (abs) min: 5 max: 39 x̄: 25.70 x̃: 31
HURT stats (rel) min: 0.22% max: 1.66% x̄: 1.09% x̃: 1.31%
95% mean confidence interval for instructions value: -1.49 -1.44
95% mean confidence interval for instructions %-change: -2.42% -2.34%
Instructions are helped.
total cycles in shared programs: 377865412 -> 377639034 (-0.06%)
cycles in affected programs: 40169572 -> 39943194 (-0.56%)
helped: 15550
HURT: 1938
helped stats (abs) min: 1 max: 2482 x̄: 25.67 x̃: 18
helped stats (rel) min: <.01% max: 37.77% x̄: 3.00% x̃: 2.25%
HURT stats (abs) min: 1 max: 4862 x̄: 89.17 x̃: 35
HURT stats (rel) min: <.01% max: 67.67% x̄: 6.16% x̃: 2.75%
95% mean confidence interval for cycles value: -14.42 -11.47
95% mean confidence interval for cycles %-change: -2.05% -1.91%
Cycles are helped.
total spills in shared programs: 26769 -> 26814 (0.17%)
spills in affected programs: 826 -> 871 (5.45%)
helped: 9
HURT: 10
total fills in shared programs: 38383 -> 38425 (0.11%)
fills in affected programs: 834 -> 876 (5.04%)
helped: 9
HURT: 10
LOST: 5
GAINED: 10
Ivy Bridge
total instructions in shared programs: 12079250 -> 12044139 (-0.29%)
instructions in affected programs: 2409680 -> 2374569 (-1.46%)
helped: 16135
HURT: 0
helped stats (abs) min: 1 max: 23 x̄: 2.18 x̃: 2
helped stats (rel) min: 0.07% max: 37.50% x̄: 2.72% x̃: 1.68%
95% mean confidence interval for instructions value: -2.21 -2.14
95% mean confidence interval for instructions %-change: -2.76% -2.67%
Instructions are helped.
total cycles in shared programs: 180116747 -> 179900405 (-0.12%)
cycles in affected programs: 25439823 -> 25223481 (-0.85%)
helped: 13817
HURT: 1499
helped stats (abs) min: 1 max: 1886 x̄: 26.40 x̃: 18
helped stats (rel) min: <.01% max: 38.84% x̄: 2.57% x̃: 1.97%
HURT stats (abs) min: 1 max: 3684 x̄: 98.99 x̃: 52
HURT stats (rel) min: <.01% max: 97.01% x̄: 6.37% x̃: 3.42%
95% mean confidence interval for cycles value: -15.68 -12.57
95% mean confidence interval for cycles %-change: -1.77% -1.63%
Cycles are helped.
LOST: 8
GAINED: 10
Sandy Bridge
total instructions in shared programs: 10878990 -> 10863659 (-0.14%)
instructions in affected programs: 1806702 -> 1791371 (-0.85%)
helped: 13023
HURT: 0
helped stats (abs) min: 1 max: 5 x̄: 1.18 x̃: 1
helped stats (rel) min: 0.07% max: 13.79% x̄: 1.65% x̃: 1.10%
95% mean confidence interval for instructions value: -1.18 -1.17
95% mean confidence interval for instructions %-change: -1.68% -1.62%
Instructions are helped.
total cycles in shared programs: 154082878 -> 153862810 (-0.14%)
cycles in affected programs: 20199374 -> 19979306 (-1.09%)
helped: 12048
HURT: 510
helped stats (abs) min: 1 max: 323 x̄: 20.57 x̃: 18
helped stats (rel) min: 0.03% max: 17.78% x̄: 2.05% x̃: 1.52%
HURT stats (abs) min: 1 max: 448 x̄: 54.39 x̃: 16
HURT stats (rel) min: 0.02% max: 37.98% x̄: 4.13% x̃: 1.17%
95% mean confidence interval for cycles value: -17.97 -17.08
95% mean confidence interval for cycles %-change: -1.84% -1.75%
Cycles are helped.
LOST: 1
GAINED: 0
Iron Lake
total instructions in shared programs: 8155075 -> 8142729 (-0.15%)
instructions in affected programs: 949495 -> 937149 (-1.30%)
helped: 5810
HURT: 0
helped stats (abs) min: 1 max: 8 x̄: 2.12 x̃: 2
helped stats (rel) min: 0.10% max: 16.67% x̄: 2.53% x̃: 1.85%
95% mean confidence interval for instructions value: -2.14 -2.11
95% mean confidence interval for instructions %-change: -2.59% -2.48%
Instructions are helped.
total cycles in shared programs: 188584610 -> 188549632 (-0.02%)
cycles in affected programs: 17274446 -> 17239468 (-0.20%)
helped: 3881
HURT: 90
helped stats (abs) min: 2 max: 168 x̄: 9.08 x̃: 6
helped stats (rel) min: <.01% max: 23.53% x̄: 0.83% x̃: 0.30%
HURT stats (abs) min: 2 max: 10 x̄: 2.80 x̃: 2
HURT stats (rel) min: <.01% max: 0.60% x̄: 0.10% x̃: 0.07%
95% mean confidence interval for cycles value: -9.35 -8.27
95% mean confidence interval for cycles %-change: -0.85% -0.77%
Cycles are helped.
GM45
total instructions in shared programs: 5019308 -> 5013119 (-0.12%)
instructions in affected programs: 489028 -> 482839 (-1.27%)
helped: 2912
HURT: 0
helped stats (abs) min: 1 max: 8 x̄: 2.13 x̃: 2
helped stats (rel) min: 0.10% max: 16.67% x̄: 2.46% x̃: 1.81%
95% mean confidence interval for instructions value: -2.14 -2.11
95% mean confidence interval for instructions %-change: -2.54% -2.39%
Instructions are helped.
total cycles in shared programs: 129002592 -> 128977804 (-0.02%)
cycles in affected programs: 12669152 -> 12644364 (-0.20%)
helped: 2759
HURT: 37
helped stats (abs) min: 2 max: 168 x̄: 9.03 x̃: 4
helped stats (rel) min: <.01% max: 21.43% x̄: 0.75% x̃: 0.31%
HURT stats (abs) min: 2 max: 10 x̄: 3.62 x̃: 4
HURT stats (rel) min: <.01% max: 0.41% x̄: 0.10% x̃: 0.04%
95% mean confidence interval for cycles value: -9.53 -8.20
95% mean confidence interval for cycles %-change: -0.79% -0.70%
Cycles are helped.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2019-05-20 17:25:01 -07:00
|
|
|
/* Re-emit the instruction that generated the Boolean value, but
|
|
|
|
|
* do not store it. Since this instruction will be conditional,
|
|
|
|
|
* other instructions that want to use the real Boolean value may
|
|
|
|
|
* get garbage. This was a problem for piglit's fs-discard-exit-2
|
|
|
|
|
* test.
|
|
|
|
|
*
|
|
|
|
|
* Ideally we'd detect that the instruction cannot have a
|
|
|
|
|
* conditional modifier before emitting the instructions. Alas,
|
|
|
|
|
* that is nigh impossible. Instead, we're going to assume the
|
|
|
|
|
* instruction (or last instruction) generated can have a
|
|
|
|
|
* conditional modifier. If it cannot, fallback to the old-style
|
|
|
|
|
* compare, and hope dead code elimination will clean up the
|
|
|
|
|
* extra instructions generated.
|
|
|
|
|
*/
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_alu(ntb, alu, false);
|
intel/fs: Improve discard_if code generation
Previously we would blindly emit an sequence like:
mov(1) f0.1<1>UW g1.14<0,1,0>UW
...
cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */
(+f0.1) cmp.z.f0.1(16) null<1>D g7<8,8,1>D 0D
The first move sets the flags based on the initial execution mask.
Later discard sequences contain a predicated compare that can only
remove more SIMD channels. Often times the only user of the result from
the first compare is the second compare. Instead, generate a sequence
like
mov(1) f0.1<1>UW g1.14<0,1,0>UW
...
cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */
(+f0.1) cmp.ge.f0.1(8) null<1>F g5<8,8,1>F 0x41700000F /* 15F */
If the results stored in g7 and f0.0 are not used, the comparison will
be eliminated. This removes an instruction and potentially reduces
register pressure.
v2: Major re-write of the commit message (including fixing the assembly
code). Suggested by Matt.
All Gen8+ platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 17224434 -> 17198659 (-0.15%)
instructions in affected programs: 2908125 -> 2882350 (-0.89%)
helped: 18891
HURT: 5
helped stats (abs) min: 1 max: 12 x̄: 1.38 x̃: 1
helped stats (rel) min: 0.03% max: 25.00% x̄: 1.76% x̃: 1.02%
HURT stats (abs) min: 9 max: 105 x̄: 51.40 x̃: 35
HURT stats (rel) min: 0.43% max: 4.92% x̄: 2.34% x̃: 1.56%
95% mean confidence interval for instructions value: -1.39 -1.34
95% mean confidence interval for instructions %-change: -1.79% -1.73%
Instructions are helped.
total cycles in shared programs: 361468458 -> 361170679 (-0.08%)
cycles in affected programs: 38470116 -> 38172337 (-0.77%)
helped: 16202
HURT: 1456
helped stats (abs) min: 1 max: 4473 x̄: 26.24 x̃: 18
helped stats (rel) min: <.01% max: 28.44% x̄: 2.90% x̃: 2.18%
HURT stats (abs) min: 1 max: 5982 x̄: 87.51 x̃: 28
HURT stats (rel) min: <.01% max: 51.29% x̄: 5.48% x̃: 1.64%
95% mean confidence interval for cycles value: -18.24 -15.49
95% mean confidence interval for cycles %-change: -2.26% -2.14%
Cycles are helped.
total spills in shared programs: 12147 -> 12176 (0.24%)
spills in affected programs: 175 -> 204 (16.57%)
helped: 8
HURT: 5
total fills in shared programs: 25262 -> 25292 (0.12%)
fills in affected programs: 269 -> 299 (11.15%)
helped: 8
HURT: 5
Haswell
total instructions in shared programs: 13530316 -> 13502647 (-0.20%)
instructions in affected programs: 2507824 -> 2480155 (-1.10%)
helped: 18859
HURT: 10
helped stats (abs) min: 1 max: 12 x̄: 1.48 x̃: 1
helped stats (rel) min: 0.03% max: 27.78% x̄: 2.38% x̃: 1.41%
HURT stats (abs) min: 5 max: 39 x̄: 25.70 x̃: 31
HURT stats (rel) min: 0.22% max: 1.66% x̄: 1.09% x̃: 1.31%
95% mean confidence interval for instructions value: -1.49 -1.44
95% mean confidence interval for instructions %-change: -2.42% -2.34%
Instructions are helped.
total cycles in shared programs: 377865412 -> 377639034 (-0.06%)
cycles in affected programs: 40169572 -> 39943194 (-0.56%)
helped: 15550
HURT: 1938
helped stats (abs) min: 1 max: 2482 x̄: 25.67 x̃: 18
helped stats (rel) min: <.01% max: 37.77% x̄: 3.00% x̃: 2.25%
HURT stats (abs) min: 1 max: 4862 x̄: 89.17 x̃: 35
HURT stats (rel) min: <.01% max: 67.67% x̄: 6.16% x̃: 2.75%
95% mean confidence interval for cycles value: -14.42 -11.47
95% mean confidence interval for cycles %-change: -2.05% -1.91%
Cycles are helped.
total spills in shared programs: 26769 -> 26814 (0.17%)
spills in affected programs: 826 -> 871 (5.45%)
helped: 9
HURT: 10
total fills in shared programs: 38383 -> 38425 (0.11%)
fills in affected programs: 834 -> 876 (5.04%)
helped: 9
HURT: 10
LOST: 5
GAINED: 10
Ivy Bridge
total instructions in shared programs: 12079250 -> 12044139 (-0.29%)
instructions in affected programs: 2409680 -> 2374569 (-1.46%)
helped: 16135
HURT: 0
helped stats (abs) min: 1 max: 23 x̄: 2.18 x̃: 2
helped stats (rel) min: 0.07% max: 37.50% x̄: 2.72% x̃: 1.68%
95% mean confidence interval for instructions value: -2.21 -2.14
95% mean confidence interval for instructions %-change: -2.76% -2.67%
Instructions are helped.
total cycles in shared programs: 180116747 -> 179900405 (-0.12%)
cycles in affected programs: 25439823 -> 25223481 (-0.85%)
helped: 13817
HURT: 1499
helped stats (abs) min: 1 max: 1886 x̄: 26.40 x̃: 18
helped stats (rel) min: <.01% max: 38.84% x̄: 2.57% x̃: 1.97%
HURT stats (abs) min: 1 max: 3684 x̄: 98.99 x̃: 52
HURT stats (rel) min: <.01% max: 97.01% x̄: 6.37% x̃: 3.42%
95% mean confidence interval for cycles value: -15.68 -12.57
95% mean confidence interval for cycles %-change: -1.77% -1.63%
Cycles are helped.
LOST: 8
GAINED: 10
Sandy Bridge
total instructions in shared programs: 10878990 -> 10863659 (-0.14%)
instructions in affected programs: 1806702 -> 1791371 (-0.85%)
helped: 13023
HURT: 0
helped stats (abs) min: 1 max: 5 x̄: 1.18 x̃: 1
helped stats (rel) min: 0.07% max: 13.79% x̄: 1.65% x̃: 1.10%
95% mean confidence interval for instructions value: -1.18 -1.17
95% mean confidence interval for instructions %-change: -1.68% -1.62%
Instructions are helped.
total cycles in shared programs: 154082878 -> 153862810 (-0.14%)
cycles in affected programs: 20199374 -> 19979306 (-1.09%)
helped: 12048
HURT: 510
helped stats (abs) min: 1 max: 323 x̄: 20.57 x̃: 18
helped stats (rel) min: 0.03% max: 17.78% x̄: 2.05% x̃: 1.52%
HURT stats (abs) min: 1 max: 448 x̄: 54.39 x̃: 16
HURT stats (rel) min: 0.02% max: 37.98% x̄: 4.13% x̃: 1.17%
95% mean confidence interval for cycles value: -17.97 -17.08
95% mean confidence interval for cycles %-change: -1.84% -1.75%
Cycles are helped.
LOST: 1
GAINED: 0
Iron Lake
total instructions in shared programs: 8155075 -> 8142729 (-0.15%)
instructions in affected programs: 949495 -> 937149 (-1.30%)
helped: 5810
HURT: 0
helped stats (abs) min: 1 max: 8 x̄: 2.12 x̃: 2
helped stats (rel) min: 0.10% max: 16.67% x̄: 2.53% x̃: 1.85%
95% mean confidence interval for instructions value: -2.14 -2.11
95% mean confidence interval for instructions %-change: -2.59% -2.48%
Instructions are helped.
total cycles in shared programs: 188584610 -> 188549632 (-0.02%)
cycles in affected programs: 17274446 -> 17239468 (-0.20%)
helped: 3881
HURT: 90
helped stats (abs) min: 2 max: 168 x̄: 9.08 x̃: 6
helped stats (rel) min: <.01% max: 23.53% x̄: 0.83% x̃: 0.30%
HURT stats (abs) min: 2 max: 10 x̄: 2.80 x̃: 2
HURT stats (rel) min: <.01% max: 0.60% x̄: 0.10% x̃: 0.07%
95% mean confidence interval for cycles value: -9.35 -8.27
95% mean confidence interval for cycles %-change: -0.85% -0.77%
Cycles are helped.
GM45
total instructions in shared programs: 5019308 -> 5013119 (-0.12%)
instructions in affected programs: 489028 -> 482839 (-1.27%)
helped: 2912
HURT: 0
helped stats (abs) min: 1 max: 8 x̄: 2.13 x̃: 2
helped stats (rel) min: 0.10% max: 16.67% x̄: 2.46% x̃: 1.81%
95% mean confidence interval for instructions value: -2.14 -2.11
95% mean confidence interval for instructions %-change: -2.54% -2.39%
Instructions are helped.
total cycles in shared programs: 129002592 -> 128977804 (-0.02%)
cycles in affected programs: 12669152 -> 12644364 (-0.20%)
helped: 2759
HURT: 37
helped stats (abs) min: 2 max: 168 x̄: 9.03 x̃: 4
helped stats (rel) min: <.01% max: 21.43% x̄: 0.75% x̃: 0.31%
HURT stats (abs) min: 2 max: 10 x̄: 3.62 x̃: 4
HURT stats (rel) min: <.01% max: 0.41% x̄: 0.10% x̃: 0.04%
95% mean confidence interval for cycles value: -9.53 -8.20
95% mean confidence interval for cycles %-change: -0.79% -0.70%
Cycles are helped.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2019-05-20 17:25:01 -07:00
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
cmp = (brw_inst *) s.instructions.get_tail();
|
intel/fs: Improve discard_if code generation
Previously we would blindly emit an sequence like:
mov(1) f0.1<1>UW g1.14<0,1,0>UW
...
cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */
(+f0.1) cmp.z.f0.1(16) null<1>D g7<8,8,1>D 0D
The first move sets the flags based on the initial execution mask.
Later discard sequences contain a predicated compare that can only
remove more SIMD channels. Often times the only user of the result from
the first compare is the second compare. Instead, generate a sequence
like
mov(1) f0.1<1>UW g1.14<0,1,0>UW
...
cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */
(+f0.1) cmp.ge.f0.1(8) null<1>F g5<8,8,1>F 0x41700000F /* 15F */
If the results stored in g7 and f0.0 are not used, the comparison will
be eliminated. This removes an instruction and potentially reduces
register pressure.
v2: Major re-write of the commit message (including fixing the assembly
code). Suggested by Matt.
All Gen8+ platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 17224434 -> 17198659 (-0.15%)
instructions in affected programs: 2908125 -> 2882350 (-0.89%)
helped: 18891
HURT: 5
helped stats (abs) min: 1 max: 12 x̄: 1.38 x̃: 1
helped stats (rel) min: 0.03% max: 25.00% x̄: 1.76% x̃: 1.02%
HURT stats (abs) min: 9 max: 105 x̄: 51.40 x̃: 35
HURT stats (rel) min: 0.43% max: 4.92% x̄: 2.34% x̃: 1.56%
95% mean confidence interval for instructions value: -1.39 -1.34
95% mean confidence interval for instructions %-change: -1.79% -1.73%
Instructions are helped.
total cycles in shared programs: 361468458 -> 361170679 (-0.08%)
cycles in affected programs: 38470116 -> 38172337 (-0.77%)
helped: 16202
HURT: 1456
helped stats (abs) min: 1 max: 4473 x̄: 26.24 x̃: 18
helped stats (rel) min: <.01% max: 28.44% x̄: 2.90% x̃: 2.18%
HURT stats (abs) min: 1 max: 5982 x̄: 87.51 x̃: 28
HURT stats (rel) min: <.01% max: 51.29% x̄: 5.48% x̃: 1.64%
95% mean confidence interval for cycles value: -18.24 -15.49
95% mean confidence interval for cycles %-change: -2.26% -2.14%
Cycles are helped.
total spills in shared programs: 12147 -> 12176 (0.24%)
spills in affected programs: 175 -> 204 (16.57%)
helped: 8
HURT: 5
total fills in shared programs: 25262 -> 25292 (0.12%)
fills in affected programs: 269 -> 299 (11.15%)
helped: 8
HURT: 5
Haswell
total instructions in shared programs: 13530316 -> 13502647 (-0.20%)
instructions in affected programs: 2507824 -> 2480155 (-1.10%)
helped: 18859
HURT: 10
helped stats (abs) min: 1 max: 12 x̄: 1.48 x̃: 1
helped stats (rel) min: 0.03% max: 27.78% x̄: 2.38% x̃: 1.41%
HURT stats (abs) min: 5 max: 39 x̄: 25.70 x̃: 31
HURT stats (rel) min: 0.22% max: 1.66% x̄: 1.09% x̃: 1.31%
95% mean confidence interval for instructions value: -1.49 -1.44
95% mean confidence interval for instructions %-change: -2.42% -2.34%
Instructions are helped.
total cycles in shared programs: 377865412 -> 377639034 (-0.06%)
cycles in affected programs: 40169572 -> 39943194 (-0.56%)
helped: 15550
HURT: 1938
helped stats (abs) min: 1 max: 2482 x̄: 25.67 x̃: 18
helped stats (rel) min: <.01% max: 37.77% x̄: 3.00% x̃: 2.25%
HURT stats (abs) min: 1 max: 4862 x̄: 89.17 x̃: 35
HURT stats (rel) min: <.01% max: 67.67% x̄: 6.16% x̃: 2.75%
95% mean confidence interval for cycles value: -14.42 -11.47
95% mean confidence interval for cycles %-change: -2.05% -1.91%
Cycles are helped.
total spills in shared programs: 26769 -> 26814 (0.17%)
spills in affected programs: 826 -> 871 (5.45%)
helped: 9
HURT: 10
total fills in shared programs: 38383 -> 38425 (0.11%)
fills in affected programs: 834 -> 876 (5.04%)
helped: 9
HURT: 10
LOST: 5
GAINED: 10
Ivy Bridge
total instructions in shared programs: 12079250 -> 12044139 (-0.29%)
instructions in affected programs: 2409680 -> 2374569 (-1.46%)
helped: 16135
HURT: 0
helped stats (abs) min: 1 max: 23 x̄: 2.18 x̃: 2
helped stats (rel) min: 0.07% max: 37.50% x̄: 2.72% x̃: 1.68%
95% mean confidence interval for instructions value: -2.21 -2.14
95% mean confidence interval for instructions %-change: -2.76% -2.67%
Instructions are helped.
total cycles in shared programs: 180116747 -> 179900405 (-0.12%)
cycles in affected programs: 25439823 -> 25223481 (-0.85%)
helped: 13817
HURT: 1499
helped stats (abs) min: 1 max: 1886 x̄: 26.40 x̃: 18
helped stats (rel) min: <.01% max: 38.84% x̄: 2.57% x̃: 1.97%
HURT stats (abs) min: 1 max: 3684 x̄: 98.99 x̃: 52
HURT stats (rel) min: <.01% max: 97.01% x̄: 6.37% x̃: 3.42%
95% mean confidence interval for cycles value: -15.68 -12.57
95% mean confidence interval for cycles %-change: -1.77% -1.63%
Cycles are helped.
LOST: 8
GAINED: 10
Sandy Bridge
total instructions in shared programs: 10878990 -> 10863659 (-0.14%)
instructions in affected programs: 1806702 -> 1791371 (-0.85%)
helped: 13023
HURT: 0
helped stats (abs) min: 1 max: 5 x̄: 1.18 x̃: 1
helped stats (rel) min: 0.07% max: 13.79% x̄: 1.65% x̃: 1.10%
95% mean confidence interval for instructions value: -1.18 -1.17
95% mean confidence interval for instructions %-change: -1.68% -1.62%
Instructions are helped.
total cycles in shared programs: 154082878 -> 153862810 (-0.14%)
cycles in affected programs: 20199374 -> 19979306 (-1.09%)
helped: 12048
HURT: 510
helped stats (abs) min: 1 max: 323 x̄: 20.57 x̃: 18
helped stats (rel) min: 0.03% max: 17.78% x̄: 2.05% x̃: 1.52%
HURT stats (abs) min: 1 max: 448 x̄: 54.39 x̃: 16
HURT stats (rel) min: 0.02% max: 37.98% x̄: 4.13% x̃: 1.17%
95% mean confidence interval for cycles value: -17.97 -17.08
95% mean confidence interval for cycles %-change: -1.84% -1.75%
Cycles are helped.
LOST: 1
GAINED: 0
Iron Lake
total instructions in shared programs: 8155075 -> 8142729 (-0.15%)
instructions in affected programs: 949495 -> 937149 (-1.30%)
helped: 5810
HURT: 0
helped stats (abs) min: 1 max: 8 x̄: 2.12 x̃: 2
helped stats (rel) min: 0.10% max: 16.67% x̄: 2.53% x̃: 1.85%
95% mean confidence interval for instructions value: -2.14 -2.11
95% mean confidence interval for instructions %-change: -2.59% -2.48%
Instructions are helped.
total cycles in shared programs: 188584610 -> 188549632 (-0.02%)
cycles in affected programs: 17274446 -> 17239468 (-0.20%)
helped: 3881
HURT: 90
helped stats (abs) min: 2 max: 168 x̄: 9.08 x̃: 6
helped stats (rel) min: <.01% max: 23.53% x̄: 0.83% x̃: 0.30%
HURT stats (abs) min: 2 max: 10 x̄: 2.80 x̃: 2
HURT stats (rel) min: <.01% max: 0.60% x̄: 0.10% x̃: 0.07%
95% mean confidence interval for cycles value: -9.35 -8.27
95% mean confidence interval for cycles %-change: -0.85% -0.77%
Cycles are helped.
GM45
total instructions in shared programs: 5019308 -> 5013119 (-0.12%)
instructions in affected programs: 489028 -> 482839 (-1.27%)
helped: 2912
HURT: 0
helped stats (abs) min: 1 max: 8 x̄: 2.13 x̃: 2
helped stats (rel) min: 0.10% max: 16.67% x̄: 2.46% x̃: 1.81%
95% mean confidence interval for instructions value: -2.14 -2.11
95% mean confidence interval for instructions %-change: -2.54% -2.39%
Instructions are helped.
total cycles in shared programs: 129002592 -> 128977804 (-0.02%)
cycles in affected programs: 12669152 -> 12644364 (-0.20%)
helped: 2759
HURT: 37
helped stats (abs) min: 2 max: 168 x̄: 9.03 x̃: 4
helped stats (rel) min: <.01% max: 21.43% x̄: 0.75% x̃: 0.31%
HURT stats (abs) min: 2 max: 10 x̄: 3.62 x̃: 4
HURT stats (rel) min: <.01% max: 0.41% x̄: 0.10% x̃: 0.04%
95% mean confidence interval for cycles value: -9.53 -8.20
95% mean confidence interval for cycles %-change: -0.79% -0.70%
Cycles are helped.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2019-05-20 17:25:01 -07:00
|
|
|
if (cmp->conditional_mod == BRW_CONDITIONAL_NONE) {
|
|
|
|
|
if (cmp->can_do_cmod())
|
|
|
|
|
cmp->conditional_mod = BRW_CONDITIONAL_Z;
|
|
|
|
|
else
|
|
|
|
|
cmp = NULL;
|
|
|
|
|
} else {
|
|
|
|
|
/* The old sequence that would have been generated is,
|
|
|
|
|
* basically, bool_result == false. This is equivalent to
|
|
|
|
|
* !bool_result, so negate the old modifier.
|
2024-09-05 13:41:19 +03:00
|
|
|
*
|
|
|
|
|
* Unfortunately, we can't do this to most float comparisons
|
|
|
|
|
* because of NaN, so we'll have to fallback to the old-style
|
|
|
|
|
* compare.
|
|
|
|
|
*
|
|
|
|
|
* For example, this code (after negation):
|
|
|
|
|
* (+f1.0) cmp.ge.f1.0(8) null<1>F g30<8,8,1>F 0x0F
|
|
|
|
|
* will provide different results from this:
|
|
|
|
|
* cmp.l.f0.0(8) g31<1>F g30<1,1,0>F 0x0F
|
|
|
|
|
* (+f1.0) cmp.z.f1.0(8) null<1>D g31<8,8,1>D 0D
|
|
|
|
|
* because both (NaN >= 0) == false and (NaN < 0) == false.
|
|
|
|
|
*
|
|
|
|
|
* It will still work for == and != though, because
|
|
|
|
|
* (NaN == x) == false and (NaN != x) == true.
|
intel/fs: Improve discard_if code generation
Previously we would blindly emit an sequence like:
mov(1) f0.1<1>UW g1.14<0,1,0>UW
...
cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */
(+f0.1) cmp.z.f0.1(16) null<1>D g7<8,8,1>D 0D
The first move sets the flags based on the initial execution mask.
Later discard sequences contain a predicated compare that can only
remove more SIMD channels. Often times the only user of the result from
the first compare is the second compare. Instead, generate a sequence
like
mov(1) f0.1<1>UW g1.14<0,1,0>UW
...
cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */
(+f0.1) cmp.ge.f0.1(8) null<1>F g5<8,8,1>F 0x41700000F /* 15F */
If the results stored in g7 and f0.0 are not used, the comparison will
be eliminated. This removes an instruction and potentially reduces
register pressure.
v2: Major re-write of the commit message (including fixing the assembly
code). Suggested by Matt.
All Gen8+ platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 17224434 -> 17198659 (-0.15%)
instructions in affected programs: 2908125 -> 2882350 (-0.89%)
helped: 18891
HURT: 5
helped stats (abs) min: 1 max: 12 x̄: 1.38 x̃: 1
helped stats (rel) min: 0.03% max: 25.00% x̄: 1.76% x̃: 1.02%
HURT stats (abs) min: 9 max: 105 x̄: 51.40 x̃: 35
HURT stats (rel) min: 0.43% max: 4.92% x̄: 2.34% x̃: 1.56%
95% mean confidence interval for instructions value: -1.39 -1.34
95% mean confidence interval for instructions %-change: -1.79% -1.73%
Instructions are helped.
total cycles in shared programs: 361468458 -> 361170679 (-0.08%)
cycles in affected programs: 38470116 -> 38172337 (-0.77%)
helped: 16202
HURT: 1456
helped stats (abs) min: 1 max: 4473 x̄: 26.24 x̃: 18
helped stats (rel) min: <.01% max: 28.44% x̄: 2.90% x̃: 2.18%
HURT stats (abs) min: 1 max: 5982 x̄: 87.51 x̃: 28
HURT stats (rel) min: <.01% max: 51.29% x̄: 5.48% x̃: 1.64%
95% mean confidence interval for cycles value: -18.24 -15.49
95% mean confidence interval for cycles %-change: -2.26% -2.14%
Cycles are helped.
total spills in shared programs: 12147 -> 12176 (0.24%)
spills in affected programs: 175 -> 204 (16.57%)
helped: 8
HURT: 5
total fills in shared programs: 25262 -> 25292 (0.12%)
fills in affected programs: 269 -> 299 (11.15%)
helped: 8
HURT: 5
Haswell
total instructions in shared programs: 13530316 -> 13502647 (-0.20%)
instructions in affected programs: 2507824 -> 2480155 (-1.10%)
helped: 18859
HURT: 10
helped stats (abs) min: 1 max: 12 x̄: 1.48 x̃: 1
helped stats (rel) min: 0.03% max: 27.78% x̄: 2.38% x̃: 1.41%
HURT stats (abs) min: 5 max: 39 x̄: 25.70 x̃: 31
HURT stats (rel) min: 0.22% max: 1.66% x̄: 1.09% x̃: 1.31%
95% mean confidence interval for instructions value: -1.49 -1.44
95% mean confidence interval for instructions %-change: -2.42% -2.34%
Instructions are helped.
total cycles in shared programs: 377865412 -> 377639034 (-0.06%)
cycles in affected programs: 40169572 -> 39943194 (-0.56%)
helped: 15550
HURT: 1938
helped stats (abs) min: 1 max: 2482 x̄: 25.67 x̃: 18
helped stats (rel) min: <.01% max: 37.77% x̄: 3.00% x̃: 2.25%
HURT stats (abs) min: 1 max: 4862 x̄: 89.17 x̃: 35
HURT stats (rel) min: <.01% max: 67.67% x̄: 6.16% x̃: 2.75%
95% mean confidence interval for cycles value: -14.42 -11.47
95% mean confidence interval for cycles %-change: -2.05% -1.91%
Cycles are helped.
total spills in shared programs: 26769 -> 26814 (0.17%)
spills in affected programs: 826 -> 871 (5.45%)
helped: 9
HURT: 10
total fills in shared programs: 38383 -> 38425 (0.11%)
fills in affected programs: 834 -> 876 (5.04%)
helped: 9
HURT: 10
LOST: 5
GAINED: 10
Ivy Bridge
total instructions in shared programs: 12079250 -> 12044139 (-0.29%)
instructions in affected programs: 2409680 -> 2374569 (-1.46%)
helped: 16135
HURT: 0
helped stats (abs) min: 1 max: 23 x̄: 2.18 x̃: 2
helped stats (rel) min: 0.07% max: 37.50% x̄: 2.72% x̃: 1.68%
95% mean confidence interval for instructions value: -2.21 -2.14
95% mean confidence interval for instructions %-change: -2.76% -2.67%
Instructions are helped.
total cycles in shared programs: 180116747 -> 179900405 (-0.12%)
cycles in affected programs: 25439823 -> 25223481 (-0.85%)
helped: 13817
HURT: 1499
helped stats (abs) min: 1 max: 1886 x̄: 26.40 x̃: 18
helped stats (rel) min: <.01% max: 38.84% x̄: 2.57% x̃: 1.97%
HURT stats (abs) min: 1 max: 3684 x̄: 98.99 x̃: 52
HURT stats (rel) min: <.01% max: 97.01% x̄: 6.37% x̃: 3.42%
95% mean confidence interval for cycles value: -15.68 -12.57
95% mean confidence interval for cycles %-change: -1.77% -1.63%
Cycles are helped.
LOST: 8
GAINED: 10
Sandy Bridge
total instructions in shared programs: 10878990 -> 10863659 (-0.14%)
instructions in affected programs: 1806702 -> 1791371 (-0.85%)
helped: 13023
HURT: 0
helped stats (abs) min: 1 max: 5 x̄: 1.18 x̃: 1
helped stats (rel) min: 0.07% max: 13.79% x̄: 1.65% x̃: 1.10%
95% mean confidence interval for instructions value: -1.18 -1.17
95% mean confidence interval for instructions %-change: -1.68% -1.62%
Instructions are helped.
total cycles in shared programs: 154082878 -> 153862810 (-0.14%)
cycles in affected programs: 20199374 -> 19979306 (-1.09%)
helped: 12048
HURT: 510
helped stats (abs) min: 1 max: 323 x̄: 20.57 x̃: 18
helped stats (rel) min: 0.03% max: 17.78% x̄: 2.05% x̃: 1.52%
HURT stats (abs) min: 1 max: 448 x̄: 54.39 x̃: 16
HURT stats (rel) min: 0.02% max: 37.98% x̄: 4.13% x̃: 1.17%
95% mean confidence interval for cycles value: -17.97 -17.08
95% mean confidence interval for cycles %-change: -1.84% -1.75%
Cycles are helped.
LOST: 1
GAINED: 0
Iron Lake
total instructions in shared programs: 8155075 -> 8142729 (-0.15%)
instructions in affected programs: 949495 -> 937149 (-1.30%)
helped: 5810
HURT: 0
helped stats (abs) min: 1 max: 8 x̄: 2.12 x̃: 2
helped stats (rel) min: 0.10% max: 16.67% x̄: 2.53% x̃: 1.85%
95% mean confidence interval for instructions value: -2.14 -2.11
95% mean confidence interval for instructions %-change: -2.59% -2.48%
Instructions are helped.
total cycles in shared programs: 188584610 -> 188549632 (-0.02%)
cycles in affected programs: 17274446 -> 17239468 (-0.20%)
helped: 3881
HURT: 90
helped stats (abs) min: 2 max: 168 x̄: 9.08 x̃: 6
helped stats (rel) min: <.01% max: 23.53% x̄: 0.83% x̃: 0.30%
HURT stats (abs) min: 2 max: 10 x̄: 2.80 x̃: 2
HURT stats (rel) min: <.01% max: 0.60% x̄: 0.10% x̃: 0.07%
95% mean confidence interval for cycles value: -9.35 -8.27
95% mean confidence interval for cycles %-change: -0.85% -0.77%
Cycles are helped.
GM45
total instructions in shared programs: 5019308 -> 5013119 (-0.12%)
instructions in affected programs: 489028 -> 482839 (-1.27%)
helped: 2912
HURT: 0
helped stats (abs) min: 1 max: 8 x̄: 2.13 x̃: 2
helped stats (rel) min: 0.10% max: 16.67% x̄: 2.46% x̃: 1.81%
95% mean confidence interval for instructions value: -2.14 -2.11
95% mean confidence interval for instructions %-change: -2.54% -2.39%
Instructions are helped.
total cycles in shared programs: 129002592 -> 128977804 (-0.02%)
cycles in affected programs: 12669152 -> 12644364 (-0.20%)
helped: 2759
HURT: 37
helped stats (abs) min: 2 max: 168 x̄: 9.03 x̃: 4
helped stats (rel) min: <.01% max: 21.43% x̄: 0.75% x̃: 0.31%
HURT stats (abs) min: 2 max: 10 x̄: 3.62 x̃: 4
HURT stats (rel) min: <.01% max: 0.41% x̄: 0.10% x̃: 0.04%
95% mean confidence interval for cycles value: -9.53 -8.20
95% mean confidence interval for cycles %-change: -0.79% -0.70%
Cycles are helped.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2019-05-20 17:25:01 -07:00
|
|
|
*/
|
2024-09-05 13:41:19 +03:00
|
|
|
if (brw_type_is_float(cmp->src[0].type) &&
|
|
|
|
|
cmp->conditional_mod != BRW_CONDITIONAL_EQ &&
|
|
|
|
|
cmp->conditional_mod != BRW_CONDITIONAL_NEQ) {
|
|
|
|
|
cmp = NULL;
|
|
|
|
|
} else {
|
|
|
|
|
cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod);
|
|
|
|
|
}
|
intel/fs: Improve discard_if code generation
Previously we would blindly emit an sequence like:
mov(1) f0.1<1>UW g1.14<0,1,0>UW
...
cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */
(+f0.1) cmp.z.f0.1(16) null<1>D g7<8,8,1>D 0D
The first move sets the flags based on the initial execution mask.
Later discard sequences contain a predicated compare that can only
remove more SIMD channels. Often times the only user of the result from
the first compare is the second compare. Instead, generate a sequence
like
mov(1) f0.1<1>UW g1.14<0,1,0>UW
...
cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */
(+f0.1) cmp.ge.f0.1(8) null<1>F g5<8,8,1>F 0x41700000F /* 15F */
If the results stored in g7 and f0.0 are not used, the comparison will
be eliminated. This removes an instruction and potentially reduces
register pressure.
v2: Major re-write of the commit message (including fixing the assembly
code). Suggested by Matt.
All Gen8+ platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 17224434 -> 17198659 (-0.15%)
instructions in affected programs: 2908125 -> 2882350 (-0.89%)
helped: 18891
HURT: 5
helped stats (abs) min: 1 max: 12 x̄: 1.38 x̃: 1
helped stats (rel) min: 0.03% max: 25.00% x̄: 1.76% x̃: 1.02%
HURT stats (abs) min: 9 max: 105 x̄: 51.40 x̃: 35
HURT stats (rel) min: 0.43% max: 4.92% x̄: 2.34% x̃: 1.56%
95% mean confidence interval for instructions value: -1.39 -1.34
95% mean confidence interval for instructions %-change: -1.79% -1.73%
Instructions are helped.
total cycles in shared programs: 361468458 -> 361170679 (-0.08%)
cycles in affected programs: 38470116 -> 38172337 (-0.77%)
helped: 16202
HURT: 1456
helped stats (abs) min: 1 max: 4473 x̄: 26.24 x̃: 18
helped stats (rel) min: <.01% max: 28.44% x̄: 2.90% x̃: 2.18%
HURT stats (abs) min: 1 max: 5982 x̄: 87.51 x̃: 28
HURT stats (rel) min: <.01% max: 51.29% x̄: 5.48% x̃: 1.64%
95% mean confidence interval for cycles value: -18.24 -15.49
95% mean confidence interval for cycles %-change: -2.26% -2.14%
Cycles are helped.
total spills in shared programs: 12147 -> 12176 (0.24%)
spills in affected programs: 175 -> 204 (16.57%)
helped: 8
HURT: 5
total fills in shared programs: 25262 -> 25292 (0.12%)
fills in affected programs: 269 -> 299 (11.15%)
helped: 8
HURT: 5
Haswell
total instructions in shared programs: 13530316 -> 13502647 (-0.20%)
instructions in affected programs: 2507824 -> 2480155 (-1.10%)
helped: 18859
HURT: 10
helped stats (abs) min: 1 max: 12 x̄: 1.48 x̃: 1
helped stats (rel) min: 0.03% max: 27.78% x̄: 2.38% x̃: 1.41%
HURT stats (abs) min: 5 max: 39 x̄: 25.70 x̃: 31
HURT stats (rel) min: 0.22% max: 1.66% x̄: 1.09% x̃: 1.31%
95% mean confidence interval for instructions value: -1.49 -1.44
95% mean confidence interval for instructions %-change: -2.42% -2.34%
Instructions are helped.
total cycles in shared programs: 377865412 -> 377639034 (-0.06%)
cycles in affected programs: 40169572 -> 39943194 (-0.56%)
helped: 15550
HURT: 1938
helped stats (abs) min: 1 max: 2482 x̄: 25.67 x̃: 18
helped stats (rel) min: <.01% max: 37.77% x̄: 3.00% x̃: 2.25%
HURT stats (abs) min: 1 max: 4862 x̄: 89.17 x̃: 35
HURT stats (rel) min: <.01% max: 67.67% x̄: 6.16% x̃: 2.75%
95% mean confidence interval for cycles value: -14.42 -11.47
95% mean confidence interval for cycles %-change: -2.05% -1.91%
Cycles are helped.
total spills in shared programs: 26769 -> 26814 (0.17%)
spills in affected programs: 826 -> 871 (5.45%)
helped: 9
HURT: 10
total fills in shared programs: 38383 -> 38425 (0.11%)
fills in affected programs: 834 -> 876 (5.04%)
helped: 9
HURT: 10
LOST: 5
GAINED: 10
Ivy Bridge
total instructions in shared programs: 12079250 -> 12044139 (-0.29%)
instructions in affected programs: 2409680 -> 2374569 (-1.46%)
helped: 16135
HURT: 0
helped stats (abs) min: 1 max: 23 x̄: 2.18 x̃: 2
helped stats (rel) min: 0.07% max: 37.50% x̄: 2.72% x̃: 1.68%
95% mean confidence interval for instructions value: -2.21 -2.14
95% mean confidence interval for instructions %-change: -2.76% -2.67%
Instructions are helped.
total cycles in shared programs: 180116747 -> 179900405 (-0.12%)
cycles in affected programs: 25439823 -> 25223481 (-0.85%)
helped: 13817
HURT: 1499
helped stats (abs) min: 1 max: 1886 x̄: 26.40 x̃: 18
helped stats (rel) min: <.01% max: 38.84% x̄: 2.57% x̃: 1.97%
HURT stats (abs) min: 1 max: 3684 x̄: 98.99 x̃: 52
HURT stats (rel) min: <.01% max: 97.01% x̄: 6.37% x̃: 3.42%
95% mean confidence interval for cycles value: -15.68 -12.57
95% mean confidence interval for cycles %-change: -1.77% -1.63%
Cycles are helped.
LOST: 8
GAINED: 10
Sandy Bridge
total instructions in shared programs: 10878990 -> 10863659 (-0.14%)
instructions in affected programs: 1806702 -> 1791371 (-0.85%)
helped: 13023
HURT: 0
helped stats (abs) min: 1 max: 5 x̄: 1.18 x̃: 1
helped stats (rel) min: 0.07% max: 13.79% x̄: 1.65% x̃: 1.10%
95% mean confidence interval for instructions value: -1.18 -1.17
95% mean confidence interval for instructions %-change: -1.68% -1.62%
Instructions are helped.
total cycles in shared programs: 154082878 -> 153862810 (-0.14%)
cycles in affected programs: 20199374 -> 19979306 (-1.09%)
helped: 12048
HURT: 510
helped stats (abs) min: 1 max: 323 x̄: 20.57 x̃: 18
helped stats (rel) min: 0.03% max: 17.78% x̄: 2.05% x̃: 1.52%
HURT stats (abs) min: 1 max: 448 x̄: 54.39 x̃: 16
HURT stats (rel) min: 0.02% max: 37.98% x̄: 4.13% x̃: 1.17%
95% mean confidence interval for cycles value: -17.97 -17.08
95% mean confidence interval for cycles %-change: -1.84% -1.75%
Cycles are helped.
LOST: 1
GAINED: 0
Iron Lake
total instructions in shared programs: 8155075 -> 8142729 (-0.15%)
instructions in affected programs: 949495 -> 937149 (-1.30%)
helped: 5810
HURT: 0
helped stats (abs) min: 1 max: 8 x̄: 2.12 x̃: 2
helped stats (rel) min: 0.10% max: 16.67% x̄: 2.53% x̃: 1.85%
95% mean confidence interval for instructions value: -2.14 -2.11
95% mean confidence interval for instructions %-change: -2.59% -2.48%
Instructions are helped.
total cycles in shared programs: 188584610 -> 188549632 (-0.02%)
cycles in affected programs: 17274446 -> 17239468 (-0.20%)
helped: 3881
HURT: 90
helped stats (abs) min: 2 max: 168 x̄: 9.08 x̃: 6
helped stats (rel) min: <.01% max: 23.53% x̄: 0.83% x̃: 0.30%
HURT stats (abs) min: 2 max: 10 x̄: 2.80 x̃: 2
HURT stats (rel) min: <.01% max: 0.60% x̄: 0.10% x̃: 0.07%
95% mean confidence interval for cycles value: -9.35 -8.27
95% mean confidence interval for cycles %-change: -0.85% -0.77%
Cycles are helped.
GM45
total instructions in shared programs: 5019308 -> 5013119 (-0.12%)
instructions in affected programs: 489028 -> 482839 (-1.27%)
helped: 2912
HURT: 0
helped stats (abs) min: 1 max: 8 x̄: 2.13 x̃: 2
helped stats (rel) min: 0.10% max: 16.67% x̄: 2.46% x̃: 1.81%
95% mean confidence interval for instructions value: -2.14 -2.11
95% mean confidence interval for instructions %-change: -2.54% -2.39%
Instructions are helped.
total cycles in shared programs: 129002592 -> 128977804 (-0.02%)
cycles in affected programs: 12669152 -> 12644364 (-0.20%)
helped: 2759
HURT: 37
helped stats (abs) min: 2 max: 168 x̄: 9.03 x̃: 4
helped stats (rel) min: <.01% max: 21.43% x̄: 0.75% x̃: 0.31%
HURT stats (abs) min: 2 max: 10 x̄: 3.62 x̃: 4
HURT stats (rel) min: <.01% max: 0.41% x̄: 0.10% x̃: 0.04%
95% mean confidence interval for cycles value: -9.53 -8.20
95% mean confidence interval for cycles %-change: -0.79% -0.70%
Cycles are helped.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2019-05-20 17:25:01 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (cmp == NULL) {
|
2025-01-15 13:27:05 -08:00
|
|
|
cmp = bld.CMP(bld.null_reg_f(), get_nir_src(ntb, instr->src[0], 0),
|
intel/fs: Improve discard_if code generation
Previously we would blindly emit an sequence like:
mov(1) f0.1<1>UW g1.14<0,1,0>UW
...
cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */
(+f0.1) cmp.z.f0.1(16) null<1>D g7<8,8,1>D 0D
The first move sets the flags based on the initial execution mask.
Later discard sequences contain a predicated compare that can only
remove more SIMD channels. Often times the only user of the result from
the first compare is the second compare. Instead, generate a sequence
like
mov(1) f0.1<1>UW g1.14<0,1,0>UW
...
cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */
(+f0.1) cmp.ge.f0.1(8) null<1>F g5<8,8,1>F 0x41700000F /* 15F */
If the results stored in g7 and f0.0 are not used, the comparison will
be eliminated. This removes an instruction and potentially reduces
register pressure.
v2: Major re-write of the commit message (including fixing the assembly
code). Suggested by Matt.
All Gen8+ platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 17224434 -> 17198659 (-0.15%)
instructions in affected programs: 2908125 -> 2882350 (-0.89%)
helped: 18891
HURT: 5
helped stats (abs) min: 1 max: 12 x̄: 1.38 x̃: 1
helped stats (rel) min: 0.03% max: 25.00% x̄: 1.76% x̃: 1.02%
HURT stats (abs) min: 9 max: 105 x̄: 51.40 x̃: 35
HURT stats (rel) min: 0.43% max: 4.92% x̄: 2.34% x̃: 1.56%
95% mean confidence interval for instructions value: -1.39 -1.34
95% mean confidence interval for instructions %-change: -1.79% -1.73%
Instructions are helped.
total cycles in shared programs: 361468458 -> 361170679 (-0.08%)
cycles in affected programs: 38470116 -> 38172337 (-0.77%)
helped: 16202
HURT: 1456
helped stats (abs) min: 1 max: 4473 x̄: 26.24 x̃: 18
helped stats (rel) min: <.01% max: 28.44% x̄: 2.90% x̃: 2.18%
HURT stats (abs) min: 1 max: 5982 x̄: 87.51 x̃: 28
HURT stats (rel) min: <.01% max: 51.29% x̄: 5.48% x̃: 1.64%
95% mean confidence interval for cycles value: -18.24 -15.49
95% mean confidence interval for cycles %-change: -2.26% -2.14%
Cycles are helped.
total spills in shared programs: 12147 -> 12176 (0.24%)
spills in affected programs: 175 -> 204 (16.57%)
helped: 8
HURT: 5
total fills in shared programs: 25262 -> 25292 (0.12%)
fills in affected programs: 269 -> 299 (11.15%)
helped: 8
HURT: 5
Haswell
total instructions in shared programs: 13530316 -> 13502647 (-0.20%)
instructions in affected programs: 2507824 -> 2480155 (-1.10%)
helped: 18859
HURT: 10
helped stats (abs) min: 1 max: 12 x̄: 1.48 x̃: 1
helped stats (rel) min: 0.03% max: 27.78% x̄: 2.38% x̃: 1.41%
HURT stats (abs) min: 5 max: 39 x̄: 25.70 x̃: 31
HURT stats (rel) min: 0.22% max: 1.66% x̄: 1.09% x̃: 1.31%
95% mean confidence interval for instructions value: -1.49 -1.44
95% mean confidence interval for instructions %-change: -2.42% -2.34%
Instructions are helped.
total cycles in shared programs: 377865412 -> 377639034 (-0.06%)
cycles in affected programs: 40169572 -> 39943194 (-0.56%)
helped: 15550
HURT: 1938
helped stats (abs) min: 1 max: 2482 x̄: 25.67 x̃: 18
helped stats (rel) min: <.01% max: 37.77% x̄: 3.00% x̃: 2.25%
HURT stats (abs) min: 1 max: 4862 x̄: 89.17 x̃: 35
HURT stats (rel) min: <.01% max: 67.67% x̄: 6.16% x̃: 2.75%
95% mean confidence interval for cycles value: -14.42 -11.47
95% mean confidence interval for cycles %-change: -2.05% -1.91%
Cycles are helped.
total spills in shared programs: 26769 -> 26814 (0.17%)
spills in affected programs: 826 -> 871 (5.45%)
helped: 9
HURT: 10
total fills in shared programs: 38383 -> 38425 (0.11%)
fills in affected programs: 834 -> 876 (5.04%)
helped: 9
HURT: 10
LOST: 5
GAINED: 10
Ivy Bridge
total instructions in shared programs: 12079250 -> 12044139 (-0.29%)
instructions in affected programs: 2409680 -> 2374569 (-1.46%)
helped: 16135
HURT: 0
helped stats (abs) min: 1 max: 23 x̄: 2.18 x̃: 2
helped stats (rel) min: 0.07% max: 37.50% x̄: 2.72% x̃: 1.68%
95% mean confidence interval for instructions value: -2.21 -2.14
95% mean confidence interval for instructions %-change: -2.76% -2.67%
Instructions are helped.
total cycles in shared programs: 180116747 -> 179900405 (-0.12%)
cycles in affected programs: 25439823 -> 25223481 (-0.85%)
helped: 13817
HURT: 1499
helped stats (abs) min: 1 max: 1886 x̄: 26.40 x̃: 18
helped stats (rel) min: <.01% max: 38.84% x̄: 2.57% x̃: 1.97%
HURT stats (abs) min: 1 max: 3684 x̄: 98.99 x̃: 52
HURT stats (rel) min: <.01% max: 97.01% x̄: 6.37% x̃: 3.42%
95% mean confidence interval for cycles value: -15.68 -12.57
95% mean confidence interval for cycles %-change: -1.77% -1.63%
Cycles are helped.
LOST: 8
GAINED: 10
Sandy Bridge
total instructions in shared programs: 10878990 -> 10863659 (-0.14%)
instructions in affected programs: 1806702 -> 1791371 (-0.85%)
helped: 13023
HURT: 0
helped stats (abs) min: 1 max: 5 x̄: 1.18 x̃: 1
helped stats (rel) min: 0.07% max: 13.79% x̄: 1.65% x̃: 1.10%
95% mean confidence interval for instructions value: -1.18 -1.17
95% mean confidence interval for instructions %-change: -1.68% -1.62%
Instructions are helped.
total cycles in shared programs: 154082878 -> 153862810 (-0.14%)
cycles in affected programs: 20199374 -> 19979306 (-1.09%)
helped: 12048
HURT: 510
helped stats (abs) min: 1 max: 323 x̄: 20.57 x̃: 18
helped stats (rel) min: 0.03% max: 17.78% x̄: 2.05% x̃: 1.52%
HURT stats (abs) min: 1 max: 448 x̄: 54.39 x̃: 16
HURT stats (rel) min: 0.02% max: 37.98% x̄: 4.13% x̃: 1.17%
95% mean confidence interval for cycles value: -17.97 -17.08
95% mean confidence interval for cycles %-change: -1.84% -1.75%
Cycles are helped.
LOST: 1
GAINED: 0
Iron Lake
total instructions in shared programs: 8155075 -> 8142729 (-0.15%)
instructions in affected programs: 949495 -> 937149 (-1.30%)
helped: 5810
HURT: 0
helped stats (abs) min: 1 max: 8 x̄: 2.12 x̃: 2
helped stats (rel) min: 0.10% max: 16.67% x̄: 2.53% x̃: 1.85%
95% mean confidence interval for instructions value: -2.14 -2.11
95% mean confidence interval for instructions %-change: -2.59% -2.48%
Instructions are helped.
total cycles in shared programs: 188584610 -> 188549632 (-0.02%)
cycles in affected programs: 17274446 -> 17239468 (-0.20%)
helped: 3881
HURT: 90
helped stats (abs) min: 2 max: 168 x̄: 9.08 x̃: 6
helped stats (rel) min: <.01% max: 23.53% x̄: 0.83% x̃: 0.30%
HURT stats (abs) min: 2 max: 10 x̄: 2.80 x̃: 2
HURT stats (rel) min: <.01% max: 0.60% x̄: 0.10% x̃: 0.07%
95% mean confidence interval for cycles value: -9.35 -8.27
95% mean confidence interval for cycles %-change: -0.85% -0.77%
Cycles are helped.
GM45
total instructions in shared programs: 5019308 -> 5013119 (-0.12%)
instructions in affected programs: 489028 -> 482839 (-1.27%)
helped: 2912
HURT: 0
helped stats (abs) min: 1 max: 8 x̄: 2.13 x̃: 2
helped stats (rel) min: 0.10% max: 16.67% x̄: 2.46% x̃: 1.81%
95% mean confidence interval for instructions value: -2.14 -2.11
95% mean confidence interval for instructions %-change: -2.54% -2.39%
Instructions are helped.
total cycles in shared programs: 129002592 -> 128977804 (-0.02%)
cycles in affected programs: 12669152 -> 12644364 (-0.20%)
helped: 2759
HURT: 37
helped stats (abs) min: 2 max: 168 x̄: 9.03 x̃: 4
helped stats (rel) min: <.01% max: 21.43% x̄: 0.75% x̃: 0.31%
HURT stats (abs) min: 2 max: 10 x̄: 3.62 x̃: 4
HURT stats (rel) min: <.01% max: 0.41% x̄: 0.10% x̃: 0.04%
95% mean confidence interval for cycles value: -9.53 -8.20
95% mean confidence interval for cycles %-change: -0.79% -0.70%
Cycles are helped.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2019-05-20 17:25:01 -07:00
|
|
|
brw_imm_d(0), BRW_CONDITIONAL_Z);
|
|
|
|
|
}
|
2014-08-19 15:22:43 -07:00
|
|
|
} else {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg some_reg = brw_reg(retype(brw_vec8_grf(0, 0), BRW_TYPE_UW));
|
2015-06-03 21:01:32 +03:00
|
|
|
cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
|
2014-08-19 15:22:43 -07:00
|
|
|
}
|
intel/fs: Improve discard_if code generation
Previously we would blindly emit an sequence like:
mov(1) f0.1<1>UW g1.14<0,1,0>UW
...
cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */
(+f0.1) cmp.z.f0.1(16) null<1>D g7<8,8,1>D 0D
The first move sets the flags based on the initial execution mask.
Later discard sequences contain a predicated compare that can only
remove more SIMD channels. Often times the only user of the result from
the first compare is the second compare. Instead, generate a sequence
like
mov(1) f0.1<1>UW g1.14<0,1,0>UW
...
cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */
(+f0.1) cmp.ge.f0.1(8) null<1>F g5<8,8,1>F 0x41700000F /* 15F */
If the results stored in g7 and f0.0 are not used, the comparison will
be eliminated. This removes an instruction and potentially reduces
register pressure.
v2: Major re-write of the commit message (including fixing the assembly
code). Suggested by Matt.
All Gen8+ platforms had similar results. (Ice Lake shown)
total instructions in shared programs: 17224434 -> 17198659 (-0.15%)
instructions in affected programs: 2908125 -> 2882350 (-0.89%)
helped: 18891
HURT: 5
helped stats (abs) min: 1 max: 12 x̄: 1.38 x̃: 1
helped stats (rel) min: 0.03% max: 25.00% x̄: 1.76% x̃: 1.02%
HURT stats (abs) min: 9 max: 105 x̄: 51.40 x̃: 35
HURT stats (rel) min: 0.43% max: 4.92% x̄: 2.34% x̃: 1.56%
95% mean confidence interval for instructions value: -1.39 -1.34
95% mean confidence interval for instructions %-change: -1.79% -1.73%
Instructions are helped.
total cycles in shared programs: 361468458 -> 361170679 (-0.08%)
cycles in affected programs: 38470116 -> 38172337 (-0.77%)
helped: 16202
HURT: 1456
helped stats (abs) min: 1 max: 4473 x̄: 26.24 x̃: 18
helped stats (rel) min: <.01% max: 28.44% x̄: 2.90% x̃: 2.18%
HURT stats (abs) min: 1 max: 5982 x̄: 87.51 x̃: 28
HURT stats (rel) min: <.01% max: 51.29% x̄: 5.48% x̃: 1.64%
95% mean confidence interval for cycles value: -18.24 -15.49
95% mean confidence interval for cycles %-change: -2.26% -2.14%
Cycles are helped.
total spills in shared programs: 12147 -> 12176 (0.24%)
spills in affected programs: 175 -> 204 (16.57%)
helped: 8
HURT: 5
total fills in shared programs: 25262 -> 25292 (0.12%)
fills in affected programs: 269 -> 299 (11.15%)
helped: 8
HURT: 5
Haswell
total instructions in shared programs: 13530316 -> 13502647 (-0.20%)
instructions in affected programs: 2507824 -> 2480155 (-1.10%)
helped: 18859
HURT: 10
helped stats (abs) min: 1 max: 12 x̄: 1.48 x̃: 1
helped stats (rel) min: 0.03% max: 27.78% x̄: 2.38% x̃: 1.41%
HURT stats (abs) min: 5 max: 39 x̄: 25.70 x̃: 31
HURT stats (rel) min: 0.22% max: 1.66% x̄: 1.09% x̃: 1.31%
95% mean confidence interval for instructions value: -1.49 -1.44
95% mean confidence interval for instructions %-change: -2.42% -2.34%
Instructions are helped.
total cycles in shared programs: 377865412 -> 377639034 (-0.06%)
cycles in affected programs: 40169572 -> 39943194 (-0.56%)
helped: 15550
HURT: 1938
helped stats (abs) min: 1 max: 2482 x̄: 25.67 x̃: 18
helped stats (rel) min: <.01% max: 37.77% x̄: 3.00% x̃: 2.25%
HURT stats (abs) min: 1 max: 4862 x̄: 89.17 x̃: 35
HURT stats (rel) min: <.01% max: 67.67% x̄: 6.16% x̃: 2.75%
95% mean confidence interval for cycles value: -14.42 -11.47
95% mean confidence interval for cycles %-change: -2.05% -1.91%
Cycles are helped.
total spills in shared programs: 26769 -> 26814 (0.17%)
spills in affected programs: 826 -> 871 (5.45%)
helped: 9
HURT: 10
total fills in shared programs: 38383 -> 38425 (0.11%)
fills in affected programs: 834 -> 876 (5.04%)
helped: 9
HURT: 10
LOST: 5
GAINED: 10
Ivy Bridge
total instructions in shared programs: 12079250 -> 12044139 (-0.29%)
instructions in affected programs: 2409680 -> 2374569 (-1.46%)
helped: 16135
HURT: 0
helped stats (abs) min: 1 max: 23 x̄: 2.18 x̃: 2
helped stats (rel) min: 0.07% max: 37.50% x̄: 2.72% x̃: 1.68%
95% mean confidence interval for instructions value: -2.21 -2.14
95% mean confidence interval for instructions %-change: -2.76% -2.67%
Instructions are helped.
total cycles in shared programs: 180116747 -> 179900405 (-0.12%)
cycles in affected programs: 25439823 -> 25223481 (-0.85%)
helped: 13817
HURT: 1499
helped stats (abs) min: 1 max: 1886 x̄: 26.40 x̃: 18
helped stats (rel) min: <.01% max: 38.84% x̄: 2.57% x̃: 1.97%
HURT stats (abs) min: 1 max: 3684 x̄: 98.99 x̃: 52
HURT stats (rel) min: <.01% max: 97.01% x̄: 6.37% x̃: 3.42%
95% mean confidence interval for cycles value: -15.68 -12.57
95% mean confidence interval for cycles %-change: -1.77% -1.63%
Cycles are helped.
LOST: 8
GAINED: 10
Sandy Bridge
total instructions in shared programs: 10878990 -> 10863659 (-0.14%)
instructions in affected programs: 1806702 -> 1791371 (-0.85%)
helped: 13023
HURT: 0
helped stats (abs) min: 1 max: 5 x̄: 1.18 x̃: 1
helped stats (rel) min: 0.07% max: 13.79% x̄: 1.65% x̃: 1.10%
95% mean confidence interval for instructions value: -1.18 -1.17
95% mean confidence interval for instructions %-change: -1.68% -1.62%
Instructions are helped.
total cycles in shared programs: 154082878 -> 153862810 (-0.14%)
cycles in affected programs: 20199374 -> 19979306 (-1.09%)
helped: 12048
HURT: 510
helped stats (abs) min: 1 max: 323 x̄: 20.57 x̃: 18
helped stats (rel) min: 0.03% max: 17.78% x̄: 2.05% x̃: 1.52%
HURT stats (abs) min: 1 max: 448 x̄: 54.39 x̃: 16
HURT stats (rel) min: 0.02% max: 37.98% x̄: 4.13% x̃: 1.17%
95% mean confidence interval for cycles value: -17.97 -17.08
95% mean confidence interval for cycles %-change: -1.84% -1.75%
Cycles are helped.
LOST: 1
GAINED: 0
Iron Lake
total instructions in shared programs: 8155075 -> 8142729 (-0.15%)
instructions in affected programs: 949495 -> 937149 (-1.30%)
helped: 5810
HURT: 0
helped stats (abs) min: 1 max: 8 x̄: 2.12 x̃: 2
helped stats (rel) min: 0.10% max: 16.67% x̄: 2.53% x̃: 1.85%
95% mean confidence interval for instructions value: -2.14 -2.11
95% mean confidence interval for instructions %-change: -2.59% -2.48%
Instructions are helped.
total cycles in shared programs: 188584610 -> 188549632 (-0.02%)
cycles in affected programs: 17274446 -> 17239468 (-0.20%)
helped: 3881
HURT: 90
helped stats (abs) min: 2 max: 168 x̄: 9.08 x̃: 6
helped stats (rel) min: <.01% max: 23.53% x̄: 0.83% x̃: 0.30%
HURT stats (abs) min: 2 max: 10 x̄: 2.80 x̃: 2
HURT stats (rel) min: <.01% max: 0.60% x̄: 0.10% x̃: 0.07%
95% mean confidence interval for cycles value: -9.35 -8.27
95% mean confidence interval for cycles %-change: -0.85% -0.77%
Cycles are helped.
GM45
total instructions in shared programs: 5019308 -> 5013119 (-0.12%)
instructions in affected programs: 489028 -> 482839 (-1.27%)
helped: 2912
HURT: 0
helped stats (abs) min: 1 max: 8 x̄: 2.13 x̃: 2
helped stats (rel) min: 0.10% max: 16.67% x̄: 2.46% x̃: 1.81%
95% mean confidence interval for instructions value: -2.14 -2.11
95% mean confidence interval for instructions %-change: -2.54% -2.39%
Instructions are helped.
total cycles in shared programs: 129002592 -> 128977804 (-0.02%)
cycles in affected programs: 12669152 -> 12644364 (-0.20%)
helped: 2759
HURT: 37
helped stats (abs) min: 2 max: 168 x̄: 9.03 x̃: 4
helped stats (rel) min: <.01% max: 21.43% x̄: 0.75% x̃: 0.31%
HURT stats (abs) min: 2 max: 10 x̄: 3.62 x̃: 4
HURT stats (rel) min: <.01% max: 0.41% x̄: 0.10% x̃: 0.04%
95% mean confidence interval for cycles value: -9.53 -8.20
95% mean confidence interval for cycles %-change: -0.79% -0.70%
Cycles are helped.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2019-05-20 17:25:01 -07:00
|
|
|
|
2014-08-15 10:32:07 -07:00
|
|
|
cmp->predicate = BRW_PREDICATE_NORMAL;
|
2023-11-20 12:13:47 -08:00
|
|
|
cmp->flag_subreg = sample_mask_flag_subreg(s);
|
2014-08-15 10:32:07 -07:00
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *jump = bld.emit(BRW_OPCODE_HALT);
|
2023-11-20 12:13:47 -08:00
|
|
|
jump->flag_subreg = sample_mask_flag_subreg(s);
|
2020-05-08 09:08:55 -07:00
|
|
|
jump->predicate_inverse = true;
|
|
|
|
|
|
|
|
|
|
if (instr->intrinsic == nir_intrinsic_terminate ||
|
|
|
|
|
instr->intrinsic == nir_intrinsic_terminate_if) {
|
|
|
|
|
jump->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
} else {
|
|
|
|
|
/* Only jump when the whole quad is demoted. For historical
|
|
|
|
|
* reasons this is also used for discard.
|
|
|
|
|
*/
|
2022-07-22 17:11:52 -07:00
|
|
|
jump->predicate = (devinfo->ver >= 20 ? XE2_PREDICATE_ANY :
|
|
|
|
|
BRW_PREDICATE_ALIGN1_ANY4H);
|
2020-05-08 09:08:55 -07:00
|
|
|
}
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-06 04:24:31 -04:00
|
|
|
case nir_intrinsic_load_input:
|
|
|
|
|
case nir_intrinsic_load_per_primitive_input: {
|
2021-05-18 10:17:43 -07:00
|
|
|
/* In Fragment Shaders load_input is used either for flat inputs or
|
|
|
|
|
* per-primitive inputs.
|
|
|
|
|
*/
|
2023-08-14 11:56:00 -05:00
|
|
|
assert(instr->def.bit_size == 32);
|
2016-07-12 03:57:25 -07:00
|
|
|
unsigned base = nir_intrinsic_base(instr);
|
2018-05-18 15:13:25 -07:00
|
|
|
unsigned comp = nir_intrinsic_component(instr);
|
2016-07-12 03:57:25 -07:00
|
|
|
unsigned num_components = instr->num_components;
|
2015-05-05 20:52:58 +03:00
|
|
|
|
2021-05-18 10:17:43 -07:00
|
|
|
/* TODO(mesh): Multiview. Verify and handle these special cases for Mesh. */
|
|
|
|
|
|
2023-09-13 11:03:59 -07:00
|
|
|
if (base == VARYING_SLOT_LAYER) {
|
|
|
|
|
dest.type = BRW_TYPE_UD;
|
|
|
|
|
bld.MOV(dest, fetch_render_target_array_index(bld));
|
|
|
|
|
break;
|
|
|
|
|
} else if (base == VARYING_SLOT_VIEWPORT) {
|
|
|
|
|
dest.type = BRW_TYPE_UD;
|
|
|
|
|
bld.MOV(dest, fetch_viewport_index(bld));
|
|
|
|
|
break;
|
|
|
|
|
}
|
2015-05-05 20:52:58 +03:00
|
|
|
|
2025-03-10 23:18:30 +02:00
|
|
|
if (instr->intrinsic == nir_intrinsic_load_per_primitive_input) {
|
2021-05-18 10:17:43 -07:00
|
|
|
assert(base != VARYING_SLOT_PRIMITIVE_INDICES);
|
|
|
|
|
for (unsigned int i = 0; i < num_components; i++) {
|
|
|
|
|
bld.MOV(offset(dest, bld, i),
|
2024-07-12 23:36:49 -07:00
|
|
|
retype(brw_per_primitive_reg(bld, base, comp + i), dest.type));
|
2021-05-18 10:17:43 -07:00
|
|
|
}
|
|
|
|
|
} else {
|
2023-12-01 16:23:11 -08:00
|
|
|
/* Gfx20+ packs the plane parameters of a single logical
|
|
|
|
|
* input in a vec3 format instead of the previously used vec4
|
|
|
|
|
* format.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned k = devinfo->ver >= 20 ? 0 : 3;
|
2021-05-18 10:17:43 -07:00
|
|
|
for (unsigned int i = 0; i < num_components; i++) {
|
|
|
|
|
bld.MOV(offset(dest, bld, i),
|
2024-07-12 23:36:49 -07:00
|
|
|
retype(brw_interp_reg(bld, base, comp + i, k), dest.type));
|
2021-05-18 10:17:43 -07:00
|
|
|
}
|
2016-07-12 03:57:25 -07:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2025-03-11 15:29:15 -07:00
|
|
|
case nir_intrinsic_load_input_vertex: {
|
|
|
|
|
unsigned base = nir_intrinsic_base(instr);
|
|
|
|
|
unsigned comp = nir_intrinsic_component(instr);
|
|
|
|
|
unsigned vtx = nir_src_as_uint(instr->src[0]);
|
|
|
|
|
unsigned num_components = instr->num_components;
|
|
|
|
|
|
|
|
|
|
for (unsigned int i = 0; i < num_components; i++) {
|
|
|
|
|
bld.MOV(offset(dest, bld, i),
|
|
|
|
|
retype(brw_interp_reg(bld, base, comp + i, vtx), dest.type));
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-19 17:05:15 +03:00
|
|
|
case nir_intrinsic_store_per_primitive_payload_intel: {
|
|
|
|
|
const brw_builder ubld = bld.exec_all().group(1, 0);
|
|
|
|
|
brw_reg src = get_nir_src(ntb, instr->src[0], -1);
|
|
|
|
|
src = retype(bld.emit_uniformize(src), BRW_TYPE_UD);
|
|
|
|
|
|
|
|
|
|
ubld.MOV(retype(
|
|
|
|
|
brw_per_primitive_reg(bld,
|
|
|
|
|
nir_intrinsic_base(instr),
|
|
|
|
|
nir_intrinsic_component(instr)),
|
|
|
|
|
BRW_TYPE_UD),
|
|
|
|
|
component(src, 0));
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2019-04-11 14:55:40 -05:00
|
|
|
case nir_intrinsic_load_fs_input_interp_deltas: {
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
2019-04-11 14:55:40 -05:00
|
|
|
assert(nir_src_as_uint(instr->src[0]) == 0);
|
2022-06-22 16:19:05 -07:00
|
|
|
const unsigned base = nir_intrinsic_base(instr);
|
|
|
|
|
const unsigned comp = nir_intrinsic_component(instr);
|
2024-04-20 17:08:02 -07:00
|
|
|
dest.type = BRW_TYPE_F;
|
2023-12-01 16:23:11 -08:00
|
|
|
|
|
|
|
|
/* Gfx20+ packs the plane parameters of a single logical
|
|
|
|
|
* input in a vec3 format instead of the previously used vec4
|
|
|
|
|
* format.
|
|
|
|
|
*/
|
|
|
|
|
if (devinfo->ver >= 20) {
|
2024-07-12 23:36:49 -07:00
|
|
|
bld.MOV(offset(dest, bld, 0), brw_interp_reg(bld, base, comp, 0));
|
|
|
|
|
bld.MOV(offset(dest, bld, 1), brw_interp_reg(bld, base, comp, 2));
|
|
|
|
|
bld.MOV(offset(dest, bld, 2), brw_interp_reg(bld, base, comp, 1));
|
2023-12-01 16:23:11 -08:00
|
|
|
} else {
|
2024-07-12 23:36:49 -07:00
|
|
|
bld.MOV(offset(dest, bld, 0), brw_interp_reg(bld, base, comp, 3));
|
|
|
|
|
bld.MOV(offset(dest, bld, 1), brw_interp_reg(bld, base, comp, 1));
|
|
|
|
|
bld.MOV(offset(dest, bld, 2), brw_interp_reg(bld, base, comp, 0));
|
2023-12-01 16:23:11 -08:00
|
|
|
}
|
|
|
|
|
|
2019-04-11 14:55:40 -05:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
case nir_intrinsic_load_barycentric_pixel:
|
|
|
|
|
case nir_intrinsic_load_barycentric_centroid:
|
2019-04-11 14:12:58 -05:00
|
|
|
case nir_intrinsic_load_barycentric_sample: {
|
|
|
|
|
/* Use the delta_xy values computed from the payload */
|
2024-11-18 11:33:35 +02:00
|
|
|
enum intel_barycentric_mode bary = brw_barycentric_mode(
|
2024-04-18 09:54:11 +03:00
|
|
|
reinterpret_cast<const brw_wm_prog_key *>(s.key), instr);
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg srcs[] = { offset(s.delta_xy[bary], bld, 0),
|
2023-12-05 17:16:34 -08:00
|
|
|
offset(s.delta_xy[bary], bld, 1) };
|
2020-01-03 17:08:51 -08:00
|
|
|
bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
|
2016-07-12 03:57:25 -07:00
|
|
|
break;
|
2019-04-11 14:12:58 -05:00
|
|
|
}
|
2016-07-12 03:57:25 -07:00
|
|
|
|
|
|
|
|
case nir_intrinsic_load_barycentric_at_sample: {
|
|
|
|
|
const glsl_interp_mode interpolation =
|
|
|
|
|
(enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
|
2015-07-20 17:38:15 +03:00
|
|
|
|
2024-06-20 18:51:06 -07:00
|
|
|
if (devinfo->ver >= 20) {
|
|
|
|
|
emit_pixel_interpolater_alu_at_sample(
|
2025-01-15 13:27:05 -08:00
|
|
|
bld, dest, retype(get_nir_src(ntb, instr->src[0], 0),
|
2024-06-20 18:51:06 -07:00
|
|
|
BRW_TYPE_UD),
|
|
|
|
|
interpolation);
|
|
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
} else {
|
2025-01-15 13:27:05 -08:00
|
|
|
const brw_reg sample_src = retype(get_nir_src(ntb, instr->src[0], 0),
|
2024-12-12 16:41:51 -08:00
|
|
|
BRW_TYPE_UD);
|
|
|
|
|
const brw_reg sample_id = bld.emit_uniformize(sample_src);
|
|
|
|
|
const brw_reg msg_data = component(bld.group(8, 0).vgrf(BRW_TYPE_UD), 0);
|
|
|
|
|
|
2025-04-03 01:14:03 -07:00
|
|
|
bld.uniform().SHL(msg_data, sample_id, brw_imm_ud(4u));
|
2023-08-24 01:23:00 +03:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg flag_reg;
|
2024-06-20 18:51:06 -07:00
|
|
|
struct brw_wm_prog_key *wm_prog_key = (struct brw_wm_prog_key *) s.key;
|
2024-11-18 10:58:46 +02:00
|
|
|
if (wm_prog_key->multisample_fbo == INTEL_SOMETIMES) {
|
2024-06-20 18:51:06 -07:00
|
|
|
struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
|
2023-11-21 10:38:19 +02:00
|
|
|
|
2024-12-06 21:46:48 -08:00
|
|
|
brw_check_dynamic_msaa_flag(bld.exec_all().group(8, 0),
|
|
|
|
|
wm_prog_data,
|
|
|
|
|
INTEL_MSAA_FLAG_MULTISAMPLE_FBO);
|
2024-06-20 18:51:06 -07:00
|
|
|
flag_reg = brw_flag_reg(0, 0);
|
|
|
|
|
}
|
2023-11-21 10:38:19 +02:00
|
|
|
|
2024-06-20 18:51:06 -07:00
|
|
|
emit_pixel_interpolater_send(bld,
|
|
|
|
|
FS_OPCODE_INTERPOLATE_AT_SAMPLE,
|
|
|
|
|
dest,
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg(), /* src */
|
2024-06-20 18:51:06 -07:00
|
|
|
msg_data,
|
|
|
|
|
flag_reg,
|
|
|
|
|
interpolation);
|
|
|
|
|
}
|
2016-07-12 03:57:25 -07:00
|
|
|
break;
|
|
|
|
|
}
|
2015-07-27 16:26:52 +03:00
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
case nir_intrinsic_load_barycentric_at_offset: {
|
|
|
|
|
const glsl_interp_mode interpolation =
|
|
|
|
|
(enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
|
2015-07-27 16:26:52 +03:00
|
|
|
|
2022-08-10 17:33:56 -07:00
|
|
|
if (devinfo->ver >= 20) {
|
|
|
|
|
emit_pixel_interpolater_alu_at_offset(
|
|
|
|
|
bld, dest,
|
2025-01-13 19:00:25 -08:00
|
|
|
retype(get_nir_src(ntb, instr->src[0], -1), BRW_TYPE_F),
|
2022-08-10 17:33:56 -07:00
|
|
|
interpolation);
|
2015-07-27 16:26:52 +03:00
|
|
|
|
2022-08-10 17:33:56 -07:00
|
|
|
} else if (nir_const_value *const_offset = nir_src_as_const_value(instr->src[0])) {
|
2018-10-20 09:55:28 -05:00
|
|
|
assert(nir_src_bit_size(instr->src[0]) == 32);
|
2020-08-04 19:01:13 -07:00
|
|
|
unsigned off_x = const_offset[0].u32 & 0xf;
|
|
|
|
|
unsigned off_y = const_offset[1].u32 & 0xf;
|
2015-11-04 23:05:07 -08:00
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
emit_pixel_interpolater_send(bld,
|
|
|
|
|
FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
|
2020-01-03 16:12:23 -08:00
|
|
|
dest,
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg(), /* src */
|
2016-07-12 03:57:25 -07:00
|
|
|
brw_imm_ud(off_x | (off_y << 4)),
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg(), /* flag_reg */
|
2016-07-12 03:57:25 -07:00
|
|
|
interpolation);
|
|
|
|
|
} else {
|
2024-02-12 08:43:34 -08:00
|
|
|
brw_reg src = retype(get_nir_src(ntb, instr->src[0], -1), BRW_TYPE_D);
|
2016-07-12 03:57:25 -07:00
|
|
|
const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
|
|
|
|
|
emit_pixel_interpolater_send(bld,
|
|
|
|
|
opcode,
|
2020-01-03 16:12:23 -08:00
|
|
|
dest,
|
2016-07-12 03:57:25 -07:00
|
|
|
src,
|
|
|
|
|
brw_imm_ud(0u),
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg(), /* flag_reg */
|
2016-07-12 03:57:25 -07:00
|
|
|
interpolation);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-08 22:06:55 -08:00
|
|
|
case nir_intrinsic_load_frag_coord: {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg comps[4] = { s.pixel_x, s.pixel_y, s.pixel_z, s.wpos_w };
|
2024-01-08 22:06:55 -08:00
|
|
|
bld.VEC(dest, comps, 4);
|
2019-07-18 09:59:44 -05:00
|
|
|
break;
|
2024-01-08 22:06:55 -08:00
|
|
|
}
|
2015-11-04 23:05:07 -08:00
|
|
|
|
2019-07-18 09:59:44 -05:00
|
|
|
case nir_intrinsic_load_interpolated_input: {
|
2016-07-12 03:57:25 -07:00
|
|
|
assert(instr->src[0].ssa &&
|
|
|
|
|
instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
|
2025-08-12 21:45:08 +02:00
|
|
|
nir_intrinsic_instr *bary_intrinsic = nir_def_as_intrinsic(instr->src[0].ssa);
|
2016-07-12 03:57:25 -07:00
|
|
|
nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dst_xy;
|
2016-07-12 03:57:25 -07:00
|
|
|
|
|
|
|
|
if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
|
|
|
|
|
bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
|
2020-01-03 17:08:51 -08:00
|
|
|
/* Use the result of the PI message. */
|
2025-01-13 19:00:25 -08:00
|
|
|
dst_xy = retype(get_nir_src(ntb, instr->src[0], -1), BRW_TYPE_F);
|
2016-07-12 03:57:25 -07:00
|
|
|
} else {
|
|
|
|
|
/* Use the delta_xy values computed from the payload */
|
2024-11-18 11:33:35 +02:00
|
|
|
enum intel_barycentric_mode bary = brw_barycentric_mode(
|
2024-04-18 09:54:11 +03:00
|
|
|
reinterpret_cast<const brw_wm_prog_key *>(s.key), bary_intrinsic);
|
2023-12-05 17:16:34 -08:00
|
|
|
dst_xy = s.delta_xy[bary];
|
2015-11-04 23:05:07 -08:00
|
|
|
}
|
|
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
for (unsigned int i = 0; i < instr->num_components; i++) {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg interp =
|
2024-07-12 23:36:49 -07:00
|
|
|
brw_interp_reg(bld, nir_intrinsic_base(instr),
|
|
|
|
|
nir_intrinsic_component(instr) + i, 0);
|
2024-04-20 17:08:02 -07:00
|
|
|
interp.type = BRW_TYPE_F;
|
|
|
|
|
dest.type = BRW_TYPE_F;
|
2015-11-04 23:05:07 -08:00
|
|
|
|
2024-04-11 01:10:51 -07:00
|
|
|
bld.PLN(offset(dest, bld, i), interp, dst_xy);
|
2015-11-04 23:05:07 -08:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
2016-07-12 03:57:25 -07:00
|
|
|
|
2024-05-12 14:39:14 +03:00
|
|
|
case nir_intrinsic_load_fs_msaa_intel:
|
|
|
|
|
bld.MOV(retype(dest, BRW_TYPE_UD),
|
|
|
|
|
brw_dynamic_msaa_flags(brw_wm_prog_data(s.prog_data)));
|
|
|
|
|
break;
|
|
|
|
|
|
2025-04-29 12:50:42 +03:00
|
|
|
case nir_intrinsic_load_max_polygon_intel:
|
|
|
|
|
bld.MOV(retype(dest, BRW_TYPE_UD), brw_imm_ud(s.max_polygons));
|
|
|
|
|
break;
|
|
|
|
|
|
2025-05-19 17:05:15 +03:00
|
|
|
case nir_intrinsic_load_per_primitive_remap_intel:
|
|
|
|
|
bld.MOV(retype(dest, BRW_TYPE_UD),
|
|
|
|
|
brw_dynamic_per_primitive_remap(brw_wm_prog_data(s.prog_data)));
|
|
|
|
|
break;
|
|
|
|
|
|
2025-04-29 12:50:42 +03:00
|
|
|
case nir_intrinsic_read_attribute_payload_intel: {
|
2025-05-19 17:05:15 +03:00
|
|
|
const brw_reg offset = retype(
|
|
|
|
|
bld.emit_uniformize(get_nir_src(ntb, instr->src[0], 0)),
|
|
|
|
|
BRW_TYPE_UD);
|
2025-04-29 12:50:42 +03:00
|
|
|
bld.emit(FS_OPCODE_READ_ATTRIBUTE_PAYLOAD, retype(dest, BRW_TYPE_UD), offset);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2015-11-04 23:05:07 -08:00
|
|
|
default:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_intrinsic(ntb, bld, instr);
|
2015-11-04 23:05:07 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-17 09:20:11 +02:00
|
|
|
static bool
|
|
|
|
|
can_use_instruction_offset(enum lsc_addr_surface_type binding_type, int32_t offset)
|
|
|
|
|
{
|
|
|
|
|
const unsigned max_bits = brw_max_immediate_offset_bits(binding_type);
|
|
|
|
|
return offset >= u_intN_min(max_bits) && offset <= u_intN_max(max_bits);
|
|
|
|
|
}
|
|
|
|
|
|
2025-08-21 16:20:49 -07:00
|
|
|
static brw_reg
|
|
|
|
|
memory_address(nir_to_brw_state &ntb,
|
|
|
|
|
const brw_builder &bld,
|
|
|
|
|
nir_intrinsic_instr *instr,
|
|
|
|
|
enum lsc_addr_surface_type binding_type,
|
|
|
|
|
int32_t *address_offset)
|
2024-09-17 09:20:11 +02:00
|
|
|
{
|
|
|
|
|
const intel_device_info *devinfo = ntb.devinfo;
|
|
|
|
|
const nir_src *nir_src_offset = nir_get_io_offset_src(instr);
|
|
|
|
|
const brw_reg src_offset = get_nir_src_imm(ntb, *nir_src_offset);
|
|
|
|
|
const brw_builder ubld = src_offset.is_scalar ? bld.scalar_group() : bld;
|
2025-08-21 16:20:49 -07:00
|
|
|
brw_reg address;
|
2024-09-17 09:20:11 +02:00
|
|
|
|
|
|
|
|
if (devinfo->ver < 20 ||
|
|
|
|
|
(!nir_intrinsic_has_base(instr) && !nir_src_is_const(*nir_src_offset))) {
|
2025-08-21 16:20:49 -07:00
|
|
|
address =
|
2024-09-17 09:20:11 +02:00
|
|
|
nir_intrinsic_has_base(instr) ?
|
|
|
|
|
ubld.ADD(src_offset,
|
|
|
|
|
brw_imm_int(src_offset.type, nir_intrinsic_base(instr))) :
|
|
|
|
|
src_offset;
|
2025-08-21 16:20:49 -07:00
|
|
|
*address_offset = 0;
|
2024-09-17 09:20:11 +02:00
|
|
|
} else if (!nir_intrinsic_has_base(instr) && nir_src_is_const(*nir_src_offset)) {
|
|
|
|
|
const int32_t offset = nir_src_as_int(*nir_src_offset);
|
|
|
|
|
if (can_use_instruction_offset(binding_type, offset)) {
|
2025-08-21 16:20:49 -07:00
|
|
|
address = brw_imm_ud(0);
|
|
|
|
|
*address_offset = offset;
|
2024-09-17 09:20:11 +02:00
|
|
|
} else {
|
2025-08-21 16:20:49 -07:00
|
|
|
address = src_offset;
|
|
|
|
|
*address_offset = 0;
|
2024-09-17 09:20:11 +02:00
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
assert(nir_intrinsic_has_base(instr));
|
|
|
|
|
const int32_t offset = nir_intrinsic_base(instr);
|
|
|
|
|
assert(can_use_instruction_offset(binding_type, offset));
|
2025-08-21 16:20:49 -07:00
|
|
|
address = src_offset;
|
|
|
|
|
*address_offset = offset;
|
2024-09-17 09:20:11 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* If nir_src is_scalar, the MEMORY_LOGICAL_ADDRESS will be allocated at
|
|
|
|
|
* scalar_group() size and will have every component the same value. This
|
|
|
|
|
* is the definition of is_scalar. Much more importantly, setting is_scalar
|
|
|
|
|
* properly also ensures that emit_uniformize (below) will handle the value
|
|
|
|
|
* as scalar_group() size instead of full dispatch width.
|
|
|
|
|
*/
|
2025-08-21 16:20:49 -07:00
|
|
|
address.is_scalar = src_offset.is_scalar;
|
|
|
|
|
|
|
|
|
|
return address;
|
2024-09-17 09:20:11 +02:00
|
|
|
}
|
|
|
|
|
|
2024-07-12 16:36:39 -07:00
|
|
|
static unsigned
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_workgroup_size(brw_shader &s)
|
2024-07-12 16:36:39 -07:00
|
|
|
{
|
2025-08-05 16:50:43 +08:00
|
|
|
assert(mesa_shader_stage_uses_workgroup(s.stage));
|
2024-07-12 16:36:39 -07:00
|
|
|
assert(!s.nir->info.workgroup_size_variable);
|
|
|
|
|
const struct brw_cs_prog_data *cs = brw_cs_prog_data(s.prog_data);
|
|
|
|
|
return cs->local_size[0] * cs->local_size[1] * cs->local_size[2];
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 12:13:47 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_cs_intrinsic(nir_to_brw_state &ntb,
|
2023-11-20 12:13:47 -08:00
|
|
|
nir_intrinsic_instr *instr)
|
2015-11-04 23:05:07 -08:00
|
|
|
{
|
2023-12-05 15:27:29 -08:00
|
|
|
const intel_device_info *devinfo = ntb.devinfo;
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 12:13:47 -08:00
|
|
|
|
2025-08-05 16:50:43 +08:00
|
|
|
assert(mesa_shader_stage_uses_workgroup(s.stage));
|
2023-12-05 17:16:34 -08:00
|
|
|
struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(s.prog_data);
|
2015-11-04 23:05:07 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dest;
|
2015-11-04 23:05:07 -08:00
|
|
|
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
|
2023-11-20 21:21:54 -08:00
|
|
|
dest = get_nir_def(ntb, instr->def);
|
2015-11-04 23:05:07 -08:00
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder xbld = dest.is_scalar ? bld.scalar_group() : bld;
|
2024-07-05 16:35:33 -07:00
|
|
|
|
2015-11-04 23:05:07 -08:00
|
|
|
switch (instr->intrinsic) {
|
2023-07-28 15:08:00 -04:00
|
|
|
case nir_intrinsic_barrier:
|
2023-05-30 12:05:30 -07:00
|
|
|
if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_intrinsic(ntb, bld, instr);
|
2023-05-30 12:05:30 -07:00
|
|
|
if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
|
2023-03-02 14:26:53 -08:00
|
|
|
/* The whole workgroup fits in a single HW thread, so all the
|
|
|
|
|
* invocations are already executed lock-step. Instead of an actual
|
|
|
|
|
* barrier just emit a scheduling fence, that will generate no code.
|
|
|
|
|
*/
|
2023-12-05 17:16:34 -08:00
|
|
|
if (!s.nir->info.workgroup_size_variable &&
|
2024-07-12 16:36:39 -07:00
|
|
|
brw_workgroup_size(s) <= s.dispatch_width) {
|
2025-04-03 01:14:03 -07:00
|
|
|
bld.uniform().emit(FS_OPCODE_SCHEDULING_FENCE);
|
2023-03-02 14:26:53 -08:00
|
|
|
break;
|
|
|
|
|
}
|
2020-01-14 12:03:22 -08:00
|
|
|
|
2023-11-20 22:00:28 -08:00
|
|
|
emit_barrier(ntb);
|
2023-03-02 14:26:53 -08:00
|
|
|
cs_prog_data->uses_barrier = true;
|
|
|
|
|
}
|
2015-11-04 23:05:07 -08:00
|
|
|
break;
|
|
|
|
|
|
2024-09-30 08:45:21 +03:00
|
|
|
case nir_intrinsic_load_inline_data_intel: {
|
2024-12-06 22:13:36 -08:00
|
|
|
const brw_cs_thread_payload &payload = s.cs_payload();
|
2024-09-30 08:45:21 +03:00
|
|
|
unsigned inline_stride = brw_type_size_bytes(dest.type);
|
2024-07-05 16:35:33 -07:00
|
|
|
for (unsigned c = 0; c < instr->def.num_components; c++) {
|
|
|
|
|
xbld.MOV(offset(dest, xbld, c),
|
|
|
|
|
retype(
|
|
|
|
|
byte_offset(payload.inline_parameter,
|
|
|
|
|
nir_intrinsic_base(instr) +
|
|
|
|
|
c * inline_stride),
|
|
|
|
|
dest.type));
|
|
|
|
|
}
|
2024-09-30 08:45:21 +03:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2017-08-24 11:40:31 -07:00
|
|
|
case nir_intrinsic_load_subgroup_id:
|
2023-12-05 17:16:34 -08:00
|
|
|
s.cs_payload().load_subgroup_id(bld, dest);
|
2017-09-29 17:57:32 -07:00
|
|
|
break;
|
|
|
|
|
|
2023-11-27 16:31:25 -08:00
|
|
|
case nir_intrinsic_load_local_invocation_id:
|
|
|
|
|
/* This is only used for hardware generated local IDs. */
|
|
|
|
|
assert(cs_prog_data->generate_local_id);
|
|
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
dest.type = BRW_TYPE_UD;
|
2023-11-27 16:31:25 -08:00
|
|
|
|
2022-11-29 13:54:55 -06:00
|
|
|
for (unsigned i = 0; i < 3; i++)
|
2023-11-27 16:31:25 -08:00
|
|
|
bld.MOV(offset(dest, bld, i), s.cs_payload().local_invocation_id[i]);
|
2022-11-29 13:54:55 -06:00
|
|
|
break;
|
|
|
|
|
|
2024-03-21 00:15:48 +01:00
|
|
|
case nir_intrinsic_load_workgroup_id: {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg val = ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID];
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder ubld = bld.scalar_group();
|
2024-01-30 18:14:02 -08:00
|
|
|
|
2015-11-04 23:05:07 -08:00
|
|
|
assert(val.file != BAD_FILE);
|
2024-01-30 18:14:02 -08:00
|
|
|
assert(val.is_scalar);
|
|
|
|
|
|
2015-11-04 23:05:07 -08:00
|
|
|
dest.type = val.type;
|
|
|
|
|
for (unsigned i = 0; i < 3; i++)
|
2024-01-30 18:14:02 -08:00
|
|
|
ubld.MOV(offset(dest, ubld, i), offset(val, ubld, i));
|
2015-11-04 23:05:07 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-04 12:04:15 -07:00
|
|
|
case nir_intrinsic_load_num_workgroups: {
|
2023-08-14 11:56:00 -05:00
|
|
|
assert(instr->def.bit_size == 32);
|
2015-11-04 23:05:07 -08:00
|
|
|
|
|
|
|
|
cs_prog_data->uses_num_work_groups = true;
|
|
|
|
|
|
2024-08-21 11:35:31 -07:00
|
|
|
brw_reg srcs[MEMORY_LOGICAL_NUM_SRCS];
|
|
|
|
|
srcs[MEMORY_LOGICAL_BINDING] = brw_imm_ud(0);
|
|
|
|
|
srcs[MEMORY_LOGICAL_ADDRESS] = brw_imm_ud(0);
|
|
|
|
|
|
2025-08-21 16:20:49 -07:00
|
|
|
brw_mem_inst *mem =
|
2024-08-21 11:35:31 -07:00
|
|
|
bld.emit(SHADER_OPCODE_MEMORY_LOAD_LOGICAL,
|
2025-08-21 16:20:49 -07:00
|
|
|
dest, srcs, MEMORY_LOGICAL_NUM_SRCS)->as_mem();
|
|
|
|
|
mem->size_written = 3 * s.dispatch_width * 4;
|
|
|
|
|
mem->lsc_op = LSC_OP_LOAD;
|
|
|
|
|
mem->mode = MEMORY_MODE_UNTYPED;
|
|
|
|
|
mem->binding_type = LSC_ADDR_SURFTYPE_BTI;
|
|
|
|
|
mem->data_size = LSC_DATA_SIZE_D32;
|
|
|
|
|
mem->coord_components = 1;
|
|
|
|
|
mem->components = 3;
|
|
|
|
|
mem->alignment = 4;
|
2015-11-04 23:05:07 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2021-05-27 14:44:54 -07:00
|
|
|
case nir_intrinsic_load_workgroup_size: {
|
2022-08-15 00:18:59 -07:00
|
|
|
/* Should have been lowered by brw_nir_lower_cs_intrinsics() or
|
2024-10-15 05:08:31 -07:00
|
|
|
* iris_setup_uniforms() for the variable group size case.
|
2022-08-15 00:18:59 -07:00
|
|
|
*/
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("Should have been lowered");
|
2018-11-12 06:29:51 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-09 13:54:38 -07:00
|
|
|
case nir_intrinsic_dpas_intel: {
|
|
|
|
|
const unsigned sdepth = nir_intrinsic_systolic_depth(instr);
|
|
|
|
|
const unsigned rcount = nir_intrinsic_repeat_count(instr);
|
|
|
|
|
|
|
|
|
|
const brw_reg_type dest_type =
|
2025-03-14 13:29:59 -07:00
|
|
|
brw_type_for_base_type(nir_intrinsic_dest_base_type(instr));
|
2023-10-09 13:54:38 -07:00
|
|
|
const brw_reg_type src_type =
|
2025-03-14 13:29:59 -07:00
|
|
|
brw_type_for_base_type(nir_intrinsic_src_base_type(instr));
|
2023-10-09 13:54:38 -07:00
|
|
|
|
2025-04-03 11:05:28 -07:00
|
|
|
brw_reg src[3] = {};
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(src); i++) {
|
|
|
|
|
nir_src nsrc = instr->src[i];
|
2023-10-09 13:54:38 -07:00
|
|
|
|
2025-04-03 11:05:28 -07:00
|
|
|
if (!nir_src_is_const(nsrc)) {
|
|
|
|
|
src[i] = get_nir_src(ntb, nsrc, 0);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* A single constant value can be used to fill the entire
|
|
|
|
|
* cooperative matrix. In this case get_nir_src() would give a
|
|
|
|
|
* uniform value (with stride 0), but DPAS can't use regioning,
|
|
|
|
|
* it needs the full data available in the register.
|
|
|
|
|
*
|
|
|
|
|
* So when a source is a constant, allocate the space necessary
|
|
|
|
|
* and fill it with the constant value. Except for
|
|
|
|
|
*
|
|
|
|
|
* When Src0 is specified as null, it is treated as an
|
|
|
|
|
* immediate value of +0.
|
|
|
|
|
*
|
|
|
|
|
* documented in ACM PRM, Vol 2a, "Dot Product Accumulate Systolic".
|
|
|
|
|
*/
|
|
|
|
|
const unsigned num_components = nir_src_num_components(nsrc);
|
|
|
|
|
const unsigned bit_size = nir_src_bit_size(nsrc);
|
|
|
|
|
const nir_const_value *nval = nir_src_as_const_value(instr->src[0]);
|
|
|
|
|
|
|
|
|
|
assert(bit_size <= 32);
|
|
|
|
|
for (unsigned j = 1; j < num_components; j++)
|
|
|
|
|
assert(nval[0].u32 == nval[j].u32);
|
|
|
|
|
uint32_t val = nval[0].u32;
|
|
|
|
|
|
|
|
|
|
if (i == 0 && val == 0) {
|
|
|
|
|
src[i] = brw_null_reg();
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
unsigned size = bit_size * num_components;
|
|
|
|
|
unsigned count = size / 32;
|
|
|
|
|
assert(size % 32 == 0);
|
|
|
|
|
|
|
|
|
|
src[i] = bld.vgrf(BRW_TYPE_UD, count);
|
|
|
|
|
for (unsigned j = 0; j < count; j++)
|
|
|
|
|
bld.exec_all().MOV(offset(src[i], bld, j), brw_imm_ud(val));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const unsigned dpas_exec_size = devinfo->ver >= 20 ? 16 : 8;
|
|
|
|
|
brw_builder bldn = bld.exec_all().group(dpas_exec_size, 0);
|
2023-10-09 13:54:38 -07:00
|
|
|
|
2025-04-03 11:05:28 -07:00
|
|
|
/* DPAS uses a different source order: Accumulator, B, A. */
|
|
|
|
|
bldn.DPAS(retype(dest, dest_type),
|
|
|
|
|
retype(src[0], dest_type),
|
|
|
|
|
retype(src[2], src_type),
|
|
|
|
|
retype(src[1], src_type),
|
2023-10-09 13:54:38 -07:00
|
|
|
sdepth,
|
|
|
|
|
rcount)
|
|
|
|
|
->saturate = nir_intrinsic_saturate(instr);
|
|
|
|
|
|
2023-09-22 16:17:18 -07:00
|
|
|
cs_prog_data->uses_systolic = true;
|
2023-10-09 13:54:38 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2025-06-09 16:05:19 -04:00
|
|
|
case nir_intrinsic_convert_cmat_intel: {
|
|
|
|
|
struct glsl_cmat_description dst_cmat_desc =
|
|
|
|
|
nir_intrinsic_dst_cmat_desc(instr);
|
|
|
|
|
struct glsl_cmat_description src_cmat_desc =
|
|
|
|
|
nir_intrinsic_src_cmat_desc(instr);
|
|
|
|
|
|
|
|
|
|
brw_reg_type dst_type =
|
|
|
|
|
brw_type_for_base_type((enum glsl_base_type)dst_cmat_desc.element_type);
|
|
|
|
|
brw_reg_type src_type =
|
|
|
|
|
brw_type_for_base_type((enum glsl_base_type)src_cmat_desc.element_type);
|
|
|
|
|
|
|
|
|
|
const unsigned dst_element_bits =
|
|
|
|
|
brw_type_size_bits(dst_type);
|
|
|
|
|
const unsigned src_element_bits =
|
|
|
|
|
brw_type_size_bits(src_type);
|
|
|
|
|
|
|
|
|
|
const unsigned element_bits = 32;
|
|
|
|
|
const unsigned src_packing_factor = element_bits / src_element_bits;
|
|
|
|
|
const unsigned src_components = nir_src_num_components(instr->src[0]);
|
|
|
|
|
const unsigned elems = src_components * src_packing_factor;
|
|
|
|
|
|
|
|
|
|
brw_builder bldn = bld.exec_all();
|
2025-07-16 16:06:02 -07:00
|
|
|
brw_reg src = retype(get_nir_src(ntb, instr->src[0], 0), src_type);
|
2025-06-09 16:05:19 -04:00
|
|
|
const brw_reg dst = retype(dest, dst_type);
|
|
|
|
|
|
|
|
|
|
assert(dst_cmat_desc.use == src_cmat_desc.use);
|
|
|
|
|
|
2025-07-16 16:06:02 -07:00
|
|
|
const bool needs_intermediate =
|
|
|
|
|
(src.type == BRW_TYPE_BF && dst.type != BRW_TYPE_F) ||
|
|
|
|
|
(dst.type == BRW_TYPE_BF && src.type != BRW_TYPE_F);
|
|
|
|
|
|
2025-06-09 16:05:19 -04:00
|
|
|
switch (src_cmat_desc.use) {
|
|
|
|
|
case GLSL_CMAT_USE_B:
|
|
|
|
|
assert(dst_element_bits == src_element_bits);
|
|
|
|
|
FALLTHROUGH;
|
|
|
|
|
|
|
|
|
|
case GLSL_CMAT_USE_A:
|
|
|
|
|
case GLSL_CMAT_USE_ACCUMULATOR: {
|
|
|
|
|
const unsigned width = bldn.dispatch_width();
|
2025-07-16 16:06:02 -07:00
|
|
|
|
|
|
|
|
if (needs_intermediate) {
|
|
|
|
|
brw_reg tmp = bldn.vgrf(BRW_TYPE_F, elems);
|
|
|
|
|
for (unsigned c = 0; c < elems; c++) {
|
|
|
|
|
bldn.MOV(suboffset(tmp, c * width),
|
|
|
|
|
suboffset(src, c * width));
|
|
|
|
|
}
|
|
|
|
|
src = tmp;
|
|
|
|
|
}
|
|
|
|
|
|
2025-06-09 16:05:19 -04:00
|
|
|
for (unsigned c = 0; c < elems; c++) {
|
|
|
|
|
bldn.MOV(suboffset(dst, c * width),
|
|
|
|
|
suboffset(src, c * width));
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
default:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("not reached");
|
2025-06-09 16:05:19 -04:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2015-11-04 23:05:07 -08:00
|
|
|
default:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_intrinsic(ntb, bld, instr);
|
2015-11-04 23:05:07 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-14 17:30:31 +03:00
|
|
|
static void
|
2024-12-29 15:41:04 -08:00
|
|
|
emit_rt_lsc_fence(const brw_builder &bld,
|
2022-04-05 13:23:13 +00:00
|
|
|
enum lsc_fence_scope scope,
|
|
|
|
|
enum lsc_flush_type flush_type)
|
2021-06-14 17:30:31 +03:00
|
|
|
{
|
|
|
|
|
const intel_device_info *devinfo = bld.shader->devinfo;
|
|
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder ubld = bld.exec_all().group(8, 0);
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp = ubld.vgrf(BRW_TYPE_UD);
|
2025-08-20 16:35:57 -07:00
|
|
|
|
2025-08-20 15:43:08 -07:00
|
|
|
brw_send_inst *send = ubld.SEND();
|
2025-08-20 16:35:57 -07:00
|
|
|
send->dst = tmp;
|
|
|
|
|
|
|
|
|
|
send->src[SEND_SRC_DESC] = brw_imm_ud(0);
|
|
|
|
|
send->src[SEND_SRC_EX_DESC] = brw_imm_ud(0);
|
|
|
|
|
send->src[SEND_SRC_PAYLOAD1] = brw_vec8_grf(0, 0);
|
|
|
|
|
send->src[SEND_SRC_PAYLOAD2] = brw_reg();
|
|
|
|
|
|
brw: Rename shared function enums for clarity
Our name for this enum was brw_message_target, but it's better known as
shared function ID or SFID. Call it brw_sfid to make it easier to find.
Now that brw only supports Gfx9+, we don't particularly care whether
SFIDs were introduced on Gfx4, Gfx6, or Gfx7.5. Also, the LSC SFIDs
were confusingly tagged "GFX12" but aren't available on Gfx12.0; they
were introduced with Alchemist/Meteorlake.
GFX6_SFID_DATAPORT_SAMPLER_CACHE in particular was confusing. It sounds
like the SFID to use for the sampler on Gfx6+, however it has nothing to
do with the sampler at all. BRW_SFID_SAMPLER remains the sampler SFID.
On Haswell, we ran out of messages on the main data cache data port, and
so they introduced two additional ones, for more messages. The modern
Tigerlake PRMs simply call these DP_DC0, DP_DC1, and DP_DC2. I think
the "sampler" name came from some idea about reorganizing messages that
never materialized (instead, the LSC came as a much larger cleanup).
Recently we've adopted the term "HDC" for the legacy data cluster, as
opposed to "LSC" for the modern Load/Store Cache. To make clear which
SFIDs target the legacy HDC dataports, we use BRW_SFID_HDC0/1/2.
We were also citing the G45, Sandybridge, and Ivybridge PRMs for a
compiler that supports none of those platforms. Cite modern docs.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33650>
2025-02-10 16:28:48 -08:00
|
|
|
send->sfid = BRW_SFID_UGM;
|
2022-04-05 13:23:13 +00:00
|
|
|
send->desc = lsc_fence_msg_desc(devinfo, scope, flush_type, true);
|
2023-02-01 10:32:10 -08:00
|
|
|
send->mlen = reg_unit(devinfo); /* g0 header */
|
2021-06-14 17:30:31 +03:00
|
|
|
send->ex_mlen = 0;
|
2023-02-01 10:32:10 -08:00
|
|
|
/* Temp write for scheduling */
|
|
|
|
|
send->size_written = REG_SIZE * reg_unit(devinfo);
|
2025-08-20 15:43:08 -07:00
|
|
|
send->has_side_effects = true;
|
2021-06-14 17:30:31 +03:00
|
|
|
|
|
|
|
|
ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), tmp);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2023-11-20 12:13:47 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_bs_intrinsic(nir_to_brw_state &ntb,
|
2023-11-20 12:13:47 -08:00
|
|
|
nir_intrinsic_instr *instr)
|
2020-10-21 14:46:50 -05:00
|
|
|
{
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 12:13:47 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(brw_shader_stage_is_bindless(s.stage));
|
2024-12-06 22:13:36 -08:00
|
|
|
const brw_bs_thread_payload &payload = s.bs_payload();
|
2020-10-21 14:46:50 -05:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dest;
|
2020-10-21 14:46:50 -05:00
|
|
|
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
|
2023-11-20 21:21:54 -08:00
|
|
|
dest = get_nir_def(ntb, instr->def);
|
2020-10-21 14:46:50 -05:00
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder xbld = dest.is_scalar ? bld.scalar_group() : bld;
|
brw/nir: Treat load_btd_{global,local}_arg_addr_intel and load_btd_shader_type_intel as convergent
No shader-db changes on any Intel platform. No fossil-db changes on
Tiger Lake, Ice Lake, or Skylake.
fossil-db:
Lunar Lake
Totals:
Instrs: 141808714 -> 141808513 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22177889310 -> 22181410192 (+0.02%); split: -0.00%, +0.02%
Spill count: 69892 -> 69890 (-0.00%); split: -0.01%, +0.01%
Fill count: 128313 -> 128331 (+0.01%)
Max live registers: 48052083 -> 48052742 (+0.00%); split: -0.00%, +0.00%
Totals from 549 (0.10% of 551446) affected shaders:
Instrs: 911251 -> 911050 (-0.02%); split: -0.10%, +0.07%
Cycle count: 1244153266 -> 1247674148 (+0.28%); split: -0.04%, +0.32%
Spill count: 15849 -> 15847 (-0.01%); split: -0.04%, +0.03%
Fill count: 35087 -> 35105 (+0.05%)
Max live registers: 68047 -> 68706 (+0.97%); split: -0.25%, +1.22%
Meteor Lake
Totals:
Instrs: 152744298 -> 152741241 (-0.00%); split: -0.00%, +0.00%
Cycle count: 17410258529 -> 17405949054 (-0.02%); split: -0.04%, +0.01%
Spill count: 78528 -> 78598 (+0.09%); split: -0.01%, +0.09%
Fill count: 147893 -> 147978 (+0.06%); split: -0.00%, +0.06%
Scratch Memory Size: 3962880 -> 3969024 (+0.16%)
Max live registers: 31887206 -> 31887413 (+0.00%); split: -0.00%, +0.00%
Totals from 552 (0.09% of 633315) affected shaders:
Instrs: 907279 -> 904222 (-0.34%); split: -0.48%, +0.15%
Cycle count: 1152358569 -> 1148049094 (-0.37%); split: -0.56%, +0.19%
Spill count: 15290 -> 15360 (+0.46%); split: -0.03%, +0.48%
Fill count: 35313 -> 35398 (+0.24%); split: -0.02%, +0.26%
Scratch Memory Size: 1313792 -> 1319936 (+0.47%)
Max live registers: 34218 -> 34425 (+0.60%); split: -0.47%, +1.08%
DG2
Totals:
Instrs: 152766492 -> 152763061 (-0.00%); split: -0.00%, +0.00%
Cycle count: 17406058608 -> 17406396943 (+0.00%); split: -0.02%, +0.02%
Spill count: 78626 -> 78624 (-0.00%); split: -0.01%, +0.01%
Fill count: 147956 -> 148007 (+0.03%); split: -0.01%, +0.04%
Scratch Memory Size: 3962880 -> 3969024 (+0.16%)
Max live registers: 31887158 -> 31887365 (+0.00%); split: -0.00%, +0.00%
Totals from 552 (0.09% of 633315) affected shaders:
Instrs: 908513 -> 905082 (-0.38%); split: -0.47%, +0.09%
Cycle count: 1148162185 -> 1148500520 (+0.03%); split: -0.23%, +0.26%
Spill count: 15364 -> 15362 (-0.01%); split: -0.07%, +0.06%
Fill count: 35343 -> 35394 (+0.14%); split: -0.03%, +0.17%
Scratch Memory Size: 1313792 -> 1319936 (+0.47%)
Max live registers: 34218 -> 34425 (+0.60%); split: -0.47%, +1.08%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-07-06 11:49:44 -07:00
|
|
|
|
2020-10-21 14:46:50 -05:00
|
|
|
switch (instr->intrinsic) {
|
|
|
|
|
case nir_intrinsic_load_btd_global_arg_addr_intel:
|
brw/nir: Treat load_btd_{global,local}_arg_addr_intel and load_btd_shader_type_intel as convergent
No shader-db changes on any Intel platform. No fossil-db changes on
Tiger Lake, Ice Lake, or Skylake.
fossil-db:
Lunar Lake
Totals:
Instrs: 141808714 -> 141808513 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22177889310 -> 22181410192 (+0.02%); split: -0.00%, +0.02%
Spill count: 69892 -> 69890 (-0.00%); split: -0.01%, +0.01%
Fill count: 128313 -> 128331 (+0.01%)
Max live registers: 48052083 -> 48052742 (+0.00%); split: -0.00%, +0.00%
Totals from 549 (0.10% of 551446) affected shaders:
Instrs: 911251 -> 911050 (-0.02%); split: -0.10%, +0.07%
Cycle count: 1244153266 -> 1247674148 (+0.28%); split: -0.04%, +0.32%
Spill count: 15849 -> 15847 (-0.01%); split: -0.04%, +0.03%
Fill count: 35087 -> 35105 (+0.05%)
Max live registers: 68047 -> 68706 (+0.97%); split: -0.25%, +1.22%
Meteor Lake
Totals:
Instrs: 152744298 -> 152741241 (-0.00%); split: -0.00%, +0.00%
Cycle count: 17410258529 -> 17405949054 (-0.02%); split: -0.04%, +0.01%
Spill count: 78528 -> 78598 (+0.09%); split: -0.01%, +0.09%
Fill count: 147893 -> 147978 (+0.06%); split: -0.00%, +0.06%
Scratch Memory Size: 3962880 -> 3969024 (+0.16%)
Max live registers: 31887206 -> 31887413 (+0.00%); split: -0.00%, +0.00%
Totals from 552 (0.09% of 633315) affected shaders:
Instrs: 907279 -> 904222 (-0.34%); split: -0.48%, +0.15%
Cycle count: 1152358569 -> 1148049094 (-0.37%); split: -0.56%, +0.19%
Spill count: 15290 -> 15360 (+0.46%); split: -0.03%, +0.48%
Fill count: 35313 -> 35398 (+0.24%); split: -0.02%, +0.26%
Scratch Memory Size: 1313792 -> 1319936 (+0.47%)
Max live registers: 34218 -> 34425 (+0.60%); split: -0.47%, +1.08%
DG2
Totals:
Instrs: 152766492 -> 152763061 (-0.00%); split: -0.00%, +0.00%
Cycle count: 17406058608 -> 17406396943 (+0.00%); split: -0.02%, +0.02%
Spill count: 78626 -> 78624 (-0.00%); split: -0.01%, +0.01%
Fill count: 147956 -> 148007 (+0.03%); split: -0.01%, +0.04%
Scratch Memory Size: 3962880 -> 3969024 (+0.16%)
Max live registers: 31887158 -> 31887365 (+0.00%); split: -0.00%, +0.00%
Totals from 552 (0.09% of 633315) affected shaders:
Instrs: 908513 -> 905082 (-0.38%); split: -0.47%, +0.09%
Cycle count: 1148162185 -> 1148500520 (+0.03%); split: -0.23%, +0.26%
Spill count: 15364 -> 15362 (-0.01%); split: -0.07%, +0.06%
Fill count: 35343 -> 35394 (+0.14%); split: -0.03%, +0.17%
Scratch Memory Size: 1313792 -> 1319936 (+0.47%)
Max live registers: 34218 -> 34425 (+0.60%); split: -0.47%, +1.08%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-07-06 11:49:44 -07:00
|
|
|
xbld.MOV(dest, retype(payload.global_arg_ptr, dest.type));
|
2020-10-21 14:46:50 -05:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_btd_local_arg_addr_intel:
|
brw/nir: Treat load_btd_{global,local}_arg_addr_intel and load_btd_shader_type_intel as convergent
No shader-db changes on any Intel platform. No fossil-db changes on
Tiger Lake, Ice Lake, or Skylake.
fossil-db:
Lunar Lake
Totals:
Instrs: 141808714 -> 141808513 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22177889310 -> 22181410192 (+0.02%); split: -0.00%, +0.02%
Spill count: 69892 -> 69890 (-0.00%); split: -0.01%, +0.01%
Fill count: 128313 -> 128331 (+0.01%)
Max live registers: 48052083 -> 48052742 (+0.00%); split: -0.00%, +0.00%
Totals from 549 (0.10% of 551446) affected shaders:
Instrs: 911251 -> 911050 (-0.02%); split: -0.10%, +0.07%
Cycle count: 1244153266 -> 1247674148 (+0.28%); split: -0.04%, +0.32%
Spill count: 15849 -> 15847 (-0.01%); split: -0.04%, +0.03%
Fill count: 35087 -> 35105 (+0.05%)
Max live registers: 68047 -> 68706 (+0.97%); split: -0.25%, +1.22%
Meteor Lake
Totals:
Instrs: 152744298 -> 152741241 (-0.00%); split: -0.00%, +0.00%
Cycle count: 17410258529 -> 17405949054 (-0.02%); split: -0.04%, +0.01%
Spill count: 78528 -> 78598 (+0.09%); split: -0.01%, +0.09%
Fill count: 147893 -> 147978 (+0.06%); split: -0.00%, +0.06%
Scratch Memory Size: 3962880 -> 3969024 (+0.16%)
Max live registers: 31887206 -> 31887413 (+0.00%); split: -0.00%, +0.00%
Totals from 552 (0.09% of 633315) affected shaders:
Instrs: 907279 -> 904222 (-0.34%); split: -0.48%, +0.15%
Cycle count: 1152358569 -> 1148049094 (-0.37%); split: -0.56%, +0.19%
Spill count: 15290 -> 15360 (+0.46%); split: -0.03%, +0.48%
Fill count: 35313 -> 35398 (+0.24%); split: -0.02%, +0.26%
Scratch Memory Size: 1313792 -> 1319936 (+0.47%)
Max live registers: 34218 -> 34425 (+0.60%); split: -0.47%, +1.08%
DG2
Totals:
Instrs: 152766492 -> 152763061 (-0.00%); split: -0.00%, +0.00%
Cycle count: 17406058608 -> 17406396943 (+0.00%); split: -0.02%, +0.02%
Spill count: 78626 -> 78624 (-0.00%); split: -0.01%, +0.01%
Fill count: 147956 -> 148007 (+0.03%); split: -0.01%, +0.04%
Scratch Memory Size: 3962880 -> 3969024 (+0.16%)
Max live registers: 31887158 -> 31887365 (+0.00%); split: -0.00%, +0.00%
Totals from 552 (0.09% of 633315) affected shaders:
Instrs: 908513 -> 905082 (-0.38%); split: -0.47%, +0.09%
Cycle count: 1148162185 -> 1148500520 (+0.03%); split: -0.23%, +0.26%
Spill count: 15364 -> 15362 (-0.01%); split: -0.07%, +0.06%
Fill count: 35343 -> 35394 (+0.14%); split: -0.03%, +0.17%
Scratch Memory Size: 1313792 -> 1319936 (+0.47%)
Max live registers: 34218 -> 34425 (+0.60%); split: -0.47%, +1.08%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-07-06 11:49:44 -07:00
|
|
|
xbld.MOV(dest, retype(payload.local_arg_ptr, dest.type));
|
2020-10-21 14:46:50 -05:00
|
|
|
break;
|
|
|
|
|
|
2022-08-25 17:00:15 -07:00
|
|
|
case nir_intrinsic_load_btd_shader_type_intel:
|
brw/nir: Treat load_btd_{global,local}_arg_addr_intel and load_btd_shader_type_intel as convergent
No shader-db changes on any Intel platform. No fossil-db changes on
Tiger Lake, Ice Lake, or Skylake.
fossil-db:
Lunar Lake
Totals:
Instrs: 141808714 -> 141808513 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22177889310 -> 22181410192 (+0.02%); split: -0.00%, +0.02%
Spill count: 69892 -> 69890 (-0.00%); split: -0.01%, +0.01%
Fill count: 128313 -> 128331 (+0.01%)
Max live registers: 48052083 -> 48052742 (+0.00%); split: -0.00%, +0.00%
Totals from 549 (0.10% of 551446) affected shaders:
Instrs: 911251 -> 911050 (-0.02%); split: -0.10%, +0.07%
Cycle count: 1244153266 -> 1247674148 (+0.28%); split: -0.04%, +0.32%
Spill count: 15849 -> 15847 (-0.01%); split: -0.04%, +0.03%
Fill count: 35087 -> 35105 (+0.05%)
Max live registers: 68047 -> 68706 (+0.97%); split: -0.25%, +1.22%
Meteor Lake
Totals:
Instrs: 152744298 -> 152741241 (-0.00%); split: -0.00%, +0.00%
Cycle count: 17410258529 -> 17405949054 (-0.02%); split: -0.04%, +0.01%
Spill count: 78528 -> 78598 (+0.09%); split: -0.01%, +0.09%
Fill count: 147893 -> 147978 (+0.06%); split: -0.00%, +0.06%
Scratch Memory Size: 3962880 -> 3969024 (+0.16%)
Max live registers: 31887206 -> 31887413 (+0.00%); split: -0.00%, +0.00%
Totals from 552 (0.09% of 633315) affected shaders:
Instrs: 907279 -> 904222 (-0.34%); split: -0.48%, +0.15%
Cycle count: 1152358569 -> 1148049094 (-0.37%); split: -0.56%, +0.19%
Spill count: 15290 -> 15360 (+0.46%); split: -0.03%, +0.48%
Fill count: 35313 -> 35398 (+0.24%); split: -0.02%, +0.26%
Scratch Memory Size: 1313792 -> 1319936 (+0.47%)
Max live registers: 34218 -> 34425 (+0.60%); split: -0.47%, +1.08%
DG2
Totals:
Instrs: 152766492 -> 152763061 (-0.00%); split: -0.00%, +0.00%
Cycle count: 17406058608 -> 17406396943 (+0.00%); split: -0.02%, +0.02%
Spill count: 78626 -> 78624 (-0.00%); split: -0.01%, +0.01%
Fill count: 147956 -> 148007 (+0.03%); split: -0.01%, +0.04%
Scratch Memory Size: 3962880 -> 3969024 (+0.16%)
Max live registers: 31887158 -> 31887365 (+0.00%); split: -0.00%, +0.00%
Totals from 552 (0.09% of 633315) affected shaders:
Instrs: 908513 -> 905082 (-0.38%); split: -0.47%, +0.09%
Cycle count: 1148162185 -> 1148500520 (+0.03%); split: -0.23%, +0.26%
Spill count: 15364 -> 15362 (-0.01%); split: -0.07%, +0.06%
Fill count: 35343 -> 35394 (+0.14%); split: -0.03%, +0.17%
Scratch Memory Size: 1313792 -> 1319936 (+0.47%)
Max live registers: 34218 -> 34425 (+0.60%); split: -0.47%, +1.08%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-07-06 11:49:44 -07:00
|
|
|
payload.load_shader_type(xbld, dest);
|
2021-10-06 12:15:59 +03:00
|
|
|
break;
|
|
|
|
|
|
2020-10-21 14:46:50 -05:00
|
|
|
default:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_intrinsic(ntb, bld, instr);
|
2020-10-21 14:46:50 -05:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-15 15:09:12 -07:00
|
|
|
static brw_reduce_op
|
|
|
|
|
brw_reduce_op_for_nir_reduction_op(nir_op op)
|
|
|
|
|
{
|
|
|
|
|
switch (op) {
|
|
|
|
|
case nir_op_iadd: return BRW_REDUCE_OP_ADD;
|
|
|
|
|
case nir_op_fadd: return BRW_REDUCE_OP_ADD;
|
|
|
|
|
case nir_op_imul: return BRW_REDUCE_OP_MUL;
|
|
|
|
|
case nir_op_fmul: return BRW_REDUCE_OP_MUL;
|
|
|
|
|
case nir_op_imin: return BRW_REDUCE_OP_MIN;
|
|
|
|
|
case nir_op_umin: return BRW_REDUCE_OP_MIN;
|
|
|
|
|
case nir_op_fmin: return BRW_REDUCE_OP_MIN;
|
|
|
|
|
case nir_op_imax: return BRW_REDUCE_OP_MAX;
|
|
|
|
|
case nir_op_umax: return BRW_REDUCE_OP_MAX;
|
|
|
|
|
case nir_op_fmax: return BRW_REDUCE_OP_MAX;
|
|
|
|
|
case nir_op_iand: return BRW_REDUCE_OP_AND;
|
|
|
|
|
case nir_op_ior: return BRW_REDUCE_OP_OR;
|
|
|
|
|
case nir_op_ixor: return BRW_REDUCE_OP_XOR;
|
|
|
|
|
default:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("Invalid reduction operation");
|
2024-07-15 15:09:12 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2024-12-29 16:06:27 -08:00
|
|
|
get_nir_image_intrinsic_image(nir_to_brw_state &ntb, const brw_builder &bld,
|
2023-11-20 15:21:11 -08:00
|
|
|
nir_intrinsic_instr *instr)
|
2018-08-16 16:23:10 -05:00
|
|
|
{
|
2024-07-09 20:54:35 -07:00
|
|
|
brw_reg surf_index = get_nir_src_imm(ntb, instr->src[0]);
|
|
|
|
|
enum brw_reg_type type = brw_type_with_size(BRW_TYPE_UD,
|
|
|
|
|
brw_type_size_bits(surf_index.type));
|
2018-08-16 16:23:10 -05:00
|
|
|
|
2024-07-09 20:54:35 -07:00
|
|
|
return bld.emit_uniformize(retype(surf_index, type));
|
2018-08-16 16:23:10 -05:00
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2024-12-29 16:06:27 -08:00
|
|
|
get_nir_buffer_intrinsic_index(nir_to_brw_state &ntb, const brw_builder &bld,
|
2024-06-07 18:50:04 +03:00
|
|
|
nir_intrinsic_instr *instr, bool *no_mask_handle = NULL)
|
2018-10-20 10:05:33 -05:00
|
|
|
{
|
|
|
|
|
/* SSBO stores are weird in that their index is in src[1] */
|
2020-10-29 14:21:38 -07:00
|
|
|
const bool is_store =
|
|
|
|
|
instr->intrinsic == nir_intrinsic_store_ssbo ||
|
2025-05-29 17:05:10 +03:00
|
|
|
instr->intrinsic == nir_intrinsic_store_ssbo_intel ||
|
2020-10-29 14:21:38 -07:00
|
|
|
instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
|
2023-01-13 12:26:01 +02:00
|
|
|
nir_src src = is_store ? instr->src[1] : instr->src[0];
|
2018-10-20 10:05:33 -05:00
|
|
|
|
2024-07-09 20:54:35 -07:00
|
|
|
brw_reg surf_index = get_nir_src_imm(ntb, src);
|
|
|
|
|
|
2024-06-07 18:50:04 +03:00
|
|
|
if (no_mask_handle)
|
2024-07-09 20:54:35 -07:00
|
|
|
*no_mask_handle = surf_index.is_scalar || surf_index.file == IMM;
|
|
|
|
|
|
|
|
|
|
enum brw_reg_type type = brw_type_with_size(BRW_TYPE_UD,
|
|
|
|
|
brw_type_size_bits(surf_index.type));
|
|
|
|
|
|
|
|
|
|
return bld.emit_uniformize(retype(surf_index, type));
|
2018-10-20 10:05:33 -05:00
|
|
|
}
|
|
|
|
|
|
2019-02-28 08:15:30 -06:00
|
|
|
/**
|
|
|
|
|
* The offsets we get from NIR act as if each SIMD channel has it's own blob
|
|
|
|
|
* of contiguous space. However, if we actually place each SIMD channel in
|
|
|
|
|
* it's own space, we end up with terrible cache performance because each SIMD
|
|
|
|
|
* channel accesses a different cache line even when they're all accessing the
|
|
|
|
|
* same byte offset. To deal with this problem, we swizzle the address using
|
|
|
|
|
* a simple algorithm which ensures that any time a SIMD message reads or
|
|
|
|
|
* writes the same address, it's all in the same cache line. We have to keep
|
|
|
|
|
* the bottom two bits fixed so that we can read/write up to a dword at a time
|
|
|
|
|
* and the individual element is contiguous. We do this by splitting the
|
|
|
|
|
* address as follows:
|
|
|
|
|
*
|
|
|
|
|
* 31 4-6 2 0
|
|
|
|
|
* +-------------------------------+------------+----------+
|
|
|
|
|
* | Hi address bits | chan index | addr low |
|
|
|
|
|
* +-------------------------------+------------+----------+
|
|
|
|
|
*
|
|
|
|
|
* In other words, the bottom two address bits stay, and the top 30 get
|
|
|
|
|
* shifted up so that we can stick the SIMD channel index in the middle. This
|
|
|
|
|
* way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit
|
|
|
|
|
* at the same logical offset, the scratch read/write instruction acts on
|
|
|
|
|
* continuous elements and we get good cache locality.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2023-12-05 15:27:29 -08:00
|
|
|
swizzle_nir_scratch_addr(nir_to_brw_state &ntb,
|
2024-12-29 16:06:27 -08:00
|
|
|
const brw_builder &bld,
|
2024-02-21 03:13:14 -08:00
|
|
|
const nir_src &nir_addr_src,
|
2023-11-20 13:25:36 -08:00
|
|
|
bool in_dwords)
|
2019-02-28 08:15:30 -06:00
|
|
|
{
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 13:25:36 -08:00
|
|
|
|
intel/brw: Use CSE for LOAD_SUBGROUP_INVOCATION
Instead of emitting a single one at the top, and making reference to it,
emit the virtual instruction as needed and let CSE do its job.
Since load_subgroup_invocation now can appear not at the start of the
shader, use UNDEF in all cases to ensure that the liveness of the
destination doesn't extend to the first partial write done here (it was
being used only for SIMD > 8 before).
Note this option was considered in the past
6132992cdb858268af0e985727d80e4140be389c but at the time dismissed. The
difference now is that the lowering of the virtual instruction happens
earlier than the scheduling.
The motivation for this change is to allow passes other than the NIR
conversion to use this value. The alternative of storing a `brw_reg` in
the shader (instead of NIR state) gets complicated by passes like
compact_vgrfs, that move VGRFs around (and update the instructions).
This and maybe other passes would have to care about the brw_reg.
Fossil-db numbers, TGL
```
*** Shaders only in 'after' results are ignored:
steam-native/shadow_of_the_tomb_raider/c683ea5067ee157d/fs.32/0, steam-native/shadow_of_the_tomb_raider/f4df450c3cef40b4/fs.32/0, steam-native/shadow_of_the_tomb_raider/94b708fb8e3d9597/fs.32/0, steam-native/shadow_of_the_tomb_raider/19d44c328edabd30/fs.32/0, steam-native/shadow_of_the_tomb_raider/8a7dcbd5a74a19bf/fs.32/0, and 366 more
from 4 apps: steam-dxvk/alan_wake, steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
*** Shaders only in 'before' results are ignored:
steam-dxvk/octopath_traveler/aaa3d10acb726906/fs.32/0, steam-dxvk/batman_arkham_origins/e6872ae23569c35f/fs.32/0, steam-dxvk/octopath_traveler/fd33a99fa5c271a8/fs.32/0, steam-dxvk/octopath_traveler/9a077cdc16f24520/fs.32/0, steam-dxvk/batman_arkham_city_goty/fac7b438ad52f622/fs.32/0, and 12 more
from 4 apps: steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-dxvk/octopath_traveler, steam-native/shadow_of_the_tomb_raider
Totals:
Instrs: 149752381 -> 149751337 (-0.00%); split: -0.00%, +0.00%
Cycle count: 11553609349 -> 11549970294 (-0.03%); split: -0.06%, +0.03%
Spill count: 42763 -> 42764 (+0.00%); split: -0.01%, +0.01%
Fill count: 75650 -> 75651 (+0.00%); split: -0.00%, +0.01%
Max live registers: 31725096 -> 31671792 (-0.17%)
Max dispatch width: 5546008 -> 5551672 (+0.10%); split: +0.11%, -0.00%
Totals from 52574 (8.34% of 630441) affected shaders:
Instrs: 9535159 -> 9534115 (-0.01%); split: -0.03%, +0.02%
Cycle count: 1006627109 -> 1002988054 (-0.36%); split: -0.65%, +0.29%
Spill count: 11588 -> 11589 (+0.01%); split: -0.03%, +0.03%
Fill count: 21057 -> 21058 (+0.00%); split: -0.01%, +0.02%
Max live registers: 1992493 -> 1939189 (-2.68%)
Max dispatch width: 559696 -> 565360 (+1.01%); split: +1.06%, -0.05%
```
and DG2
```
*** Shaders only in 'after' results are ignored:
steam-native/shadow_of_the_tomb_raider/1f95a9d3db21df85/fs.32/0, steam-native/shadow_of_the_tomb_raider/56b87c4a46613a2a/fs.32/0, steam-native/shadow_of_the_tomb_raider/a74b4137f85dbbd3/fs.32/0, steam-native/shadow_of_the_tomb_raider/e07e38d3f48e8402/fs.32/0, steam-native/shadow_of_the_tomb_raider/206336789c48996c/fs.32/0, and 268 more
from 4 apps: steam-dxvk/alan_wake, steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
*** Shaders only in 'before' results are ignored:
steam-native/shadow_of_the_tomb_raider/0420d7c3a2ea99ec/fs.32/0, steam-native/shadow_of_the_tomb_raider/2ff39f8bf7d24abb/fs.32/0, steam-native/shadow_of_the_tomb_raider/92d7be2824bd9659/fs.32/0, steam-native/shadow_of_the_tomb_raider/f09ca6d2ecf18015/fs.32/0, steam-native/shadow_of_the_tomb_raider/490f8ffd59e52949/fs.32/0, and 205 more
from 3 apps: steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
Totals:
Instrs: 151597619 -> 151599914 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7699776 -> 7699784 (+0.00%)
Cycle count: 12738501989 -> 12739841170 (+0.01%); split: -0.01%, +0.02%
Spill count: 61283 -> 61274 (-0.01%)
Fill count: 119886 -> 119849 (-0.03%)
Max live registers: 31810432 -> 31758920 (-0.16%)
Max dispatch width: 5540128 -> 5541136 (+0.02%); split: +0.08%, -0.06%
Totals from 49286 (7.81% of 631231) affected shaders:
Instrs: 8607753 -> 8610048 (+0.03%); split: -0.01%, +0.04%
Subgroup size: 857752 -> 857760 (+0.00%)
Cycle count: 305939495 -> 307278676 (+0.44%); split: -0.28%, +0.72%
Spill count: 6339 -> 6330 (-0.14%)
Fill count: 12571 -> 12534 (-0.29%)
Max live registers: 1788346 -> 1736834 (-2.88%)
Max dispatch width: 510920 -> 511928 (+0.20%); split: +0.85%, -0.66%
```
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30489>
2024-07-31 22:46:20 -07:00
|
|
|
const brw_reg chan_index = bld.LOAD_SUBGROUP_INVOCATION();
|
2023-12-05 17:16:34 -08:00
|
|
|
const unsigned chan_index_bits = ffs(s.dispatch_width) - 1;
|
2019-02-28 08:15:30 -06:00
|
|
|
|
2024-02-21 03:13:14 -08:00
|
|
|
if (nir_src_is_const(nir_addr_src)) {
|
|
|
|
|
unsigned nir_addr = nir_src_as_uint(nir_addr_src);
|
|
|
|
|
if (in_dwords) {
|
|
|
|
|
/* In this case, we know the address is aligned to a DWORD and we want
|
|
|
|
|
* the final address in DWORDs.
|
|
|
|
|
*/
|
|
|
|
|
return bld.OR(chan_index,
|
|
|
|
|
brw_imm_ud(nir_addr << (chan_index_bits - 2)));
|
|
|
|
|
} else {
|
|
|
|
|
/* This case is substantially more annoying because we have to pay
|
|
|
|
|
* attention to those pesky two bottom bits.
|
|
|
|
|
*/
|
|
|
|
|
unsigned addr_hi = (nir_addr & ~0x3u) << chan_index_bits;
|
|
|
|
|
unsigned addr_lo = (nir_addr & 0x3u);
|
|
|
|
|
|
|
|
|
|
return bld.OR(bld.SHL(chan_index, brw_imm_ud(2)),
|
|
|
|
|
brw_imm_ud(addr_lo | addr_hi));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg nir_addr =
|
2025-01-15 13:27:05 -08:00
|
|
|
retype(get_nir_src(ntb, nir_addr_src, 0), BRW_TYPE_UD);
|
2024-02-21 03:13:14 -08:00
|
|
|
|
2019-02-28 08:15:30 -06:00
|
|
|
if (in_dwords) {
|
|
|
|
|
/* In this case, we know the address is aligned to a DWORD and we want
|
|
|
|
|
* the final address in DWORDs.
|
|
|
|
|
*/
|
2024-02-21 03:13:14 -08:00
|
|
|
return bld.OR(bld.SHL(nir_addr, brw_imm_ud(chan_index_bits - 2)),
|
2024-04-12 17:43:22 -07:00
|
|
|
chan_index);
|
2019-02-28 08:15:30 -06:00
|
|
|
} else {
|
|
|
|
|
/* This case substantially more annoying because we have to pay
|
|
|
|
|
* attention to those pesky two bottom bits.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg chan_addr = bld.SHL(chan_index, brw_imm_ud(2));
|
|
|
|
|
brw_reg addr_bits =
|
2024-02-21 03:13:14 -08:00
|
|
|
bld.OR(bld.AND(nir_addr, brw_imm_ud(0x3u)),
|
|
|
|
|
bld.SHL(bld.AND(nir_addr, brw_imm_ud(~0x3u)),
|
2024-04-12 17:43:22 -07:00
|
|
|
brw_imm_ud(chan_index_bits)));
|
|
|
|
|
return bld.OR(addr_bits, chan_addr);
|
|
|
|
|
}
|
2019-02-28 08:15:30 -06:00
|
|
|
}
|
|
|
|
|
|
2020-10-05 14:48:44 -07:00
|
|
|
static unsigned
|
2024-09-09 16:27:29 -07:00
|
|
|
choose_block_size_dwords(const intel_device_info *devinfo, unsigned dwords)
|
2020-10-05 14:48:44 -07:00
|
|
|
{
|
2024-09-09 16:27:29 -07:00
|
|
|
const unsigned min_block = 8;
|
|
|
|
|
const unsigned max_block = devinfo->has_lsc ? 64 : 32;
|
|
|
|
|
|
|
|
|
|
const unsigned block = 1 << util_logbase2(dwords);
|
|
|
|
|
|
|
|
|
|
return CLAMP(block, min_block, max_block);
|
2020-10-05 14:48:44 -07:00
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2024-12-29 15:41:04 -08:00
|
|
|
increment_a64_address(const brw_builder &_bld, brw_reg address, uint32_t v, bool use_no_mask)
|
2020-10-05 14:48:44 -07:00
|
|
|
{
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder bld = use_no_mask ? _bld.exec_all().group(8, 0) : _bld;
|
2024-06-13 20:51:24 +03:00
|
|
|
|
2020-10-05 14:48:44 -07:00
|
|
|
if (bld.shader->devinfo->has_64bit_int) {
|
2025-05-30 09:06:40 +03:00
|
|
|
return bld.ADD(address, brw_imm_int(address.type, v));
|
2020-10-05 14:48:44 -07:00
|
|
|
} else {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dst = bld.vgrf(BRW_TYPE_UQ);
|
|
|
|
|
brw_reg dst_low = subscript(dst, BRW_TYPE_UD, 0);
|
|
|
|
|
brw_reg dst_high = subscript(dst, BRW_TYPE_UD, 1);
|
|
|
|
|
brw_reg src_low = subscript(address, BRW_TYPE_UD, 0);
|
|
|
|
|
brw_reg src_high = subscript(address, BRW_TYPE_UD, 1);
|
2020-10-05 14:48:44 -07:00
|
|
|
|
|
|
|
|
/* Add low and if that overflows, add carry to high. */
|
2024-06-17 13:03:47 +03:00
|
|
|
bld.ADD(dst_low, src_low, brw_imm_ud(v))->conditional_mod = BRW_CONDITIONAL_O;
|
|
|
|
|
bld.ADD(dst_high, src_high, brw_imm_ud(0x1))->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
return dst_low;
|
2020-10-05 14:48:44 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2024-12-29 15:41:04 -08:00
|
|
|
emit_fence(const brw_builder &bld, enum opcode opcode,
|
2022-04-05 07:59:51 +03:00
|
|
|
uint8_t sfid, uint32_t desc,
|
2025-01-18 00:48:10 -08:00
|
|
|
bool commit_enable)
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
{
|
2025-01-17 22:56:24 -08:00
|
|
|
const struct intel_device_info *devinfo = bld.shader->devinfo;
|
|
|
|
|
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
assert(opcode == SHADER_OPCODE_INTERLOCK ||
|
|
|
|
|
opcode == SHADER_OPCODE_MEMORY_FENCE);
|
|
|
|
|
|
2025-01-18 00:48:14 -08:00
|
|
|
brw_reg dst = commit_enable ? bld.vgrf(BRW_TYPE_UD) : bld.null_reg_ud();
|
2025-08-20 15:43:08 -07:00
|
|
|
brw_send_inst *fence = bld.emit(opcode, dst, brw_vec8_grf(0, 0),
|
|
|
|
|
brw_imm_ud(commit_enable))->as_send();
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
fence->sfid = sfid;
|
2022-04-05 07:59:51 +03:00
|
|
|
fence->desc = desc;
|
2025-01-17 22:56:24 -08:00
|
|
|
fence->size_written = commit_enable ? REG_SIZE * reg_unit(devinfo) : 0;
|
2022-04-05 07:59:51 +03:00
|
|
|
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
return dst;
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-05 07:59:51 +03:00
|
|
|
static uint32_t
|
|
|
|
|
lsc_fence_descriptor_for_intrinsic(const struct intel_device_info *devinfo,
|
|
|
|
|
nir_intrinsic_instr *instr)
|
|
|
|
|
{
|
|
|
|
|
assert(devinfo->has_lsc);
|
|
|
|
|
|
|
|
|
|
enum lsc_fence_scope scope = LSC_FENCE_LOCAL;
|
|
|
|
|
enum lsc_flush_type flush_type = LSC_FLUSH_TYPE_NONE;
|
|
|
|
|
|
|
|
|
|
if (nir_intrinsic_has_memory_scope(instr)) {
|
|
|
|
|
switch (nir_intrinsic_memory_scope(instr)) {
|
2023-05-30 12:05:30 -07:00
|
|
|
case SCOPE_DEVICE:
|
|
|
|
|
case SCOPE_QUEUE_FAMILY:
|
2022-04-05 07:59:51 +03:00
|
|
|
scope = LSC_FENCE_TILE;
|
|
|
|
|
flush_type = LSC_FLUSH_TYPE_EVICT;
|
|
|
|
|
break;
|
2023-05-30 12:05:30 -07:00
|
|
|
case SCOPE_WORKGROUP:
|
2022-04-05 07:59:51 +03:00
|
|
|
scope = LSC_FENCE_THREADGROUP;
|
|
|
|
|
break;
|
2023-05-30 12:05:30 -07:00
|
|
|
case SCOPE_SHADER_CALL:
|
|
|
|
|
case SCOPE_INVOCATION:
|
|
|
|
|
case SCOPE_SUBGROUP:
|
|
|
|
|
case SCOPE_NONE:
|
2022-04-05 07:59:51 +03:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
/* No scope defined. */
|
|
|
|
|
scope = LSC_FENCE_TILE;
|
|
|
|
|
flush_type = LSC_FLUSH_TYPE_EVICT;
|
|
|
|
|
}
|
|
|
|
|
return lsc_fence_msg_desc(devinfo, scope, flush_type, true);
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-17 17:17:25 -08:00
|
|
|
/**
|
|
|
|
|
* Create a MOV to read the timestamp register.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2024-12-29 15:41:04 -08:00
|
|
|
get_timestamp(const brw_builder &bld)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = *bld.shader;
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-08-20 11:48:54 -07:00
|
|
|
brw_reg ts = brw_reg(retype(brw_vec4_reg(ARF,
|
2024-04-20 17:30:23 -07:00
|
|
|
BRW_ARF_TIMESTAMP, 0), BRW_TYPE_UD));
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2025-01-31 12:50:20 -08:00
|
|
|
brw_reg dst = retype(brw_allocate_vgrf_units(s, 1), BRW_TYPE_UD);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
/* We want to read the 3 fields we care about even if it's not enabled in
|
|
|
|
|
* the dispatch.
|
|
|
|
|
*/
|
|
|
|
|
bld.group(4, 0).exec_all().MOV(dst, ts);
|
|
|
|
|
|
|
|
|
|
return dst;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static unsigned
|
|
|
|
|
component_from_intrinsic(nir_intrinsic_instr *instr)
|
|
|
|
|
{
|
|
|
|
|
if (nir_intrinsic_has_component(instr))
|
|
|
|
|
return nir_intrinsic_component(instr);
|
|
|
|
|
else
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2024-12-29 15:41:04 -08:00
|
|
|
adjust_handle_and_offset(const brw_builder &bld,
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg &urb_handle,
|
2023-11-17 17:17:25 -08:00
|
|
|
unsigned &urb_global_offset)
|
|
|
|
|
{
|
|
|
|
|
/* Make sure that URB global offset is below 2048 (2^11), because
|
|
|
|
|
* that's the maximum possible value encoded in Message Descriptor.
|
|
|
|
|
*/
|
|
|
|
|
unsigned adjustment = (urb_global_offset >> 11) << 11;
|
|
|
|
|
|
|
|
|
|
if (adjustment) {
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder ubld8 = bld.group(8, 0).exec_all();
|
2023-11-17 17:17:25 -08:00
|
|
|
/* Allocate new register to not overwrite the shared URB handle. */
|
2024-04-12 17:43:22 -07:00
|
|
|
urb_handle = ubld8.ADD(urb_handle, brw_imm_ud(adjustment));
|
2023-11-17 17:17:25 -08:00
|
|
|
urb_global_offset -= adjustment;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2024-12-29 15:41:04 -08:00
|
|
|
emit_urb_direct_vec4_write(const brw_builder &bld,
|
2023-11-17 17:17:25 -08:00
|
|
|
unsigned urb_global_offset,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg &src,
|
|
|
|
|
brw_reg urb_handle,
|
2023-11-17 17:17:25 -08:00
|
|
|
unsigned dst_comp_offset,
|
|
|
|
|
unsigned comps,
|
|
|
|
|
unsigned mask)
|
|
|
|
|
{
|
|
|
|
|
for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder bld8 = bld.group(8, q);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg payload_srcs[8];
|
2023-11-17 17:17:25 -08:00
|
|
|
unsigned length = 0;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < dst_comp_offset; i++)
|
|
|
|
|
payload_srcs[length++] = reg_undef;
|
|
|
|
|
|
|
|
|
|
for (unsigned c = 0; c < comps; c++)
|
|
|
|
|
payload_srcs[length++] = quarter(offset(src, bld, c), q);
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2023-11-17 17:17:25 -08:00
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
|
2024-08-22 12:21:19 +03:00
|
|
|
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask);
|
2025-01-31 12:50:20 -08:00
|
|
|
srcs[URB_LOGICAL_SRC_DATA] =
|
|
|
|
|
retype(brw_allocate_vgrf_units(*bld.shader, length), BRW_TYPE_F);
|
2023-11-17 17:17:25 -08:00
|
|
|
bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
|
|
|
|
|
|
2025-08-22 00:30:40 -07:00
|
|
|
brw_urb_inst *urb = bld8.URB_WRITE(srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
urb->offset = urb_global_offset;
|
|
|
|
|
urb->components = length;
|
|
|
|
|
assert(urb->offset < 2048);
|
2023-11-17 17:17:25 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2024-12-29 15:41:04 -08:00
|
|
|
emit_urb_direct_writes(const brw_builder &bld, nir_intrinsic_instr *instr,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg &src, brw_reg urb_handle)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
|
|
|
|
assert(nir_src_bit_size(instr->src[0]) == 32);
|
|
|
|
|
|
|
|
|
|
nir_src *offset_nir_src = nir_get_io_offset_src(instr);
|
|
|
|
|
assert(nir_src_is_const(*offset_nir_src));
|
|
|
|
|
|
|
|
|
|
const unsigned comps = nir_src_num_components(instr->src[0]);
|
|
|
|
|
assert(comps <= 4);
|
|
|
|
|
|
|
|
|
|
const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
|
|
|
|
|
nir_src_as_uint(*offset_nir_src) +
|
|
|
|
|
component_from_intrinsic(instr);
|
|
|
|
|
|
|
|
|
|
/* URB writes are vec4 aligned but the intrinsic offsets are in dwords.
|
|
|
|
|
* We can write up to 8 dwords, so single vec4 write is enough.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned comp_shift = offset_in_dwords % 4;
|
|
|
|
|
const unsigned mask = nir_intrinsic_write_mask(instr) << comp_shift;
|
|
|
|
|
|
|
|
|
|
unsigned urb_global_offset = offset_in_dwords / 4;
|
|
|
|
|
adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
|
|
|
|
|
|
|
|
|
|
emit_urb_direct_vec4_write(bld, urb_global_offset, src, urb_handle,
|
|
|
|
|
comp_shift, comps, mask);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2024-12-29 15:41:04 -08:00
|
|
|
emit_urb_direct_vec4_write_xe2(const brw_builder &bld,
|
2023-11-17 17:17:25 -08:00
|
|
|
unsigned offset_in_bytes,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg &src,
|
|
|
|
|
brw_reg urb_handle,
|
2023-11-17 17:17:25 -08:00
|
|
|
unsigned comps,
|
|
|
|
|
unsigned mask)
|
|
|
|
|
{
|
|
|
|
|
const struct intel_device_info *devinfo = bld.shader->devinfo;
|
|
|
|
|
const unsigned runit = reg_unit(devinfo);
|
|
|
|
|
const unsigned write_size = 8 * runit;
|
|
|
|
|
|
|
|
|
|
if (offset_in_bytes > 0) {
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder bldall = bld.group(write_size, 0).exec_all();
|
2024-04-12 17:43:22 -07:00
|
|
|
urb_handle = bldall.ADD(urb_handle, brw_imm_ud(offset_in_bytes));
|
2023-11-17 17:17:25 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (unsigned q = 0; q < bld.dispatch_width() / write_size; q++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder hbld = bld.group(write_size, q);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-08-16 00:11:04 -07:00
|
|
|
assert(comps <= 4);
|
|
|
|
|
brw_reg payload_srcs[4];
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
for (unsigned c = 0; c < comps; c++)
|
|
|
|
|
payload_srcs[c] = horiz_offset(offset(src, bld, c), write_size * q);
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2023-11-17 17:17:25 -08:00
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
|
2024-08-22 12:21:19 +03:00
|
|
|
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask);
|
2025-01-31 12:50:20 -08:00
|
|
|
srcs[URB_LOGICAL_SRC_DATA] =
|
|
|
|
|
retype(brw_allocate_vgrf_units(*bld.shader, comps * runit), BRW_TYPE_F);
|
2023-11-17 17:17:25 -08:00
|
|
|
hbld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, comps, 0);
|
|
|
|
|
|
2025-08-22 00:30:40 -07:00
|
|
|
brw_urb_inst *urb = hbld.URB_WRITE(srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
urb->components = comps;
|
2023-11-17 17:17:25 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2024-12-29 15:41:04 -08:00
|
|
|
emit_urb_direct_writes_xe2(const brw_builder &bld, nir_intrinsic_instr *instr,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg &src, brw_reg urb_handle)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
|
|
|
|
assert(nir_src_bit_size(instr->src[0]) == 32);
|
|
|
|
|
|
|
|
|
|
nir_src *offset_nir_src = nir_get_io_offset_src(instr);
|
|
|
|
|
assert(nir_src_is_const(*offset_nir_src));
|
|
|
|
|
|
|
|
|
|
const unsigned comps = nir_src_num_components(instr->src[0]);
|
|
|
|
|
assert(comps <= 4);
|
|
|
|
|
|
|
|
|
|
const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
|
|
|
|
|
nir_src_as_uint(*offset_nir_src) +
|
|
|
|
|
component_from_intrinsic(instr);
|
|
|
|
|
|
|
|
|
|
const unsigned mask = nir_intrinsic_write_mask(instr);
|
|
|
|
|
|
|
|
|
|
emit_urb_direct_vec4_write_xe2(bld, offset_in_dwords * 4, src,
|
|
|
|
|
urb_handle, comps, mask);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2024-12-29 15:41:04 -08:00
|
|
|
emit_urb_indirect_vec4_write(const brw_builder &bld,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg &offset_src,
|
2023-11-17 17:17:25 -08:00
|
|
|
unsigned base,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg &src,
|
|
|
|
|
brw_reg urb_handle,
|
2023-11-17 17:17:25 -08:00
|
|
|
unsigned dst_comp_offset,
|
|
|
|
|
unsigned comps,
|
|
|
|
|
unsigned mask)
|
|
|
|
|
{
|
|
|
|
|
for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder bld8 = bld.group(8, q);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
/* offset is always positive, so signedness doesn't matter */
|
2024-04-20 17:30:23 -07:00
|
|
|
assert(offset_src.type == BRW_TYPE_D || offset_src.type == BRW_TYPE_UD);
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg qtr = bld8.MOV(quarter(retype(offset_src, BRW_TYPE_UD), q));
|
|
|
|
|
brw_reg off = bld8.SHR(bld8.ADD(qtr, brw_imm_ud(base)), brw_imm_ud(2));
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg payload_srcs[8];
|
2023-11-17 17:17:25 -08:00
|
|
|
unsigned length = 0;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < dst_comp_offset; i++)
|
|
|
|
|
payload_srcs[length++] = reg_undef;
|
|
|
|
|
|
|
|
|
|
for (unsigned c = 0; c < comps; c++)
|
|
|
|
|
payload_srcs[length++] = quarter(offset(src, bld, c), q);
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2023-11-17 17:17:25 -08:00
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
|
|
|
|
|
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
|
2024-08-22 12:21:19 +03:00
|
|
|
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask);
|
2025-01-31 12:50:20 -08:00
|
|
|
srcs[URB_LOGICAL_SRC_DATA] =
|
|
|
|
|
retype(brw_allocate_vgrf_units(*bld.shader, length), BRW_TYPE_F);
|
2023-11-17 17:17:25 -08:00
|
|
|
bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
|
|
|
|
|
|
2025-08-22 00:30:40 -07:00
|
|
|
brw_urb_inst *urb = bld8.URB_WRITE(srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
urb->components = length;
|
2023-11-17 17:17:25 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2024-12-29 15:41:04 -08:00
|
|
|
emit_urb_indirect_writes_mod(const brw_builder &bld, nir_intrinsic_instr *instr,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg &src, const brw_reg &offset_src,
|
|
|
|
|
brw_reg urb_handle, unsigned mod)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
|
|
|
|
assert(nir_src_bit_size(instr->src[0]) == 32);
|
|
|
|
|
|
|
|
|
|
const unsigned comps = nir_src_num_components(instr->src[0]);
|
|
|
|
|
assert(comps <= 4);
|
|
|
|
|
|
|
|
|
|
const unsigned base_in_dwords = nir_intrinsic_base(instr) +
|
|
|
|
|
component_from_intrinsic(instr);
|
|
|
|
|
|
|
|
|
|
const unsigned comp_shift = mod;
|
|
|
|
|
const unsigned mask = nir_intrinsic_write_mask(instr) << comp_shift;
|
|
|
|
|
|
|
|
|
|
emit_urb_indirect_vec4_write(bld, offset_src, base_in_dwords, src,
|
|
|
|
|
urb_handle, comp_shift, comps, mask);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2024-12-29 15:41:04 -08:00
|
|
|
emit_urb_indirect_writes_xe2(const brw_builder &bld, nir_intrinsic_instr *instr,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg &src, const brw_reg &offset_src,
|
|
|
|
|
brw_reg urb_handle)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
|
|
|
|
assert(nir_src_bit_size(instr->src[0]) == 32);
|
|
|
|
|
|
|
|
|
|
const struct intel_device_info *devinfo = bld.shader->devinfo;
|
|
|
|
|
const unsigned runit = reg_unit(devinfo);
|
|
|
|
|
const unsigned write_size = 8 * runit;
|
|
|
|
|
|
|
|
|
|
const unsigned comps = nir_src_num_components(instr->src[0]);
|
|
|
|
|
assert(comps <= 4);
|
|
|
|
|
|
|
|
|
|
const unsigned base_in_dwords = nir_intrinsic_base(instr) +
|
|
|
|
|
component_from_intrinsic(instr);
|
|
|
|
|
|
|
|
|
|
if (base_in_dwords > 0) {
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder bldall = bld.group(write_size, 0).exec_all();
|
2024-04-12 17:43:22 -07:00
|
|
|
urb_handle = bldall.ADD(urb_handle, brw_imm_ud(base_in_dwords * 4));
|
2023-11-17 17:17:25 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const unsigned mask = nir_intrinsic_write_mask(instr);
|
|
|
|
|
|
|
|
|
|
for (unsigned q = 0; q < bld.dispatch_width() / write_size; q++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder wbld = bld.group(write_size, q);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-08-16 00:11:04 -07:00
|
|
|
brw_reg payload_srcs[4];
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
for (unsigned c = 0; c < comps; c++)
|
|
|
|
|
payload_srcs[c] = horiz_offset(offset(src, bld, c), write_size * q);
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg addr =
|
2024-04-30 11:19:36 -07:00
|
|
|
wbld.ADD(wbld.SHL(retype(horiz_offset(offset_src, write_size * q),
|
|
|
|
|
BRW_TYPE_UD),
|
2024-04-12 17:43:22 -07:00
|
|
|
brw_imm_ud(2)), urb_handle);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2023-11-17 17:17:25 -08:00
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = addr;
|
2024-08-22 12:21:19 +03:00
|
|
|
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask);
|
2025-01-31 12:50:20 -08:00
|
|
|
srcs[URB_LOGICAL_SRC_DATA] =
|
|
|
|
|
retype(brw_allocate_vgrf_units(*bld.shader, comps * runit), BRW_TYPE_F);
|
2023-11-17 17:17:25 -08:00
|
|
|
wbld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, comps, 0);
|
|
|
|
|
|
2025-08-22 00:30:40 -07:00
|
|
|
brw_urb_inst *urb = wbld.URB_WRITE(srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
urb->components = comps;
|
2023-11-17 17:17:25 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2024-12-29 15:41:04 -08:00
|
|
|
emit_urb_indirect_writes(const brw_builder &bld, nir_intrinsic_instr *instr,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg &src, const brw_reg &offset_src,
|
|
|
|
|
brw_reg urb_handle)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
|
|
|
|
assert(nir_src_bit_size(instr->src[0]) == 32);
|
|
|
|
|
|
|
|
|
|
const unsigned comps = nir_src_num_components(instr->src[0]);
|
|
|
|
|
assert(comps <= 4);
|
|
|
|
|
|
|
|
|
|
const unsigned base_in_dwords = nir_intrinsic_base(instr) +
|
|
|
|
|
component_from_intrinsic(instr);
|
|
|
|
|
|
|
|
|
|
/* Use URB write message that allow different offsets per-slot. The offset
|
|
|
|
|
* is in units of vec4s (128 bits), so we use a write for each component,
|
|
|
|
|
* replicating it in the sources and applying the appropriate mask based on
|
|
|
|
|
* the dword offset.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
for (unsigned c = 0; c < comps; c++) {
|
|
|
|
|
if (((1 << c) & nir_intrinsic_write_mask(instr)) == 0)
|
|
|
|
|
continue;
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg src_comp = offset(src, bld, c);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder bld8 = bld.group(8, q);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
/* offset is always positive, so signedness doesn't matter */
|
2024-04-20 17:08:02 -07:00
|
|
|
assert(offset_src.type == BRW_TYPE_D ||
|
|
|
|
|
offset_src.type == BRW_TYPE_UD);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg off =
|
2024-04-12 17:43:22 -07:00
|
|
|
bld8.ADD(quarter(retype(offset_src, BRW_TYPE_UD), q),
|
|
|
|
|
brw_imm_ud(c + base_in_dwords));
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg m = bld8.AND(off, brw_imm_ud(0x3));
|
2024-08-22 12:21:19 +03:00
|
|
|
brw_reg mask = bld8.SHL(bld8.MOV(brw_imm_ud(1)), m);
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg final_offset = bld8.SHR(off, brw_imm_ud(2));
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg payload_srcs[4];
|
2023-11-17 17:17:25 -08:00
|
|
|
unsigned length = 0;
|
|
|
|
|
|
|
|
|
|
for (unsigned j = 0; j < 4; j++)
|
|
|
|
|
payload_srcs[length++] = quarter(src_comp, q);
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2023-11-17 17:17:25 -08:00
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
|
2024-04-12 17:43:22 -07:00
|
|
|
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = final_offset;
|
2023-11-17 17:17:25 -08:00
|
|
|
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask;
|
2025-01-31 12:50:20 -08:00
|
|
|
srcs[URB_LOGICAL_SRC_DATA] =
|
|
|
|
|
retype(brw_allocate_vgrf_units(*bld.shader, length), BRW_TYPE_F);
|
2023-11-17 17:17:25 -08:00
|
|
|
bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
|
|
|
|
|
|
2025-08-22 00:30:40 -07:00
|
|
|
brw_urb_inst *urb = bld8.URB_WRITE(srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
urb->components = length;
|
2023-11-17 17:17:25 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2024-12-29 15:41:04 -08:00
|
|
|
emit_urb_direct_reads(const brw_builder &bld, nir_intrinsic_instr *instr,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg &dest, brw_reg urb_handle)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
|
|
|
|
assert(instr->def.bit_size == 32);
|
|
|
|
|
|
|
|
|
|
unsigned comps = instr->def.num_components;
|
|
|
|
|
if (comps == 0)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
nir_src *offset_nir_src = nir_get_io_offset_src(instr);
|
|
|
|
|
assert(nir_src_is_const(*offset_nir_src));
|
|
|
|
|
|
|
|
|
|
const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
|
|
|
|
|
nir_src_as_uint(*offset_nir_src) +
|
|
|
|
|
component_from_intrinsic(instr);
|
|
|
|
|
|
|
|
|
|
unsigned urb_global_offset = offset_in_dwords / 4;
|
|
|
|
|
adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
|
|
|
|
|
|
|
|
|
|
const unsigned comp_offset = offset_in_dwords % 4;
|
|
|
|
|
const unsigned num_regs = comp_offset + comps;
|
|
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder ubld8 = bld.group(8, 0).exec_all();
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg data = ubld8.vgrf(BRW_TYPE_UD, num_regs);
|
|
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2023-11-17 17:17:25 -08:00
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
|
|
|
|
|
|
2025-08-22 00:30:40 -07:00
|
|
|
brw_urb_inst *urb = ubld8.URB_READ(data, srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
urb->offset = urb_global_offset;
|
|
|
|
|
assert(urb->offset < 2048);
|
|
|
|
|
urb->size_written = num_regs * REG_SIZE;
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
for (unsigned c = 0; c < comps; c++) {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dest_comp = offset(dest, bld, c);
|
|
|
|
|
brw_reg data_comp = horiz_stride(offset(data, ubld8, comp_offset + c), 0);
|
2024-04-20 17:08:02 -07:00
|
|
|
bld.MOV(retype(dest_comp, BRW_TYPE_UD), data_comp);
|
2023-11-17 17:17:25 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2024-12-29 15:41:04 -08:00
|
|
|
emit_urb_direct_reads_xe2(const brw_builder &bld, nir_intrinsic_instr *instr,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg &dest, brw_reg urb_handle)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
|
|
|
|
assert(instr->def.bit_size == 32);
|
|
|
|
|
|
|
|
|
|
unsigned comps = instr->def.num_components;
|
|
|
|
|
if (comps == 0)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
nir_src *offset_nir_src = nir_get_io_offset_src(instr);
|
|
|
|
|
assert(nir_src_is_const(*offset_nir_src));
|
|
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder ubld16 = bld.group(16, 0).exec_all();
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
|
|
|
|
|
nir_src_as_uint(*offset_nir_src) +
|
|
|
|
|
component_from_intrinsic(instr);
|
|
|
|
|
|
2024-04-12 17:43:22 -07:00
|
|
|
if (offset_in_dwords > 0)
|
|
|
|
|
urb_handle = ubld16.ADD(urb_handle, brw_imm_ud(offset_in_dwords * 4));
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg data = ubld16.vgrf(BRW_TYPE_UD, comps);
|
|
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2023-11-17 17:17:25 -08:00
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
|
|
|
|
|
|
2025-08-20 18:25:32 -07:00
|
|
|
brw_inst *inst = ubld16.URB_READ(data, srcs, ARRAY_SIZE(srcs));
|
2023-11-17 17:17:25 -08:00
|
|
|
inst->size_written = 2 * comps * REG_SIZE;
|
|
|
|
|
|
|
|
|
|
for (unsigned c = 0; c < comps; c++) {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dest_comp = offset(dest, bld, c);
|
|
|
|
|
brw_reg data_comp = horiz_stride(offset(data, ubld16, c), 0);
|
2024-04-20 17:08:02 -07:00
|
|
|
bld.MOV(retype(dest_comp, BRW_TYPE_UD), data_comp);
|
2023-11-17 17:17:25 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2024-12-29 15:41:04 -08:00
|
|
|
emit_urb_indirect_reads(const brw_builder &bld, nir_intrinsic_instr *instr,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg &dest, const brw_reg &offset_src, brw_reg urb_handle)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
|
|
|
|
assert(instr->def.bit_size == 32);
|
|
|
|
|
|
|
|
|
|
unsigned comps = instr->def.num_components;
|
|
|
|
|
if (comps == 0)
|
|
|
|
|
return;
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg seq_ud;
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder ubld8 = bld.group(8, 0).exec_all();
|
2024-04-20 17:08:02 -07:00
|
|
|
seq_ud = ubld8.vgrf(BRW_TYPE_UD, 1);
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg seq_uw = ubld8.vgrf(BRW_TYPE_UW, 1);
|
|
|
|
|
ubld8.MOV(seq_uw, brw_reg(brw_imm_v(0x76543210)));
|
2023-11-17 17:17:25 -08:00
|
|
|
ubld8.MOV(seq_ud, seq_uw);
|
2024-04-12 17:43:22 -07:00
|
|
|
seq_ud = ubld8.SHL(seq_ud, brw_imm_ud(2));
|
2023-11-17 17:17:25 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const unsigned base_in_dwords = nir_intrinsic_base(instr) +
|
|
|
|
|
component_from_intrinsic(instr);
|
|
|
|
|
|
|
|
|
|
for (unsigned c = 0; c < comps; c++) {
|
|
|
|
|
for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder bld8 = bld.group(8, q);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
/* offset is always positive, so signedness doesn't matter */
|
2024-04-20 17:08:02 -07:00
|
|
|
assert(offset_src.type == BRW_TYPE_D ||
|
|
|
|
|
offset_src.type == BRW_TYPE_UD);
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg off =
|
2024-04-12 17:43:22 -07:00
|
|
|
bld8.ADD(bld8.MOV(quarter(retype(offset_src, BRW_TYPE_UD), q)),
|
|
|
|
|
brw_imm_ud(base_in_dwords + c));
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
STATIC_ASSERT(IS_POT(REG_SIZE) && REG_SIZE > 1);
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg comp;
|
2024-04-12 17:43:22 -07:00
|
|
|
comp = bld8.AND(off, brw_imm_ud(0x3));
|
|
|
|
|
comp = bld8.SHL(comp, brw_imm_ud(ffs(REG_SIZE) - 1));
|
|
|
|
|
comp = bld8.ADD(comp, seq_ud);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-04-12 17:43:22 -07:00
|
|
|
off = bld8.SHR(off, brw_imm_ud(2));
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2023-11-17 17:17:25 -08:00
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
|
|
|
|
|
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg data = bld8.vgrf(BRW_TYPE_UD, 4);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2025-08-22 00:30:40 -07:00
|
|
|
brw_urb_inst *urb = bld8.URB_READ(data, srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
urb->size_written = 4 * REG_SIZE;
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dest_comp = offset(dest, bld, c);
|
2023-11-17 17:17:25 -08:00
|
|
|
bld8.emit(SHADER_OPCODE_MOV_INDIRECT,
|
2024-04-20 17:08:02 -07:00
|
|
|
retype(quarter(dest_comp, q), BRW_TYPE_UD),
|
2023-11-17 17:17:25 -08:00
|
|
|
data,
|
|
|
|
|
comp,
|
|
|
|
|
brw_imm_ud(4 * REG_SIZE));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2024-12-29 15:41:04 -08:00
|
|
|
emit_urb_indirect_reads_xe2(const brw_builder &bld, nir_intrinsic_instr *instr,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg &dest, const brw_reg &offset_src,
|
|
|
|
|
brw_reg urb_handle)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
|
|
|
|
assert(instr->def.bit_size == 32);
|
|
|
|
|
|
|
|
|
|
unsigned comps = instr->def.num_components;
|
|
|
|
|
if (comps == 0)
|
|
|
|
|
return;
|
|
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder ubld16 = bld.group(16, 0).exec_all();
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
|
|
|
|
|
component_from_intrinsic(instr);
|
|
|
|
|
|
2024-04-12 17:43:22 -07:00
|
|
|
if (offset_in_dwords > 0)
|
|
|
|
|
urb_handle = ubld16.ADD(urb_handle, brw_imm_ud(offset_in_dwords * 4));
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg data = ubld16.vgrf(BRW_TYPE_UD, comps);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
for (unsigned q = 0; q < bld.dispatch_width() / 16; q++) {
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder wbld = bld.group(16, q);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg addr = wbld.SHL(retype(horiz_offset(offset_src, 16 * q),
|
|
|
|
|
BRW_TYPE_UD),
|
|
|
|
|
brw_imm_ud(2));
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2024-04-12 17:43:22 -07:00
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = wbld.ADD(addr, urb_handle);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2025-08-20 18:25:32 -07:00
|
|
|
brw_inst *inst = wbld.URB_READ(data, srcs, ARRAY_SIZE(srcs));
|
2023-11-17 17:17:25 -08:00
|
|
|
inst->size_written = 2 * comps * REG_SIZE;
|
|
|
|
|
|
|
|
|
|
for (unsigned c = 0; c < comps; c++) {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dest_comp = horiz_offset(offset(dest, bld, c), 16 * q);
|
|
|
|
|
brw_reg data_comp = offset(data, wbld, c);
|
2024-04-20 17:08:02 -07:00
|
|
|
wbld.MOV(retype(dest_comp, BRW_TYPE_UD), data_comp);
|
2023-11-17 17:17:25 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 13:25:36 -08:00
|
|
|
static void
|
2023-12-05 15:27:29 -08:00
|
|
|
emit_task_mesh_store(nir_to_brw_state &ntb,
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld, nir_intrinsic_instr *instr,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg &urb_handle)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
2024-02-12 08:43:34 -08:00
|
|
|
brw_reg src = get_nir_src(ntb, instr->src[0], -1);
|
2023-11-17 17:17:25 -08:00
|
|
|
nir_src *offset_nir_src = nir_get_io_offset_src(instr);
|
|
|
|
|
|
|
|
|
|
if (nir_src_is_const(*offset_nir_src)) {
|
|
|
|
|
if (bld.shader->devinfo->ver >= 20)
|
|
|
|
|
emit_urb_direct_writes_xe2(bld, instr, src, urb_handle);
|
|
|
|
|
else
|
|
|
|
|
emit_urb_direct_writes(bld, instr, src, urb_handle);
|
|
|
|
|
} else {
|
|
|
|
|
if (bld.shader->devinfo->ver >= 20) {
|
2025-01-15 13:27:05 -08:00
|
|
|
emit_urb_indirect_writes_xe2(bld, instr, src,
|
|
|
|
|
get_nir_src(ntb, *offset_nir_src, 0),
|
|
|
|
|
urb_handle);
|
2023-11-17 17:17:25 -08:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
bool use_mod = false;
|
|
|
|
|
unsigned mod;
|
|
|
|
|
|
|
|
|
|
/* Try to calculate the value of (offset + base) % 4. If we can do
|
|
|
|
|
* this, then we can do indirect writes using only 1 URB write.
|
|
|
|
|
*/
|
|
|
|
|
use_mod = nir_mod_analysis(nir_get_scalar(offset_nir_src->ssa, 0), nir_type_uint, 4, &mod);
|
|
|
|
|
if (use_mod) {
|
|
|
|
|
mod += nir_intrinsic_base(instr) + component_from_intrinsic(instr);
|
|
|
|
|
mod %= 4;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (use_mod) {
|
2025-01-15 13:27:05 -08:00
|
|
|
emit_urb_indirect_writes_mod(bld, instr, src,
|
|
|
|
|
get_nir_src(ntb, *offset_nir_src, 0),
|
|
|
|
|
urb_handle, mod);
|
2023-11-17 17:17:25 -08:00
|
|
|
} else {
|
2025-01-15 13:27:05 -08:00
|
|
|
emit_urb_indirect_writes(bld, instr, src,
|
|
|
|
|
get_nir_src(ntb, *offset_nir_src, 0),
|
|
|
|
|
urb_handle);
|
2023-11-17 17:17:25 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 13:25:36 -08:00
|
|
|
static void
|
2023-12-05 15:27:29 -08:00
|
|
|
emit_task_mesh_load(nir_to_brw_state &ntb,
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld, nir_intrinsic_instr *instr,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg &urb_handle)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dest = get_nir_def(ntb, instr->def);
|
2023-11-17 17:17:25 -08:00
|
|
|
nir_src *offset_nir_src = nir_get_io_offset_src(instr);
|
|
|
|
|
|
|
|
|
|
/* TODO(mesh): for per_vertex and per_primitive, if we could keep around
|
|
|
|
|
* the non-array-index offset, we could use to decide if we can perform
|
|
|
|
|
* a single large aligned read instead one per component.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
if (nir_src_is_const(*offset_nir_src)) {
|
|
|
|
|
if (bld.shader->devinfo->ver >= 20)
|
|
|
|
|
emit_urb_direct_reads_xe2(bld, instr, dest, urb_handle);
|
|
|
|
|
else
|
|
|
|
|
emit_urb_direct_reads(bld, instr, dest, urb_handle);
|
|
|
|
|
} else {
|
|
|
|
|
if (bld.shader->devinfo->ver >= 20)
|
2025-01-15 13:27:05 -08:00
|
|
|
emit_urb_indirect_reads_xe2(bld, instr, dest,
|
|
|
|
|
get_nir_src(ntb, *offset_nir_src, 0),
|
|
|
|
|
urb_handle);
|
2023-11-17 17:17:25 -08:00
|
|
|
else
|
2025-01-15 13:27:05 -08:00
|
|
|
emit_urb_indirect_reads(bld, instr, dest,
|
|
|
|
|
get_nir_src(ntb, *offset_nir_src, 0),
|
|
|
|
|
urb_handle);
|
2023-11-17 17:17:25 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 12:13:47 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_task_mesh_intrinsic(nir_to_brw_state &ntb, const brw_builder &bld,
|
2023-11-20 12:13:47 -08:00
|
|
|
nir_intrinsic_instr *instr)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 12:13:47 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_MESH || s.stage == MESA_SHADER_TASK);
|
2024-12-06 22:13:36 -08:00
|
|
|
const brw_task_mesh_thread_payload &payload = s.task_mesh_payload();
|
2023-11-17 17:17:25 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dest;
|
2023-11-17 17:17:25 -08:00
|
|
|
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
|
2023-11-20 21:21:54 -08:00
|
|
|
dest = get_nir_def(ntb, instr->def);
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
switch (instr->intrinsic) {
|
|
|
|
|
case nir_intrinsic_load_draw_id:
|
2024-04-20 17:08:02 -07:00
|
|
|
dest = retype(dest, BRW_TYPE_UD);
|
2023-11-17 17:17:25 -08:00
|
|
|
bld.MOV(dest, payload.extended_parameter_0);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_local_invocation_id:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("local invocation id should have been lowered earlier");
|
2023-11-17 17:17:25 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_local_invocation_index:
|
2024-04-20 17:08:02 -07:00
|
|
|
dest = retype(dest, BRW_TYPE_UD);
|
2023-11-17 17:17:25 -08:00
|
|
|
bld.MOV(dest, payload.local_index);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_num_workgroups:
|
2024-04-20 17:08:02 -07:00
|
|
|
dest = retype(dest, BRW_TYPE_UD);
|
2023-11-17 17:17:25 -08:00
|
|
|
bld.MOV(offset(dest, bld, 0), brw_uw1_grf(0, 13)); /* g0.6 >> 16 */
|
|
|
|
|
bld.MOV(offset(dest, bld, 1), brw_uw1_grf(0, 8)); /* g0.4 & 0xffff */
|
|
|
|
|
bld.MOV(offset(dest, bld, 2), brw_uw1_grf(0, 9)); /* g0.4 >> 16 */
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_workgroup_index:
|
2024-04-20 17:08:02 -07:00
|
|
|
dest = retype(dest, BRW_TYPE_UD);
|
|
|
|
|
bld.MOV(dest, retype(brw_vec1_grf(0, 1), BRW_TYPE_UD));
|
2023-11-17 17:17:25 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_cs_intrinsic(ntb, instr);
|
2023-11-17 17:17:25 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 12:13:47 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_task_intrinsic(nir_to_brw_state &ntb,
|
2023-11-20 12:13:47 -08:00
|
|
|
nir_intrinsic_instr *instr)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 12:13:47 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_TASK);
|
2024-12-06 22:13:36 -08:00
|
|
|
const brw_task_mesh_thread_payload &payload = s.task_mesh_payload();
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
switch (instr->intrinsic) {
|
|
|
|
|
case nir_intrinsic_store_output:
|
|
|
|
|
case nir_intrinsic_store_task_payload:
|
2023-11-20 21:21:54 -08:00
|
|
|
emit_task_mesh_store(ntb, bld, instr, payload.urb_output);
|
2023-11-17 17:17:25 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_output:
|
|
|
|
|
case nir_intrinsic_load_task_payload:
|
2023-11-20 21:21:54 -08:00
|
|
|
emit_task_mesh_load(ntb, bld, instr, payload.urb_output);
|
2023-11-17 17:17:25 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_task_mesh_intrinsic(ntb, bld, instr);
|
2023-11-17 17:17:25 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 12:13:47 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_mesh_intrinsic(nir_to_brw_state &ntb,
|
2023-11-20 12:13:47 -08:00
|
|
|
nir_intrinsic_instr *instr)
|
2023-11-17 17:17:25 -08:00
|
|
|
{
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 12:13:47 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_MESH);
|
2024-12-06 22:13:36 -08:00
|
|
|
const brw_task_mesh_thread_payload &payload = s.task_mesh_payload();
|
2023-11-17 17:17:25 -08:00
|
|
|
|
|
|
|
|
switch (instr->intrinsic) {
|
|
|
|
|
case nir_intrinsic_store_per_primitive_output:
|
|
|
|
|
case nir_intrinsic_store_per_vertex_output:
|
|
|
|
|
case nir_intrinsic_store_output:
|
2023-11-20 21:21:54 -08:00
|
|
|
emit_task_mesh_store(ntb, bld, instr, payload.urb_output);
|
2023-11-17 17:17:25 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_per_vertex_output:
|
|
|
|
|
case nir_intrinsic_load_per_primitive_output:
|
|
|
|
|
case nir_intrinsic_load_output:
|
2023-11-20 21:21:54 -08:00
|
|
|
emit_task_mesh_load(ntb, bld, instr, payload.urb_output);
|
2023-11-17 17:17:25 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_task_payload:
|
2023-11-20 21:21:54 -08:00
|
|
|
emit_task_mesh_load(ntb, bld, instr, payload.task_urb_input);
|
2023-11-17 17:17:25 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_task_mesh_intrinsic(ntb, bld, instr);
|
2023-11-17 17:17:25 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 12:13:47 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld, nir_intrinsic_instr *instr)
|
2015-11-04 23:05:07 -08:00
|
|
|
{
|
2023-12-05 15:27:29 -08:00
|
|
|
const intel_device_info *devinfo = ntb.devinfo;
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 12:13:47 -08:00
|
|
|
|
2023-05-18 14:14:04 -05:00
|
|
|
/* We handle this as a special case */
|
|
|
|
|
if (instr->intrinsic == nir_intrinsic_decl_reg) {
|
|
|
|
|
assert(nir_intrinsic_num_array_elems(instr) == 0);
|
|
|
|
|
unsigned bit_size = nir_intrinsic_bit_size(instr);
|
|
|
|
|
unsigned num_components = nir_intrinsic_num_components(instr);
|
|
|
|
|
const brw_reg_type reg_type =
|
2024-04-21 00:33:52 -07:00
|
|
|
brw_type_with_size(bit_size == 8 ? BRW_TYPE_D : BRW_TYPE_F,
|
|
|
|
|
bit_size);
|
2023-05-18 14:14:04 -05:00
|
|
|
|
|
|
|
|
/* Re-use the destination's slot in the table for the register */
|
2023-12-05 15:27:29 -08:00
|
|
|
ntb.ssa_values[instr->def.index] =
|
2023-05-18 14:14:04 -05:00
|
|
|
bld.vgrf(reg_type, num_components);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dest;
|
2015-11-04 23:05:07 -08:00
|
|
|
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
|
2023-11-20 21:21:54 -08:00
|
|
|
dest = get_nir_def(ntb, instr->def);
|
2015-11-04 23:05:07 -08:00
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder xbld = dest.is_scalar ? bld.scalar_group() : bld;
|
2024-02-01 15:02:37 -08:00
|
|
|
|
2015-11-04 23:05:07 -08:00
|
|
|
switch (instr->intrinsic) {
|
2024-07-06 12:11:33 -07:00
|
|
|
case nir_intrinsic_resource_intel: {
|
2023-12-05 15:27:29 -08:00
|
|
|
ntb.ssa_bind_infos[instr->def.index].valid = true;
|
|
|
|
|
ntb.ssa_bind_infos[instr->def.index].bindless =
|
2023-01-13 12:26:01 +02:00
|
|
|
(nir_intrinsic_resource_access_intel(instr) &
|
|
|
|
|
nir_resource_intel_bindless) != 0;
|
2023-12-05 15:27:29 -08:00
|
|
|
ntb.ssa_bind_infos[instr->def.index].block =
|
2023-01-13 12:26:01 +02:00
|
|
|
nir_intrinsic_resource_block_intel(instr);
|
2023-12-05 15:27:29 -08:00
|
|
|
ntb.ssa_bind_infos[instr->def.index].set =
|
2023-01-13 12:26:01 +02:00
|
|
|
nir_intrinsic_desc_set(instr);
|
2023-12-05 15:27:29 -08:00
|
|
|
ntb.ssa_bind_infos[instr->def.index].binding =
|
2023-01-13 12:26:01 +02:00
|
|
|
nir_intrinsic_binding(instr);
|
2023-02-09 15:07:36 +02:00
|
|
|
|
2024-12-12 16:41:51 -08:00
|
|
|
dest = retype(dest, BRW_TYPE_UD);
|
|
|
|
|
ntb.ssa_values[instr->def.index] = dest;
|
|
|
|
|
|
|
|
|
|
xbld.MOV(dest,
|
2025-01-15 13:27:05 -08:00
|
|
|
bld.emit_uniformize(get_nir_src(ntb, instr->src[1], 0)));
|
2023-01-13 12:26:01 +02:00
|
|
|
break;
|
2024-07-06 12:11:33 -07:00
|
|
|
}
|
2023-01-13 12:26:01 +02:00
|
|
|
|
2023-05-18 14:14:04 -05:00
|
|
|
case nir_intrinsic_load_reg:
|
|
|
|
|
case nir_intrinsic_store_reg:
|
|
|
|
|
/* Nothing to do with these. */
|
|
|
|
|
break;
|
|
|
|
|
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
case nir_intrinsic_load_global_constant_uniform_block_intel:
|
|
|
|
|
case nir_intrinsic_load_ssbo_uniform_block_intel:
|
|
|
|
|
case nir_intrinsic_load_shared_uniform_block_intel:
|
|
|
|
|
case nir_intrinsic_load_global_block_intel:
|
|
|
|
|
case nir_intrinsic_store_global_block_intel:
|
|
|
|
|
case nir_intrinsic_load_shared_block_intel:
|
|
|
|
|
case nir_intrinsic_store_shared_block_intel:
|
|
|
|
|
case nir_intrinsic_load_ssbo_block_intel:
|
|
|
|
|
case nir_intrinsic_store_ssbo_block_intel:
|
2018-08-16 16:23:10 -05:00
|
|
|
case nir_intrinsic_image_load:
|
|
|
|
|
case nir_intrinsic_image_store:
|
2023-05-13 02:22:47 +03:00
|
|
|
case nir_intrinsic_image_atomic:
|
|
|
|
|
case nir_intrinsic_image_atomic_swap:
|
2019-02-12 00:47:54 -06:00
|
|
|
case nir_intrinsic_bindless_image_load:
|
|
|
|
|
case nir_intrinsic_bindless_image_store:
|
2023-05-13 02:22:47 +03:00
|
|
|
case nir_intrinsic_bindless_image_atomic:
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
case nir_intrinsic_bindless_image_atomic_swap:
|
|
|
|
|
case nir_intrinsic_load_shared:
|
|
|
|
|
case nir_intrinsic_store_shared:
|
|
|
|
|
case nir_intrinsic_shared_atomic:
|
|
|
|
|
case nir_intrinsic_shared_atomic_swap:
|
|
|
|
|
case nir_intrinsic_load_ssbo:
|
2025-05-29 17:05:10 +03:00
|
|
|
case nir_intrinsic_load_ssbo_intel:
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
case nir_intrinsic_store_ssbo:
|
2025-05-29 17:05:10 +03:00
|
|
|
case nir_intrinsic_store_ssbo_intel:
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
case nir_intrinsic_ssbo_atomic:
|
|
|
|
|
case nir_intrinsic_ssbo_atomic_swap:
|
|
|
|
|
case nir_intrinsic_load_global:
|
|
|
|
|
case nir_intrinsic_load_global_constant:
|
|
|
|
|
case nir_intrinsic_store_global:
|
|
|
|
|
case nir_intrinsic_global_atomic:
|
|
|
|
|
case nir_intrinsic_global_atomic_swap:
|
|
|
|
|
case nir_intrinsic_load_scratch:
|
|
|
|
|
case nir_intrinsic_store_scratch:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_memory_access(ntb, bld, xbld, instr);
|
2015-07-27 16:26:52 +03:00
|
|
|
break;
|
|
|
|
|
|
2019-02-12 00:47:54 -06:00
|
|
|
case nir_intrinsic_image_size:
|
|
|
|
|
case nir_intrinsic_bindless_image_size: {
|
2021-02-03 10:52:04 -08:00
|
|
|
/* Cube image sizes should have previously been lowered to a 2D array */
|
|
|
|
|
assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE);
|
|
|
|
|
|
2018-08-16 11:01:24 -05:00
|
|
|
/* Unlike the [un]typed load and store opcodes, the TXS that this turns
|
|
|
|
|
* into will handle the binding table index for us in the geneerator.
|
2019-02-12 00:47:54 -06:00
|
|
|
* Incidentally, this means that we can handle bindless with exactly the
|
|
|
|
|
* same code.
|
2018-08-16 11:01:24 -05:00
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg image = retype(get_nir_src_imm(ntb, instr->src[0]), BRW_TYPE_UD);
|
2018-08-16 11:01:24 -05:00
|
|
|
image = bld.emit_uniformize(image);
|
|
|
|
|
|
2020-08-19 18:21:33 -05:00
|
|
|
assert(nir_src_as_uint(instr->src[1]) == 0);
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
|
2025-09-01 23:16:40 +03:00
|
|
|
srcs[TEX_LOGICAL_SRC_SURFACE] = image;
|
2018-10-31 09:52:33 -05:00
|
|
|
srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
|
|
|
|
|
|
2018-08-16 11:01:24 -05:00
|
|
|
/* Since the image size is always uniform, we can just emit a SIMD8
|
|
|
|
|
* query instruction and splat the result out.
|
|
|
|
|
*/
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder ubld = bld.scalar_group();
|
2018-08-16 11:01:24 -05:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4);
|
2025-09-01 13:55:57 +03:00
|
|
|
brw_tex_inst *inst = ubld.emit(SHADER_OPCODE_SAMPLER,
|
2025-08-21 00:02:14 -07:00
|
|
|
tmp, srcs, ARRAY_SIZE(srcs))->as_tex();
|
2025-09-01 13:55:57 +03:00
|
|
|
inst->sampler_opcode = SAMPLER_OPCODE_IMAGE_SIZE_LOGICAL;
|
2025-09-01 23:16:40 +03:00
|
|
|
inst->surface_bindless = instr->intrinsic == nir_intrinsic_bindless_image_size;
|
2023-10-02 20:51:55 -07:00
|
|
|
inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
|
2018-08-16 11:01:24 -05:00
|
|
|
|
2023-08-14 11:56:00 -05:00
|
|
|
for (unsigned c = 0; c < instr->def.num_components; ++c) {
|
2021-02-03 10:52:04 -08:00
|
|
|
bld.MOV(offset(retype(dest, tmp.type), bld, c),
|
|
|
|
|
component(offset(tmp, ubld, c), 0));
|
2018-08-16 11:01:24 -05:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2023-07-28 15:08:00 -04:00
|
|
|
case nir_intrinsic_barrier:
|
intel/fs,vec4: Pull stall logic for memory fences up into the IR
Instead of emitting the stall MOV "inside" the
SHADER_OPCODE_MEMORY_FENCE generation, use the scheduling fences when
creating the IR.
For IvyBridge, every (data cache) fence is accompained by a render
cache fence, that now is explicit in the IR, two
SHADER_OPCODE_MEMORY_FENCEs are emitted (with different SFIDs).
Because Begin and End interlock intrinsics are effectively memory
barriers, move its handling alongside the other memory barrier
intrinsics. The SHADER_OPCODE_INTERLOCK is still used to distinguish
if we are going to use a SENDC (for Begin) or regular SEND (for End).
This change is a preparation to allow emitting both SENDs in Gen11+
before we can stall on them.
Shader-db results for IVB (i965):
total instructions in shared programs: 11971190 -> 11971200 (<.01%)
instructions in affected programs: 11482 -> 11492 (0.09%)
helped: 0
HURT: 8
HURT stats (abs) min: 1 max: 3 x̄: 1.25 x̃: 1
HURT stats (rel) min: 0.03% max: 0.50% x̄: 0.14% x̃: 0.10%
95% mean confidence interval for instructions value: 0.66 1.84
95% mean confidence interval for instructions %-change: 0.01% 0.27%
Instructions are HURT.
Unlike the previous code, that used the `mov g1 g2` trick to force
both `g1` and `g2` to stall, the scheduling fence will generate `mov
null g1` and `mov null g2`. During review it was decided it was not
worth keeping the special codepath for the small effect will have.
Shader-db results for HSW (i965), BDW and SKL don't have a change
on instruction count, but do report changes in cycles count, showing
SKL results below
total cycles in shared programs: 341738444 -> 341710570 (<.01%)
cycles in affected programs: 7240002 -> 7212128 (-0.38%)
helped: 46
HURT: 5
helped stats (abs) min: 14 max: 1940 x̄: 676.22 x̃: 154
helped stats (rel) min: <.01% max: 2.62% x̄: 1.28% x̃: 0.95%
HURT stats (abs) min: 2 max: 1768 x̄: 646.40 x̃: 362
HURT stats (rel) min: <.01% max: 0.83% x̄: 0.28% x̃: 0.08%
95% mean confidence interval for cycles value: -777.71 -315.38
95% mean confidence interval for cycles %-change: -1.42% -0.83%
Cycles are helped.
This seems to be the effect of allocating two registers separatedly
instead of a single one with size 2, which causes different register
allocation, affecting the cycle estimates.
while ICL also has not change on instruction count but report changes
negative changes in cycles
total cycles in shared programs: 352665369 -> 352707484 (0.01%)
cycles in affected programs: 9608288 -> 9650403 (0.44%)
helped: 4
HURT: 104
helped stats (abs) min: 24 max: 128 x̄: 88.50 x̃: 101
helped stats (rel) min: <.01% max: 0.85% x̄: 0.46% x̃: 0.49%
HURT stats (abs) min: 2 max: 2016 x̄: 408.36 x̃: 48
HURT stats (rel) min: <.01% max: 3.31% x̄: 0.88% x̃: 0.45%
95% mean confidence interval for cycles value: 256.67 523.24
95% mean confidence interval for cycles %-change: 0.63% 1.03%
Cycles are HURT.
AFAICT this is the result of the case above.
Shader-db results for TGL have similar cycles result as ICL, but also
affect instructions
total instructions in shared programs: 17690586 -> 17690597 (<.01%)
instructions in affected programs: 64617 -> 64628 (0.02%)
helped: 55
HURT: 32
helped stats (abs) min: 1 max: 16 x̄: 4.13 x̃: 3
helped stats (rel) min: 0.05% max: 2.78% x̄: 0.86% x̃: 0.74%
HURT stats (abs) min: 1 max: 65 x̄: 7.44 x̃: 2
HURT stats (rel) min: 0.05% max: 4.58% x̄: 1.13% x̃: 0.69%
95% mean confidence interval for instructions value: -2.03 2.28
95% mean confidence interval for instructions %-change: -0.41% 0.15%
Inconclusive result (value mean confidence interval includes 0).
Now that more is done in the IR, more dependencies are visible and
more SWSB annotations are emitted. Mixed with different register
allocation decisions like above, some shaders will see more `sync
nops` while others able to avoid them.
Most of the new `sync nops` are also redundant and could be dropped,
which will be fixed in a separate change.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3278>
2020-01-17 15:07:44 -08:00
|
|
|
case nir_intrinsic_begin_invocation_interlock:
|
|
|
|
|
case nir_intrinsic_end_invocation_interlock: {
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
bool ugm_fence, slm_fence, tgm_fence, urb_fence;
|
2023-03-02 14:26:53 -08:00
|
|
|
enum opcode opcode = BRW_OPCODE_NOP;
|
2023-02-24 02:25:44 -08:00
|
|
|
|
|
|
|
|
/* Handling interlock intrinsics here will allow the logic for IVB
|
|
|
|
|
* render cache (see below) to be reused.
|
|
|
|
|
*/
|
intel/fs,vec4: Pull stall logic for memory fences up into the IR
Instead of emitting the stall MOV "inside" the
SHADER_OPCODE_MEMORY_FENCE generation, use the scheduling fences when
creating the IR.
For IvyBridge, every (data cache) fence is accompained by a render
cache fence, that now is explicit in the IR, two
SHADER_OPCODE_MEMORY_FENCEs are emitted (with different SFIDs).
Because Begin and End interlock intrinsics are effectively memory
barriers, move its handling alongside the other memory barrier
intrinsics. The SHADER_OPCODE_INTERLOCK is still used to distinguish
if we are going to use a SENDC (for Begin) or regular SEND (for End).
This change is a preparation to allow emitting both SENDs in Gen11+
before we can stall on them.
Shader-db results for IVB (i965):
total instructions in shared programs: 11971190 -> 11971200 (<.01%)
instructions in affected programs: 11482 -> 11492 (0.09%)
helped: 0
HURT: 8
HURT stats (abs) min: 1 max: 3 x̄: 1.25 x̃: 1
HURT stats (rel) min: 0.03% max: 0.50% x̄: 0.14% x̃: 0.10%
95% mean confidence interval for instructions value: 0.66 1.84
95% mean confidence interval for instructions %-change: 0.01% 0.27%
Instructions are HURT.
Unlike the previous code, that used the `mov g1 g2` trick to force
both `g1` and `g2` to stall, the scheduling fence will generate `mov
null g1` and `mov null g2`. During review it was decided it was not
worth keeping the special codepath for the small effect will have.
Shader-db results for HSW (i965), BDW and SKL don't have a change
on instruction count, but do report changes in cycles count, showing
SKL results below
total cycles in shared programs: 341738444 -> 341710570 (<.01%)
cycles in affected programs: 7240002 -> 7212128 (-0.38%)
helped: 46
HURT: 5
helped stats (abs) min: 14 max: 1940 x̄: 676.22 x̃: 154
helped stats (rel) min: <.01% max: 2.62% x̄: 1.28% x̃: 0.95%
HURT stats (abs) min: 2 max: 1768 x̄: 646.40 x̃: 362
HURT stats (rel) min: <.01% max: 0.83% x̄: 0.28% x̃: 0.08%
95% mean confidence interval for cycles value: -777.71 -315.38
95% mean confidence interval for cycles %-change: -1.42% -0.83%
Cycles are helped.
This seems to be the effect of allocating two registers separatedly
instead of a single one with size 2, which causes different register
allocation, affecting the cycle estimates.
while ICL also has not change on instruction count but report changes
negative changes in cycles
total cycles in shared programs: 352665369 -> 352707484 (0.01%)
cycles in affected programs: 9608288 -> 9650403 (0.44%)
helped: 4
HURT: 104
helped stats (abs) min: 24 max: 128 x̄: 88.50 x̃: 101
helped stats (rel) min: <.01% max: 0.85% x̄: 0.46% x̃: 0.49%
HURT stats (abs) min: 2 max: 2016 x̄: 408.36 x̃: 48
HURT stats (rel) min: <.01% max: 3.31% x̄: 0.88% x̃: 0.45%
95% mean confidence interval for cycles value: 256.67 523.24
95% mean confidence interval for cycles %-change: 0.63% 1.03%
Cycles are HURT.
AFAICT this is the result of the case above.
Shader-db results for TGL have similar cycles result as ICL, but also
affect instructions
total instructions in shared programs: 17690586 -> 17690597 (<.01%)
instructions in affected programs: 64617 -> 64628 (0.02%)
helped: 55
HURT: 32
helped stats (abs) min: 1 max: 16 x̄: 4.13 x̃: 3
helped stats (rel) min: 0.05% max: 2.78% x̄: 0.86% x̃: 0.74%
HURT stats (abs) min: 1 max: 65 x̄: 7.44 x̃: 2
HURT stats (rel) min: 0.05% max: 4.58% x̄: 1.13% x̃: 0.69%
95% mean confidence interval for instructions value: -2.03 2.28
95% mean confidence interval for instructions %-change: -0.41% 0.15%
Inconclusive result (value mean confidence interval includes 0).
Now that more is done in the IR, more dependencies are visible and
more SWSB annotations are emitted. Mixed with different register
allocation decisions like above, some shaders will see more `sync
nops` while others able to avoid them.
Most of the new `sync nops` are also redundant and could be dropped,
which will be fixed in a separate change.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3278>
2020-01-17 15:07:44 -08:00
|
|
|
|
|
|
|
|
switch (instr->intrinsic) {
|
2023-07-28 15:08:00 -04:00
|
|
|
case nir_intrinsic_barrier: {
|
2023-03-02 14:26:53 -08:00
|
|
|
/* Note we only care about the memory part of the
|
|
|
|
|
* barrier. The execution part will be taken care
|
|
|
|
|
* of by the stage specific intrinsic handler functions.
|
|
|
|
|
*/
|
2019-09-05 11:08:05 -07:00
|
|
|
nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
ugm_fence = modes & (nir_var_mem_ssbo | nir_var_mem_global);
|
2019-12-31 01:01:27 -08:00
|
|
|
slm_fence = modes & nir_var_mem_shared;
|
2021-10-15 12:58:22 -05:00
|
|
|
tgm_fence = modes & nir_var_image;
|
2022-05-16 12:10:00 +02:00
|
|
|
urb_fence = modes & (nir_var_shader_out | nir_var_mem_task_payload);
|
2024-12-16 13:26:22 +02:00
|
|
|
|
|
|
|
|
/* When image accesses have been lowered to global intrinsics and a
|
|
|
|
|
* typed fence is requested, we also need to include the untyped
|
|
|
|
|
* global memory fence.
|
|
|
|
|
*/
|
|
|
|
|
if (tgm_fence && s.nir->info.use_lowered_image_to_global)
|
|
|
|
|
ugm_fence = true;
|
|
|
|
|
|
2023-05-30 12:05:30 -07:00
|
|
|
if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
|
2023-03-02 14:26:53 -08:00
|
|
|
opcode = SHADER_OPCODE_MEMORY_FENCE;
|
intel/fs,vec4: Pull stall logic for memory fences up into the IR
Instead of emitting the stall MOV "inside" the
SHADER_OPCODE_MEMORY_FENCE generation, use the scheduling fences when
creating the IR.
For IvyBridge, every (data cache) fence is accompained by a render
cache fence, that now is explicit in the IR, two
SHADER_OPCODE_MEMORY_FENCEs are emitted (with different SFIDs).
Because Begin and End interlock intrinsics are effectively memory
barriers, move its handling alongside the other memory barrier
intrinsics. The SHADER_OPCODE_INTERLOCK is still used to distinguish
if we are going to use a SENDC (for Begin) or regular SEND (for End).
This change is a preparation to allow emitting both SENDs in Gen11+
before we can stall on them.
Shader-db results for IVB (i965):
total instructions in shared programs: 11971190 -> 11971200 (<.01%)
instructions in affected programs: 11482 -> 11492 (0.09%)
helped: 0
HURT: 8
HURT stats (abs) min: 1 max: 3 x̄: 1.25 x̃: 1
HURT stats (rel) min: 0.03% max: 0.50% x̄: 0.14% x̃: 0.10%
95% mean confidence interval for instructions value: 0.66 1.84
95% mean confidence interval for instructions %-change: 0.01% 0.27%
Instructions are HURT.
Unlike the previous code, that used the `mov g1 g2` trick to force
both `g1` and `g2` to stall, the scheduling fence will generate `mov
null g1` and `mov null g2`. During review it was decided it was not
worth keeping the special codepath for the small effect will have.
Shader-db results for HSW (i965), BDW and SKL don't have a change
on instruction count, but do report changes in cycles count, showing
SKL results below
total cycles in shared programs: 341738444 -> 341710570 (<.01%)
cycles in affected programs: 7240002 -> 7212128 (-0.38%)
helped: 46
HURT: 5
helped stats (abs) min: 14 max: 1940 x̄: 676.22 x̃: 154
helped stats (rel) min: <.01% max: 2.62% x̄: 1.28% x̃: 0.95%
HURT stats (abs) min: 2 max: 1768 x̄: 646.40 x̃: 362
HURT stats (rel) min: <.01% max: 0.83% x̄: 0.28% x̃: 0.08%
95% mean confidence interval for cycles value: -777.71 -315.38
95% mean confidence interval for cycles %-change: -1.42% -0.83%
Cycles are helped.
This seems to be the effect of allocating two registers separatedly
instead of a single one with size 2, which causes different register
allocation, affecting the cycle estimates.
while ICL also has not change on instruction count but report changes
negative changes in cycles
total cycles in shared programs: 352665369 -> 352707484 (0.01%)
cycles in affected programs: 9608288 -> 9650403 (0.44%)
helped: 4
HURT: 104
helped stats (abs) min: 24 max: 128 x̄: 88.50 x̃: 101
helped stats (rel) min: <.01% max: 0.85% x̄: 0.46% x̃: 0.49%
HURT stats (abs) min: 2 max: 2016 x̄: 408.36 x̃: 48
HURT stats (rel) min: <.01% max: 3.31% x̄: 0.88% x̃: 0.45%
95% mean confidence interval for cycles value: 256.67 523.24
95% mean confidence interval for cycles %-change: 0.63% 1.03%
Cycles are HURT.
AFAICT this is the result of the case above.
Shader-db results for TGL have similar cycles result as ICL, but also
affect instructions
total instructions in shared programs: 17690586 -> 17690597 (<.01%)
instructions in affected programs: 64617 -> 64628 (0.02%)
helped: 55
HURT: 32
helped stats (abs) min: 1 max: 16 x̄: 4.13 x̃: 3
helped stats (rel) min: 0.05% max: 2.78% x̄: 0.86% x̃: 0.74%
HURT stats (abs) min: 1 max: 65 x̄: 7.44 x̃: 2
HURT stats (rel) min: 0.05% max: 4.58% x̄: 1.13% x̃: 0.69%
95% mean confidence interval for instructions value: -2.03 2.28
95% mean confidence interval for instructions %-change: -0.41% 0.15%
Inconclusive result (value mean confidence interval includes 0).
Now that more is done in the IR, more dependencies are visible and
more SWSB annotations are emitted. Mixed with different register
allocation decisions like above, some shaders will see more `sync
nops` while others able to avoid them.
Most of the new `sync nops` are also redundant and could be dropped,
which will be fixed in a separate change.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3278>
2020-01-17 15:07:44 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_begin_invocation_interlock:
|
|
|
|
|
/* For beginInvocationInterlockARB(), we will generate a memory fence
|
|
|
|
|
* but with a different opcode so that generator can pick SENDC
|
|
|
|
|
* instead of SEND.
|
2023-02-24 02:25:44 -08:00
|
|
|
*/
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
2023-02-24 02:25:44 -08:00
|
|
|
ugm_fence = tgm_fence = true;
|
|
|
|
|
slm_fence = urb_fence = false;
|
|
|
|
|
opcode = SHADER_OPCODE_INTERLOCK;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_end_invocation_interlock:
|
|
|
|
|
/* For endInvocationInterlockARB(), we need to insert a memory fence which
|
intel/fs,vec4: Pull stall logic for memory fences up into the IR
Instead of emitting the stall MOV "inside" the
SHADER_OPCODE_MEMORY_FENCE generation, use the scheduling fences when
creating the IR.
For IvyBridge, every (data cache) fence is accompained by a render
cache fence, that now is explicit in the IR, two
SHADER_OPCODE_MEMORY_FENCEs are emitted (with different SFIDs).
Because Begin and End interlock intrinsics are effectively memory
barriers, move its handling alongside the other memory barrier
intrinsics. The SHADER_OPCODE_INTERLOCK is still used to distinguish
if we are going to use a SENDC (for Begin) or regular SEND (for End).
This change is a preparation to allow emitting both SENDs in Gen11+
before we can stall on them.
Shader-db results for IVB (i965):
total instructions in shared programs: 11971190 -> 11971200 (<.01%)
instructions in affected programs: 11482 -> 11492 (0.09%)
helped: 0
HURT: 8
HURT stats (abs) min: 1 max: 3 x̄: 1.25 x̃: 1
HURT stats (rel) min: 0.03% max: 0.50% x̄: 0.14% x̃: 0.10%
95% mean confidence interval for instructions value: 0.66 1.84
95% mean confidence interval for instructions %-change: 0.01% 0.27%
Instructions are HURT.
Unlike the previous code, that used the `mov g1 g2` trick to force
both `g1` and `g2` to stall, the scheduling fence will generate `mov
null g1` and `mov null g2`. During review it was decided it was not
worth keeping the special codepath for the small effect will have.
Shader-db results for HSW (i965), BDW and SKL don't have a change
on instruction count, but do report changes in cycles count, showing
SKL results below
total cycles in shared programs: 341738444 -> 341710570 (<.01%)
cycles in affected programs: 7240002 -> 7212128 (-0.38%)
helped: 46
HURT: 5
helped stats (abs) min: 14 max: 1940 x̄: 676.22 x̃: 154
helped stats (rel) min: <.01% max: 2.62% x̄: 1.28% x̃: 0.95%
HURT stats (abs) min: 2 max: 1768 x̄: 646.40 x̃: 362
HURT stats (rel) min: <.01% max: 0.83% x̄: 0.28% x̃: 0.08%
95% mean confidence interval for cycles value: -777.71 -315.38
95% mean confidence interval for cycles %-change: -1.42% -0.83%
Cycles are helped.
This seems to be the effect of allocating two registers separatedly
instead of a single one with size 2, which causes different register
allocation, affecting the cycle estimates.
while ICL also has not change on instruction count but report changes
negative changes in cycles
total cycles in shared programs: 352665369 -> 352707484 (0.01%)
cycles in affected programs: 9608288 -> 9650403 (0.44%)
helped: 4
HURT: 104
helped stats (abs) min: 24 max: 128 x̄: 88.50 x̃: 101
helped stats (rel) min: <.01% max: 0.85% x̄: 0.46% x̃: 0.49%
HURT stats (abs) min: 2 max: 2016 x̄: 408.36 x̃: 48
HURT stats (rel) min: <.01% max: 3.31% x̄: 0.88% x̃: 0.45%
95% mean confidence interval for cycles value: 256.67 523.24
95% mean confidence interval for cycles %-change: 0.63% 1.03%
Cycles are HURT.
AFAICT this is the result of the case above.
Shader-db results for TGL have similar cycles result as ICL, but also
affect instructions
total instructions in shared programs: 17690586 -> 17690597 (<.01%)
instructions in affected programs: 64617 -> 64628 (0.02%)
helped: 55
HURT: 32
helped stats (abs) min: 1 max: 16 x̄: 4.13 x̃: 3
helped stats (rel) min: 0.05% max: 2.78% x̄: 0.86% x̃: 0.74%
HURT stats (abs) min: 1 max: 65 x̄: 7.44 x̃: 2
HURT stats (rel) min: 0.05% max: 4.58% x̄: 1.13% x̃: 0.69%
95% mean confidence interval for instructions value: -2.03 2.28
95% mean confidence interval for instructions %-change: -0.41% 0.15%
Inconclusive result (value mean confidence interval includes 0).
Now that more is done in the IR, more dependencies are visible and
more SWSB annotations are emitted. Mixed with different register
allocation decisions like above, some shaders will see more `sync
nops` while others able to avoid them.
Most of the new `sync nops` are also redundant and could be dropped,
which will be fixed in a separate change.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3278>
2020-01-17 15:07:44 -08:00
|
|
|
* stalls in the shader until the memory transactions prior to that
|
|
|
|
|
* fence are complete. This ensures that the shader does not end before
|
|
|
|
|
* any writes from its critical section have landed. Otherwise, you can
|
|
|
|
|
* end up with a case where the next invocation on that pixel properly
|
|
|
|
|
* stalls for previous FS invocation on its pixel to complete but
|
|
|
|
|
* doesn't actually wait for the dataport memory transactions from that
|
|
|
|
|
* thread to land before submitting its own.
|
|
|
|
|
*/
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
ugm_fence = tgm_fence = true;
|
|
|
|
|
slm_fence = urb_fence = false;
|
2023-02-24 02:25:44 -08:00
|
|
|
opcode = SHADER_OPCODE_MEMORY_FENCE;
|
intel/fs,vec4: Pull stall logic for memory fences up into the IR
Instead of emitting the stall MOV "inside" the
SHADER_OPCODE_MEMORY_FENCE generation, use the scheduling fences when
creating the IR.
For IvyBridge, every (data cache) fence is accompained by a render
cache fence, that now is explicit in the IR, two
SHADER_OPCODE_MEMORY_FENCEs are emitted (with different SFIDs).
Because Begin and End interlock intrinsics are effectively memory
barriers, move its handling alongside the other memory barrier
intrinsics. The SHADER_OPCODE_INTERLOCK is still used to distinguish
if we are going to use a SENDC (for Begin) or regular SEND (for End).
This change is a preparation to allow emitting both SENDs in Gen11+
before we can stall on them.
Shader-db results for IVB (i965):
total instructions in shared programs: 11971190 -> 11971200 (<.01%)
instructions in affected programs: 11482 -> 11492 (0.09%)
helped: 0
HURT: 8
HURT stats (abs) min: 1 max: 3 x̄: 1.25 x̃: 1
HURT stats (rel) min: 0.03% max: 0.50% x̄: 0.14% x̃: 0.10%
95% mean confidence interval for instructions value: 0.66 1.84
95% mean confidence interval for instructions %-change: 0.01% 0.27%
Instructions are HURT.
Unlike the previous code, that used the `mov g1 g2` trick to force
both `g1` and `g2` to stall, the scheduling fence will generate `mov
null g1` and `mov null g2`. During review it was decided it was not
worth keeping the special codepath for the small effect will have.
Shader-db results for HSW (i965), BDW and SKL don't have a change
on instruction count, but do report changes in cycles count, showing
SKL results below
total cycles in shared programs: 341738444 -> 341710570 (<.01%)
cycles in affected programs: 7240002 -> 7212128 (-0.38%)
helped: 46
HURT: 5
helped stats (abs) min: 14 max: 1940 x̄: 676.22 x̃: 154
helped stats (rel) min: <.01% max: 2.62% x̄: 1.28% x̃: 0.95%
HURT stats (abs) min: 2 max: 1768 x̄: 646.40 x̃: 362
HURT stats (rel) min: <.01% max: 0.83% x̄: 0.28% x̃: 0.08%
95% mean confidence interval for cycles value: -777.71 -315.38
95% mean confidence interval for cycles %-change: -1.42% -0.83%
Cycles are helped.
This seems to be the effect of allocating two registers separatedly
instead of a single one with size 2, which causes different register
allocation, affecting the cycle estimates.
while ICL also has not change on instruction count but report changes
negative changes in cycles
total cycles in shared programs: 352665369 -> 352707484 (0.01%)
cycles in affected programs: 9608288 -> 9650403 (0.44%)
helped: 4
HURT: 104
helped stats (abs) min: 24 max: 128 x̄: 88.50 x̃: 101
helped stats (rel) min: <.01% max: 0.85% x̄: 0.46% x̃: 0.49%
HURT stats (abs) min: 2 max: 2016 x̄: 408.36 x̃: 48
HURT stats (rel) min: <.01% max: 3.31% x̄: 0.88% x̃: 0.45%
95% mean confidence interval for cycles value: 256.67 523.24
95% mean confidence interval for cycles %-change: 0.63% 1.03%
Cycles are HURT.
AFAICT this is the result of the case above.
Shader-db results for TGL have similar cycles result as ICL, but also
affect instructions
total instructions in shared programs: 17690586 -> 17690597 (<.01%)
instructions in affected programs: 64617 -> 64628 (0.02%)
helped: 55
HURT: 32
helped stats (abs) min: 1 max: 16 x̄: 4.13 x̃: 3
helped stats (rel) min: 0.05% max: 2.78% x̄: 0.86% x̃: 0.74%
HURT stats (abs) min: 1 max: 65 x̄: 7.44 x̃: 2
HURT stats (rel) min: 0.05% max: 4.58% x̄: 1.13% x̃: 0.69%
95% mean confidence interval for instructions value: -2.03 2.28
95% mean confidence interval for instructions %-change: -0.41% 0.15%
Inconclusive result (value mean confidence interval includes 0).
Now that more is done in the IR, more dependencies are visible and
more SWSB annotations are emitted. Mixed with different register
allocation decisions like above, some shaders will see more `sync
nops` while others able to avoid them.
Most of the new `sync nops` are also redundant and could be dropped,
which will be fixed in a separate change.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3278>
2020-01-17 15:07:44 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("invalid intrinsic");
|
2019-07-10 12:02:23 -07:00
|
|
|
}
|
|
|
|
|
|
2023-03-02 14:26:53 -08:00
|
|
|
if (opcode == BRW_OPCODE_NOP)
|
|
|
|
|
break;
|
|
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
if (s.nir->info.shared_size > 0) {
|
2025-08-05 16:50:43 +08:00
|
|
|
assert(mesa_shader_stage_uses_workgroup(s.stage));
|
2021-09-17 07:45:46 -05:00
|
|
|
} else {
|
2020-01-13 15:48:12 -08:00
|
|
|
slm_fence = false;
|
2021-09-17 07:45:46 -05:00
|
|
|
}
|
2020-01-13 15:48:12 -08:00
|
|
|
|
2019-12-31 01:01:27 -08:00
|
|
|
/* If the workgroup fits in a single HW thread, the messages for SLM are
|
|
|
|
|
* processed in-order and the shader itself is already synchronized so
|
|
|
|
|
* the memory fence is not necessary.
|
|
|
|
|
*
|
|
|
|
|
* TODO: Check if applies for many HW threads sharing same Data Port.
|
|
|
|
|
*/
|
2023-12-05 17:16:34 -08:00
|
|
|
if (!s.nir->info.workgroup_size_variable &&
|
2024-07-12 16:36:39 -07:00
|
|
|
slm_fence && brw_workgroup_size(s) <= s.dispatch_width)
|
2019-12-31 01:01:27 -08:00
|
|
|
slm_fence = false;
|
|
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
switch (s.stage) {
|
2022-05-16 12:10:00 +02:00
|
|
|
case MESA_SHADER_TESS_CTRL:
|
|
|
|
|
case MESA_SHADER_TASK:
|
|
|
|
|
case MESA_SHADER_MESH:
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
urb_fence = false;
|
|
|
|
|
break;
|
|
|
|
|
}
|
2019-07-10 12:02:23 -07:00
|
|
|
|
2020-01-17 14:17:58 -08:00
|
|
|
unsigned fence_regs_count = 0;
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg fence_regs[4] = {};
|
2020-01-17 14:17:58 -08:00
|
|
|
|
2025-04-03 01:14:03 -07:00
|
|
|
const brw_builder ubld1 = bld.uniform();
|
2019-07-10 12:02:23 -07:00
|
|
|
|
2023-01-09 15:31:33 -08:00
|
|
|
/* A memory barrier with acquire semantics requires us to
|
|
|
|
|
* guarantee that memory operations of the specified storage
|
|
|
|
|
* class sequenced-after the barrier aren't reordered before the
|
|
|
|
|
* barrier, nor before any previous atomic operation
|
|
|
|
|
* sequenced-before the barrier which may be synchronizing this
|
|
|
|
|
* acquire barrier with a prior release sequence.
|
|
|
|
|
*
|
|
|
|
|
* In order to guarantee the latter we must make sure that any
|
|
|
|
|
* such previous operation has completed execution before
|
|
|
|
|
* invalidating the relevant caches, since otherwise some cache
|
|
|
|
|
* could be polluted by a concurrent thread after its
|
|
|
|
|
* invalidation but before the previous atomic completes, which
|
|
|
|
|
* could lead to a violation of the expected memory ordering if
|
|
|
|
|
* a subsequent memory read hits the polluted cacheline, which
|
|
|
|
|
* would return a stale value read from memory before the
|
|
|
|
|
* completion of the atomic sequenced-before the barrier.
|
|
|
|
|
*
|
|
|
|
|
* This ordering inversion can be avoided trivially if the
|
|
|
|
|
* operations we need to order are all handled by a single
|
|
|
|
|
* in-order cache, since the flush implied by the memory fence
|
|
|
|
|
* occurs after any pending operations have completed, however
|
|
|
|
|
* that doesn't help us when dealing with multiple caches
|
|
|
|
|
* processing requests out of order, in which case we need to
|
|
|
|
|
* explicitly stall the EU until any pending memory operations
|
|
|
|
|
* have executed.
|
|
|
|
|
*
|
|
|
|
|
* Note that that might be somewhat heavy handed in some cases.
|
|
|
|
|
* In particular when this memory fence was inserted by
|
|
|
|
|
* spirv_to_nir() lowering an atomic with acquire semantics into
|
|
|
|
|
* an atomic+barrier sequence we could do a better job by
|
|
|
|
|
* synchronizing with respect to that one atomic *only*, but
|
|
|
|
|
* that would require additional information not currently
|
|
|
|
|
* available to the backend.
|
|
|
|
|
*
|
|
|
|
|
* XXX - Use an alternative workaround on IVB and ICL, since
|
|
|
|
|
* SYNC.ALLWR is only available on Gfx12+.
|
|
|
|
|
*/
|
|
|
|
|
if (devinfo->ver >= 12 &&
|
|
|
|
|
(!nir_intrinsic_has_memory_scope(instr) ||
|
|
|
|
|
(nir_intrinsic_memory_semantics(instr) & NIR_MEMORY_ACQUIRE))) {
|
2025-01-17 22:56:24 -08:00
|
|
|
ubld1.SYNC(TGL_SYNC_ALLWR);
|
2023-01-09 15:31:33 -08:00
|
|
|
}
|
|
|
|
|
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
if (devinfo->has_lsc) {
|
|
|
|
|
assert(devinfo->verx10 >= 125);
|
2022-04-05 07:59:51 +03:00
|
|
|
uint32_t desc =
|
|
|
|
|
lsc_fence_descriptor_for_intrinsic(devinfo, instr);
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
if (ugm_fence) {
|
|
|
|
|
fence_regs[fence_regs_count++] =
|
brw: Rename shared function enums for clarity
Our name for this enum was brw_message_target, but it's better known as
shared function ID or SFID. Call it brw_sfid to make it easier to find.
Now that brw only supports Gfx9+, we don't particularly care whether
SFIDs were introduced on Gfx4, Gfx6, or Gfx7.5. Also, the LSC SFIDs
were confusingly tagged "GFX12" but aren't available on Gfx12.0; they
were introduced with Alchemist/Meteorlake.
GFX6_SFID_DATAPORT_SAMPLER_CACHE in particular was confusing. It sounds
like the SFID to use for the sampler on Gfx6+, however it has nothing to
do with the sampler at all. BRW_SFID_SAMPLER remains the sampler SFID.
On Haswell, we ran out of messages on the main data cache data port, and
so they introduced two additional ones, for more messages. The modern
Tigerlake PRMs simply call these DP_DC0, DP_DC1, and DP_DC2. I think
the "sampler" name came from some idea about reorganizing messages that
never materialized (instead, the LSC came as a much larger cleanup).
Recently we've adopted the term "HDC" for the legacy data cluster, as
opposed to "LSC" for the modern Load/Store Cache. To make clear which
SFIDs target the legacy HDC dataports, we use BRW_SFID_HDC0/1/2.
We were also citing the G45, Sandybridge, and Ivybridge PRMs for a
compiler that supports none of those platforms. Cite modern docs.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33650>
2025-02-10 16:28:48 -08:00
|
|
|
emit_fence(ubld1, opcode, BRW_SFID_UGM, desc,
|
2025-01-18 00:48:10 -08:00
|
|
|
true /* commit_enable */);
|
intel/fs,vec4: Pull stall logic for memory fences up into the IR
Instead of emitting the stall MOV "inside" the
SHADER_OPCODE_MEMORY_FENCE generation, use the scheduling fences when
creating the IR.
For IvyBridge, every (data cache) fence is accompained by a render
cache fence, that now is explicit in the IR, two
SHADER_OPCODE_MEMORY_FENCEs are emitted (with different SFIDs).
Because Begin and End interlock intrinsics are effectively memory
barriers, move its handling alongside the other memory barrier
intrinsics. The SHADER_OPCODE_INTERLOCK is still used to distinguish
if we are going to use a SENDC (for Begin) or regular SEND (for End).
This change is a preparation to allow emitting both SENDs in Gen11+
before we can stall on them.
Shader-db results for IVB (i965):
total instructions in shared programs: 11971190 -> 11971200 (<.01%)
instructions in affected programs: 11482 -> 11492 (0.09%)
helped: 0
HURT: 8
HURT stats (abs) min: 1 max: 3 x̄: 1.25 x̃: 1
HURT stats (rel) min: 0.03% max: 0.50% x̄: 0.14% x̃: 0.10%
95% mean confidence interval for instructions value: 0.66 1.84
95% mean confidence interval for instructions %-change: 0.01% 0.27%
Instructions are HURT.
Unlike the previous code, that used the `mov g1 g2` trick to force
both `g1` and `g2` to stall, the scheduling fence will generate `mov
null g1` and `mov null g2`. During review it was decided it was not
worth keeping the special codepath for the small effect will have.
Shader-db results for HSW (i965), BDW and SKL don't have a change
on instruction count, but do report changes in cycles count, showing
SKL results below
total cycles in shared programs: 341738444 -> 341710570 (<.01%)
cycles in affected programs: 7240002 -> 7212128 (-0.38%)
helped: 46
HURT: 5
helped stats (abs) min: 14 max: 1940 x̄: 676.22 x̃: 154
helped stats (rel) min: <.01% max: 2.62% x̄: 1.28% x̃: 0.95%
HURT stats (abs) min: 2 max: 1768 x̄: 646.40 x̃: 362
HURT stats (rel) min: <.01% max: 0.83% x̄: 0.28% x̃: 0.08%
95% mean confidence interval for cycles value: -777.71 -315.38
95% mean confidence interval for cycles %-change: -1.42% -0.83%
Cycles are helped.
This seems to be the effect of allocating two registers separatedly
instead of a single one with size 2, which causes different register
allocation, affecting the cycle estimates.
while ICL also has not change on instruction count but report changes
negative changes in cycles
total cycles in shared programs: 352665369 -> 352707484 (0.01%)
cycles in affected programs: 9608288 -> 9650403 (0.44%)
helped: 4
HURT: 104
helped stats (abs) min: 24 max: 128 x̄: 88.50 x̃: 101
helped stats (rel) min: <.01% max: 0.85% x̄: 0.46% x̃: 0.49%
HURT stats (abs) min: 2 max: 2016 x̄: 408.36 x̃: 48
HURT stats (rel) min: <.01% max: 3.31% x̄: 0.88% x̃: 0.45%
95% mean confidence interval for cycles value: 256.67 523.24
95% mean confidence interval for cycles %-change: 0.63% 1.03%
Cycles are HURT.
AFAICT this is the result of the case above.
Shader-db results for TGL have similar cycles result as ICL, but also
affect instructions
total instructions in shared programs: 17690586 -> 17690597 (<.01%)
instructions in affected programs: 64617 -> 64628 (0.02%)
helped: 55
HURT: 32
helped stats (abs) min: 1 max: 16 x̄: 4.13 x̃: 3
helped stats (rel) min: 0.05% max: 2.78% x̄: 0.86% x̃: 0.74%
HURT stats (abs) min: 1 max: 65 x̄: 7.44 x̃: 2
HURT stats (rel) min: 0.05% max: 4.58% x̄: 1.13% x̃: 0.69%
95% mean confidence interval for instructions value: -2.03 2.28
95% mean confidence interval for instructions %-change: -0.41% 0.15%
Inconclusive result (value mean confidence interval includes 0).
Now that more is done in the IR, more dependencies are visible and
more SWSB annotations are emitted. Mixed with different register
allocation decisions like above, some shaders will see more `sync
nops` while others able to avoid them.
Most of the new `sync nops` are also redundant and could be dropped,
which will be fixed in a separate change.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3278>
2020-01-17 15:07:44 -08:00
|
|
|
}
|
2020-07-11 18:33:05 -07:00
|
|
|
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
if (tgm_fence) {
|
|
|
|
|
fence_regs[fence_regs_count++] =
|
brw: Rename shared function enums for clarity
Our name for this enum was brw_message_target, but it's better known as
shared function ID or SFID. Call it brw_sfid to make it easier to find.
Now that brw only supports Gfx9+, we don't particularly care whether
SFIDs were introduced on Gfx4, Gfx6, or Gfx7.5. Also, the LSC SFIDs
were confusingly tagged "GFX12" but aren't available on Gfx12.0; they
were introduced with Alchemist/Meteorlake.
GFX6_SFID_DATAPORT_SAMPLER_CACHE in particular was confusing. It sounds
like the SFID to use for the sampler on Gfx6+, however it has nothing to
do with the sampler at all. BRW_SFID_SAMPLER remains the sampler SFID.
On Haswell, we ran out of messages on the main data cache data port, and
so they introduced two additional ones, for more messages. The modern
Tigerlake PRMs simply call these DP_DC0, DP_DC1, and DP_DC2. I think
the "sampler" name came from some idea about reorganizing messages that
never materialized (instead, the LSC came as a much larger cleanup).
Recently we've adopted the term "HDC" for the legacy data cluster, as
opposed to "LSC" for the modern Load/Store Cache. To make clear which
SFIDs target the legacy HDC dataports, we use BRW_SFID_HDC0/1/2.
We were also citing the G45, Sandybridge, and Ivybridge PRMs for a
compiler that supports none of those platforms. Cite modern docs.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33650>
2025-02-10 16:28:48 -08:00
|
|
|
emit_fence(ubld1, opcode, BRW_SFID_TGM, desc,
|
2025-01-18 00:48:10 -08:00
|
|
|
true /* commit_enable */);
|
2020-07-11 18:33:05 -07:00
|
|
|
}
|
2019-07-10 12:02:23 -07:00
|
|
|
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
if (slm_fence) {
|
|
|
|
|
assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
|
2021-05-20 23:48:47 -07:00
|
|
|
if (intel_needs_workaround(devinfo, 14014063774)) {
|
|
|
|
|
/* Wa_14014063774
|
|
|
|
|
*
|
|
|
|
|
* Before SLM fence compiler needs to insert SYNC.ALLWR in order
|
|
|
|
|
* to avoid the SLM data race.
|
|
|
|
|
*/
|
2025-01-17 22:56:24 -08:00
|
|
|
ubld1.SYNC(TGL_SYNC_ALLWR);
|
2021-05-20 23:48:47 -07:00
|
|
|
}
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
fence_regs[fence_regs_count++] =
|
brw: Rename shared function enums for clarity
Our name for this enum was brw_message_target, but it's better known as
shared function ID or SFID. Call it brw_sfid to make it easier to find.
Now that brw only supports Gfx9+, we don't particularly care whether
SFIDs were introduced on Gfx4, Gfx6, or Gfx7.5. Also, the LSC SFIDs
were confusingly tagged "GFX12" but aren't available on Gfx12.0; they
were introduced with Alchemist/Meteorlake.
GFX6_SFID_DATAPORT_SAMPLER_CACHE in particular was confusing. It sounds
like the SFID to use for the sampler on Gfx6+, however it has nothing to
do with the sampler at all. BRW_SFID_SAMPLER remains the sampler SFID.
On Haswell, we ran out of messages on the main data cache data port, and
so they introduced two additional ones, for more messages. The modern
Tigerlake PRMs simply call these DP_DC0, DP_DC1, and DP_DC2. I think
the "sampler" name came from some idea about reorganizing messages that
never materialized (instead, the LSC came as a much larger cleanup).
Recently we've adopted the term "HDC" for the legacy data cluster, as
opposed to "LSC" for the modern Load/Store Cache. To make clear which
SFIDs target the legacy HDC dataports, we use BRW_SFID_HDC0/1/2.
We were also citing the G45, Sandybridge, and Ivybridge PRMs for a
compiler that supports none of those platforms. Cite modern docs.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33650>
2025-02-10 16:28:48 -08:00
|
|
|
emit_fence(ubld1, opcode, BRW_SFID_SLM, desc,
|
2025-01-18 00:48:10 -08:00
|
|
|
true /* commit_enable */);
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (urb_fence) {
|
2021-09-15 16:24:22 -05:00
|
|
|
assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
|
|
|
|
|
fence_regs[fence_regs_count++] =
|
2025-01-17 22:56:24 -08:00
|
|
|
emit_fence(ubld1, opcode, BRW_SFID_URB, desc,
|
2025-01-18 00:48:10 -08:00
|
|
|
true /* commit_enable */);
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
}
|
|
|
|
|
} else if (devinfo->ver >= 11) {
|
|
|
|
|
if (tgm_fence || ugm_fence || urb_fence) {
|
|
|
|
|
fence_regs[fence_regs_count++] =
|
brw: Rename shared function enums for clarity
Our name for this enum was brw_message_target, but it's better known as
shared function ID or SFID. Call it brw_sfid to make it easier to find.
Now that brw only supports Gfx9+, we don't particularly care whether
SFIDs were introduced on Gfx4, Gfx6, or Gfx7.5. Also, the LSC SFIDs
were confusingly tagged "GFX12" but aren't available on Gfx12.0; they
were introduced with Alchemist/Meteorlake.
GFX6_SFID_DATAPORT_SAMPLER_CACHE in particular was confusing. It sounds
like the SFID to use for the sampler on Gfx6+, however it has nothing to
do with the sampler at all. BRW_SFID_SAMPLER remains the sampler SFID.
On Haswell, we ran out of messages on the main data cache data port, and
so they introduced two additional ones, for more messages. The modern
Tigerlake PRMs simply call these DP_DC0, DP_DC1, and DP_DC2. I think
the "sampler" name came from some idea about reorganizing messages that
never materialized (instead, the LSC came as a much larger cleanup).
Recently we've adopted the term "HDC" for the legacy data cluster, as
opposed to "LSC" for the modern Load/Store Cache. To make clear which
SFIDs target the legacy HDC dataports, we use BRW_SFID_HDC0/1/2.
We were also citing the G45, Sandybridge, and Ivybridge PRMs for a
compiler that supports none of those platforms. Cite modern docs.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33650>
2025-02-10 16:28:48 -08:00
|
|
|
emit_fence(ubld1, opcode, BRW_SFID_HDC0, 0,
|
2025-01-18 00:48:10 -08:00
|
|
|
true /* commit_enable HSD ES # 1404612949 */);
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (slm_fence) {
|
|
|
|
|
assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
|
2025-01-18 00:48:10 -08:00
|
|
|
/* We use the "SLM" SFID here even though it doesn't exist;
|
|
|
|
|
* the logical send lowering will replace it with the SLM
|
|
|
|
|
* special binding table index and the normal DATA_CACHE SFID.
|
|
|
|
|
*/
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
fence_regs[fence_regs_count++] =
|
brw: Rename shared function enums for clarity
Our name for this enum was brw_message_target, but it's better known as
shared function ID or SFID. Call it brw_sfid to make it easier to find.
Now that brw only supports Gfx9+, we don't particularly care whether
SFIDs were introduced on Gfx4, Gfx6, or Gfx7.5. Also, the LSC SFIDs
were confusingly tagged "GFX12" but aren't available on Gfx12.0; they
were introduced with Alchemist/Meteorlake.
GFX6_SFID_DATAPORT_SAMPLER_CACHE in particular was confusing. It sounds
like the SFID to use for the sampler on Gfx6+, however it has nothing to
do with the sampler at all. BRW_SFID_SAMPLER remains the sampler SFID.
On Haswell, we ran out of messages on the main data cache data port, and
so they introduced two additional ones, for more messages. The modern
Tigerlake PRMs simply call these DP_DC0, DP_DC1, and DP_DC2. I think
the "sampler" name came from some idea about reorganizing messages that
never materialized (instead, the LSC came as a much larger cleanup).
Recently we've adopted the term "HDC" for the legacy data cluster, as
opposed to "LSC" for the modern Load/Store Cache. To make clear which
SFIDs target the legacy HDC dataports, we use BRW_SFID_HDC0/1/2.
We were also citing the G45, Sandybridge, and Ivybridge PRMs for a
compiler that supports none of those platforms. Cite modern docs.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33650>
2025-02-10 16:28:48 -08:00
|
|
|
emit_fence(ubld1, opcode, BRW_SFID_SLM, 0,
|
2025-01-18 00:48:10 -08:00
|
|
|
true /* commit_enable HSD ES # 1404612949 */);
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
}
|
|
|
|
|
} else {
|
2022-03-29 11:47:23 +03:00
|
|
|
/* Simulation also complains on Gfx9 if we do not enable commit.
|
|
|
|
|
*/
|
2024-02-15 02:51:39 -08:00
|
|
|
const bool commit_enable =
|
2022-03-29 11:47:23 +03:00
|
|
|
instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
|
|
|
|
|
devinfo->ver == 9;
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
|
|
|
|
|
if (tgm_fence || ugm_fence || slm_fence || urb_fence) {
|
|
|
|
|
fence_regs[fence_regs_count++] =
|
brw: Rename shared function enums for clarity
Our name for this enum was brw_message_target, but it's better known as
shared function ID or SFID. Call it brw_sfid to make it easier to find.
Now that brw only supports Gfx9+, we don't particularly care whether
SFIDs were introduced on Gfx4, Gfx6, or Gfx7.5. Also, the LSC SFIDs
were confusingly tagged "GFX12" but aren't available on Gfx12.0; they
were introduced with Alchemist/Meteorlake.
GFX6_SFID_DATAPORT_SAMPLER_CACHE in particular was confusing. It sounds
like the SFID to use for the sampler on Gfx6+, however it has nothing to
do with the sampler at all. BRW_SFID_SAMPLER remains the sampler SFID.
On Haswell, we ran out of messages on the main data cache data port, and
so they introduced two additional ones, for more messages. The modern
Tigerlake PRMs simply call these DP_DC0, DP_DC1, and DP_DC2. I think
the "sampler" name came from some idea about reorganizing messages that
never materialized (instead, the LSC came as a much larger cleanup).
Recently we've adopted the term "HDC" for the legacy data cluster, as
opposed to "LSC" for the modern Load/Store Cache. To make clear which
SFIDs target the legacy HDC dataports, we use BRW_SFID_HDC0/1/2.
We were also citing the G45, Sandybridge, and Ivybridge PRMs for a
compiler that supports none of those platforms. Cite modern docs.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33650>
2025-02-10 16:28:48 -08:00
|
|
|
emit_fence(ubld1, opcode, BRW_SFID_HDC0, 0, commit_enable);
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
}
|
2019-07-10 12:02:23 -07:00
|
|
|
}
|
|
|
|
|
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
assert(fence_regs_count <= ARRAY_SIZE(fence_regs));
|
2020-01-17 14:17:58 -08:00
|
|
|
|
2023-01-30 10:41:37 -08:00
|
|
|
/* Be conservative in Gen11+ and always stall in a fence. Since
|
|
|
|
|
* there are two different fences, and shader might want to
|
|
|
|
|
* synchronize between them.
|
|
|
|
|
*
|
|
|
|
|
* TODO: Use scope and visibility information for the barriers from NIR
|
|
|
|
|
* to make a better decision on whether we need to stall.
|
|
|
|
|
*/
|
|
|
|
|
bool force_stall = devinfo->ver >= 11;
|
|
|
|
|
|
2022-04-27 09:20:21 +03:00
|
|
|
/* There are four cases where we want to insert a stall:
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
*
|
|
|
|
|
* 1. If we're a nir_intrinsic_end_invocation_interlock. This is
|
|
|
|
|
* required to ensure that the shader EOT doesn't happen until
|
|
|
|
|
* after the fence returns. Otherwise, we might end up with the
|
|
|
|
|
* next shader invocation for that pixel not respecting our fence
|
|
|
|
|
* because it may happen on a different HW thread.
|
|
|
|
|
*
|
|
|
|
|
* 2. If we have multiple fences. This is required to ensure that
|
|
|
|
|
* they all complete and nothing gets weirdly out-of-order.
|
|
|
|
|
*
|
|
|
|
|
* 3. If we have no fences. In this case, we need at least a
|
|
|
|
|
* scheduling barrier to keep the compiler from moving things
|
|
|
|
|
* around in an invalid way.
|
2022-04-27 09:20:21 +03:00
|
|
|
*
|
2023-01-30 10:41:37 -08:00
|
|
|
* 4. On Gen11+ and platforms with LSC, we have multiple fence types,
|
|
|
|
|
* without further information about the fence, we need to force a
|
|
|
|
|
* stall.
|
intel/fs: Rework fence handling in brw_fs_nir.cpp
Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM. Then, emit the actual code in a generation-
aware way. There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.
There are only three known functional changes from this commit:
1. We now avoid the render cache fence on IVB if we don't need image
barriers.
2. On ICL+, we no longer unconditionally stall on barriers. We still
stall if we have more than one to help tie them together but
independent barriers are independent. Barrier instructions will
still operate in write-commit mode and still be scheduling barriers
but won't necessarily stall.
3. We now assert-fail for URB fences on LSC platforms. We'll be adding
in the new URB fence message for those platforms in a follow-on
commit.
It is a big enough refactor, however, that other minor changes may be
present.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
|
|
|
*/
|
|
|
|
|
if (instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
|
2023-01-30 10:41:37 -08:00
|
|
|
fence_regs_count != 1 || devinfo->has_lsc || force_stall) {
|
2025-01-17 22:56:24 -08:00
|
|
|
ubld1.emit(FS_OPCODE_SCHEDULING_FENCE,
|
|
|
|
|
retype(brw_null_reg(), BRW_TYPE_UW),
|
|
|
|
|
fence_regs, fence_regs_count);
|
2020-01-17 14:17:58 -08:00
|
|
|
}
|
2019-12-31 01:01:27 -08:00
|
|
|
|
2015-07-27 16:25:55 +03:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2015-10-07 11:50:01 +01:00
|
|
|
case nir_intrinsic_shader_clock: {
|
|
|
|
|
/* We cannot do anything if there is an event, so ignore it for now */
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg shader_clock = get_timestamp(bld);
|
|
|
|
|
const brw_reg srcs[] = { component(shader_clock, 0),
|
2016-09-01 00:35:03 -07:00
|
|
|
component(shader_clock, 1) };
|
2015-10-07 11:50:01 +01:00
|
|
|
bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-08 13:56:16 -05:00
|
|
|
case nir_intrinsic_load_reloc_const_intel: {
|
|
|
|
|
uint32_t id = nir_intrinsic_param_idx(instr);
|
2023-09-08 00:05:13 +03:00
|
|
|
uint32_t base = nir_intrinsic_base(instr);
|
2023-05-05 13:12:25 +03:00
|
|
|
|
2025-09-15 17:03:02 -07:00
|
|
|
/* Emit the reloc in the smallest SIMD size to limit register usage. */
|
|
|
|
|
const brw_builder ubld = dest.is_scalar ? xbld : bld.exec_all().group(1, 0);
|
|
|
|
|
brw_reg small_dest = dest.is_scalar ? dest : ubld.vgrf(dest.type);
|
2023-05-05 13:12:25 +03:00
|
|
|
|
2025-09-15 17:03:02 -07:00
|
|
|
if (!dest.is_scalar)
|
|
|
|
|
ubld.UNDEF(small_dest);
|
|
|
|
|
|
|
|
|
|
ubld.emit(SHADER_OPCODE_MOV_RELOC_IMM, retype(small_dest, BRW_TYPE_D),
|
brw/nir: Treat load_reloc_const_intel as convergent
shader-db:
Lunar Lake, Meteor Lake, DG2, and Tiger Lake had similar results. (Lunar Lake shown)
Lunar Lake
total instructions in shared programs: 18096549 -> 18096537 (<.01%)
instructions in affected programs: 26128 -> 26116 (-0.05%)
helped: 7 / HURT: 2
total cycles in shared programs: 922073090 -> 922093922 (<.01%)
cycles in affected programs: 10574198 -> 10595030 (0.20%)
helped: 19 / HURT: 76
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20503943 -> 20504053 (<.01%)
instructions in affected programs: 23378 -> 23488 (0.47%)
helped: 6 / HURT: 5
total cycles in shared programs: 875477036 -> 875480112 (<.01%)
cycles in affected programs: 13840528 -> 13843604 (0.02%)
helped: 22 / HURT: 55
total spills in shared programs: 4546 -> 4552 (0.13%)
spills in affected programs: 8 -> 14 (75.00%)
helped: 0 / HURT: 1
total fills in shared programs: 5280 -> 5298 (0.34%)
fills in affected programs: 24 -> 42 (75.00%)
helped: 0 / HURT: 1
One compute shader in Tomb Raider was hurt for spills and fills.
fossil-db:
Lunar Lake
Totals:
Instrs: 141808815 -> 141808714 (-0.00%); split: -0.00%, +0.00%
Cycle count: 22185066952 -> 22177889310 (-0.03%); split: -0.05%, +0.02%
Spill count: 69859 -> 69892 (+0.05%); split: -0.03%, +0.07%
Fill count: 128344 -> 128313 (-0.02%); split: -0.04%, +0.01%
Scratch Memory Size: 5833728 -> 5829632 (-0.07%)
Totals from 13384 (2.43% of 551446) affected shaders:
Instrs: 13852162 -> 13852061 (-0.00%); split: -0.00%, +0.00%
Cycle count: 7691993336 -> 7684815694 (-0.09%); split: -0.15%, +0.06%
Spill count: 53266 -> 53299 (+0.06%); split: -0.03%, +0.10%
Fill count: 96492 -> 96461 (-0.03%); split: -0.05%, +0.02%
Scratch Memory Size: 3827712 -> 3823616 (-0.11%)
Meteor Lake and DG2 had similar results. (Meteor Lake shown)
Totals:
Instrs: 152744735 -> 152744298 (-0.00%); split: -0.00%, +0.00%
Cycle count: 17400199290 -> 17410258529 (+0.06%); split: -0.01%, +0.07%
Max live registers: 31887208 -> 31887206 (-0.00%)
Totals from 12435 (1.96% of 633315) affected shaders:
Instrs: 13445310 -> 13444873 (-0.00%); split: -0.00%, +0.00%
Cycle count: 6941685096 -> 6951744335 (+0.14%); split: -0.03%, +0.18%
Max live registers: 1071302 -> 1071300 (-0.00%)
Tiger Lake and Ice Lake had similar results. (Tiger Lake shown)
Totals:
Instrs: 150644063 -> 150643944 (-0.00%); split: -0.00%, +0.00%
Cycle count: 15618718733 -> 15622092285 (+0.02%); split: -0.01%, +0.03%
Spill count: 58816 -> 58790 (-0.04%)
Fill count: 101054 -> 101065 (+0.01%)
Max live registers: 31792771 -> 31792766 (-0.00%); split: -0.00%, +0.00%
Totals from 13383 (2.12% of 632544) affected shaders:
Instrs: 12016285 -> 12016166 (-0.00%); split: -0.00%, +0.00%
Cycle count: 5239956851 -> 5243330403 (+0.06%); split: -0.02%, +0.08%
Spill count: 28977 -> 28951 (-0.09%)
Fill count: 47568 -> 47579 (+0.02%)
Max live registers: 1001554 -> 1001549 (-0.00%); split: -0.00%, +0.00%
Skylake
Totals:
Instrs: 140943195 -> 140943154 (-0.00%); split: -0.00%, +0.00%
Cycle count: 14818940190 -> 14816706154 (-0.02%); split: -0.02%, +0.00%
Max live registers: 31663173 -> 31663168 (-0.00%); split: -0.00%, +0.00%
Totals from 12625 (2.01% of 629351) affected shaders:
Instrs: 11598223 -> 11598182 (-0.00%); split: -0.00%, +0.00%
Cycle count: 4519027823 -> 4516793787 (-0.05%); split: -0.05%, +0.00%
Max live registers: 970275 -> 970270 (-0.00%); split: -0.00%, +0.00%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-07-06 09:18:44 -07:00
|
|
|
brw_imm_ud(id), brw_imm_ud(base));
|
2025-09-15 17:03:02 -07:00
|
|
|
|
|
|
|
|
/* Copy propagation will get rid of this MOV. */
|
|
|
|
|
if (!dest.is_scalar)
|
|
|
|
|
bld.MOV(dest, component(small_dest, 0));
|
|
|
|
|
|
2020-08-08 13:56:16 -05:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2025-04-16 08:54:14 +03:00
|
|
|
case nir_intrinsic_load_uniform:
|
|
|
|
|
case nir_intrinsic_load_push_constant: {
|
2018-02-20 10:28:41 +01:00
|
|
|
/* Offsets are in bytes but they should always aligned to
|
|
|
|
|
* the type size
|
|
|
|
|
*/
|
2022-08-13 01:11:58 -07:00
|
|
|
unsigned base_offset = nir_intrinsic_base(instr);
|
2024-04-21 00:57:59 -07:00
|
|
|
assert(base_offset % 4 == 0 || base_offset % brw_type_size_bytes(dest.type) == 0);
|
2015-11-10 21:12:47 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg src = brw_uniform_reg(base_offset / 4, dest.type);
|
2015-03-18 15:18:54 -07:00
|
|
|
|
2018-10-20 09:55:28 -05:00
|
|
|
if (nir_src_is_const(instr->src[0])) {
|
|
|
|
|
unsigned load_offset = nir_src_as_uint(instr->src[0]);
|
2024-04-21 00:57:59 -07:00
|
|
|
assert(load_offset % brw_type_size_bytes(dest.type) == 0);
|
2022-08-13 01:11:58 -07:00
|
|
|
/* The base offset can only handle 32-bit units, so for 16-bit
|
|
|
|
|
* data take the modulo of the offset with 4 bytes and add it to
|
|
|
|
|
* the offset to read from within the source register.
|
2018-02-20 10:28:41 +01:00
|
|
|
*/
|
2022-08-13 01:11:58 -07:00
|
|
|
src.offset = load_offset + base_offset % 4;
|
2015-11-24 15:12:20 -08:00
|
|
|
|
|
|
|
|
for (unsigned j = 0; j < instr->num_components; j++) {
|
2024-02-01 15:02:37 -08:00
|
|
|
xbld.MOV(offset(dest, xbld, j), offset(src, xbld, j));
|
2015-11-24 15:12:20 -08:00
|
|
|
}
|
2015-11-25 14:14:05 -08:00
|
|
|
} else {
|
2025-01-15 13:27:05 -08:00
|
|
|
brw_reg indirect = retype(get_nir_src(ntb, instr->src[0], 0),
|
2024-04-20 17:08:02 -07:00
|
|
|
BRW_TYPE_UD);
|
2015-05-19 16:57:43 -07:00
|
|
|
|
2015-11-24 15:12:20 -08:00
|
|
|
/* We need to pass a size to the MOV_INDIRECT but we don't want it to
|
|
|
|
|
* go past the end of the uniform. In order to keep the n'th
|
|
|
|
|
* component from running past, we subtract off the size of all but
|
|
|
|
|
* one component of the vector.
|
|
|
|
|
*/
|
2022-08-13 01:11:58 -07:00
|
|
|
assert(nir_intrinsic_range(instr) >=
|
2024-04-21 00:57:59 -07:00
|
|
|
instr->num_components * brw_type_size_bytes(dest.type));
|
2022-08-13 01:11:58 -07:00
|
|
|
unsigned read_size = nir_intrinsic_range(instr) -
|
2024-04-21 00:57:59 -07:00
|
|
|
(instr->num_components - 1) * brw_type_size_bytes(dest.type);
|
2015-11-24 15:12:20 -08:00
|
|
|
|
2024-02-15 02:51:39 -08:00
|
|
|
bool supports_64bit_indirects = !intel_device_info_is_9lp(devinfo);
|
2016-06-13 08:29:53 +02:00
|
|
|
|
2024-04-21 00:57:59 -07:00
|
|
|
if (brw_type_size_bytes(dest.type) != 8 || supports_64bit_indirects) {
|
2017-02-13 13:24:18 +01:00
|
|
|
for (unsigned j = 0; j < instr->num_components; j++) {
|
2024-02-01 15:02:37 -08:00
|
|
|
xbld.emit(SHADER_OPCODE_MOV_INDIRECT,
|
|
|
|
|
offset(dest, xbld, j), offset(src, xbld, j),
|
|
|
|
|
indirect, brw_imm_ud(read_size));
|
2017-02-13 13:24:18 +01:00
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
const unsigned num_mov_indirects =
|
2024-04-21 00:57:59 -07:00
|
|
|
brw_type_size_bytes(dest.type) / brw_type_size_bytes(BRW_TYPE_UD);
|
2017-02-13 13:24:18 +01:00
|
|
|
/* We read a little bit less per MOV INDIRECT, as they are now
|
|
|
|
|
* 32-bits ones instead of 64-bit. Fix read_size then.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned read_size_32bit = read_size -
|
2024-04-21 00:57:59 -07:00
|
|
|
(num_mov_indirects - 1) * brw_type_size_bytes(BRW_TYPE_UD);
|
2017-02-13 13:24:18 +01:00
|
|
|
for (unsigned j = 0; j < instr->num_components; j++) {
|
|
|
|
|
for (unsigned i = 0; i < num_mov_indirects; i++) {
|
2024-02-01 15:02:37 -08:00
|
|
|
xbld.emit(SHADER_OPCODE_MOV_INDIRECT,
|
|
|
|
|
subscript(offset(dest, xbld, j), BRW_TYPE_UD, i),
|
|
|
|
|
subscript(offset(src, xbld, j), BRW_TYPE_UD, i),
|
|
|
|
|
indirect, brw_imm_ud(read_size_32bit));
|
2017-02-13 13:24:18 +01:00
|
|
|
}
|
2016-06-13 08:29:53 +02:00
|
|
|
}
|
2015-11-24 15:12:20 -08:00
|
|
|
}
|
2014-08-15 10:32:07 -07:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-06 18:03:26 +03:00
|
|
|
case nir_intrinsic_load_ubo:
|
|
|
|
|
case nir_intrinsic_load_ubo_uniform_block_intel: {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg surface, surface_handle;
|
2024-06-07 18:50:04 +03:00
|
|
|
bool no_mask_handle = false;
|
2023-01-13 12:29:30 +02:00
|
|
|
|
2023-11-20 21:36:14 -08:00
|
|
|
if (get_nir_src_bindless(ntb, instr->src[0]))
|
2024-06-07 18:50:04 +03:00
|
|
|
surface_handle = get_nir_buffer_intrinsic_index(ntb, bld, instr, &no_mask_handle);
|
2023-01-13 12:29:30 +02:00
|
|
|
else
|
2024-06-07 18:50:04 +03:00
|
|
|
surface = get_nir_buffer_intrinsic_index(ntb, bld, instr, &no_mask_handle);
|
2014-08-15 10:32:07 -07:00
|
|
|
|
2024-12-27 02:47:51 -08:00
|
|
|
const unsigned first_component =
|
|
|
|
|
nir_def_first_component_read(&instr->def);
|
|
|
|
|
const unsigned last_component =
|
|
|
|
|
nir_def_last_component_read(&instr->def);
|
|
|
|
|
const unsigned num_components = last_component - first_component + 1;
|
2024-09-10 02:15:10 -07:00
|
|
|
|
2018-10-20 09:55:28 -05:00
|
|
|
if (!nir_src_is_const(instr->src[1])) {
|
2024-08-21 15:26:11 -07:00
|
|
|
s.prog_data->has_ubo_pull = true;
|
|
|
|
|
|
2023-06-06 18:03:26 +03:00
|
|
|
if (instr->intrinsic == nir_intrinsic_load_ubo) {
|
brw/nir: Treat some load_ubo as convergent
v2: Fix for Xe2.
No changes in shader-db or fossil-db on Lunar Lake, Meteor Lake, or DG2.
shader-db:
Tiger Lake, Ice Lake, and Skylake had similar results. (Tiger Lake shown)
total instructions in shared programs: 19626547 -> 19634353 (0.04%)
instructions in affected programs: 1591181 -> 1598987 (0.49%)
helped: 925 / HURT: 3595
total cycles in shared programs: 865236718 -> 866682659 (0.17%)
cycles in affected programs: 151284264 -> 152730205 (0.96%)
helped: 3430 / HURT: 5510
total sends in shared programs: 1032237 -> 1032233 (<.01%)
sends in affected programs: 20 -> 16 (-20.00%)
helped: 4 / HURT: 0
LOST: 48
GAINED: 141
fossil-db:
Tiger Lake, Ice Lake, and Skylake had similar results. (Tiger Lake shown)
Totals:
Instrs: 150662952 -> 150641175 (-0.01%); split: -0.03%, +0.02%
Subgroup size: 7768880 -> 7768888 (+0.00%)
Send messages: 7502265 -> 7502044 (-0.00%)
Cycle count: 15621785298 -> 15618640525 (-0.02%); split: -0.06%, +0.04%
Spill count: 58818 -> 58816 (-0.00%)
Fill count: 101063 -> 101054 (-0.01%)
Max live registers: 31795403 -> 31792179 (-0.01%); split: -0.01%, +0.00%
Max dispatch width: 5572160 -> 5571488 (-0.01%); split: +0.00%, -0.01%
Totals from 10278 (1.62% of 632539) affected shaders:
Instrs: 5276493 -> 5254716 (-0.41%); split: -0.89%, +0.48%
Subgroup size: 156432 -> 156440 (+0.01%)
Send messages: 279259 -> 279038 (-0.08%)
Cycle count: 6483576378 -> 6480431605 (-0.05%); split: -0.16%, +0.11%
Spill count: 27133 -> 27131 (-0.01%)
Fill count: 49384 -> 49375 (-0.02%)
Max live registers: 675781 -> 672557 (-0.48%); split: -0.49%, +0.01%
Max dispatch width: 97256 -> 96584 (-0.69%); split: +0.08%, -0.77%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-14 16:22:45 -08:00
|
|
|
/* load_ubo with non-constant offset. The offset might still be
|
|
|
|
|
* uniform on non-LSC platforms when loading fewer than 4
|
|
|
|
|
* components.
|
|
|
|
|
*/
|
2025-01-15 13:27:05 -08:00
|
|
|
brw_reg base_offset = retype(get_nir_src(ntb, instr->src[1], 0),
|
2024-04-20 17:08:02 -07:00
|
|
|
BRW_TYPE_UD);
|
2025-05-29 11:30:59 +03:00
|
|
|
if (nir_intrinsic_has_base(instr)) {
|
2025-05-30 09:06:40 +03:00
|
|
|
struct brw_reg imm = brw_imm_int(base_offset.type,
|
|
|
|
|
nir_intrinsic_base(instr));
|
2025-05-29 11:30:59 +03:00
|
|
|
base_offset = bld.ADD(base_offset, imm);
|
|
|
|
|
}
|
2023-06-06 18:03:26 +03:00
|
|
|
|
2024-04-21 00:57:59 -07:00
|
|
|
const unsigned comps_per_load = brw_type_size_bytes(dest.type) == 8 ? 2 : 4;
|
intel/fs: Don't rely on CSE for VARYING_PULL_CONSTANT_LOAD
In the past, we didn't have a good solution for combining scalar loads
with a variable index plus a constant offset. To handle that, we took
our load offset and rounded it down to the nearest vec4, loaded an
entire vec4, and trusted in the backend CSE pass to detect loads from
the same address and remove redundant ones.
These days, nir_opt_load_store_vectorize() does a good job of taking
those scalar loads and combining them into vector loads for us, so we
no longer need to do this trick. In fact, it can be better not to:
our offset need only be 4 byte (scalar) aligned, but we were making it
16 byte (vec4) aligned. So if you wanted to load an unaligned vec2,
we might actually load two vec4's (___X | Y___) instead of doing a
single load at the starting offset.
This should also reduce the work the backend CSE pass has to do,
since we just emit a single VARYING_PULL_CONSTANT_LOAD instead of 4.
shader-db results on Alchemist:
- No changes in SEND count or spills/fills
- Instructions: helped 95, hurt 100, +/- 1-3 instructions
- Cycles: helped 3411 hurt 1868, -0.01% (-0.28% in affected)
- SIMD32: gained 5, lost 3
fossil-db results on Alchemist:
- Instrs: 161381427 -> 161384130 (+0.00%); split: -0.00%, +0.00%
- Cycles: 14258305873 -> 14145884365 (-0.79%); split: -0.95%, +0.16%
- SIMD32: Gained 42, lost 26
- Totals from 56285 (8.63% of 652236) affected shaders:
- Instrs: 13318308 -> 13321011 (+0.02%); split: -0.01%, +0.03%
- Cycles: 7464985282 -> 7352563774 (-1.51%); split: -1.82%, +0.31%
From this we can see that we aren't doing more loads than before
and the change is pretty inconsequential, but it requires less
optimizing to produce similar results.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27568>
2024-02-01 09:45:46 -08:00
|
|
|
|
2024-12-27 02:47:51 -08:00
|
|
|
for (unsigned i = first_component;
|
|
|
|
|
i <= last_component;
|
|
|
|
|
i += comps_per_load) {
|
|
|
|
|
const unsigned remaining = last_component + 1 - i;
|
brw/nir: Treat some load_ubo as convergent
v2: Fix for Xe2.
No changes in shader-db or fossil-db on Lunar Lake, Meteor Lake, or DG2.
shader-db:
Tiger Lake, Ice Lake, and Skylake had similar results. (Tiger Lake shown)
total instructions in shared programs: 19626547 -> 19634353 (0.04%)
instructions in affected programs: 1591181 -> 1598987 (0.49%)
helped: 925 / HURT: 3595
total cycles in shared programs: 865236718 -> 866682659 (0.17%)
cycles in affected programs: 151284264 -> 152730205 (0.96%)
helped: 3430 / HURT: 5510
total sends in shared programs: 1032237 -> 1032233 (<.01%)
sends in affected programs: 20 -> 16 (-20.00%)
helped: 4 / HURT: 0
LOST: 48
GAINED: 141
fossil-db:
Tiger Lake, Ice Lake, and Skylake had similar results. (Tiger Lake shown)
Totals:
Instrs: 150662952 -> 150641175 (-0.01%); split: -0.03%, +0.02%
Subgroup size: 7768880 -> 7768888 (+0.00%)
Send messages: 7502265 -> 7502044 (-0.00%)
Cycle count: 15621785298 -> 15618640525 (-0.02%); split: -0.06%, +0.04%
Spill count: 58818 -> 58816 (-0.00%)
Fill count: 101063 -> 101054 (-0.01%)
Max live registers: 31795403 -> 31792179 (-0.01%); split: -0.01%, +0.00%
Max dispatch width: 5572160 -> 5571488 (-0.01%); split: +0.00%, -0.01%
Totals from 10278 (1.62% of 632539) affected shaders:
Instrs: 5276493 -> 5254716 (-0.41%); split: -0.89%, +0.48%
Subgroup size: 156432 -> 156440 (+0.01%)
Send messages: 279259 -> 279038 (-0.08%)
Cycle count: 6483576378 -> 6480431605 (-0.05%); split: -0.16%, +0.11%
Spill count: 27133 -> 27131 (-0.01%)
Fill count: 49384 -> 49375 (-0.02%)
Max live registers: 675781 -> 672557 (-0.48%); split: -0.49%, +0.01%
Max dispatch width: 97256 -> 96584 (-0.69%); split: +0.08%, -0.77%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-14 16:22:45 -08:00
|
|
|
xbld.VARYING_PULL_CONSTANT_LOAD(offset(dest, xbld, i),
|
|
|
|
|
surface, surface_handle,
|
|
|
|
|
base_offset,
|
|
|
|
|
i * brw_type_size_bytes(dest.type),
|
|
|
|
|
instr->def.bit_size / 8,
|
|
|
|
|
MIN2(remaining, comps_per_load));
|
intel/fs: Don't rely on CSE for VARYING_PULL_CONSTANT_LOAD
In the past, we didn't have a good solution for combining scalar loads
with a variable index plus a constant offset. To handle that, we took
our load offset and rounded it down to the nearest vec4, loaded an
entire vec4, and trusted in the backend CSE pass to detect loads from
the same address and remove redundant ones.
These days, nir_opt_load_store_vectorize() does a good job of taking
those scalar loads and combining them into vector loads for us, so we
no longer need to do this trick. In fact, it can be better not to:
our offset need only be 4 byte (scalar) aligned, but we were making it
16 byte (vec4) aligned. So if you wanted to load an unaligned vec2,
we might actually load two vec4's (___X | Y___) instead of doing a
single load at the starting offset.
This should also reduce the work the backend CSE pass has to do,
since we just emit a single VARYING_PULL_CONSTANT_LOAD instead of 4.
shader-db results on Alchemist:
- No changes in SEND count or spills/fills
- Instructions: helped 95, hurt 100, +/- 1-3 instructions
- Cycles: helped 3411 hurt 1868, -0.01% (-0.28% in affected)
- SIMD32: gained 5, lost 3
fossil-db results on Alchemist:
- Instrs: 161381427 -> 161384130 (+0.00%); split: -0.00%, +0.00%
- Cycles: 14258305873 -> 14145884365 (-0.79%); split: -0.95%, +0.16%
- SIMD32: Gained 42, lost 26
- Totals from 56285 (8.63% of 652236) affected shaders:
- Instrs: 13318308 -> 13321011 (+0.02%); split: -0.01%, +0.03%
- Cycles: 7464985282 -> 7352563774 (-1.51%); split: -1.82%, +0.31%
From this we can see that we aren't doing more loads than before
and the change is pretty inconsequential, but it requires less
optimizing to produce similar results.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27568>
2024-02-01 09:45:46 -08:00
|
|
|
}
|
2023-06-06 18:03:26 +03:00
|
|
|
} else {
|
2024-08-21 15:26:11 -07:00
|
|
|
/* load_ubo_uniform_block_intel with non-constant offset */
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_memory_access(ntb, bld, xbld, instr);
|
2023-06-06 18:03:26 +03:00
|
|
|
}
|
2014-12-08 17:34:52 -08:00
|
|
|
} else {
|
2016-01-13 10:17:10 +01:00
|
|
|
/* Even if we are loading doubles, a pull constant load will load
|
|
|
|
|
* a 32-bit vec4, so should only reserve vgrf space for that. If we
|
|
|
|
|
* need to load a full dvec4 we will have to emit 2 loads. This is
|
|
|
|
|
* similar to demote_pull_constants(), except that in that case we
|
|
|
|
|
* see individual accesses to each component of the vector and then
|
|
|
|
|
* we let CSE deal with duplicate loads. Here we see a vector access
|
|
|
|
|
* and we have to split it if necessary.
|
|
|
|
|
*/
|
2024-04-21 00:57:59 -07:00
|
|
|
const unsigned type_size = brw_type_size_bytes(dest.type);
|
2024-12-27 02:47:51 -08:00
|
|
|
const unsigned load_offset =
|
2025-05-29 11:30:59 +03:00
|
|
|
nir_src_as_uint(instr->src[1]) + first_component * type_size +
|
|
|
|
|
(nir_intrinsic_has_base(instr) ? nir_intrinsic_base(instr) : 0);
|
2024-12-27 02:47:51 -08:00
|
|
|
const unsigned end_offset = load_offset + num_components * type_size;
|
2022-12-27 11:26:02 +02:00
|
|
|
const unsigned ubo_block =
|
|
|
|
|
brw_nir_ubo_surface_index_get_push_block(instr->src[0]);
|
|
|
|
|
const unsigned offset_256b = load_offset / 32;
|
2024-12-27 02:47:51 -08:00
|
|
|
const unsigned end_256b = DIV_ROUND_UP(end_offset, 32);
|
2016-11-29 05:20:20 -08:00
|
|
|
|
|
|
|
|
/* See if we've selected this as a push constant candidate */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg push_reg;
|
2022-12-27 11:26:02 +02:00
|
|
|
for (int i = 0; i < 4; i++) {
|
2023-12-05 17:16:34 -08:00
|
|
|
const struct brw_ubo_range *range = &s.prog_data->ubo_ranges[i];
|
2022-12-27 11:26:02 +02:00
|
|
|
if (range->block == ubo_block &&
|
|
|
|
|
offset_256b >= range->start &&
|
2023-09-07 13:15:41 +03:00
|
|
|
end_256b <= range->start + range->length) {
|
2022-12-27 11:26:02 +02:00
|
|
|
|
2024-06-18 15:25:22 -07:00
|
|
|
push_reg = brw_uniform_reg(UBO_START + i, dest.type);
|
2022-12-27 11:26:02 +02:00
|
|
|
push_reg.offset = load_offset - 32 * range->start;
|
|
|
|
|
break;
|
2016-11-29 05:20:20 -08:00
|
|
|
}
|
2022-12-27 11:26:02 +02:00
|
|
|
}
|
2016-11-29 05:20:20 -08:00
|
|
|
|
2022-12-27 11:26:02 +02:00
|
|
|
if (push_reg.file != BAD_FILE) {
|
2024-12-27 02:47:51 -08:00
|
|
|
for (unsigned i = first_component; i <= last_component; i++) {
|
brw/nir: Treat some load_ubo as convergent
v2: Fix for Xe2.
No changes in shader-db or fossil-db on Lunar Lake, Meteor Lake, or DG2.
shader-db:
Tiger Lake, Ice Lake, and Skylake had similar results. (Tiger Lake shown)
total instructions in shared programs: 19626547 -> 19634353 (0.04%)
instructions in affected programs: 1591181 -> 1598987 (0.49%)
helped: 925 / HURT: 3595
total cycles in shared programs: 865236718 -> 866682659 (0.17%)
cycles in affected programs: 151284264 -> 152730205 (0.96%)
helped: 3430 / HURT: 5510
total sends in shared programs: 1032237 -> 1032233 (<.01%)
sends in affected programs: 20 -> 16 (-20.00%)
helped: 4 / HURT: 0
LOST: 48
GAINED: 141
fossil-db:
Tiger Lake, Ice Lake, and Skylake had similar results. (Tiger Lake shown)
Totals:
Instrs: 150662952 -> 150641175 (-0.01%); split: -0.03%, +0.02%
Subgroup size: 7768880 -> 7768888 (+0.00%)
Send messages: 7502265 -> 7502044 (-0.00%)
Cycle count: 15621785298 -> 15618640525 (-0.02%); split: -0.06%, +0.04%
Spill count: 58818 -> 58816 (-0.00%)
Fill count: 101063 -> 101054 (-0.01%)
Max live registers: 31795403 -> 31792179 (-0.01%); split: -0.01%, +0.00%
Max dispatch width: 5572160 -> 5571488 (-0.01%); split: +0.00%, -0.01%
Totals from 10278 (1.62% of 632539) affected shaders:
Instrs: 5276493 -> 5254716 (-0.41%); split: -0.89%, +0.48%
Subgroup size: 156432 -> 156440 (+0.01%)
Send messages: 279259 -> 279038 (-0.08%)
Cycle count: 6483576378 -> 6480431605 (-0.05%); split: -0.16%, +0.11%
Spill count: 27133 -> 27131 (-0.01%)
Fill count: 49384 -> 49375 (-0.02%)
Max live registers: 675781 -> 672557 (-0.48%); split: -0.49%, +0.01%
Max dispatch width: 97256 -> 96584 (-0.69%); split: +0.08%, -0.77%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-14 16:22:45 -08:00
|
|
|
xbld.MOV(offset(dest, xbld, i),
|
2024-12-27 02:47:51 -08:00
|
|
|
byte_offset(push_reg,
|
|
|
|
|
(i - first_component) * type_size));
|
2016-11-29 05:20:20 -08:00
|
|
|
}
|
2022-12-27 11:26:02 +02:00
|
|
|
break;
|
2016-11-29 05:20:20 -08:00
|
|
|
}
|
|
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
s.prog_data->has_ubo_pull = true;
|
2019-09-09 22:21:17 -07:00
|
|
|
|
brw: Always use MEMORY_LOAD for load_ubo_uniform_block_intel intrinsics
Rather than emitting FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD to do block
loads that were cacheline aligned, loading entire cachelines at a time,
we now rely on NIR passes to group, CSE, and vectorize things into
appropriately sized blocks. This means that we'll usually still load
a cacheline, but we may load only 32B if we don't actually need anything
from the full 64B. Prior to Xe2, this saves us registers, and it ought
to save us some bandwidth as well as the response length can be lowered.
The cacheline-aligning hack was the main reason not to simply call
fs_nir_emit_memory_access(), so now we do that instead, porting yet
one more thing to the common memory opcode framework.
We unfortunately still emit the old FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD
opcode for non-block intrinsics. We'd have to clean up 16-bit handling
among other things in order to eliminate this, but we should in the
future.
fossil-db results on Alchemist for this and the previous patch together:
Instrs: 161481888 -> 161297588 (-0.11%); split: -0.12%, +0.01%
Subgroup size: 8102976 -> 8103000 (+0.00%)
Send messages: 7895489 -> 7846178 (-0.62%); split: -0.67%, +0.05%
Cycle count: 16583127302 -> 16703162264 (+0.72%); split: -0.57%, +1.29%
Spill count: 72316 -> 67212 (-7.06%); split: -7.25%, +0.19%
Fill count: 134457 -> 125970 (-6.31%); split: -6.83%, +0.52%
Scratch Memory Size: 4093952 -> 3787776 (-7.48%); split: -7.53%, +0.05%
Max live registers: 33037765 -> 32947425 (-0.27%); split: -0.28%, +0.00%
Max dispatch width: 5780288 -> 5778536 (-0.03%); split: +0.17%, -0.20%
Non SSA regs after NIR: 177862542 -> 178816944 (+0.54%); split: -0.06%, +0.60%
In particular, several titles see incredible reductions in spill/fills:
Shadow of the Tomb Raider: -65.96% / -65.44%
Batman: Arkham City GOTY: -53.49% / -28.57%
Witcher 3: -16.33% / -14.29%
Total War: Warhammer III: -9.60% / -10.14%
Assassins Creed Odyssey: -6.50% / -9.92%
Red Dead Redemption 2: -6.77% / -8.88%
Far Cry: New Dawn: -7.97% / -4.53%
Improves performance in many games on Arc A750:
Cyberpunk 2077: 5.8%
Witcher 3: 4%
Shadow of the Tomb Raider: 3.3%
Assassins Creed: Valhalla: 3%
Spiderman Remastered: 2.75%
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32888>
2025-01-02 01:16:16 -08:00
|
|
|
if (instr->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel) {
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_memory_access(ntb, bld, xbld, instr);
|
brw: Always use MEMORY_LOAD for load_ubo_uniform_block_intel intrinsics
Rather than emitting FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD to do block
loads that were cacheline aligned, loading entire cachelines at a time,
we now rely on NIR passes to group, CSE, and vectorize things into
appropriately sized blocks. This means that we'll usually still load
a cacheline, but we may load only 32B if we don't actually need anything
from the full 64B. Prior to Xe2, this saves us registers, and it ought
to save us some bandwidth as well as the response length can be lowered.
The cacheline-aligning hack was the main reason not to simply call
fs_nir_emit_memory_access(), so now we do that instead, porting yet
one more thing to the common memory opcode framework.
We unfortunately still emit the old FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD
opcode for non-block intrinsics. We'd have to clean up 16-bit handling
among other things in order to eliminate this, but we should in the
future.
fossil-db results on Alchemist for this and the previous patch together:
Instrs: 161481888 -> 161297588 (-0.11%); split: -0.12%, +0.01%
Subgroup size: 8102976 -> 8103000 (+0.00%)
Send messages: 7895489 -> 7846178 (-0.62%); split: -0.67%, +0.05%
Cycle count: 16583127302 -> 16703162264 (+0.72%); split: -0.57%, +1.29%
Spill count: 72316 -> 67212 (-7.06%); split: -7.25%, +0.19%
Fill count: 134457 -> 125970 (-6.31%); split: -6.83%, +0.52%
Scratch Memory Size: 4093952 -> 3787776 (-7.48%); split: -7.53%, +0.05%
Max live registers: 33037765 -> 32947425 (-0.27%); split: -0.28%, +0.00%
Max dispatch width: 5780288 -> 5778536 (-0.03%); split: +0.17%, -0.20%
Non SSA regs after NIR: 177862542 -> 178816944 (+0.54%); split: -0.06%, +0.60%
In particular, several titles see incredible reductions in spill/fills:
Shadow of the Tomb Raider: -65.96% / -65.44%
Batman: Arkham City GOTY: -53.49% / -28.57%
Witcher 3: -16.33% / -14.29%
Total War: Warhammer III: -9.60% / -10.14%
Assassins Creed Odyssey: -6.50% / -9.92%
Red Dead Redemption 2: -6.77% / -8.88%
Far Cry: New Dawn: -7.97% / -4.53%
Improves performance in many games on Arc A750:
Cyberpunk 2077: 5.8%
Witcher 3: 4%
Shadow of the Tomb Raider: 3.3%
Assassins Creed: Valhalla: 3%
Spiderman Remastered: 2.75%
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32888>
2025-01-02 01:16:16 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2016-12-08 19:18:00 -08:00
|
|
|
const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder ubld = bld.exec_all().group(block_sz / 4, 0);
|
2016-12-08 20:05:18 -08:00
|
|
|
|
2024-09-10 02:15:10 -07:00
|
|
|
for (unsigned c = 0; c < num_components;) {
|
2018-10-20 09:55:28 -05:00
|
|
|
const unsigned base = load_offset + c * type_size;
|
2016-12-08 19:18:00 -08:00
|
|
|
/* Number of usable components in the next block-aligned load. */
|
2024-09-10 02:15:10 -07:00
|
|
|
const unsigned count = MIN2(num_components - c,
|
2016-12-08 19:18:00 -08:00
|
|
|
(block_sz - base % block_sz) / type_size);
|
2016-01-13 10:17:10 +01:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg packed_consts = ubld.vgrf(BRW_TYPE_UD);
|
|
|
|
|
brw_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
|
2023-01-13 12:29:30 +02:00
|
|
|
srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = surface;
|
|
|
|
|
srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
|
|
|
|
|
srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1));
|
|
|
|
|
srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz);
|
2022-12-21 20:16:27 +02:00
|
|
|
|
|
|
|
|
ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
|
|
|
|
|
srcs, PULL_UNIFORM_CONSTANT_SRCS);
|
2016-01-13 10:17:10 +01:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg consts =
|
2016-12-08 19:18:00 -08:00
|
|
|
retype(byte_offset(packed_consts, base & (block_sz - 1)),
|
|
|
|
|
dest.type);
|
2014-12-08 17:34:52 -08:00
|
|
|
|
2024-12-27 02:47:51 -08:00
|
|
|
for (unsigned d = 0; d < count; d++) {
|
|
|
|
|
xbld.MOV(offset(dest, xbld, first_component + c + d),
|
|
|
|
|
component(consts, d));
|
|
|
|
|
}
|
2014-12-08 17:34:52 -08:00
|
|
|
|
2016-05-19 12:50:01 +02:00
|
|
|
c += count;
|
2016-01-13 10:17:10 +01:00
|
|
|
}
|
2014-08-15 10:32:07 -07:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2014-12-03 17:03:19 -08:00
|
|
|
case nir_intrinsic_store_output: {
|
2019-07-19 17:38:04 -05:00
|
|
|
assert(nir_src_bit_size(instr->src[0]) == 32);
|
2024-02-12 08:43:34 -08:00
|
|
|
brw_reg src = get_nir_src(ntb, instr->src[0], -1);
|
2015-12-07 22:41:50 -08:00
|
|
|
|
2018-10-20 09:55:28 -05:00
|
|
|
unsigned store_offset = nir_src_as_uint(instr->src[1]);
|
2016-05-09 10:14:48 +02:00
|
|
|
unsigned num_components = instr->num_components;
|
2016-05-23 16:48:05 +10:00
|
|
|
unsigned first_component = nir_intrinsic_component(instr);
|
2016-05-09 10:14:48 +02:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg new_dest = retype(offset(s.outputs[instr->const_index[0]], bld,
|
2018-10-20 09:55:28 -05:00
|
|
|
4 * store_offset), src.type);
|
2024-08-15 23:04:23 -07:00
|
|
|
|
|
|
|
|
brw_combine_with_vec(bld, offset(new_dest, bld, first_component),
|
|
|
|
|
src, num_components);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-22 03:24:45 -05:00
|
|
|
case nir_intrinsic_get_ssbo_size: {
|
2019-01-12 10:58:33 -06:00
|
|
|
assert(nir_src_num_components(instr->src[0]) == 1);
|
2015-06-01 09:45:51 +02:00
|
|
|
|
2016-05-18 14:27:20 -07:00
|
|
|
/* A resinfo's sampler message is used to get the buffer size. The
|
|
|
|
|
* SIMD8's writeback message consists of four registers and SIMD16's
|
|
|
|
|
* writeback message consists of 8 destination registers (two per each
|
|
|
|
|
* component). Because we are only interested on the first channel of
|
|
|
|
|
* the first returned component, where resinfo returns the buffer size
|
|
|
|
|
* for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
|
|
|
|
|
* the dispatch width.
|
|
|
|
|
*/
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder ubld = bld.scalar_group();
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg ret_payload = ubld.vgrf(BRW_TYPE_UD, 4);
|
2015-06-01 09:45:51 +02:00
|
|
|
|
2016-05-18 14:27:20 -07:00
|
|
|
/* Set LOD = 0 */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg src_payload = ubld.MOV(brw_imm_ud(0));
|
2015-11-10 13:45:21 +01:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[GET_BUFFER_SIZE_SRCS];
|
2023-11-20 21:36:14 -08:00
|
|
|
srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
|
2022-12-27 15:57:53 +02:00
|
|
|
GET_BUFFER_SIZE_SRC_SURFACE_HANDLE :
|
|
|
|
|
GET_BUFFER_SIZE_SRC_SURFACE] =
|
2023-11-20 21:21:54 -08:00
|
|
|
get_nir_buffer_intrinsic_index(ntb, bld, instr);
|
2022-12-27 15:32:24 +02:00
|
|
|
srcs[GET_BUFFER_SIZE_SRC_LOD] = src_payload;
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
|
2022-12-27 15:32:24 +02:00
|
|
|
srcs, GET_BUFFER_SIZE_SRCS);
|
2022-09-28 17:37:18 -07:00
|
|
|
inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
|
2015-10-30 11:10:02 +01:00
|
|
|
|
2018-01-30 09:59:34 +01:00
|
|
|
/* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
|
|
|
|
|
*
|
|
|
|
|
* "Out-of-bounds checking is always performed at a DWord granularity. If
|
|
|
|
|
* any part of the DWord is out-of-bounds then the whole DWord is
|
|
|
|
|
* considered out-of-bounds."
|
|
|
|
|
*
|
|
|
|
|
* This implies that types with size smaller than 4-bytes need to be
|
|
|
|
|
* padded if they don't complete the last dword of the buffer. But as we
|
|
|
|
|
* need to maintain the original size we need to reverse the padding
|
|
|
|
|
* calculation to return the correct size to know the number of elements
|
|
|
|
|
* of an unsized array. As we stored in the last two bits of the surface
|
|
|
|
|
* size the needed padding for the buffer, we calculate here the
|
|
|
|
|
* original buffer_size reversing the surface_size calculation:
|
|
|
|
|
*
|
|
|
|
|
* surface_size = isl_align(buffer_size, 4) +
|
|
|
|
|
* (isl_align(buffer_size) - buffer_size)
|
|
|
|
|
*
|
|
|
|
|
* buffer_size = surface_size & ~3 - surface_size & 3
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg size_padding = ubld.AND(ret_payload, brw_imm_ud(3));
|
|
|
|
|
brw_reg size_aligned4 = ubld.AND(ret_payload, brw_imm_ud(~3));
|
|
|
|
|
brw_reg buffer_size = ubld.ADD(size_aligned4, negate(size_padding));
|
2018-01-30 09:59:34 +01:00
|
|
|
|
|
|
|
|
bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
|
2015-06-01 09:45:51 +02:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2019-02-22 15:28:24 -06:00
|
|
|
case nir_intrinsic_load_subgroup_size:
|
|
|
|
|
/* This should only happen for fragment shaders because every other case
|
|
|
|
|
* is lowered in NIR so we can optimize on it.
|
|
|
|
|
*/
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(s.stage == MESA_SHADER_FRAGMENT);
|
2024-04-20 17:08:02 -07:00
|
|
|
bld.MOV(retype(dest, BRW_TYPE_D), brw_imm_d(s.dispatch_width));
|
2019-02-22 15:28:24 -06:00
|
|
|
break;
|
|
|
|
|
|
2017-08-31 21:56:43 -07:00
|
|
|
case nir_intrinsic_load_subgroup_invocation:
|
intel/brw: Use CSE for LOAD_SUBGROUP_INVOCATION
Instead of emitting a single one at the top, and making reference to it,
emit the virtual instruction as needed and let CSE do its job.
Since load_subgroup_invocation now can appear not at the start of the
shader, use UNDEF in all cases to ensure that the liveness of the
destination doesn't extend to the first partial write done here (it was
being used only for SIMD > 8 before).
Note this option was considered in the past
6132992cdb858268af0e985727d80e4140be389c but at the time dismissed. The
difference now is that the lowering of the virtual instruction happens
earlier than the scheduling.
The motivation for this change is to allow passes other than the NIR
conversion to use this value. The alternative of storing a `brw_reg` in
the shader (instead of NIR state) gets complicated by passes like
compact_vgrfs, that move VGRFs around (and update the instructions).
This and maybe other passes would have to care about the brw_reg.
Fossil-db numbers, TGL
```
*** Shaders only in 'after' results are ignored:
steam-native/shadow_of_the_tomb_raider/c683ea5067ee157d/fs.32/0, steam-native/shadow_of_the_tomb_raider/f4df450c3cef40b4/fs.32/0, steam-native/shadow_of_the_tomb_raider/94b708fb8e3d9597/fs.32/0, steam-native/shadow_of_the_tomb_raider/19d44c328edabd30/fs.32/0, steam-native/shadow_of_the_tomb_raider/8a7dcbd5a74a19bf/fs.32/0, and 366 more
from 4 apps: steam-dxvk/alan_wake, steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
*** Shaders only in 'before' results are ignored:
steam-dxvk/octopath_traveler/aaa3d10acb726906/fs.32/0, steam-dxvk/batman_arkham_origins/e6872ae23569c35f/fs.32/0, steam-dxvk/octopath_traveler/fd33a99fa5c271a8/fs.32/0, steam-dxvk/octopath_traveler/9a077cdc16f24520/fs.32/0, steam-dxvk/batman_arkham_city_goty/fac7b438ad52f622/fs.32/0, and 12 more
from 4 apps: steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-dxvk/octopath_traveler, steam-native/shadow_of_the_tomb_raider
Totals:
Instrs: 149752381 -> 149751337 (-0.00%); split: -0.00%, +0.00%
Cycle count: 11553609349 -> 11549970294 (-0.03%); split: -0.06%, +0.03%
Spill count: 42763 -> 42764 (+0.00%); split: -0.01%, +0.01%
Fill count: 75650 -> 75651 (+0.00%); split: -0.00%, +0.01%
Max live registers: 31725096 -> 31671792 (-0.17%)
Max dispatch width: 5546008 -> 5551672 (+0.10%); split: +0.11%, -0.00%
Totals from 52574 (8.34% of 630441) affected shaders:
Instrs: 9535159 -> 9534115 (-0.01%); split: -0.03%, +0.02%
Cycle count: 1006627109 -> 1002988054 (-0.36%); split: -0.65%, +0.29%
Spill count: 11588 -> 11589 (+0.01%); split: -0.03%, +0.03%
Fill count: 21057 -> 21058 (+0.00%); split: -0.01%, +0.02%
Max live registers: 1992493 -> 1939189 (-2.68%)
Max dispatch width: 559696 -> 565360 (+1.01%); split: +1.06%, -0.05%
```
and DG2
```
*** Shaders only in 'after' results are ignored:
steam-native/shadow_of_the_tomb_raider/1f95a9d3db21df85/fs.32/0, steam-native/shadow_of_the_tomb_raider/56b87c4a46613a2a/fs.32/0, steam-native/shadow_of_the_tomb_raider/a74b4137f85dbbd3/fs.32/0, steam-native/shadow_of_the_tomb_raider/e07e38d3f48e8402/fs.32/0, steam-native/shadow_of_the_tomb_raider/206336789c48996c/fs.32/0, and 268 more
from 4 apps: steam-dxvk/alan_wake, steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
*** Shaders only in 'before' results are ignored:
steam-native/shadow_of_the_tomb_raider/0420d7c3a2ea99ec/fs.32/0, steam-native/shadow_of_the_tomb_raider/2ff39f8bf7d24abb/fs.32/0, steam-native/shadow_of_the_tomb_raider/92d7be2824bd9659/fs.32/0, steam-native/shadow_of_the_tomb_raider/f09ca6d2ecf18015/fs.32/0, steam-native/shadow_of_the_tomb_raider/490f8ffd59e52949/fs.32/0, and 205 more
from 3 apps: steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
Totals:
Instrs: 151597619 -> 151599914 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7699776 -> 7699784 (+0.00%)
Cycle count: 12738501989 -> 12739841170 (+0.01%); split: -0.01%, +0.02%
Spill count: 61283 -> 61274 (-0.01%)
Fill count: 119886 -> 119849 (-0.03%)
Max live registers: 31810432 -> 31758920 (-0.16%)
Max dispatch width: 5540128 -> 5541136 (+0.02%); split: +0.08%, -0.06%
Totals from 49286 (7.81% of 631231) affected shaders:
Instrs: 8607753 -> 8610048 (+0.03%); split: -0.01%, +0.04%
Subgroup size: 857752 -> 857760 (+0.00%)
Cycle count: 305939495 -> 307278676 (+0.44%); split: -0.28%, +0.72%
Spill count: 6339 -> 6330 (-0.14%)
Fill count: 12571 -> 12534 (-0.29%)
Max live registers: 1788346 -> 1736834 (-2.88%)
Max dispatch width: 510920 -> 511928 (+0.20%); split: +0.85%, -0.66%
```
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30489>
2024-07-31 22:46:20 -07:00
|
|
|
bld.MOV(retype(dest, BRW_TYPE_UD), bld.LOAD_SUBGROUP_INVOCATION());
|
2016-05-22 16:33:44 -07:00
|
|
|
break;
|
|
|
|
|
|
2017-06-22 16:46:39 -07:00
|
|
|
case nir_intrinsic_load_subgroup_eq_mask:
|
|
|
|
|
case nir_intrinsic_load_subgroup_ge_mask:
|
|
|
|
|
case nir_intrinsic_load_subgroup_gt_mask:
|
|
|
|
|
case nir_intrinsic_load_subgroup_le_mask:
|
|
|
|
|
case nir_intrinsic_load_subgroup_lt_mask:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("not reached");
|
2017-06-22 16:46:39 -07:00
|
|
|
|
2024-07-24 13:33:23 -04:00
|
|
|
case nir_intrinsic_ddx_fine:
|
|
|
|
|
bld.emit(FS_OPCODE_DDX_FINE, retype(dest, BRW_TYPE_F),
|
2025-01-15 13:27:05 -08:00
|
|
|
retype(get_nir_src(ntb, instr->src[0], 0), BRW_TYPE_F));
|
2024-07-24 13:33:23 -04:00
|
|
|
break;
|
|
|
|
|
case nir_intrinsic_ddx:
|
|
|
|
|
case nir_intrinsic_ddx_coarse:
|
|
|
|
|
bld.emit(FS_OPCODE_DDX_COARSE, retype(dest, BRW_TYPE_F),
|
2025-01-15 13:27:05 -08:00
|
|
|
retype(get_nir_src(ntb, instr->src[0], 0), BRW_TYPE_F));
|
2024-07-24 13:33:23 -04:00
|
|
|
break;
|
|
|
|
|
case nir_intrinsic_ddy_fine:
|
|
|
|
|
bld.emit(FS_OPCODE_DDY_FINE, retype(dest, BRW_TYPE_F),
|
2025-01-15 13:27:05 -08:00
|
|
|
retype(get_nir_src(ntb, instr->src[0], 0), BRW_TYPE_F));
|
2024-07-24 13:33:23 -04:00
|
|
|
break;
|
|
|
|
|
case nir_intrinsic_ddy:
|
|
|
|
|
case nir_intrinsic_ddy_coarse:
|
|
|
|
|
bld.emit(FS_OPCODE_DDY_COARSE, retype(dest, BRW_TYPE_F),
|
2025-01-15 13:27:05 -08:00
|
|
|
retype(get_nir_src(ntb, instr->src[0], 0), BRW_TYPE_F));
|
2024-07-24 13:33:23 -04:00
|
|
|
break;
|
|
|
|
|
|
2024-09-04 10:07:52 -07:00
|
|
|
case nir_intrinsic_vote_any:
|
|
|
|
|
case nir_intrinsic_vote_all:
|
2024-03-11 14:21:12 -07:00
|
|
|
case nir_intrinsic_quad_vote_any:
|
|
|
|
|
case nir_intrinsic_quad_vote_all: {
|
2024-09-04 10:07:52 -07:00
|
|
|
const bool any = instr->intrinsic == nir_intrinsic_vote_any ||
|
|
|
|
|
instr->intrinsic == nir_intrinsic_quad_vote_any;
|
|
|
|
|
const bool quad = instr->intrinsic == nir_intrinsic_quad_vote_any ||
|
|
|
|
|
instr->intrinsic == nir_intrinsic_quad_vote_all;
|
2024-03-11 14:21:12 -07:00
|
|
|
|
2025-01-15 13:27:05 -08:00
|
|
|
brw_reg cond = get_nir_src(ntb, instr->src[0], 0);
|
2024-09-04 10:07:52 -07:00
|
|
|
const unsigned cluster_size = quad ? 4 : s.dispatch_width;
|
2024-03-11 14:21:12 -07:00
|
|
|
|
2024-09-04 10:07:52 -07:00
|
|
|
bld.emit(any ? SHADER_OPCODE_VOTE_ANY : SHADER_OPCODE_VOTE_ALL,
|
|
|
|
|
retype(dest, BRW_TYPE_UD), cond, brw_imm_ud(cluster_size));
|
2024-03-11 14:21:12 -07:00
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2017-08-28 17:38:53 -07:00
|
|
|
case nir_intrinsic_vote_feq:
|
2017-08-28 17:33:33 -07:00
|
|
|
case nir_intrinsic_vote_ieq: {
|
2025-01-15 13:27:05 -08:00
|
|
|
brw_reg value = get_nir_src(ntb, instr->src[0], 0);
|
2017-08-28 17:38:53 -07:00
|
|
|
if (instr->intrinsic == nir_intrinsic_vote_feq) {
|
|
|
|
|
const unsigned bit_size = nir_src_bit_size(instr->src[0]);
|
2024-04-20 17:08:02 -07:00
|
|
|
value.type = bit_size == 8 ? BRW_TYPE_B :
|
2024-04-21 00:33:52 -07:00
|
|
|
brw_type_with_size(BRW_TYPE_F, bit_size);
|
2017-08-28 17:38:53 -07:00
|
|
|
}
|
2024-09-04 10:07:52 -07:00
|
|
|
bld.emit(SHADER_OPCODE_VOTE_EQUAL, retype(dest, BRW_TYPE_D), value);
|
2017-06-20 22:39:22 -07:00
|
|
|
break;
|
|
|
|
|
}
|
2017-06-22 16:46:39 -07:00
|
|
|
|
|
|
|
|
case nir_intrinsic_ballot: {
|
2024-01-05 09:19:38 -08:00
|
|
|
if (instr->def.bit_size > 32) {
|
2024-04-20 17:08:02 -07:00
|
|
|
dest.type = BRW_TYPE_UQ;
|
2024-01-05 09:19:38 -08:00
|
|
|
} else {
|
2024-04-20 17:08:02 -07:00
|
|
|
dest.type = BRW_TYPE_UD;
|
2024-01-05 09:19:38 -08:00
|
|
|
}
|
|
|
|
|
|
2025-01-15 13:27:05 -08:00
|
|
|
brw_reg value = get_nir_src(ntb, instr->src[0], 0);
|
brw/nir: Treat some ballot as convergent
v2: Fix for Xe2.
v3: Add a comment explaining the use of bld instead of xbld. Suggested
by Ken. Fix a bug in handing is_scalar source. Noticed by me while
applying Ken's review feedback.
shader-db:
Lunar Lake, Meteor Lake, DG2, and Tiger Lake had similar results. (Lunar Lake shown)
total instructions in shared programs: 18228657 -> 18228689 (<.01%)
instructions in affected programs: 9333 -> 9365 (0.34%)
helped: 2 / HURT: 26
total cycles in shared programs: 932511560 -> 932542994 (<.01%)
cycles in affected programs: 2263040 -> 2294474 (1.39%)
helped: 7 / HURT: 27
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20700370 -> 20700392 (<.01%)
instructions in affected programs: 18579 -> 18601 (0.12%)
helped: 1 / HURT: 28
total cycles in shared programs: 888385851 -> 888386325 (<.01%)
cycles in affected programs: 2571368 -> 2571842 (0.02%)
helped: 14 / HURT: 6
total spills in shared programs: 4373 -> 4371 (-0.05%)
spills in affected programs: 71 -> 69 (-2.82%)
helped: 1 / HURT: 0
total fills in shared programs: 4657 -> 4653 (-0.09%)
fills in affected programs: 196 -> 192 (-2.04%)
helped: 1 / HURT: 0
fossil-db:
Lunar Lake
Totals:
Instrs: 142887258 -> 142890605 (+0.00%); split: -0.00%, +0.00%
Cycle count: 21653599282 -> 21655049536 (+0.01%); split: -0.00%, +0.01%
Max live registers: 47942973 -> 47942837 (-0.00%)
Totals from 22209 (4.01% of 553251) affected shaders:
Instrs: 4337679 -> 4341026 (+0.08%); split: -0.00%, +0.08%
Cycle count: 261852040 -> 263302294 (+0.55%); split: -0.38%, +0.93%
Max live registers: 1299670 -> 1299534 (-0.01%)
Meteor Lake, DG2, Tiger Lake, and Skylake had similar results. (Meteor Lake shown)
Totals:
Instrs: 156599915 -> 156590882 (-0.01%); split: -0.01%, +0.00%
Cycle count: 16940072009 -> 16940902317 (+0.00%); split: -0.01%, +0.01%
Max live registers: 32610801 -> 32610488 (-0.00%)
Max dispatch width: 5730736 -> 5731744 (+0.02%); split: +0.12%, -0.11%
Totals from 35528 (5.52% of 643617) affected shaders:
Instrs: 6175409 -> 6166376 (-0.15%); split: -0.21%, +0.06%
Cycle count: 230679923 -> 231510231 (+0.36%); split: -0.46%, +0.82%
Max live registers: 1354716 -> 1354403 (-0.02%)
Max dispatch width: 167648 -> 168656 (+0.60%); split: +4.26%, -3.66%
Ice Lake
Totals:
Instrs: 155330276 -> 155318037 (-0.01%); split: -0.01%, +0.00%
Cycle count: 15019092327 -> 15019637026 (+0.00%); split: -0.00%, +0.01%
Max live registers: 32640341 -> 32637305 (-0.01%)
Max dispatch width: 5780720 -> 5780688 (-0.00%); split: +0.02%, -0.02%
Totals from 37773 (5.85% of 645641) affected shaders:
Instrs: 6643030 -> 6630791 (-0.18%); split: -0.24%, +0.05%
Cycle count: 223589025 -> 224133724 (+0.24%); split: -0.29%, +0.53%
Max live registers: 1491781 -> 1488745 (-0.20%)
Max dispatch width: 167600 -> 167568 (-0.02%); split: +0.75%, -0.77%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-28 13:05:08 -08:00
|
|
|
|
|
|
|
|
/* A ballot will always be at the full dispatch width even if the
|
|
|
|
|
* use of the ballot result is smaller. If the source is_scalar,
|
|
|
|
|
* it may be allocated at less than the full dispatch width (e.g.,
|
|
|
|
|
* allocated at SIMD8 with SIMD32 dispatch). The input may or may
|
|
|
|
|
* not be stride=0. If it is not, the generated ballot
|
|
|
|
|
*
|
|
|
|
|
* ballot(32) dst, value<1>
|
|
|
|
|
*
|
|
|
|
|
* is invalid because it will read out of bounds from value.
|
|
|
|
|
*
|
|
|
|
|
* To account for this, modify the stride of an is_scalar input to be
|
|
|
|
|
* zero.
|
|
|
|
|
*/
|
|
|
|
|
if (value.is_scalar)
|
|
|
|
|
value = component(value, 0);
|
|
|
|
|
|
|
|
|
|
/* Note the use of bld here instead of xbld. As mentioned above, the
|
|
|
|
|
* ballot must execute on all SIMD lanes regardless of the amount of
|
|
|
|
|
* data (i.e., scalar or not scalar) generated.
|
|
|
|
|
*/
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *inst = bld.emit(SHADER_OPCODE_BALLOT, dest, value);
|
brw/nir: Treat some ballot as convergent
v2: Fix for Xe2.
v3: Add a comment explaining the use of bld instead of xbld. Suggested
by Ken. Fix a bug in handing is_scalar source. Noticed by me while
applying Ken's review feedback.
shader-db:
Lunar Lake, Meteor Lake, DG2, and Tiger Lake had similar results. (Lunar Lake shown)
total instructions in shared programs: 18228657 -> 18228689 (<.01%)
instructions in affected programs: 9333 -> 9365 (0.34%)
helped: 2 / HURT: 26
total cycles in shared programs: 932511560 -> 932542994 (<.01%)
cycles in affected programs: 2263040 -> 2294474 (1.39%)
helped: 7 / HURT: 27
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20700370 -> 20700392 (<.01%)
instructions in affected programs: 18579 -> 18601 (0.12%)
helped: 1 / HURT: 28
total cycles in shared programs: 888385851 -> 888386325 (<.01%)
cycles in affected programs: 2571368 -> 2571842 (0.02%)
helped: 14 / HURT: 6
total spills in shared programs: 4373 -> 4371 (-0.05%)
spills in affected programs: 71 -> 69 (-2.82%)
helped: 1 / HURT: 0
total fills in shared programs: 4657 -> 4653 (-0.09%)
fills in affected programs: 196 -> 192 (-2.04%)
helped: 1 / HURT: 0
fossil-db:
Lunar Lake
Totals:
Instrs: 142887258 -> 142890605 (+0.00%); split: -0.00%, +0.00%
Cycle count: 21653599282 -> 21655049536 (+0.01%); split: -0.00%, +0.01%
Max live registers: 47942973 -> 47942837 (-0.00%)
Totals from 22209 (4.01% of 553251) affected shaders:
Instrs: 4337679 -> 4341026 (+0.08%); split: -0.00%, +0.08%
Cycle count: 261852040 -> 263302294 (+0.55%); split: -0.38%, +0.93%
Max live registers: 1299670 -> 1299534 (-0.01%)
Meteor Lake, DG2, Tiger Lake, and Skylake had similar results. (Meteor Lake shown)
Totals:
Instrs: 156599915 -> 156590882 (-0.01%); split: -0.01%, +0.00%
Cycle count: 16940072009 -> 16940902317 (+0.00%); split: -0.01%, +0.01%
Max live registers: 32610801 -> 32610488 (-0.00%)
Max dispatch width: 5730736 -> 5731744 (+0.02%); split: +0.12%, -0.11%
Totals from 35528 (5.52% of 643617) affected shaders:
Instrs: 6175409 -> 6166376 (-0.15%); split: -0.21%, +0.06%
Cycle count: 230679923 -> 231510231 (+0.36%); split: -0.46%, +0.82%
Max live registers: 1354716 -> 1354403 (-0.02%)
Max dispatch width: 167648 -> 168656 (+0.60%); split: +4.26%, -3.66%
Ice Lake
Totals:
Instrs: 155330276 -> 155318037 (-0.01%); split: -0.01%, +0.00%
Cycle count: 15019092327 -> 15019637026 (+0.00%); split: -0.00%, +0.01%
Max live registers: 32640341 -> 32637305 (-0.01%)
Max dispatch width: 5780720 -> 5780688 (-0.00%); split: +0.02%, -0.02%
Totals from 37773 (5.85% of 645641) affected shaders:
Instrs: 6643030 -> 6630791 (-0.18%); split: -0.24%, +0.05%
Cycle count: 223589025 -> 224133724 (+0.24%); split: -0.29%, +0.53%
Max live registers: 1491781 -> 1488745 (-0.20%)
Max dispatch width: 167600 -> 167568 (-0.02%); split: +0.75%, -0.77%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-28 13:05:08 -08:00
|
|
|
|
|
|
|
|
if (dest.is_scalar)
|
|
|
|
|
inst->size_written = dest.component_size(xbld.dispatch_width());
|
|
|
|
|
|
2017-06-22 16:46:39 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_read_invocation: {
|
2025-01-15 13:27:05 -08:00
|
|
|
const brw_reg value = get_nir_src(ntb, instr->src[0], 0);
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg invocation = get_nir_src_imm(ntb, instr->src[1]);
|
intel/brw: Emit better code for read_invocation(x, constant)
For something as basic as read_invocation(x, 0), we were emitting:
mov(8) vgrf67:D, 0d
find_live_channel(8) vgrf236:UD, NoMask
broadcast(8) vgrf237:D, vgrf67:D, vgrf236+0.0<0>:UD NoMask
broadcast(8) vgrf235+0.0:W, vgrf197+0.0:W, vgrf237+0.0<0>:D NoMask
mov(8) vgrf234+0.0:W, vgrf235+0.0<0>:W
This is way overcomplicated - if the invocation is a constant, we can
simply emit a single MOV which reads the desired channel index. Not
only that, but it's difficult to clean up:
1. If this expression appears multiple times, CSE will find all the
redundant emit_uniformize(invocation) and get rid of the duplicate
(find_live_channel+broadcast) on future instructions.
2. Copy propagation will put the 0d directly in the first broadcast.
3. Dead code elimination will get rid of the vgrf67 temp holding 0.
4. Algebraic will replace the first broadcast(x, 0) with a MOV.
5. Copy propagation will put the 0d directly in the second broadcast.
6. Dead code elimination will get rid of the vgrf237 temp.
7. Algebraic will replace the second broadcast(x, 0) with a MOV.
8. Copy propagation will finally combine the two MOVs
That's at least 7-8 optimization passes and several loops through the
same passes just to clean up something we can do trivially.
Cuts 25% of the of the optimizer steps in pipeline 22200210259a2c9c
of fossil-db/google-meet-clvk/BgBlur.1f58fdf742c27594.1 (31 to 23).
Shortens compilation time of the google-meet-clvk/Relight pipeline by
-2.87717% +/- 0.509162% (n=150).
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28097>
2024-03-10 01:00:50 -08:00
|
|
|
|
2024-11-29 15:31:05 -08:00
|
|
|
bld.emit(SHADER_OPCODE_READ_FROM_CHANNEL, retype(dest, value.type),
|
|
|
|
|
value, invocation);
|
2017-06-22 16:46:39 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_read_first_invocation: {
|
2025-01-15 13:27:05 -08:00
|
|
|
const brw_reg value = get_nir_src(ntb, instr->src[0], 0);
|
2024-11-29 15:31:05 -08:00
|
|
|
|
|
|
|
|
bld.emit(SHADER_OPCODE_READ_FROM_LIVE_CHANNEL, retype(dest, value.type), value);
|
2017-06-22 16:46:39 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2017-08-29 09:21:32 -07:00
|
|
|
case nir_intrinsic_shuffle: {
|
2025-01-15 13:27:05 -08:00
|
|
|
const brw_reg value = get_nir_src(ntb, instr->src[0], 0);
|
|
|
|
|
brw_reg index = get_nir_src(ntb, instr->src[1], 0);
|
2025-01-23 20:53:16 -08:00
|
|
|
|
|
|
|
|
if (devinfo->ver >= 30) {
|
|
|
|
|
/* Mask index to constrain it to be within the valid range in
|
|
|
|
|
* order to avoid potentially reading past the end of the GRF
|
|
|
|
|
* file, which can lead to hangs on Xe3+ with VRT enabled.
|
|
|
|
|
*/
|
|
|
|
|
const brw_reg tmp = bld.vgrf(BRW_TYPE_UD);
|
|
|
|
|
bld.AND(tmp, index, brw_imm_ud(s.dispatch_width - 1));
|
|
|
|
|
index = tmp;
|
|
|
|
|
}
|
2017-08-29 09:21:32 -07:00
|
|
|
|
|
|
|
|
bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2017-08-21 22:17:37 -07:00
|
|
|
case nir_intrinsic_first_invocation: {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp = bld.vgrf(BRW_TYPE_UD);
|
2017-08-21 22:17:37 -07:00
|
|
|
bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
|
2024-04-20 17:08:02 -07:00
|
|
|
bld.MOV(retype(dest, BRW_TYPE_UD),
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg(component(tmp, 0)));
|
2017-08-21 22:17:37 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2022-03-17 00:46:21 -07:00
|
|
|
case nir_intrinsic_last_invocation: {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp = bld.vgrf(BRW_TYPE_UD);
|
2022-03-17 00:46:21 -07:00
|
|
|
bld.exec_all().emit(SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL, tmp);
|
2024-04-20 17:08:02 -07:00
|
|
|
bld.MOV(retype(dest, BRW_TYPE_UD),
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg(component(tmp, 0)));
|
2022-03-17 00:46:21 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2017-09-01 15:18:02 -07:00
|
|
|
case nir_intrinsic_quad_broadcast: {
|
2025-01-15 13:27:05 -08:00
|
|
|
const brw_reg value = get_nir_src(ntb, instr->src[0], 0);
|
2018-10-20 09:55:28 -05:00
|
|
|
const unsigned index = nir_src_as_uint(instr->src[1]);
|
2017-09-01 15:18:02 -07:00
|
|
|
|
|
|
|
|
bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
|
2018-10-20 09:55:28 -05:00
|
|
|
value, brw_imm_ud(index), brw_imm_ud(4));
|
2017-09-01 15:18:02 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-05 17:37:25 -07:00
|
|
|
case nir_intrinsic_quad_swap_horizontal:
|
|
|
|
|
case nir_intrinsic_quad_swap_vertical:
|
|
|
|
|
case nir_intrinsic_quad_swap_diagonal: {
|
2025-01-15 13:27:05 -08:00
|
|
|
const brw_reg value = get_nir_src(ntb, instr->src[0], 0);
|
2017-09-01 15:18:02 -07:00
|
|
|
|
2024-09-05 17:37:25 -07:00
|
|
|
enum brw_swap_direction dir;
|
|
|
|
|
switch (instr->intrinsic) {
|
|
|
|
|
case nir_intrinsic_quad_swap_horizontal: dir = BRW_SWAP_HORIZONTAL; break;
|
|
|
|
|
case nir_intrinsic_quad_swap_vertical: dir = BRW_SWAP_VERTICAL; break;
|
|
|
|
|
case nir_intrinsic_quad_swap_diagonal: dir = BRW_SWAP_DIAGONAL; break;
|
2025-07-23 09:17:35 +02:00
|
|
|
default: UNREACHABLE("invalid quad swap");
|
2017-09-01 15:18:02 -07:00
|
|
|
}
|
|
|
|
|
|
2024-09-05 17:37:25 -07:00
|
|
|
bld.emit(SHADER_OPCODE_QUAD_SWAP, retype(dest, value.type),
|
|
|
|
|
value, brw_imm_ud(dir));
|
2017-09-01 15:18:02 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2017-08-31 22:12:48 -07:00
|
|
|
case nir_intrinsic_reduce: {
|
2025-01-15 13:27:05 -08:00
|
|
|
brw_reg src = get_nir_src(ntb, instr->src[0], 0);
|
2024-07-15 15:09:12 -07:00
|
|
|
nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
|
|
|
|
|
enum brw_reduce_op brw_op = brw_reduce_op_for_nir_reduction_op(op);
|
2017-08-31 22:12:48 -07:00
|
|
|
unsigned cluster_size = nir_intrinsic_cluster_size(instr);
|
2023-12-05 17:16:34 -08:00
|
|
|
if (cluster_size == 0 || cluster_size > s.dispatch_width)
|
|
|
|
|
cluster_size = s.dispatch_width;
|
2017-08-31 22:12:48 -07:00
|
|
|
|
|
|
|
|
/* Figure out the source type */
|
|
|
|
|
src.type = brw_type_for_nir_type(devinfo,
|
2024-07-15 15:09:12 -07:00
|
|
|
(nir_alu_type)(nir_op_infos[op].input_types[0] |
|
2017-08-31 22:12:48 -07:00
|
|
|
nir_src_bit_size(instr->src[0])));
|
|
|
|
|
|
2024-07-15 15:09:12 -07:00
|
|
|
bld.emit(SHADER_OPCODE_REDUCE, retype(dest, src.type), src,
|
|
|
|
|
brw_imm_ud(brw_op), brw_imm_ud(cluster_size));
|
2017-08-31 22:12:48 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_inclusive_scan:
|
|
|
|
|
case nir_intrinsic_exclusive_scan: {
|
2025-01-15 13:27:05 -08:00
|
|
|
brw_reg src = get_nir_src(ntb, instr->src[0], 0);
|
2024-07-16 14:06:12 -07:00
|
|
|
nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
|
|
|
|
|
enum brw_reduce_op brw_op = brw_reduce_op_for_nir_reduction_op(op);
|
2017-08-31 22:12:48 -07:00
|
|
|
|
|
|
|
|
/* Figure out the source type */
|
|
|
|
|
src.type = brw_type_for_nir_type(devinfo,
|
2024-07-16 14:06:12 -07:00
|
|
|
(nir_alu_type)(nir_op_infos[op].input_types[0] |
|
2017-08-31 22:12:48 -07:00
|
|
|
nir_src_bit_size(instr->src[0])));
|
|
|
|
|
|
2024-07-16 14:06:12 -07:00
|
|
|
enum opcode opcode = instr->intrinsic == nir_intrinsic_exclusive_scan ?
|
|
|
|
|
SHADER_OPCODE_EXCLUSIVE_SCAN : SHADER_OPCODE_INCLUSIVE_SCAN;
|
2017-08-31 22:12:48 -07:00
|
|
|
|
2024-07-16 14:06:12 -07:00
|
|
|
bld.emit(opcode, retype(dest, src.type), src, brw_imm_ud(brw_op));
|
2017-08-31 22:12:48 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-18 14:10:06 +03:00
|
|
|
case nir_intrinsic_load_topology_id_intel: {
|
2023-01-31 15:59:31 -08:00
|
|
|
/* These move around basically every hardware generation, so don't
|
|
|
|
|
* do any unbounded checks and fail if the platform hasn't explicitly
|
|
|
|
|
* been enabled here.
|
|
|
|
|
*/
|
2023-09-14 15:25:23 -07:00
|
|
|
assert(devinfo->ver >= 12 && devinfo->ver <= 30);
|
2021-06-18 14:10:06 +03:00
|
|
|
|
2023-01-31 15:59:31 -08:00
|
|
|
/* Here is what the layout of SR0 looks like on Gfx12
|
|
|
|
|
* https://gfxspecs.intel.com/Predator/Home/Index/47256
|
2021-06-18 14:10:06 +03:00
|
|
|
* [13:11] : Slice ID.
|
|
|
|
|
* [10:9] : Dual-SubSlice ID
|
|
|
|
|
* [8] : SubSlice ID
|
|
|
|
|
* [7] : EUID[2] (aka EU Row ID)
|
|
|
|
|
* [6] : Reserved
|
|
|
|
|
* [5:4] : EUID[1:0]
|
|
|
|
|
* [2:0] : Thread ID
|
2023-01-31 15:59:31 -08:00
|
|
|
*
|
|
|
|
|
* Xe2: Engine 3D and GPGPU Programs, EU Overview, Registers and
|
|
|
|
|
* Register Regions, ARF Registers, State Register,
|
|
|
|
|
* https://gfxspecs.intel.com/Predator/Home/Index/56623
|
|
|
|
|
* [15:11] : Slice ID.
|
|
|
|
|
* [9:8] : SubSlice ID
|
|
|
|
|
* [6:4] : EUID
|
|
|
|
|
* [2:0] : Thread ID
|
2023-09-14 15:25:23 -07:00
|
|
|
*
|
|
|
|
|
* Xe3: Engine 3D and GPGPU Programs, EU Overview, Registers and
|
|
|
|
|
* Register Regions, ARF Registers, State Register.
|
|
|
|
|
* Bspec 56623 (r55736)
|
|
|
|
|
*
|
|
|
|
|
* [17:14] : Slice ID.
|
|
|
|
|
* [11:8] : SubSlice ID
|
|
|
|
|
* [6:4] : EUID
|
|
|
|
|
* [3:0] : Thread ID
|
2021-06-18 14:10:06 +03:00
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg raw_id = bld.vgrf(BRW_TYPE_UD);
|
2024-05-28 16:43:43 +03:00
|
|
|
bld.UNDEF(raw_id);
|
|
|
|
|
bld.emit(SHADER_OPCODE_READ_ARCH_REG, raw_id, retype(brw_sr0_reg(0),
|
|
|
|
|
BRW_TYPE_UD));
|
2021-06-18 14:10:06 +03:00
|
|
|
switch (nir_intrinsic_base(instr)) {
|
|
|
|
|
case BRW_TOPOLOGY_ID_DSS:
|
2023-01-31 15:59:31 -08:00
|
|
|
if (devinfo->ver >= 20) {
|
|
|
|
|
/* Xe2+: 3D and GPGPU Programs, Shared Functions, Ray Tracing:
|
|
|
|
|
* https://gfxspecs.intel.com/Predator/Home/Index/56936
|
|
|
|
|
*
|
|
|
|
|
* Note: DSSID in all formulas below is a logical identifier of an
|
|
|
|
|
* XeCore (a value that goes from 0 to (number_of_slices *
|
|
|
|
|
* number_of_XeCores_per_slice -1). SW can get this value from
|
|
|
|
|
* either:
|
|
|
|
|
*
|
|
|
|
|
* - Message Control Register LogicalSSID field (only in shaders
|
|
|
|
|
* eligible for Mid-Thread Preemption).
|
|
|
|
|
* - Calculated based of State Register with the following formula:
|
|
|
|
|
* DSSID = StateRegister.SliceID * GT_ARCH_SS_PER_SLICE +
|
|
|
|
|
* StateRRegister.SubSliceID where GT_SS_PER_SLICE is an
|
|
|
|
|
* architectural parameter defined per product SKU.
|
|
|
|
|
*
|
|
|
|
|
* We are using the state register to calculate the DSSID.
|
|
|
|
|
*/
|
2023-09-14 15:25:23 -07:00
|
|
|
const uint32_t slice_id_mask = devinfo->ver >= 30 ?
|
|
|
|
|
INTEL_MASK(17, 14) :
|
|
|
|
|
INTEL_MASK(15, 11);
|
|
|
|
|
const uint32_t slice_id_shift = devinfo->ver >= 30 ? 14 : 11;
|
|
|
|
|
|
|
|
|
|
const uint32_t subslice_id_mask = devinfo->ver >= 30 ?
|
|
|
|
|
INTEL_MASK(11, 8) :
|
|
|
|
|
INTEL_MASK(9, 8);
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg slice_id =
|
2023-09-14 15:25:23 -07:00
|
|
|
bld.SHR(bld.AND(raw_id, brw_imm_ud(slice_id_mask)),
|
|
|
|
|
brw_imm_ud(slice_id_shift));
|
2023-01-31 15:59:31 -08:00
|
|
|
|
|
|
|
|
/* Assert that max subslices covers at least 2 bits that we use for
|
|
|
|
|
* subslices.
|
|
|
|
|
*/
|
2024-04-12 17:43:22 -07:00
|
|
|
unsigned slice_stride = devinfo->max_subslices_per_slice;
|
|
|
|
|
assert(slice_stride >= (1 << 2));
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg subslice_id =
|
2023-09-14 15:25:23 -07:00
|
|
|
bld.SHR(bld.AND(raw_id, brw_imm_ud(subslice_id_mask)),
|
2024-04-12 17:43:22 -07:00
|
|
|
brw_imm_ud(8));
|
|
|
|
|
bld.ADD(retype(dest, BRW_TYPE_UD),
|
|
|
|
|
bld.MUL(slice_id, brw_imm_ud(slice_stride)), subslice_id);
|
2023-01-31 15:59:31 -08:00
|
|
|
} else {
|
|
|
|
|
/* Get rid of anything below dualsubslice */
|
2024-04-12 17:43:22 -07:00
|
|
|
bld.SHR(retype(dest, BRW_TYPE_UD),
|
|
|
|
|
bld.AND(raw_id, brw_imm_ud(0x3fff)), brw_imm_ud(9));
|
2023-01-31 15:59:31 -08:00
|
|
|
}
|
2021-06-18 14:10:06 +03:00
|
|
|
break;
|
2021-06-18 14:12:03 +03:00
|
|
|
case BRW_TOPOLOGY_ID_EU_THREAD_SIMD: {
|
2023-12-05 17:16:34 -08:00
|
|
|
s.limit_dispatch_width(16, "Topology helper for Ray queries, "
|
2021-06-18 14:12:03 +03:00
|
|
|
"not supported in SIMD32 mode.");
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dst = retype(dest, BRW_TYPE_UD);
|
|
|
|
|
brw_reg eu;
|
2021-06-18 14:12:03 +03:00
|
|
|
|
2023-01-31 15:59:31 -08:00
|
|
|
if (devinfo->ver >= 20) {
|
|
|
|
|
/* Xe2+: Graphics Engine, 3D and GPGPU Programs, Shared Functions
|
|
|
|
|
* Ray Tracing,
|
|
|
|
|
* https://gfxspecs.intel.com/Predator/Home/Index/56936
|
|
|
|
|
*
|
|
|
|
|
* SyncStackID = (EUID[2:0] << 8) | (ThreadID[2:0] << 4) |
|
|
|
|
|
* SIMDLaneID[3:0];
|
|
|
|
|
*
|
|
|
|
|
* This section just deals with the EUID part.
|
|
|
|
|
*
|
|
|
|
|
* The 3bit EU[2:0] we need to build for ray query memory addresses
|
|
|
|
|
* computations is a bit odd :
|
|
|
|
|
*
|
|
|
|
|
* EU[2:0] = raw_id[6:4] (identified as EUID[2:0])
|
|
|
|
|
*/
|
2024-04-12 17:43:22 -07:00
|
|
|
eu = bld.SHL(bld.AND(raw_id, brw_imm_ud(INTEL_MASK(6, 4))),
|
|
|
|
|
brw_imm_ud(4));
|
2023-01-31 15:59:31 -08:00
|
|
|
} else {
|
|
|
|
|
/* EU[3:0] << 7
|
|
|
|
|
*
|
|
|
|
|
* The 4bit EU[3:0] we need to build for ray query memory addresses
|
|
|
|
|
* computations is a bit odd :
|
|
|
|
|
*
|
|
|
|
|
* EU[1:0] = raw_id[5:4] (identified as EUID[1:0])
|
|
|
|
|
* EU[2] = raw_id[8] (identified as SubSlice ID)
|
|
|
|
|
* EU[3] = raw_id[7] (identified as EUID[2] or Row ID)
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg raw5_4 = bld.AND(raw_id, brw_imm_ud(INTEL_MASK(5, 4)));
|
|
|
|
|
brw_reg raw7 = bld.AND(raw_id, brw_imm_ud(INTEL_MASK(7, 7)));
|
|
|
|
|
brw_reg raw8 = bld.AND(raw_id, brw_imm_ud(INTEL_MASK(8, 8)));
|
2024-04-12 17:43:22 -07:00
|
|
|
eu = bld.OR(bld.SHL(raw5_4, brw_imm_ud(3)),
|
|
|
|
|
bld.OR(bld.SHL(raw7, brw_imm_ud(3)),
|
|
|
|
|
bld.SHL(raw8, brw_imm_ud(1))));
|
2021-06-18 14:12:03 +03:00
|
|
|
}
|
|
|
|
|
|
2023-09-14 15:25:23 -07:00
|
|
|
brw_reg tid;
|
|
|
|
|
/* Xe3: Graphics Engine, 3D and GPGPU Programs, Shared Functions
|
|
|
|
|
* Ray Tracing, (Bspec 56936 (r56740))
|
|
|
|
|
*
|
|
|
|
|
* SyncStackID = (EUID[2:0] << 8) | (ThreadID[3:0] << 4) |
|
|
|
|
|
* SIMDLaneID[3:0];
|
|
|
|
|
*
|
|
|
|
|
* ThreadID[3:0] << 4 (ThreadID comes from raw_id[3:0])
|
|
|
|
|
*
|
|
|
|
|
* On older platforms (< Xe3):
|
|
|
|
|
* ThreadID[2:0] << 4 (ThreadID comes from raw_id[2:0])
|
|
|
|
|
*/
|
|
|
|
|
const uint32_t raw_id_mask = devinfo->ver >= 30 ?
|
|
|
|
|
INTEL_MASK(3, 0) :
|
|
|
|
|
INTEL_MASK(2, 0);
|
|
|
|
|
tid = bld.SHL(bld.AND(raw_id, brw_imm_ud(raw_id_mask)),
|
|
|
|
|
brw_imm_ud(4));
|
2021-06-18 14:12:03 +03:00
|
|
|
|
intel/brw: Use CSE for LOAD_SUBGROUP_INVOCATION
Instead of emitting a single one at the top, and making reference to it,
emit the virtual instruction as needed and let CSE do its job.
Since load_subgroup_invocation now can appear not at the start of the
shader, use UNDEF in all cases to ensure that the liveness of the
destination doesn't extend to the first partial write done here (it was
being used only for SIMD > 8 before).
Note this option was considered in the past
6132992cdb858268af0e985727d80e4140be389c but at the time dismissed. The
difference now is that the lowering of the virtual instruction happens
earlier than the scheduling.
The motivation for this change is to allow passes other than the NIR
conversion to use this value. The alternative of storing a `brw_reg` in
the shader (instead of NIR state) gets complicated by passes like
compact_vgrfs, that move VGRFs around (and update the instructions).
This and maybe other passes would have to care about the brw_reg.
Fossil-db numbers, TGL
```
*** Shaders only in 'after' results are ignored:
steam-native/shadow_of_the_tomb_raider/c683ea5067ee157d/fs.32/0, steam-native/shadow_of_the_tomb_raider/f4df450c3cef40b4/fs.32/0, steam-native/shadow_of_the_tomb_raider/94b708fb8e3d9597/fs.32/0, steam-native/shadow_of_the_tomb_raider/19d44c328edabd30/fs.32/0, steam-native/shadow_of_the_tomb_raider/8a7dcbd5a74a19bf/fs.32/0, and 366 more
from 4 apps: steam-dxvk/alan_wake, steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
*** Shaders only in 'before' results are ignored:
steam-dxvk/octopath_traveler/aaa3d10acb726906/fs.32/0, steam-dxvk/batman_arkham_origins/e6872ae23569c35f/fs.32/0, steam-dxvk/octopath_traveler/fd33a99fa5c271a8/fs.32/0, steam-dxvk/octopath_traveler/9a077cdc16f24520/fs.32/0, steam-dxvk/batman_arkham_city_goty/fac7b438ad52f622/fs.32/0, and 12 more
from 4 apps: steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-dxvk/octopath_traveler, steam-native/shadow_of_the_tomb_raider
Totals:
Instrs: 149752381 -> 149751337 (-0.00%); split: -0.00%, +0.00%
Cycle count: 11553609349 -> 11549970294 (-0.03%); split: -0.06%, +0.03%
Spill count: 42763 -> 42764 (+0.00%); split: -0.01%, +0.01%
Fill count: 75650 -> 75651 (+0.00%); split: -0.00%, +0.01%
Max live registers: 31725096 -> 31671792 (-0.17%)
Max dispatch width: 5546008 -> 5551672 (+0.10%); split: +0.11%, -0.00%
Totals from 52574 (8.34% of 630441) affected shaders:
Instrs: 9535159 -> 9534115 (-0.01%); split: -0.03%, +0.02%
Cycle count: 1006627109 -> 1002988054 (-0.36%); split: -0.65%, +0.29%
Spill count: 11588 -> 11589 (+0.01%); split: -0.03%, +0.03%
Fill count: 21057 -> 21058 (+0.00%); split: -0.01%, +0.02%
Max live registers: 1992493 -> 1939189 (-2.68%)
Max dispatch width: 559696 -> 565360 (+1.01%); split: +1.06%, -0.05%
```
and DG2
```
*** Shaders only in 'after' results are ignored:
steam-native/shadow_of_the_tomb_raider/1f95a9d3db21df85/fs.32/0, steam-native/shadow_of_the_tomb_raider/56b87c4a46613a2a/fs.32/0, steam-native/shadow_of_the_tomb_raider/a74b4137f85dbbd3/fs.32/0, steam-native/shadow_of_the_tomb_raider/e07e38d3f48e8402/fs.32/0, steam-native/shadow_of_the_tomb_raider/206336789c48996c/fs.32/0, and 268 more
from 4 apps: steam-dxvk/alan_wake, steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
*** Shaders only in 'before' results are ignored:
steam-native/shadow_of_the_tomb_raider/0420d7c3a2ea99ec/fs.32/0, steam-native/shadow_of_the_tomb_raider/2ff39f8bf7d24abb/fs.32/0, steam-native/shadow_of_the_tomb_raider/92d7be2824bd9659/fs.32/0, steam-native/shadow_of_the_tomb_raider/f09ca6d2ecf18015/fs.32/0, steam-native/shadow_of_the_tomb_raider/490f8ffd59e52949/fs.32/0, and 205 more
from 3 apps: steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
Totals:
Instrs: 151597619 -> 151599914 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7699776 -> 7699784 (+0.00%)
Cycle count: 12738501989 -> 12739841170 (+0.01%); split: -0.01%, +0.02%
Spill count: 61283 -> 61274 (-0.01%)
Fill count: 119886 -> 119849 (-0.03%)
Max live registers: 31810432 -> 31758920 (-0.16%)
Max dispatch width: 5540128 -> 5541136 (+0.02%); split: +0.08%, -0.06%
Totals from 49286 (7.81% of 631231) affected shaders:
Instrs: 8607753 -> 8610048 (+0.03%); split: -0.01%, +0.04%
Subgroup size: 857752 -> 857760 (+0.00%)
Cycle count: 305939495 -> 307278676 (+0.44%); split: -0.28%, +0.72%
Spill count: 6339 -> 6330 (-0.14%)
Fill count: 12571 -> 12534 (-0.29%)
Max live registers: 1788346 -> 1736834 (-2.88%)
Max dispatch width: 510920 -> 511928 (+0.20%); split: +0.85%, -0.66%
```
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30489>
2024-07-31 22:46:20 -07:00
|
|
|
/* LaneID[0:3] << 0 (Use subgroup invocation) */
|
2023-03-06 20:14:53 -08:00
|
|
|
assert(bld.dispatch_width() <= 16); /* Limit to 4 bits */
|
intel/brw: Use CSE for LOAD_SUBGROUP_INVOCATION
Instead of emitting a single one at the top, and making reference to it,
emit the virtual instruction as needed and let CSE do its job.
Since load_subgroup_invocation now can appear not at the start of the
shader, use UNDEF in all cases to ensure that the liveness of the
destination doesn't extend to the first partial write done here (it was
being used only for SIMD > 8 before).
Note this option was considered in the past
6132992cdb858268af0e985727d80e4140be389c but at the time dismissed. The
difference now is that the lowering of the virtual instruction happens
earlier than the scheduling.
The motivation for this change is to allow passes other than the NIR
conversion to use this value. The alternative of storing a `brw_reg` in
the shader (instead of NIR state) gets complicated by passes like
compact_vgrfs, that move VGRFs around (and update the instructions).
This and maybe other passes would have to care about the brw_reg.
Fossil-db numbers, TGL
```
*** Shaders only in 'after' results are ignored:
steam-native/shadow_of_the_tomb_raider/c683ea5067ee157d/fs.32/0, steam-native/shadow_of_the_tomb_raider/f4df450c3cef40b4/fs.32/0, steam-native/shadow_of_the_tomb_raider/94b708fb8e3d9597/fs.32/0, steam-native/shadow_of_the_tomb_raider/19d44c328edabd30/fs.32/0, steam-native/shadow_of_the_tomb_raider/8a7dcbd5a74a19bf/fs.32/0, and 366 more
from 4 apps: steam-dxvk/alan_wake, steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
*** Shaders only in 'before' results are ignored:
steam-dxvk/octopath_traveler/aaa3d10acb726906/fs.32/0, steam-dxvk/batman_arkham_origins/e6872ae23569c35f/fs.32/0, steam-dxvk/octopath_traveler/fd33a99fa5c271a8/fs.32/0, steam-dxvk/octopath_traveler/9a077cdc16f24520/fs.32/0, steam-dxvk/batman_arkham_city_goty/fac7b438ad52f622/fs.32/0, and 12 more
from 4 apps: steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-dxvk/octopath_traveler, steam-native/shadow_of_the_tomb_raider
Totals:
Instrs: 149752381 -> 149751337 (-0.00%); split: -0.00%, +0.00%
Cycle count: 11553609349 -> 11549970294 (-0.03%); split: -0.06%, +0.03%
Spill count: 42763 -> 42764 (+0.00%); split: -0.01%, +0.01%
Fill count: 75650 -> 75651 (+0.00%); split: -0.00%, +0.01%
Max live registers: 31725096 -> 31671792 (-0.17%)
Max dispatch width: 5546008 -> 5551672 (+0.10%); split: +0.11%, -0.00%
Totals from 52574 (8.34% of 630441) affected shaders:
Instrs: 9535159 -> 9534115 (-0.01%); split: -0.03%, +0.02%
Cycle count: 1006627109 -> 1002988054 (-0.36%); split: -0.65%, +0.29%
Spill count: 11588 -> 11589 (+0.01%); split: -0.03%, +0.03%
Fill count: 21057 -> 21058 (+0.00%); split: -0.01%, +0.02%
Max live registers: 1992493 -> 1939189 (-2.68%)
Max dispatch width: 559696 -> 565360 (+1.01%); split: +1.06%, -0.05%
```
and DG2
```
*** Shaders only in 'after' results are ignored:
steam-native/shadow_of_the_tomb_raider/1f95a9d3db21df85/fs.32/0, steam-native/shadow_of_the_tomb_raider/56b87c4a46613a2a/fs.32/0, steam-native/shadow_of_the_tomb_raider/a74b4137f85dbbd3/fs.32/0, steam-native/shadow_of_the_tomb_raider/e07e38d3f48e8402/fs.32/0, steam-native/shadow_of_the_tomb_raider/206336789c48996c/fs.32/0, and 268 more
from 4 apps: steam-dxvk/alan_wake, steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
*** Shaders only in 'before' results are ignored:
steam-native/shadow_of_the_tomb_raider/0420d7c3a2ea99ec/fs.32/0, steam-native/shadow_of_the_tomb_raider/2ff39f8bf7d24abb/fs.32/0, steam-native/shadow_of_the_tomb_raider/92d7be2824bd9659/fs.32/0, steam-native/shadow_of_the_tomb_raider/f09ca6d2ecf18015/fs.32/0, steam-native/shadow_of_the_tomb_raider/490f8ffd59e52949/fs.32/0, and 205 more
from 3 apps: steam-dxvk/batman_arkham_city_goty, steam-dxvk/batman_arkham_origins, steam-native/shadow_of_the_tomb_raider
Totals:
Instrs: 151597619 -> 151599914 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 7699776 -> 7699784 (+0.00%)
Cycle count: 12738501989 -> 12739841170 (+0.01%); split: -0.01%, +0.02%
Spill count: 61283 -> 61274 (-0.01%)
Fill count: 119886 -> 119849 (-0.03%)
Max live registers: 31810432 -> 31758920 (-0.16%)
Max dispatch width: 5540128 -> 5541136 (+0.02%); split: +0.08%, -0.06%
Totals from 49286 (7.81% of 631231) affected shaders:
Instrs: 8607753 -> 8610048 (+0.03%); split: -0.01%, +0.04%
Subgroup size: 857752 -> 857760 (+0.00%)
Cycle count: 305939495 -> 307278676 (+0.44%); split: -0.28%, +0.72%
Spill count: 6339 -> 6330 (-0.14%)
Fill count: 12571 -> 12534 (-0.29%)
Max live registers: 1788346 -> 1736834 (-2.88%)
Max dispatch width: 510920 -> 511928 (+0.20%); split: +0.85%, -0.66%
```
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30489>
2024-07-31 22:46:20 -07:00
|
|
|
bld.ADD(dst, bld.OR(eu, tid), bld.LOAD_SUBGROUP_INVOCATION());
|
2021-06-18 14:12:03 +03:00
|
|
|
break;
|
|
|
|
|
}
|
2021-06-18 14:10:06 +03:00
|
|
|
default:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("Invalid topology id type");
|
2021-06-18 14:10:06 +03:00
|
|
|
}
|
2020-10-21 14:46:50 -05:00
|
|
|
break;
|
2021-06-18 14:10:06 +03:00
|
|
|
}
|
2020-10-21 14:46:50 -05:00
|
|
|
|
|
|
|
|
case nir_intrinsic_load_btd_stack_id_intel:
|
2023-12-05 17:16:34 -08:00
|
|
|
if (s.stage == MESA_SHADER_COMPUTE) {
|
|
|
|
|
assert(brw_cs_prog_data(s.prog_data)->uses_btd_stack_ids);
|
2020-10-21 14:46:50 -05:00
|
|
|
} else {
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(brw_shader_stage_is_bindless(s.stage));
|
2020-10-21 14:46:50 -05:00
|
|
|
}
|
|
|
|
|
/* Stack IDs are always in R1 regardless of whether we're coming from a
|
|
|
|
|
* bindless shader or a regular compute shader.
|
|
|
|
|
*/
|
2024-04-20 17:08:02 -07:00
|
|
|
bld.MOV(retype(dest, BRW_TYPE_UD),
|
|
|
|
|
retype(brw_vec8_grf(1 * reg_unit(devinfo), 0), BRW_TYPE_UW));
|
2020-10-21 14:46:50 -05:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_btd_spawn_intel:
|
2023-12-05 17:16:34 -08:00
|
|
|
if (s.stage == MESA_SHADER_COMPUTE) {
|
|
|
|
|
assert(brw_cs_prog_data(s.prog_data)->uses_btd_stack_ids);
|
2020-10-21 14:46:50 -05:00
|
|
|
} else {
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(brw_shader_stage_is_bindless(s.stage));
|
2020-10-21 14:46:50 -05:00
|
|
|
}
|
2022-04-05 13:23:13 +00:00
|
|
|
/* Make sure all the pointers to resume shaders have landed where other
|
|
|
|
|
* threads can see them.
|
|
|
|
|
*/
|
|
|
|
|
emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
|
|
|
|
|
|
2020-10-21 14:46:50 -05:00
|
|
|
bld.emit(SHADER_OPCODE_BTD_SPAWN_LOGICAL, bld.null_reg_ud(),
|
2024-02-12 08:43:34 -08:00
|
|
|
bld.emit_uniformize(get_nir_src(ntb, instr->src[0], -1)),
|
2025-01-15 13:27:05 -08:00
|
|
|
get_nir_src(ntb, instr->src[1], 0));
|
2020-10-21 14:46:50 -05:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_btd_retire_intel:
|
2023-12-05 17:16:34 -08:00
|
|
|
if (s.stage == MESA_SHADER_COMPUTE) {
|
|
|
|
|
assert(brw_cs_prog_data(s.prog_data)->uses_btd_stack_ids);
|
2020-10-21 14:46:50 -05:00
|
|
|
} else {
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(brw_shader_stage_is_bindless(s.stage));
|
2020-10-21 14:46:50 -05:00
|
|
|
}
|
2022-04-05 13:23:13 +00:00
|
|
|
/* Make sure all the pointers to resume shaders have landed where other
|
|
|
|
|
* threads can see them.
|
|
|
|
|
*/
|
|
|
|
|
emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
|
2020-10-21 14:46:50 -05:00
|
|
|
bld.emit(SHADER_OPCODE_BTD_RETIRE_LOGICAL);
|
|
|
|
|
break;
|
|
|
|
|
|
2021-06-14 17:30:31 +03:00
|
|
|
case nir_intrinsic_trace_ray_intel: {
|
|
|
|
|
const bool synchronous = nir_intrinsic_synchronous(instr);
|
2023-12-05 17:16:34 -08:00
|
|
|
assert(brw_shader_stage_is_bindless(s.stage) || synchronous);
|
2022-04-05 13:23:13 +00:00
|
|
|
|
|
|
|
|
/* Make sure all the previous RT structure writes are visible to the RT
|
|
|
|
|
* fixed function within the DSS, as well as stack pointers to resume
|
|
|
|
|
* shaders.
|
|
|
|
|
*/
|
|
|
|
|
emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[RT_LOGICAL_NUM_SRCS];
|
2022-06-23 14:15:51 +03:00
|
|
|
|
2024-02-12 08:43:34 -08:00
|
|
|
brw_reg globals = get_nir_src(ntb, instr->src[0], -1);
|
2022-06-23 14:15:51 +03:00
|
|
|
srcs[RT_LOGICAL_SRC_GLOBALS] = bld.emit_uniformize(globals);
|
2025-01-15 13:27:05 -08:00
|
|
|
srcs[RT_LOGICAL_SRC_BVH_LEVEL] = get_nir_src(ntb, instr->src[1], 0);
|
|
|
|
|
srcs[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] = get_nir_src(ntb, instr->src[2], 0);
|
2021-06-14 17:30:31 +03:00
|
|
|
srcs[RT_LOGICAL_SRC_SYNCHRONOUS] = brw_imm_ud(synchronous);
|
2024-08-09 22:59:59 -07:00
|
|
|
|
|
|
|
|
/* Bspec 57508: Structure_SIMD16TraceRayMessage:: RayQuery Enable
|
|
|
|
|
*
|
|
|
|
|
* "When this bit is set in the header, Trace Ray Message behaves like
|
|
|
|
|
* a Ray Query. This message requires a write-back message indicating
|
|
|
|
|
* RayQuery for all valid Rays (SIMD lanes) have completed."
|
|
|
|
|
*/
|
|
|
|
|
brw_reg dst = (devinfo->ver >= 20 && synchronous) ?
|
|
|
|
|
bld.vgrf(BRW_TYPE_UD) :
|
|
|
|
|
bld.null_reg_ud();
|
|
|
|
|
|
|
|
|
|
bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL, dst, srcs, RT_LOGICAL_NUM_SRCS);
|
2021-06-14 17:30:31 +03:00
|
|
|
|
|
|
|
|
/* There is no actual value to use in the destination register of the
|
|
|
|
|
* synchronous trace instruction. All of the communication with the HW
|
|
|
|
|
* unit happens through memory reads/writes. So to ensure that the
|
|
|
|
|
* operation has completed before we go read the results in memory, we
|
|
|
|
|
* need a barrier followed by an invalidate before accessing memory.
|
|
|
|
|
*/
|
|
|
|
|
if (synchronous) {
|
2024-04-11 01:31:54 -07:00
|
|
|
bld.SYNC(TGL_SYNC_ALLWR);
|
2022-04-05 13:23:13 +00:00
|
|
|
emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_INVALIDATE);
|
2021-06-14 17:30:31 +03:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2014-08-15 10:32:07 -07:00
|
|
|
default:
|
2022-04-25 10:55:06 +02:00
|
|
|
#ifndef NDEBUG
|
|
|
|
|
assert(instr->intrinsic < nir_num_intrinsics);
|
|
|
|
|
fprintf(stderr, "intrinsic: %s\n", nir_intrinsic_infos[instr->intrinsic].name);
|
|
|
|
|
#endif
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("unknown intrinsic");
|
2014-08-15 10:32:07 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
static enum lsc_data_size
|
|
|
|
|
lsc_bits_to_data_size(unsigned bit_size)
|
2022-10-17 15:53:50 +02:00
|
|
|
{
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
switch (bit_size / 8) {
|
|
|
|
|
case 1: return LSC_DATA_SIZE_D8U32;
|
|
|
|
|
case 2: return LSC_DATA_SIZE_D16U32;
|
|
|
|
|
case 4: return LSC_DATA_SIZE_D32;
|
|
|
|
|
case 8: return LSC_DATA_SIZE_D64;
|
|
|
|
|
default:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("Unsupported data size.");
|
2022-10-17 15:53:50 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
brw/nir: Treat load_*_uniform_block_intel as convergent
Between 5 and 10 shaders (depending on the platform) from Blender are
massively helped for spills and fills (e.g., from 45 spills to 0, and
180 fills to 0).
Previously this commit cause a lot of spill and fill damage to
Wolfenstein Youngblood and Red Dead Redemption 2. I believe due to
!32041 and !32097, this is no longer the case. RDR2 is helped, and
Wolfenstein Youngblood has no changes.
However, q2rtx/q2rtx-rt-pipeline is hurt:
Spill count: 126 -> 175 (+38.89%); split: -0.79%, +39.68%
Fill count: 156 -> 235 (+50.64%); split: -1.92%, +52.56%
By the end of this series this damage is fixed, and q2rtx is helped
overall by -0.79% spills and -1.92% fills.
v2: Fix for Xe2.
v3: Just keep using bld for the group(1, 0) call. Suggested by Ken.
v4: Major re-write. Pass bld and xbld to fs_emit_memory_access. The big
fix is changing the way srcs[MEMORY_LOGICAL_ADDRESS] is calculated
(around line 7180). In previous versions of the commit, the address
would be calculated using bld (which is now xbld) even if the address
source was not is_scalar. This could cause the emit_uniformize (later in
the function) to fetch garbage. This also drops the special case
handling of constant offset. Constant propagation and algebraic will
handle this.
v5: Fix a subtle bug that was ultimately caused by the removal of
offset_to_component. The MEMORY_LOGICAL_ADDRESS for
load_shared_uniform_block_intel was being calculated as SIMD16 on LNL,
but the later emit_uniformize would treat it as SIMD32. This caused GPU
hangs in Assassin's Creed Valhalla.
v6: Fix a bug in D16 to D16U32 expansion. Noticed by Ken. Add a comment
explaining bld vs xbld vs ubld in fs_nir_emit_memory_access. Suggested
by Ken.
v7: Revert some of the v6 changes related to D16 to D16U32
expansion. This code was mostly correct. xbld is correct because DATA0
needs to be generated in size of the eventual SEND instruction. Using
offset(nir_src, xbld, c) will cause offset() to correctly added
component(..., 0) if nir_src.is_scalar but xbld is not scalar_group().
v8: nir_intrinsic_load_shared_uniform_block_intel was removed. This
caused reproducible hangs in Assassin's Creed: Valhalla. There are some
other compiler issues related to this game, and we're not yet sure
exactly what the cause of any of it is.
shader-db:
Lunar Lake
total instructions in shared programs: 18058270 -> 18068886 (0.06%)
instructions in affected programs: 5196846 -> 5207462 (0.20%)
helped: 4442 / HURT: 11416
total cycles in shared programs: 921324492 -> 919819398 (-0.16%)
cycles in affected programs: 733274162 -> 731769068 (-0.21%)
helped: 11312 / HURT: 31788
total spills in shared programs: 3633 -> 3585 (-1.32%)
spills in affected programs: 48 -> 0
helped: 5 / HURT: 0
total fills in shared programs: 2277 -> 2198 (-3.47%)
fills in affected programs: 79 -> 0
helped: 5 / HURT: 0
LOST: 123
GAINED: 377
Meteor Lake, DG2, and Tiger Lake had similar results. (Meteor Lake shown)
total instructions in shared programs: 19703458 -> 19699173 (-0.02%)
instructions in affected programs: 5885251 -> 5880966 (-0.07%)
helped: 4545 / HURT: 14971
total cycles in shared programs: 903497253 -> 902054570 (-0.16%)
cycles in affected programs: 691762248 -> 690319565 (-0.21%)
helped: 16412 / HURT: 28080
total spills in shared programs: 4894 -> 4646 (-5.07%)
spills in affected programs: 248 -> 0
helped: 7 / HURT: 0
total fills in shared programs: 6638 -> 5581 (-15.92%)
fills in affected programs: 1057 -> 0
helped: 7 / HURT: 0
LOST: 427
GAINED: 978
Ice Lake and Skylake had similar results. (Ice Lake shonw)
total instructions in shared programs: 20384200 -> 20384889 (<.01%)
instructions in affected programs: 5295084 -> 5295773 (0.01%)
helped: 5309 / HURT: 12564
total cycles in shared programs: 873002832 -> 872515246 (-0.06%)
cycles in affected programs: 463413458 -> 462925872 (-0.11%)
helped: 16079 / HURT: 13339
total spills in shared programs: 4552 -> 4373 (-3.93%)
spills in affected programs: 546 -> 367 (-32.78%)
helped: 11 / HURT: 0
total fills in shared programs: 5298 -> 4657 (-12.10%)
fills in affected programs: 1798 -> 1157 (-35.65%)
helped: 10 / HURT: 0
LOST: 380
GAINED: 925
fossil-db:
All Intel platforms had similar results. (Lunar Lake shown)
Totals:
Instrs: 141528822 -> 141728392 (+0.14%); split: -0.21%, +0.35%
Subgroup size: 10968048 -> 10968144 (+0.00%)
Send messages: 6567930 -> 6567909 (-0.00%)
Cycle count: 22165780202 -> 21624534624 (-2.44%); split: -3.09%, +0.65%
Spill count: 69890 -> 66665 (-4.61%); split: -5.06%, +0.44%
Fill count: 128331 -> 120189 (-6.34%); split: -7.44%, +1.09%
Scratch Memory Size: 5829632 -> 5664768 (-2.83%); split: -2.86%, +0.04%
Max live registers: 47928290 -> 47611371 (-0.66%); split: -0.71%, +0.05%
Totals from 364369 (66.18% of 550563) affected shaders:
Instrs: 113448842 -> 113648412 (+0.18%); split: -0.26%, +0.44%
Subgroup size: 7694080 -> 7694176 (+0.00%)
Send messages: 5308287 -> 5308266 (-0.00%)
Cycle count: 21885237842 -> 21343992264 (-2.47%); split: -3.13%, +0.65%
Spill count: 65152 -> 61927 (-4.95%); split: -5.42%, +0.47%
Fill count: 122811 -> 114669 (-6.63%); split: -7.77%, +1.14%
Scratch Memory Size: 5438464 -> 5273600 (-3.03%); split: -3.07%, +0.04%
Max live registers: 34355310 -> 34038391 (-0.92%); split: -1.00%, +0.07%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-28 10:52:47 -08:00
|
|
|
/**
|
|
|
|
|
*
|
|
|
|
|
* \param bld "Normal" builder. This is the full dispatch width of the shader.
|
|
|
|
|
*
|
|
|
|
|
* \param xbld Builder for the intrinsic. If the intrinsic is convergent, this
|
|
|
|
|
* builder will be scalar_group(). Otherwise it will be the same
|
|
|
|
|
* as bld.
|
|
|
|
|
*
|
|
|
|
|
* Some places in the function will also use \c ubld. There are two cases of
|
|
|
|
|
* this. Sometimes it is to generate intermediate values as SIMD1. Other
|
|
|
|
|
* places that use \c ubld need a scalar_group() builder to operate on sources
|
|
|
|
|
* to the intrinsic that are is_scalar.
|
|
|
|
|
*/
|
2023-11-20 15:05:03 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld,
|
|
|
|
|
const brw_builder &xbld,
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
nir_intrinsic_instr *instr)
|
2015-06-01 09:41:47 +02:00
|
|
|
{
|
2023-12-05 15:27:29 -08:00
|
|
|
const intel_device_info *devinfo = ntb.devinfo;
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
|
|
|
|
|
brw_reg srcs[MEMORY_LOGICAL_NUM_SRCS];
|
|
|
|
|
|
|
|
|
|
/* Start with some default values for most cases */
|
2023-11-20 15:05:03 -08:00
|
|
|
|
2024-08-05 19:36:02 -07:00
|
|
|
enum lsc_opcode op = lsc_op_for_nir_intrinsic(instr);
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
const bool is_store = !nir_intrinsic_infos[instr->intrinsic].has_dest;
|
|
|
|
|
const bool is_atomic = lsc_opcode_is_atomic(op);
|
|
|
|
|
const bool is_load = !is_store && !is_atomic;
|
|
|
|
|
const bool include_helpers = nir_intrinsic_has_access(instr) &&
|
|
|
|
|
(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
|
2025-07-15 11:09:49 -07:00
|
|
|
const bool volatile_access = nir_intrinsic_has_access(instr) &&
|
|
|
|
|
(nir_intrinsic_access(instr) & ACCESS_VOLATILE);
|
2025-08-06 08:17:14 +00:00
|
|
|
const bool coherent_access = nir_intrinsic_has_access(instr) &&
|
|
|
|
|
(nir_intrinsic_access(instr) & ACCESS_COHERENT);
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
const unsigned align =
|
|
|
|
|
nir_intrinsic_has_align(instr) ? nir_intrinsic_align(instr) : 0;
|
2025-08-21 16:20:49 -07:00
|
|
|
uint8_t flags =
|
2025-07-15 11:09:49 -07:00
|
|
|
(include_helpers ? MEMORY_FLAG_INCLUDE_HELPERS : 0) |
|
2025-08-06 08:17:14 +00:00
|
|
|
(volatile_access ? MEMORY_FLAG_VOLATILE_ACCESS : 0) |
|
|
|
|
|
(coherent_access ? MEMORY_FLAG_COHERENT_ACCESS : 0);
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
bool no_mask_handle = false;
|
|
|
|
|
int data_src = -1;
|
|
|
|
|
|
2025-08-21 16:20:49 -07:00
|
|
|
uint8_t coord_components = 1;
|
|
|
|
|
|
|
|
|
|
int32_t address_offset = 0;
|
2022-12-25 00:40:37 -08:00
|
|
|
|
2025-08-21 16:20:49 -07:00
|
|
|
std::optional<memory_logical_mode> mode;
|
|
|
|
|
std::optional<lsc_addr_surface_type> binding_type;
|
2024-10-21 16:40:51 +02:00
|
|
|
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
switch (instr->intrinsic) {
|
|
|
|
|
case nir_intrinsic_bindless_image_load:
|
|
|
|
|
case nir_intrinsic_bindless_image_store:
|
|
|
|
|
case nir_intrinsic_bindless_image_atomic:
|
|
|
|
|
case nir_intrinsic_bindless_image_atomic_swap:
|
2025-08-21 16:20:49 -07:00
|
|
|
binding_type = LSC_ADDR_SURFTYPE_BSS;
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
FALLTHROUGH;
|
|
|
|
|
case nir_intrinsic_image_load:
|
|
|
|
|
case nir_intrinsic_image_store:
|
|
|
|
|
case nir_intrinsic_image_atomic:
|
|
|
|
|
case nir_intrinsic_image_atomic_swap:
|
2023-08-01 16:07:57 -07:00
|
|
|
/* Bspec 73734 (r50040):
|
|
|
|
|
*
|
|
|
|
|
* Instruction_StoreCmaskMSRT::Src0 Length:
|
|
|
|
|
*
|
|
|
|
|
* "num_coordinates is the number of address coordinates used in
|
|
|
|
|
* message. For TGM it will be 4 (U, V, R, SAMPLE_INDEX)."
|
|
|
|
|
*
|
|
|
|
|
*/
|
2025-08-21 16:20:49 -07:00
|
|
|
coord_components =
|
2023-08-01 16:07:57 -07:00
|
|
|
(devinfo->ver >= 30 &&
|
|
|
|
|
nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_MS) ? 4 :
|
2025-08-21 16:20:49 -07:00
|
|
|
nir_image_intrinsic_coord_components(instr);
|
2023-08-01 16:07:57 -07:00
|
|
|
|
|
|
|
|
/* MSAA image atomic accesses not supported, must be lowered to UGM */
|
|
|
|
|
assert((instr->intrinsic != nir_intrinsic_bindless_image_atomic &&
|
|
|
|
|
instr->intrinsic != nir_intrinsic_bindless_image_atomic_swap) ||
|
|
|
|
|
nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_MS);
|
|
|
|
|
|
2025-08-21 16:20:49 -07:00
|
|
|
mode = MEMORY_MODE_TYPED;
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
srcs[MEMORY_LOGICAL_BINDING] =
|
|
|
|
|
get_nir_image_intrinsic_image(ntb, bld, instr);
|
2023-01-09 16:23:08 -08:00
|
|
|
|
2025-08-21 16:20:49 -07:00
|
|
|
if (!binding_type.has_value())
|
|
|
|
|
binding_type = LSC_ADDR_SURFTYPE_BTI;
|
2019-01-12 18:30:47 -06:00
|
|
|
|
2025-01-15 13:27:05 -08:00
|
|
|
srcs[MEMORY_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[1], 0);
|
2015-06-01 09:41:47 +02:00
|
|
|
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
data_src = 3;
|
|
|
|
|
break;
|
2019-02-11 16:11:35 -06:00
|
|
|
|
2024-08-21 15:26:11 -07:00
|
|
|
case nir_intrinsic_load_ubo_uniform_block_intel:
|
2025-08-21 16:20:49 -07:00
|
|
|
mode = MEMORY_MODE_CONSTANT;
|
2025-01-02 01:03:46 -08:00
|
|
|
FALLTHROUGH;
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
case nir_intrinsic_load_ssbo:
|
2025-05-29 17:05:10 +03:00
|
|
|
case nir_intrinsic_load_ssbo_intel:
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
case nir_intrinsic_store_ssbo:
|
2025-05-29 17:05:10 +03:00
|
|
|
case nir_intrinsic_store_ssbo_intel:
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
case nir_intrinsic_ssbo_atomic:
|
|
|
|
|
case nir_intrinsic_ssbo_atomic_swap:
|
|
|
|
|
case nir_intrinsic_load_ssbo_block_intel:
|
|
|
|
|
case nir_intrinsic_store_ssbo_block_intel:
|
|
|
|
|
case nir_intrinsic_load_ssbo_uniform_block_intel:
|
2025-08-21 16:20:49 -07:00
|
|
|
if (!mode.has_value())
|
|
|
|
|
mode = MEMORY_MODE_UNTYPED;
|
|
|
|
|
binding_type = get_nir_src_bindless(ntb, instr->src[is_store ? 1 : 0]) ?
|
|
|
|
|
LSC_ADDR_SURFTYPE_BSS : LSC_ADDR_SURFTYPE_BTI;
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
srcs[MEMORY_LOGICAL_BINDING] =
|
|
|
|
|
get_nir_buffer_intrinsic_index(ntb, bld, instr, &no_mask_handle);
|
2025-08-21 16:20:49 -07:00
|
|
|
srcs[MEMORY_LOGICAL_ADDRESS] =
|
|
|
|
|
memory_address(ntb, bld, instr, *binding_type, &address_offset);
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
data_src = is_atomic ? 2 : 0;
|
|
|
|
|
break;
|
|
|
|
|
case nir_intrinsic_load_shared:
|
|
|
|
|
case nir_intrinsic_store_shared:
|
|
|
|
|
case nir_intrinsic_shared_atomic:
|
|
|
|
|
case nir_intrinsic_shared_atomic_swap:
|
|
|
|
|
case nir_intrinsic_load_shared_block_intel:
|
|
|
|
|
case nir_intrinsic_store_shared_block_intel:
|
|
|
|
|
case nir_intrinsic_load_shared_uniform_block_intel: {
|
2025-08-21 16:20:49 -07:00
|
|
|
mode = MEMORY_MODE_SHARED_LOCAL;
|
|
|
|
|
binding_type = LSC_ADDR_SURFTYPE_FLAT;
|
|
|
|
|
srcs[MEMORY_LOGICAL_ADDRESS] =
|
|
|
|
|
memory_address(ntb, bld, instr, *binding_type, &address_offset);
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
data_src = is_atomic ? 1 : 0;
|
|
|
|
|
no_mask_handle = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case nir_intrinsic_load_scratch:
|
|
|
|
|
case nir_intrinsic_store_scratch: {
|
2025-08-21 16:20:49 -07:00
|
|
|
mode = MEMORY_MODE_SCRATCH;
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
|
|
|
|
|
const nir_src &addr = instr->src[is_store ? 1 : 0];
|
|
|
|
|
|
|
|
|
|
if (devinfo->verx10 >= 125) {
|
2025-08-21 16:20:49 -07:00
|
|
|
binding_type = LSC_ADDR_SURFTYPE_SS;
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
|
2024-12-10 17:23:48 +02:00
|
|
|
brw_reg bind = ubld.AND(retype(brw_vec1_grf(0, 5), BRW_TYPE_UD),
|
|
|
|
|
brw_imm_ud(INTEL_MASK(31, 10)));
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
if (devinfo->ver >= 20)
|
2024-12-10 17:23:48 +02:00
|
|
|
bind = ubld.SHR(bind, brw_imm_ud(4));
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
|
brw/nir: Treat load_*_uniform_block_intel as convergent
Between 5 and 10 shaders (depending on the platform) from Blender are
massively helped for spills and fills (e.g., from 45 spills to 0, and
180 fills to 0).
Previously this commit cause a lot of spill and fill damage to
Wolfenstein Youngblood and Red Dead Redemption 2. I believe due to
!32041 and !32097, this is no longer the case. RDR2 is helped, and
Wolfenstein Youngblood has no changes.
However, q2rtx/q2rtx-rt-pipeline is hurt:
Spill count: 126 -> 175 (+38.89%); split: -0.79%, +39.68%
Fill count: 156 -> 235 (+50.64%); split: -1.92%, +52.56%
By the end of this series this damage is fixed, and q2rtx is helped
overall by -0.79% spills and -1.92% fills.
v2: Fix for Xe2.
v3: Just keep using bld for the group(1, 0) call. Suggested by Ken.
v4: Major re-write. Pass bld and xbld to fs_emit_memory_access. The big
fix is changing the way srcs[MEMORY_LOGICAL_ADDRESS] is calculated
(around line 7180). In previous versions of the commit, the address
would be calculated using bld (which is now xbld) even if the address
source was not is_scalar. This could cause the emit_uniformize (later in
the function) to fetch garbage. This also drops the special case
handling of constant offset. Constant propagation and algebraic will
handle this.
v5: Fix a subtle bug that was ultimately caused by the removal of
offset_to_component. The MEMORY_LOGICAL_ADDRESS for
load_shared_uniform_block_intel was being calculated as SIMD16 on LNL,
but the later emit_uniformize would treat it as SIMD32. This caused GPU
hangs in Assassin's Creed Valhalla.
v6: Fix a bug in D16 to D16U32 expansion. Noticed by Ken. Add a comment
explaining bld vs xbld vs ubld in fs_nir_emit_memory_access. Suggested
by Ken.
v7: Revert some of the v6 changes related to D16 to D16U32
expansion. This code was mostly correct. xbld is correct because DATA0
needs to be generated in size of the eventual SEND instruction. Using
offset(nir_src, xbld, c) will cause offset() to correctly added
component(..., 0) if nir_src.is_scalar but xbld is not scalar_group().
v8: nir_intrinsic_load_shared_uniform_block_intel was removed. This
caused reproducible hangs in Assassin's Creed: Valhalla. There are some
other compiler issues related to this game, and we're not yet sure
exactly what the cause of any of it is.
shader-db:
Lunar Lake
total instructions in shared programs: 18058270 -> 18068886 (0.06%)
instructions in affected programs: 5196846 -> 5207462 (0.20%)
helped: 4442 / HURT: 11416
total cycles in shared programs: 921324492 -> 919819398 (-0.16%)
cycles in affected programs: 733274162 -> 731769068 (-0.21%)
helped: 11312 / HURT: 31788
total spills in shared programs: 3633 -> 3585 (-1.32%)
spills in affected programs: 48 -> 0
helped: 5 / HURT: 0
total fills in shared programs: 2277 -> 2198 (-3.47%)
fills in affected programs: 79 -> 0
helped: 5 / HURT: 0
LOST: 123
GAINED: 377
Meteor Lake, DG2, and Tiger Lake had similar results. (Meteor Lake shown)
total instructions in shared programs: 19703458 -> 19699173 (-0.02%)
instructions in affected programs: 5885251 -> 5880966 (-0.07%)
helped: 4545 / HURT: 14971
total cycles in shared programs: 903497253 -> 902054570 (-0.16%)
cycles in affected programs: 691762248 -> 690319565 (-0.21%)
helped: 16412 / HURT: 28080
total spills in shared programs: 4894 -> 4646 (-5.07%)
spills in affected programs: 248 -> 0
helped: 7 / HURT: 0
total fills in shared programs: 6638 -> 5581 (-15.92%)
fills in affected programs: 1057 -> 0
helped: 7 / HURT: 0
LOST: 427
GAINED: 978
Ice Lake and Skylake had similar results. (Ice Lake shonw)
total instructions in shared programs: 20384200 -> 20384889 (<.01%)
instructions in affected programs: 5295084 -> 5295773 (0.01%)
helped: 5309 / HURT: 12564
total cycles in shared programs: 873002832 -> 872515246 (-0.06%)
cycles in affected programs: 463413458 -> 462925872 (-0.11%)
helped: 16079 / HURT: 13339
total spills in shared programs: 4552 -> 4373 (-3.93%)
spills in affected programs: 546 -> 367 (-32.78%)
helped: 11 / HURT: 0
total fills in shared programs: 5298 -> 4657 (-12.10%)
fills in affected programs: 1798 -> 1157 (-35.65%)
helped: 10 / HURT: 0
LOST: 380
GAINED: 925
fossil-db:
All Intel platforms had similar results. (Lunar Lake shown)
Totals:
Instrs: 141528822 -> 141728392 (+0.14%); split: -0.21%, +0.35%
Subgroup size: 10968048 -> 10968144 (+0.00%)
Send messages: 6567930 -> 6567909 (-0.00%)
Cycle count: 22165780202 -> 21624534624 (-2.44%); split: -3.09%, +0.65%
Spill count: 69890 -> 66665 (-4.61%); split: -5.06%, +0.44%
Fill count: 128331 -> 120189 (-6.34%); split: -7.44%, +1.09%
Scratch Memory Size: 5829632 -> 5664768 (-2.83%); split: -2.86%, +0.04%
Max live registers: 47928290 -> 47611371 (-0.66%); split: -0.71%, +0.05%
Totals from 364369 (66.18% of 550563) affected shaders:
Instrs: 113448842 -> 113648412 (+0.18%); split: -0.26%, +0.44%
Subgroup size: 7694080 -> 7694176 (+0.00%)
Send messages: 5308287 -> 5308266 (-0.00%)
Cycle count: 21885237842 -> 21343992264 (-2.47%); split: -3.13%, +0.65%
Spill count: 65152 -> 61927 (-4.95%); split: -5.42%, +0.47%
Fill count: 122811 -> 114669 (-6.63%); split: -7.77%, +1.14%
Scratch Memory Size: 5438464 -> 5273600 (-3.03%); split: -3.07%, +0.04%
Max live registers: 34355310 -> 34038391 (-0.92%); split: -1.00%, +0.07%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-28 10:52:47 -08:00
|
|
|
/* load_scratch / store_scratch cannot be is_scalar yet. */
|
|
|
|
|
assert(xbld.dispatch_width() == bld.dispatch_width());
|
|
|
|
|
|
2024-12-10 17:23:48 +02:00
|
|
|
srcs[MEMORY_LOGICAL_BINDING] = component(bind, 0);
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
srcs[MEMORY_LOGICAL_ADDRESS] =
|
|
|
|
|
swizzle_nir_scratch_addr(ntb, bld, addr, false);
|
2023-01-09 16:23:08 -08:00
|
|
|
} else {
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
unsigned bit_size =
|
|
|
|
|
is_store ? nir_src_bit_size(instr->src[0]) : instr->def.bit_size;
|
|
|
|
|
bool dword_aligned = align >= 4 && bit_size == 32;
|
brw/nir: Treat load_*_uniform_block_intel as convergent
Between 5 and 10 shaders (depending on the platform) from Blender are
massively helped for spills and fills (e.g., from 45 spills to 0, and
180 fills to 0).
Previously this commit cause a lot of spill and fill damage to
Wolfenstein Youngblood and Red Dead Redemption 2. I believe due to
!32041 and !32097, this is no longer the case. RDR2 is helped, and
Wolfenstein Youngblood has no changes.
However, q2rtx/q2rtx-rt-pipeline is hurt:
Spill count: 126 -> 175 (+38.89%); split: -0.79%, +39.68%
Fill count: 156 -> 235 (+50.64%); split: -1.92%, +52.56%
By the end of this series this damage is fixed, and q2rtx is helped
overall by -0.79% spills and -1.92% fills.
v2: Fix for Xe2.
v3: Just keep using bld for the group(1, 0) call. Suggested by Ken.
v4: Major re-write. Pass bld and xbld to fs_emit_memory_access. The big
fix is changing the way srcs[MEMORY_LOGICAL_ADDRESS] is calculated
(around line 7180). In previous versions of the commit, the address
would be calculated using bld (which is now xbld) even if the address
source was not is_scalar. This could cause the emit_uniformize (later in
the function) to fetch garbage. This also drops the special case
handling of constant offset. Constant propagation and algebraic will
handle this.
v5: Fix a subtle bug that was ultimately caused by the removal of
offset_to_component. The MEMORY_LOGICAL_ADDRESS for
load_shared_uniform_block_intel was being calculated as SIMD16 on LNL,
but the later emit_uniformize would treat it as SIMD32. This caused GPU
hangs in Assassin's Creed Valhalla.
v6: Fix a bug in D16 to D16U32 expansion. Noticed by Ken. Add a comment
explaining bld vs xbld vs ubld in fs_nir_emit_memory_access. Suggested
by Ken.
v7: Revert some of the v6 changes related to D16 to D16U32
expansion. This code was mostly correct. xbld is correct because DATA0
needs to be generated in size of the eventual SEND instruction. Using
offset(nir_src, xbld, c) will cause offset() to correctly added
component(..., 0) if nir_src.is_scalar but xbld is not scalar_group().
v8: nir_intrinsic_load_shared_uniform_block_intel was removed. This
caused reproducible hangs in Assassin's Creed: Valhalla. There are some
other compiler issues related to this game, and we're not yet sure
exactly what the cause of any of it is.
shader-db:
Lunar Lake
total instructions in shared programs: 18058270 -> 18068886 (0.06%)
instructions in affected programs: 5196846 -> 5207462 (0.20%)
helped: 4442 / HURT: 11416
total cycles in shared programs: 921324492 -> 919819398 (-0.16%)
cycles in affected programs: 733274162 -> 731769068 (-0.21%)
helped: 11312 / HURT: 31788
total spills in shared programs: 3633 -> 3585 (-1.32%)
spills in affected programs: 48 -> 0
helped: 5 / HURT: 0
total fills in shared programs: 2277 -> 2198 (-3.47%)
fills in affected programs: 79 -> 0
helped: 5 / HURT: 0
LOST: 123
GAINED: 377
Meteor Lake, DG2, and Tiger Lake had similar results. (Meteor Lake shown)
total instructions in shared programs: 19703458 -> 19699173 (-0.02%)
instructions in affected programs: 5885251 -> 5880966 (-0.07%)
helped: 4545 / HURT: 14971
total cycles in shared programs: 903497253 -> 902054570 (-0.16%)
cycles in affected programs: 691762248 -> 690319565 (-0.21%)
helped: 16412 / HURT: 28080
total spills in shared programs: 4894 -> 4646 (-5.07%)
spills in affected programs: 248 -> 0
helped: 7 / HURT: 0
total fills in shared programs: 6638 -> 5581 (-15.92%)
fills in affected programs: 1057 -> 0
helped: 7 / HURT: 0
LOST: 427
GAINED: 978
Ice Lake and Skylake had similar results. (Ice Lake shonw)
total instructions in shared programs: 20384200 -> 20384889 (<.01%)
instructions in affected programs: 5295084 -> 5295773 (0.01%)
helped: 5309 / HURT: 12564
total cycles in shared programs: 873002832 -> 872515246 (-0.06%)
cycles in affected programs: 463413458 -> 462925872 (-0.11%)
helped: 16079 / HURT: 13339
total spills in shared programs: 4552 -> 4373 (-3.93%)
spills in affected programs: 546 -> 367 (-32.78%)
helped: 11 / HURT: 0
total fills in shared programs: 5298 -> 4657 (-12.10%)
fills in affected programs: 1798 -> 1157 (-35.65%)
helped: 10 / HURT: 0
LOST: 380
GAINED: 925
fossil-db:
All Intel platforms had similar results. (Lunar Lake shown)
Totals:
Instrs: 141528822 -> 141728392 (+0.14%); split: -0.21%, +0.35%
Subgroup size: 10968048 -> 10968144 (+0.00%)
Send messages: 6567930 -> 6567909 (-0.00%)
Cycle count: 22165780202 -> 21624534624 (-2.44%); split: -3.09%, +0.65%
Spill count: 69890 -> 66665 (-4.61%); split: -5.06%, +0.44%
Fill count: 128331 -> 120189 (-6.34%); split: -7.44%, +1.09%
Scratch Memory Size: 5829632 -> 5664768 (-2.83%); split: -2.86%, +0.04%
Max live registers: 47928290 -> 47611371 (-0.66%); split: -0.71%, +0.05%
Totals from 364369 (66.18% of 550563) affected shaders:
Instrs: 113448842 -> 113648412 (+0.18%); split: -0.26%, +0.44%
Subgroup size: 7694080 -> 7694176 (+0.00%)
Send messages: 5308287 -> 5308266 (-0.00%)
Cycle count: 21885237842 -> 21343992264 (-2.47%); split: -3.13%, +0.65%
Spill count: 65152 -> 61927 (-4.95%); split: -5.42%, +0.47%
Fill count: 122811 -> 114669 (-6.63%); split: -7.77%, +1.14%
Scratch Memory Size: 5438464 -> 5273600 (-3.03%); split: -3.07%, +0.04%
Max live registers: 34355310 -> 34038391 (-0.92%); split: -1.00%, +0.07%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-28 10:52:47 -08:00
|
|
|
|
|
|
|
|
/* load_scratch / store_scratch cannot be is_scalar yet. */
|
|
|
|
|
assert(xbld.dispatch_width() == bld.dispatch_width());
|
|
|
|
|
|
2025-08-21 16:20:49 -07:00
|
|
|
binding_type = LSC_ADDR_SURFTYPE_FLAT;
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
srcs[MEMORY_LOGICAL_ADDRESS] =
|
|
|
|
|
swizzle_nir_scratch_addr(ntb, bld, addr, dword_aligned);
|
2022-10-17 15:53:50 +02:00
|
|
|
}
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
|
|
|
|
|
if (is_store)
|
2024-10-02 02:32:26 -07:00
|
|
|
++s.shader_stats.spill_count;
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
else
|
2024-10-02 02:32:26 -07:00
|
|
|
++s.shader_stats.fill_count;
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
|
|
|
|
|
data_src = 0;
|
|
|
|
|
break;
|
2022-10-17 15:53:50 +02:00
|
|
|
}
|
2019-02-11 16:11:35 -06:00
|
|
|
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
case nir_intrinsic_load_global_constant_uniform_block_intel:
|
|
|
|
|
case nir_intrinsic_load_global:
|
|
|
|
|
case nir_intrinsic_load_global_constant:
|
|
|
|
|
case nir_intrinsic_store_global:
|
|
|
|
|
case nir_intrinsic_global_atomic:
|
|
|
|
|
case nir_intrinsic_global_atomic_swap:
|
|
|
|
|
case nir_intrinsic_load_global_block_intel:
|
|
|
|
|
case nir_intrinsic_store_global_block_intel:
|
2025-08-21 16:20:49 -07:00
|
|
|
mode = MEMORY_MODE_UNTYPED;
|
|
|
|
|
binding_type = LSC_ADDR_SURFTYPE_FLAT;
|
|
|
|
|
srcs[MEMORY_LOGICAL_ADDRESS] =
|
|
|
|
|
memory_address(ntb, bld, instr, *binding_type, &address_offset);
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
data_src = is_atomic ? 1 : 0;
|
2024-09-17 09:20:11 +02:00
|
|
|
no_mask_handle = srcs[MEMORY_LOGICAL_ADDRESS].is_scalar;
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
break;
|
2022-10-17 15:53:50 +02:00
|
|
|
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
default:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("unknown memory intrinsic");
|
2019-02-11 16:11:35 -06:00
|
|
|
}
|
2018-04-18 14:02:33 -07:00
|
|
|
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
unsigned components = is_store ? instr->src[data_src].ssa->num_components
|
|
|
|
|
: instr->def.num_components;
|
|
|
|
|
if (components == 0)
|
|
|
|
|
components = instr->num_components;
|
2021-01-11 22:18:11 -06:00
|
|
|
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
const unsigned nir_bit_size =
|
|
|
|
|
is_store ? instr->src[data_src].ssa->bit_size : instr->def.bit_size;
|
2025-08-21 16:20:49 -07:00
|
|
|
const enum lsc_data_size data_size = lsc_bits_to_data_size(nir_bit_size);
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
uint32_t data_bit_size = lsc_data_size_bytes(data_size) * 8;
|
2018-11-26 15:15:04 -06:00
|
|
|
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
const brw_reg_type data_type =
|
|
|
|
|
brw_type_with_size(BRW_TYPE_UD, data_bit_size);
|
|
|
|
|
const brw_reg_type nir_data_type =
|
|
|
|
|
brw_type_with_size(BRW_TYPE_UD, nir_bit_size);
|
|
|
|
|
assert(data_bit_size >= nir_bit_size);
|
2018-11-26 15:15:04 -06:00
|
|
|
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
if (!is_load) {
|
|
|
|
|
for (unsigned i = 0; i < lsc_op_num_data_values(op); i++) {
|
|
|
|
|
brw_reg nir_src =
|
2024-02-12 08:43:34 -08:00
|
|
|
retype(get_nir_src(ntb, instr->src[data_src + i], -1), nir_data_type);
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
|
|
|
|
|
if (data_bit_size > nir_bit_size) {
|
|
|
|
|
/* Expand e.g. D16 to D16U32 */
|
brw/nir: Treat load_*_uniform_block_intel as convergent
Between 5 and 10 shaders (depending on the platform) from Blender are
massively helped for spills and fills (e.g., from 45 spills to 0, and
180 fills to 0).
Previously this commit cause a lot of spill and fill damage to
Wolfenstein Youngblood and Red Dead Redemption 2. I believe due to
!32041 and !32097, this is no longer the case. RDR2 is helped, and
Wolfenstein Youngblood has no changes.
However, q2rtx/q2rtx-rt-pipeline is hurt:
Spill count: 126 -> 175 (+38.89%); split: -0.79%, +39.68%
Fill count: 156 -> 235 (+50.64%); split: -1.92%, +52.56%
By the end of this series this damage is fixed, and q2rtx is helped
overall by -0.79% spills and -1.92% fills.
v2: Fix for Xe2.
v3: Just keep using bld for the group(1, 0) call. Suggested by Ken.
v4: Major re-write. Pass bld and xbld to fs_emit_memory_access. The big
fix is changing the way srcs[MEMORY_LOGICAL_ADDRESS] is calculated
(around line 7180). In previous versions of the commit, the address
would be calculated using bld (which is now xbld) even if the address
source was not is_scalar. This could cause the emit_uniformize (later in
the function) to fetch garbage. This also drops the special case
handling of constant offset. Constant propagation and algebraic will
handle this.
v5: Fix a subtle bug that was ultimately caused by the removal of
offset_to_component. The MEMORY_LOGICAL_ADDRESS for
load_shared_uniform_block_intel was being calculated as SIMD16 on LNL,
but the later emit_uniformize would treat it as SIMD32. This caused GPU
hangs in Assassin's Creed Valhalla.
v6: Fix a bug in D16 to D16U32 expansion. Noticed by Ken. Add a comment
explaining bld vs xbld vs ubld in fs_nir_emit_memory_access. Suggested
by Ken.
v7: Revert some of the v6 changes related to D16 to D16U32
expansion. This code was mostly correct. xbld is correct because DATA0
needs to be generated in size of the eventual SEND instruction. Using
offset(nir_src, xbld, c) will cause offset() to correctly added
component(..., 0) if nir_src.is_scalar but xbld is not scalar_group().
v8: nir_intrinsic_load_shared_uniform_block_intel was removed. This
caused reproducible hangs in Assassin's Creed: Valhalla. There are some
other compiler issues related to this game, and we're not yet sure
exactly what the cause of any of it is.
shader-db:
Lunar Lake
total instructions in shared programs: 18058270 -> 18068886 (0.06%)
instructions in affected programs: 5196846 -> 5207462 (0.20%)
helped: 4442 / HURT: 11416
total cycles in shared programs: 921324492 -> 919819398 (-0.16%)
cycles in affected programs: 733274162 -> 731769068 (-0.21%)
helped: 11312 / HURT: 31788
total spills in shared programs: 3633 -> 3585 (-1.32%)
spills in affected programs: 48 -> 0
helped: 5 / HURT: 0
total fills in shared programs: 2277 -> 2198 (-3.47%)
fills in affected programs: 79 -> 0
helped: 5 / HURT: 0
LOST: 123
GAINED: 377
Meteor Lake, DG2, and Tiger Lake had similar results. (Meteor Lake shown)
total instructions in shared programs: 19703458 -> 19699173 (-0.02%)
instructions in affected programs: 5885251 -> 5880966 (-0.07%)
helped: 4545 / HURT: 14971
total cycles in shared programs: 903497253 -> 902054570 (-0.16%)
cycles in affected programs: 691762248 -> 690319565 (-0.21%)
helped: 16412 / HURT: 28080
total spills in shared programs: 4894 -> 4646 (-5.07%)
spills in affected programs: 248 -> 0
helped: 7 / HURT: 0
total fills in shared programs: 6638 -> 5581 (-15.92%)
fills in affected programs: 1057 -> 0
helped: 7 / HURT: 0
LOST: 427
GAINED: 978
Ice Lake and Skylake had similar results. (Ice Lake shonw)
total instructions in shared programs: 20384200 -> 20384889 (<.01%)
instructions in affected programs: 5295084 -> 5295773 (0.01%)
helped: 5309 / HURT: 12564
total cycles in shared programs: 873002832 -> 872515246 (-0.06%)
cycles in affected programs: 463413458 -> 462925872 (-0.11%)
helped: 16079 / HURT: 13339
total spills in shared programs: 4552 -> 4373 (-3.93%)
spills in affected programs: 546 -> 367 (-32.78%)
helped: 11 / HURT: 0
total fills in shared programs: 5298 -> 4657 (-12.10%)
fills in affected programs: 1798 -> 1157 (-35.65%)
helped: 10 / HURT: 0
LOST: 380
GAINED: 925
fossil-db:
All Intel platforms had similar results. (Lunar Lake shown)
Totals:
Instrs: 141528822 -> 141728392 (+0.14%); split: -0.21%, +0.35%
Subgroup size: 10968048 -> 10968144 (+0.00%)
Send messages: 6567930 -> 6567909 (-0.00%)
Cycle count: 22165780202 -> 21624534624 (-2.44%); split: -3.09%, +0.65%
Spill count: 69890 -> 66665 (-4.61%); split: -5.06%, +0.44%
Fill count: 128331 -> 120189 (-6.34%); split: -7.44%, +1.09%
Scratch Memory Size: 5829632 -> 5664768 (-2.83%); split: -2.86%, +0.04%
Max live registers: 47928290 -> 47611371 (-0.66%); split: -0.71%, +0.05%
Totals from 364369 (66.18% of 550563) affected shaders:
Instrs: 113448842 -> 113648412 (+0.18%); split: -0.26%, +0.44%
Subgroup size: 7694080 -> 7694176 (+0.00%)
Send messages: 5308287 -> 5308266 (-0.00%)
Cycle count: 21885237842 -> 21343992264 (-2.47%); split: -3.13%, +0.65%
Spill count: 65152 -> 61927 (-4.95%); split: -5.42%, +0.47%
Fill count: 122811 -> 114669 (-6.63%); split: -7.77%, +1.14%
Scratch Memory Size: 5438464 -> 5273600 (-3.03%); split: -3.07%, +0.04%
Max live registers: 34355310 -> 34038391 (-0.92%); split: -1.00%, +0.07%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-28 10:52:47 -08:00
|
|
|
srcs[MEMORY_LOGICAL_DATA0 + i] = xbld.vgrf(data_type, components);
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
for (unsigned c = 0; c < components; c++) {
|
brw/nir: Treat load_*_uniform_block_intel as convergent
Between 5 and 10 shaders (depending on the platform) from Blender are
massively helped for spills and fills (e.g., from 45 spills to 0, and
180 fills to 0).
Previously this commit cause a lot of spill and fill damage to
Wolfenstein Youngblood and Red Dead Redemption 2. I believe due to
!32041 and !32097, this is no longer the case. RDR2 is helped, and
Wolfenstein Youngblood has no changes.
However, q2rtx/q2rtx-rt-pipeline is hurt:
Spill count: 126 -> 175 (+38.89%); split: -0.79%, +39.68%
Fill count: 156 -> 235 (+50.64%); split: -1.92%, +52.56%
By the end of this series this damage is fixed, and q2rtx is helped
overall by -0.79% spills and -1.92% fills.
v2: Fix for Xe2.
v3: Just keep using bld for the group(1, 0) call. Suggested by Ken.
v4: Major re-write. Pass bld and xbld to fs_emit_memory_access. The big
fix is changing the way srcs[MEMORY_LOGICAL_ADDRESS] is calculated
(around line 7180). In previous versions of the commit, the address
would be calculated using bld (which is now xbld) even if the address
source was not is_scalar. This could cause the emit_uniformize (later in
the function) to fetch garbage. This also drops the special case
handling of constant offset. Constant propagation and algebraic will
handle this.
v5: Fix a subtle bug that was ultimately caused by the removal of
offset_to_component. The MEMORY_LOGICAL_ADDRESS for
load_shared_uniform_block_intel was being calculated as SIMD16 on LNL,
but the later emit_uniformize would treat it as SIMD32. This caused GPU
hangs in Assassin's Creed Valhalla.
v6: Fix a bug in D16 to D16U32 expansion. Noticed by Ken. Add a comment
explaining bld vs xbld vs ubld in fs_nir_emit_memory_access. Suggested
by Ken.
v7: Revert some of the v6 changes related to D16 to D16U32
expansion. This code was mostly correct. xbld is correct because DATA0
needs to be generated in size of the eventual SEND instruction. Using
offset(nir_src, xbld, c) will cause offset() to correctly added
component(..., 0) if nir_src.is_scalar but xbld is not scalar_group().
v8: nir_intrinsic_load_shared_uniform_block_intel was removed. This
caused reproducible hangs in Assassin's Creed: Valhalla. There are some
other compiler issues related to this game, and we're not yet sure
exactly what the cause of any of it is.
shader-db:
Lunar Lake
total instructions in shared programs: 18058270 -> 18068886 (0.06%)
instructions in affected programs: 5196846 -> 5207462 (0.20%)
helped: 4442 / HURT: 11416
total cycles in shared programs: 921324492 -> 919819398 (-0.16%)
cycles in affected programs: 733274162 -> 731769068 (-0.21%)
helped: 11312 / HURT: 31788
total spills in shared programs: 3633 -> 3585 (-1.32%)
spills in affected programs: 48 -> 0
helped: 5 / HURT: 0
total fills in shared programs: 2277 -> 2198 (-3.47%)
fills in affected programs: 79 -> 0
helped: 5 / HURT: 0
LOST: 123
GAINED: 377
Meteor Lake, DG2, and Tiger Lake had similar results. (Meteor Lake shown)
total instructions in shared programs: 19703458 -> 19699173 (-0.02%)
instructions in affected programs: 5885251 -> 5880966 (-0.07%)
helped: 4545 / HURT: 14971
total cycles in shared programs: 903497253 -> 902054570 (-0.16%)
cycles in affected programs: 691762248 -> 690319565 (-0.21%)
helped: 16412 / HURT: 28080
total spills in shared programs: 4894 -> 4646 (-5.07%)
spills in affected programs: 248 -> 0
helped: 7 / HURT: 0
total fills in shared programs: 6638 -> 5581 (-15.92%)
fills in affected programs: 1057 -> 0
helped: 7 / HURT: 0
LOST: 427
GAINED: 978
Ice Lake and Skylake had similar results. (Ice Lake shonw)
total instructions in shared programs: 20384200 -> 20384889 (<.01%)
instructions in affected programs: 5295084 -> 5295773 (0.01%)
helped: 5309 / HURT: 12564
total cycles in shared programs: 873002832 -> 872515246 (-0.06%)
cycles in affected programs: 463413458 -> 462925872 (-0.11%)
helped: 16079 / HURT: 13339
total spills in shared programs: 4552 -> 4373 (-3.93%)
spills in affected programs: 546 -> 367 (-32.78%)
helped: 11 / HURT: 0
total fills in shared programs: 5298 -> 4657 (-12.10%)
fills in affected programs: 1798 -> 1157 (-35.65%)
helped: 10 / HURT: 0
LOST: 380
GAINED: 925
fossil-db:
All Intel platforms had similar results. (Lunar Lake shown)
Totals:
Instrs: 141528822 -> 141728392 (+0.14%); split: -0.21%, +0.35%
Subgroup size: 10968048 -> 10968144 (+0.00%)
Send messages: 6567930 -> 6567909 (-0.00%)
Cycle count: 22165780202 -> 21624534624 (-2.44%); split: -3.09%, +0.65%
Spill count: 69890 -> 66665 (-4.61%); split: -5.06%, +0.44%
Fill count: 128331 -> 120189 (-6.34%); split: -7.44%, +1.09%
Scratch Memory Size: 5829632 -> 5664768 (-2.83%); split: -2.86%, +0.04%
Max live registers: 47928290 -> 47611371 (-0.66%); split: -0.71%, +0.05%
Totals from 364369 (66.18% of 550563) affected shaders:
Instrs: 113448842 -> 113648412 (+0.18%); split: -0.26%, +0.44%
Subgroup size: 7694080 -> 7694176 (+0.00%)
Send messages: 5308287 -> 5308266 (-0.00%)
Cycle count: 21885237842 -> 21343992264 (-2.47%); split: -3.13%, +0.65%
Spill count: 65152 -> 61927 (-4.95%); split: -5.42%, +0.47%
Fill count: 122811 -> 114669 (-6.63%); split: -7.77%, +1.14%
Scratch Memory Size: 5438464 -> 5273600 (-3.03%); split: -3.07%, +0.04%
Max live registers: 34355310 -> 34038391 (-0.92%); split: -1.00%, +0.07%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-28 10:52:47 -08:00
|
|
|
xbld.MOV(offset(srcs[MEMORY_LOGICAL_DATA0 + i], xbld, c),
|
|
|
|
|
offset(nir_src, xbld, c));
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
srcs[MEMORY_LOGICAL_DATA0 + i] = nir_src;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
brw_reg dest, nir_dest;
|
|
|
|
|
if (!is_store) {
|
|
|
|
|
nir_dest = retype(get_nir_def(ntb, instr->def), nir_data_type);
|
brw/nir: Treat load_*_uniform_block_intel as convergent
Between 5 and 10 shaders (depending on the platform) from Blender are
massively helped for spills and fills (e.g., from 45 spills to 0, and
180 fills to 0).
Previously this commit cause a lot of spill and fill damage to
Wolfenstein Youngblood and Red Dead Redemption 2. I believe due to
!32041 and !32097, this is no longer the case. RDR2 is helped, and
Wolfenstein Youngblood has no changes.
However, q2rtx/q2rtx-rt-pipeline is hurt:
Spill count: 126 -> 175 (+38.89%); split: -0.79%, +39.68%
Fill count: 156 -> 235 (+50.64%); split: -1.92%, +52.56%
By the end of this series this damage is fixed, and q2rtx is helped
overall by -0.79% spills and -1.92% fills.
v2: Fix for Xe2.
v3: Just keep using bld for the group(1, 0) call. Suggested by Ken.
v4: Major re-write. Pass bld and xbld to fs_emit_memory_access. The big
fix is changing the way srcs[MEMORY_LOGICAL_ADDRESS] is calculated
(around line 7180). In previous versions of the commit, the address
would be calculated using bld (which is now xbld) even if the address
source was not is_scalar. This could cause the emit_uniformize (later in
the function) to fetch garbage. This also drops the special case
handling of constant offset. Constant propagation and algebraic will
handle this.
v5: Fix a subtle bug that was ultimately caused by the removal of
offset_to_component. The MEMORY_LOGICAL_ADDRESS for
load_shared_uniform_block_intel was being calculated as SIMD16 on LNL,
but the later emit_uniformize would treat it as SIMD32. This caused GPU
hangs in Assassin's Creed Valhalla.
v6: Fix a bug in D16 to D16U32 expansion. Noticed by Ken. Add a comment
explaining bld vs xbld vs ubld in fs_nir_emit_memory_access. Suggested
by Ken.
v7: Revert some of the v6 changes related to D16 to D16U32
expansion. This code was mostly correct. xbld is correct because DATA0
needs to be generated in size of the eventual SEND instruction. Using
offset(nir_src, xbld, c) will cause offset() to correctly added
component(..., 0) if nir_src.is_scalar but xbld is not scalar_group().
v8: nir_intrinsic_load_shared_uniform_block_intel was removed. This
caused reproducible hangs in Assassin's Creed: Valhalla. There are some
other compiler issues related to this game, and we're not yet sure
exactly what the cause of any of it is.
shader-db:
Lunar Lake
total instructions in shared programs: 18058270 -> 18068886 (0.06%)
instructions in affected programs: 5196846 -> 5207462 (0.20%)
helped: 4442 / HURT: 11416
total cycles in shared programs: 921324492 -> 919819398 (-0.16%)
cycles in affected programs: 733274162 -> 731769068 (-0.21%)
helped: 11312 / HURT: 31788
total spills in shared programs: 3633 -> 3585 (-1.32%)
spills in affected programs: 48 -> 0
helped: 5 / HURT: 0
total fills in shared programs: 2277 -> 2198 (-3.47%)
fills in affected programs: 79 -> 0
helped: 5 / HURT: 0
LOST: 123
GAINED: 377
Meteor Lake, DG2, and Tiger Lake had similar results. (Meteor Lake shown)
total instructions in shared programs: 19703458 -> 19699173 (-0.02%)
instructions in affected programs: 5885251 -> 5880966 (-0.07%)
helped: 4545 / HURT: 14971
total cycles in shared programs: 903497253 -> 902054570 (-0.16%)
cycles in affected programs: 691762248 -> 690319565 (-0.21%)
helped: 16412 / HURT: 28080
total spills in shared programs: 4894 -> 4646 (-5.07%)
spills in affected programs: 248 -> 0
helped: 7 / HURT: 0
total fills in shared programs: 6638 -> 5581 (-15.92%)
fills in affected programs: 1057 -> 0
helped: 7 / HURT: 0
LOST: 427
GAINED: 978
Ice Lake and Skylake had similar results. (Ice Lake shonw)
total instructions in shared programs: 20384200 -> 20384889 (<.01%)
instructions in affected programs: 5295084 -> 5295773 (0.01%)
helped: 5309 / HURT: 12564
total cycles in shared programs: 873002832 -> 872515246 (-0.06%)
cycles in affected programs: 463413458 -> 462925872 (-0.11%)
helped: 16079 / HURT: 13339
total spills in shared programs: 4552 -> 4373 (-3.93%)
spills in affected programs: 546 -> 367 (-32.78%)
helped: 11 / HURT: 0
total fills in shared programs: 5298 -> 4657 (-12.10%)
fills in affected programs: 1798 -> 1157 (-35.65%)
helped: 10 / HURT: 0
LOST: 380
GAINED: 925
fossil-db:
All Intel platforms had similar results. (Lunar Lake shown)
Totals:
Instrs: 141528822 -> 141728392 (+0.14%); split: -0.21%, +0.35%
Subgroup size: 10968048 -> 10968144 (+0.00%)
Send messages: 6567930 -> 6567909 (-0.00%)
Cycle count: 22165780202 -> 21624534624 (-2.44%); split: -3.09%, +0.65%
Spill count: 69890 -> 66665 (-4.61%); split: -5.06%, +0.44%
Fill count: 128331 -> 120189 (-6.34%); split: -7.44%, +1.09%
Scratch Memory Size: 5829632 -> 5664768 (-2.83%); split: -2.86%, +0.04%
Max live registers: 47928290 -> 47611371 (-0.66%); split: -0.71%, +0.05%
Totals from 364369 (66.18% of 550563) affected shaders:
Instrs: 113448842 -> 113648412 (+0.18%); split: -0.26%, +0.44%
Subgroup size: 7694080 -> 7694176 (+0.00%)
Send messages: 5308287 -> 5308266 (-0.00%)
Cycle count: 21885237842 -> 21343992264 (-2.47%); split: -3.13%, +0.65%
Spill count: 65152 -> 61927 (-4.95%); split: -5.42%, +0.47%
Fill count: 122811 -> 114669 (-6.63%); split: -7.77%, +1.14%
Scratch Memory Size: 5438464 -> 5273600 (-3.03%); split: -3.07%, +0.04%
Max live registers: 34355310 -> 34038391 (-0.92%); split: -1.00%, +0.07%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-28 10:52:47 -08:00
|
|
|
dest = data_bit_size > nir_bit_size ? xbld.vgrf(data_type, components)
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
: nir_dest;
|
2018-11-26 15:15:04 -06:00
|
|
|
}
|
|
|
|
|
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
enum opcode opcode = is_load ? SHADER_OPCODE_MEMORY_LOAD_LOGICAL :
|
|
|
|
|
is_store ? SHADER_OPCODE_MEMORY_STORE_LOGICAL :
|
|
|
|
|
SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL;
|
|
|
|
|
|
|
|
|
|
const bool convergent_block_load =
|
|
|
|
|
instr->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
|
|
|
|
|
instr->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel ||
|
|
|
|
|
instr->intrinsic == nir_intrinsic_load_shared_uniform_block_intel ||
|
|
|
|
|
instr->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel;
|
|
|
|
|
const bool block = convergent_block_load ||
|
|
|
|
|
instr->intrinsic == nir_intrinsic_load_global_block_intel ||
|
|
|
|
|
instr->intrinsic == nir_intrinsic_load_shared_block_intel ||
|
|
|
|
|
instr->intrinsic == nir_intrinsic_load_ssbo_block_intel ||
|
|
|
|
|
instr->intrinsic == nir_intrinsic_store_global_block_intel ||
|
|
|
|
|
instr->intrinsic == nir_intrinsic_store_shared_block_intel ||
|
|
|
|
|
instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
|
|
|
|
|
|
2025-08-21 16:20:49 -07:00
|
|
|
brw_mem_inst *mem;
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
|
|
|
|
|
if (!block) {
|
2025-08-21 16:20:49 -07:00
|
|
|
mem = xbld.emit(opcode, dest, srcs, MEMORY_LOGICAL_NUM_SRCS)->as_mem();
|
|
|
|
|
mem->size_written *= components;
|
|
|
|
|
mem->lsc_op = op;
|
|
|
|
|
mem->mode = *mode;
|
|
|
|
|
mem->binding_type = *binding_type;
|
|
|
|
|
mem->address_offset = address_offset;
|
|
|
|
|
mem->coord_components = coord_components;
|
|
|
|
|
mem->data_size = data_size;
|
|
|
|
|
mem->components = components;
|
|
|
|
|
mem->alignment = align;
|
|
|
|
|
mem->flags = flags;
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
|
|
|
|
|
if (dest.file != BAD_FILE && data_bit_size > nir_bit_size) {
|
|
|
|
|
/* Shrink e.g. D16U32 result back to D16 */
|
|
|
|
|
for (unsigned i = 0; i < components; i++) {
|
brw/nir: Treat load_*_uniform_block_intel as convergent
Between 5 and 10 shaders (depending on the platform) from Blender are
massively helped for spills and fills (e.g., from 45 spills to 0, and
180 fills to 0).
Previously this commit cause a lot of spill and fill damage to
Wolfenstein Youngblood and Red Dead Redemption 2. I believe due to
!32041 and !32097, this is no longer the case. RDR2 is helped, and
Wolfenstein Youngblood has no changes.
However, q2rtx/q2rtx-rt-pipeline is hurt:
Spill count: 126 -> 175 (+38.89%); split: -0.79%, +39.68%
Fill count: 156 -> 235 (+50.64%); split: -1.92%, +52.56%
By the end of this series this damage is fixed, and q2rtx is helped
overall by -0.79% spills and -1.92% fills.
v2: Fix for Xe2.
v3: Just keep using bld for the group(1, 0) call. Suggested by Ken.
v4: Major re-write. Pass bld and xbld to fs_emit_memory_access. The big
fix is changing the way srcs[MEMORY_LOGICAL_ADDRESS] is calculated
(around line 7180). In previous versions of the commit, the address
would be calculated using bld (which is now xbld) even if the address
source was not is_scalar. This could cause the emit_uniformize (later in
the function) to fetch garbage. This also drops the special case
handling of constant offset. Constant propagation and algebraic will
handle this.
v5: Fix a subtle bug that was ultimately caused by the removal of
offset_to_component. The MEMORY_LOGICAL_ADDRESS for
load_shared_uniform_block_intel was being calculated as SIMD16 on LNL,
but the later emit_uniformize would treat it as SIMD32. This caused GPU
hangs in Assassin's Creed Valhalla.
v6: Fix a bug in D16 to D16U32 expansion. Noticed by Ken. Add a comment
explaining bld vs xbld vs ubld in fs_nir_emit_memory_access. Suggested
by Ken.
v7: Revert some of the v6 changes related to D16 to D16U32
expansion. This code was mostly correct. xbld is correct because DATA0
needs to be generated in size of the eventual SEND instruction. Using
offset(nir_src, xbld, c) will cause offset() to correctly added
component(..., 0) if nir_src.is_scalar but xbld is not scalar_group().
v8: nir_intrinsic_load_shared_uniform_block_intel was removed. This
caused reproducible hangs in Assassin's Creed: Valhalla. There are some
other compiler issues related to this game, and we're not yet sure
exactly what the cause of any of it is.
shader-db:
Lunar Lake
total instructions in shared programs: 18058270 -> 18068886 (0.06%)
instructions in affected programs: 5196846 -> 5207462 (0.20%)
helped: 4442 / HURT: 11416
total cycles in shared programs: 921324492 -> 919819398 (-0.16%)
cycles in affected programs: 733274162 -> 731769068 (-0.21%)
helped: 11312 / HURT: 31788
total spills in shared programs: 3633 -> 3585 (-1.32%)
spills in affected programs: 48 -> 0
helped: 5 / HURT: 0
total fills in shared programs: 2277 -> 2198 (-3.47%)
fills in affected programs: 79 -> 0
helped: 5 / HURT: 0
LOST: 123
GAINED: 377
Meteor Lake, DG2, and Tiger Lake had similar results. (Meteor Lake shown)
total instructions in shared programs: 19703458 -> 19699173 (-0.02%)
instructions in affected programs: 5885251 -> 5880966 (-0.07%)
helped: 4545 / HURT: 14971
total cycles in shared programs: 903497253 -> 902054570 (-0.16%)
cycles in affected programs: 691762248 -> 690319565 (-0.21%)
helped: 16412 / HURT: 28080
total spills in shared programs: 4894 -> 4646 (-5.07%)
spills in affected programs: 248 -> 0
helped: 7 / HURT: 0
total fills in shared programs: 6638 -> 5581 (-15.92%)
fills in affected programs: 1057 -> 0
helped: 7 / HURT: 0
LOST: 427
GAINED: 978
Ice Lake and Skylake had similar results. (Ice Lake shonw)
total instructions in shared programs: 20384200 -> 20384889 (<.01%)
instructions in affected programs: 5295084 -> 5295773 (0.01%)
helped: 5309 / HURT: 12564
total cycles in shared programs: 873002832 -> 872515246 (-0.06%)
cycles in affected programs: 463413458 -> 462925872 (-0.11%)
helped: 16079 / HURT: 13339
total spills in shared programs: 4552 -> 4373 (-3.93%)
spills in affected programs: 546 -> 367 (-32.78%)
helped: 11 / HURT: 0
total fills in shared programs: 5298 -> 4657 (-12.10%)
fills in affected programs: 1798 -> 1157 (-35.65%)
helped: 10 / HURT: 0
LOST: 380
GAINED: 925
fossil-db:
All Intel platforms had similar results. (Lunar Lake shown)
Totals:
Instrs: 141528822 -> 141728392 (+0.14%); split: -0.21%, +0.35%
Subgroup size: 10968048 -> 10968144 (+0.00%)
Send messages: 6567930 -> 6567909 (-0.00%)
Cycle count: 22165780202 -> 21624534624 (-2.44%); split: -3.09%, +0.65%
Spill count: 69890 -> 66665 (-4.61%); split: -5.06%, +0.44%
Fill count: 128331 -> 120189 (-6.34%); split: -7.44%, +1.09%
Scratch Memory Size: 5829632 -> 5664768 (-2.83%); split: -2.86%, +0.04%
Max live registers: 47928290 -> 47611371 (-0.66%); split: -0.71%, +0.05%
Totals from 364369 (66.18% of 550563) affected shaders:
Instrs: 113448842 -> 113648412 (+0.18%); split: -0.26%, +0.44%
Subgroup size: 7694080 -> 7694176 (+0.00%)
Send messages: 5308287 -> 5308266 (-0.00%)
Cycle count: 21885237842 -> 21343992264 (-2.47%); split: -3.13%, +0.65%
Spill count: 65152 -> 61927 (-4.95%); split: -5.42%, +0.47%
Fill count: 122811 -> 114669 (-6.63%); split: -7.77%, +1.14%
Scratch Memory Size: 5438464 -> 5273600 (-3.03%); split: -3.07%, +0.04%
Max live registers: 34355310 -> 34038391 (-0.92%); split: -1.00%, +0.07%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-28 10:52:47 -08:00
|
|
|
xbld.MOV(offset(nir_dest, xbld, i),
|
|
|
|
|
subscript(offset(dest, xbld, i), nir_dest.type, 0));
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
assert(nir_bit_size == 32);
|
|
|
|
|
|
2025-08-21 16:20:49 -07:00
|
|
|
flags |= MEMORY_FLAG_TRANSPOSE;
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
srcs[MEMORY_LOGICAL_ADDRESS] =
|
|
|
|
|
bld.emit_uniformize(srcs[MEMORY_LOGICAL_ADDRESS]);
|
|
|
|
|
|
2025-04-03 01:14:03 -07:00
|
|
|
const brw_builder ubld = bld.uniform();
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
unsigned total, done;
|
2025-01-02 00:42:36 -08:00
|
|
|
unsigned first_read_component = 0;
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
|
|
|
|
|
if (convergent_block_load) {
|
2025-01-02 00:42:36 -08:00
|
|
|
/* If the address is a constant and alignment permits, skip unread
|
|
|
|
|
* leading and trailing components. (It's probably not worth the
|
|
|
|
|
* extra address math for non-constant addresses.)
|
|
|
|
|
*
|
|
|
|
|
* Note that SLM block loads on HDC platforms need to be 16B aligned.
|
|
|
|
|
*/
|
|
|
|
|
if (srcs[MEMORY_LOGICAL_ADDRESS].file == IMM &&
|
|
|
|
|
align >= data_bit_size / 8 &&
|
2025-08-21 16:20:49 -07:00
|
|
|
(devinfo->has_lsc || mode != MEMORY_MODE_SHARED_LOCAL)) {
|
2025-01-02 00:42:36 -08:00
|
|
|
first_read_component = nir_def_first_component_read(&instr->def);
|
|
|
|
|
unsigned last_component = nir_def_last_component_read(&instr->def);
|
|
|
|
|
srcs[MEMORY_LOGICAL_ADDRESS].u64 +=
|
|
|
|
|
first_read_component * (data_bit_size / 8);
|
|
|
|
|
components = last_component - first_read_component + 1;
|
|
|
|
|
}
|
|
|
|
|
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
total = ALIGN(components, REG_SIZE * reg_unit(devinfo) / 4);
|
|
|
|
|
dest = ubld.vgrf(BRW_TYPE_UD, total);
|
|
|
|
|
} else {
|
|
|
|
|
total = components * bld.dispatch_width();
|
|
|
|
|
dest = nir_dest;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
brw_reg src = srcs[MEMORY_LOGICAL_DATA0];
|
|
|
|
|
|
|
|
|
|
unsigned block_comps = components;
|
|
|
|
|
|
|
|
|
|
for (done = 0; done < total; done += block_comps) {
|
2024-09-09 16:27:29 -07:00
|
|
|
block_comps = choose_block_size_dwords(devinfo, total - done);
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
const unsigned block_bytes = block_comps * (nir_bit_size / 8);
|
|
|
|
|
|
|
|
|
|
brw_reg dst_offset = is_store ? brw_reg() :
|
|
|
|
|
retype(byte_offset(dest, done * 4), BRW_TYPE_UD);
|
|
|
|
|
if (is_store) {
|
|
|
|
|
srcs[MEMORY_LOGICAL_DATA0] =
|
|
|
|
|
retype(byte_offset(src, done * 4), BRW_TYPE_UD);
|
|
|
|
|
}
|
|
|
|
|
|
2025-08-21 16:20:49 -07:00
|
|
|
mem = ubld.emit(opcode, dst_offset, srcs, MEMORY_LOGICAL_NUM_SRCS)->as_mem();
|
|
|
|
|
mem->has_no_mask_send_params = no_mask_handle;
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
if (is_load)
|
2025-08-21 16:20:49 -07:00
|
|
|
mem->size_written = block_bytes;
|
|
|
|
|
mem->lsc_op = op;
|
|
|
|
|
mem->mode = *mode;
|
|
|
|
|
mem->binding_type = *binding_type;
|
|
|
|
|
mem->address_offset = address_offset;
|
|
|
|
|
mem->coord_components = coord_components;
|
|
|
|
|
mem->data_size = data_size;
|
|
|
|
|
mem->components = block_comps;
|
|
|
|
|
mem->alignment = align;
|
|
|
|
|
mem->flags = flags;
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
|
|
|
|
|
if (brw_type_size_bits(srcs[MEMORY_LOGICAL_ADDRESS].type) == 64) {
|
|
|
|
|
increment_a64_address(ubld, srcs[MEMORY_LOGICAL_ADDRESS],
|
|
|
|
|
block_bytes, no_mask_handle);
|
|
|
|
|
} else {
|
|
|
|
|
srcs[MEMORY_LOGICAL_ADDRESS] =
|
|
|
|
|
ubld.ADD(retype(srcs[MEMORY_LOGICAL_ADDRESS], BRW_TYPE_UD),
|
|
|
|
|
brw_imm_ud(block_bytes));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
assert(done == total);
|
|
|
|
|
|
|
|
|
|
if (convergent_block_load) {
|
|
|
|
|
for (unsigned c = 0; c < components; c++) {
|
2025-01-02 00:42:36 -08:00
|
|
|
xbld.MOV(retype(offset(nir_dest, xbld, first_read_component + c),
|
|
|
|
|
BRW_TYPE_UD),
|
brw/nir: Treat load_*_uniform_block_intel as convergent
Between 5 and 10 shaders (depending on the platform) from Blender are
massively helped for spills and fills (e.g., from 45 spills to 0, and
180 fills to 0).
Previously this commit cause a lot of spill and fill damage to
Wolfenstein Youngblood and Red Dead Redemption 2. I believe due to
!32041 and !32097, this is no longer the case. RDR2 is helped, and
Wolfenstein Youngblood has no changes.
However, q2rtx/q2rtx-rt-pipeline is hurt:
Spill count: 126 -> 175 (+38.89%); split: -0.79%, +39.68%
Fill count: 156 -> 235 (+50.64%); split: -1.92%, +52.56%
By the end of this series this damage is fixed, and q2rtx is helped
overall by -0.79% spills and -1.92% fills.
v2: Fix for Xe2.
v3: Just keep using bld for the group(1, 0) call. Suggested by Ken.
v4: Major re-write. Pass bld and xbld to fs_emit_memory_access. The big
fix is changing the way srcs[MEMORY_LOGICAL_ADDRESS] is calculated
(around line 7180). In previous versions of the commit, the address
would be calculated using bld (which is now xbld) even if the address
source was not is_scalar. This could cause the emit_uniformize (later in
the function) to fetch garbage. This also drops the special case
handling of constant offset. Constant propagation and algebraic will
handle this.
v5: Fix a subtle bug that was ultimately caused by the removal of
offset_to_component. The MEMORY_LOGICAL_ADDRESS for
load_shared_uniform_block_intel was being calculated as SIMD16 on LNL,
but the later emit_uniformize would treat it as SIMD32. This caused GPU
hangs in Assassin's Creed Valhalla.
v6: Fix a bug in D16 to D16U32 expansion. Noticed by Ken. Add a comment
explaining bld vs xbld vs ubld in fs_nir_emit_memory_access. Suggested
by Ken.
v7: Revert some of the v6 changes related to D16 to D16U32
expansion. This code was mostly correct. xbld is correct because DATA0
needs to be generated in size of the eventual SEND instruction. Using
offset(nir_src, xbld, c) will cause offset() to correctly added
component(..., 0) if nir_src.is_scalar but xbld is not scalar_group().
v8: nir_intrinsic_load_shared_uniform_block_intel was removed. This
caused reproducible hangs in Assassin's Creed: Valhalla. There are some
other compiler issues related to this game, and we're not yet sure
exactly what the cause of any of it is.
shader-db:
Lunar Lake
total instructions in shared programs: 18058270 -> 18068886 (0.06%)
instructions in affected programs: 5196846 -> 5207462 (0.20%)
helped: 4442 / HURT: 11416
total cycles in shared programs: 921324492 -> 919819398 (-0.16%)
cycles in affected programs: 733274162 -> 731769068 (-0.21%)
helped: 11312 / HURT: 31788
total spills in shared programs: 3633 -> 3585 (-1.32%)
spills in affected programs: 48 -> 0
helped: 5 / HURT: 0
total fills in shared programs: 2277 -> 2198 (-3.47%)
fills in affected programs: 79 -> 0
helped: 5 / HURT: 0
LOST: 123
GAINED: 377
Meteor Lake, DG2, and Tiger Lake had similar results. (Meteor Lake shown)
total instructions in shared programs: 19703458 -> 19699173 (-0.02%)
instructions in affected programs: 5885251 -> 5880966 (-0.07%)
helped: 4545 / HURT: 14971
total cycles in shared programs: 903497253 -> 902054570 (-0.16%)
cycles in affected programs: 691762248 -> 690319565 (-0.21%)
helped: 16412 / HURT: 28080
total spills in shared programs: 4894 -> 4646 (-5.07%)
spills in affected programs: 248 -> 0
helped: 7 / HURT: 0
total fills in shared programs: 6638 -> 5581 (-15.92%)
fills in affected programs: 1057 -> 0
helped: 7 / HURT: 0
LOST: 427
GAINED: 978
Ice Lake and Skylake had similar results. (Ice Lake shonw)
total instructions in shared programs: 20384200 -> 20384889 (<.01%)
instructions in affected programs: 5295084 -> 5295773 (0.01%)
helped: 5309 / HURT: 12564
total cycles in shared programs: 873002832 -> 872515246 (-0.06%)
cycles in affected programs: 463413458 -> 462925872 (-0.11%)
helped: 16079 / HURT: 13339
total spills in shared programs: 4552 -> 4373 (-3.93%)
spills in affected programs: 546 -> 367 (-32.78%)
helped: 11 / HURT: 0
total fills in shared programs: 5298 -> 4657 (-12.10%)
fills in affected programs: 1798 -> 1157 (-35.65%)
helped: 10 / HURT: 0
LOST: 380
GAINED: 925
fossil-db:
All Intel platforms had similar results. (Lunar Lake shown)
Totals:
Instrs: 141528822 -> 141728392 (+0.14%); split: -0.21%, +0.35%
Subgroup size: 10968048 -> 10968144 (+0.00%)
Send messages: 6567930 -> 6567909 (-0.00%)
Cycle count: 22165780202 -> 21624534624 (-2.44%); split: -3.09%, +0.65%
Spill count: 69890 -> 66665 (-4.61%); split: -5.06%, +0.44%
Fill count: 128331 -> 120189 (-6.34%); split: -7.44%, +1.09%
Scratch Memory Size: 5829632 -> 5664768 (-2.83%); split: -2.86%, +0.04%
Max live registers: 47928290 -> 47611371 (-0.66%); split: -0.71%, +0.05%
Totals from 364369 (66.18% of 550563) affected shaders:
Instrs: 113448842 -> 113648412 (+0.18%); split: -0.26%, +0.44%
Subgroup size: 7694080 -> 7694176 (+0.00%)
Send messages: 5308287 -> 5308266 (-0.00%)
Cycle count: 21885237842 -> 21343992264 (-2.47%); split: -3.13%, +0.65%
Spill count: 65152 -> 61927 (-4.95%); split: -5.42%, +0.47%
Fill count: 122811 -> 114669 (-6.63%); split: -7.77%, +1.14%
Scratch Memory Size: 5438464 -> 5273600 (-3.03%); split: -3.07%, +0.04%
Max live registers: 34355310 -> 34038391 (-0.92%); split: -1.00%, +0.07%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-28 10:52:47 -08:00
|
|
|
component(dest, c));
|
intel/brw: Switch to emitting MEMORY_*_LOGICAL opcodes
We introduce a new fs_nir_emit_memory_access() helper that can handle
image, bindless image, SSBO, shared, global, and scratch memory, and
handles loads, stores, atomics, and block loads. It translates each
of these NIR intrinsics into the new MEMORY_*_LOGICAL intrinsics.
As a result, we delete a lot of similar surface access emitter code.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Acked-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2023-02-16 21:21:13 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2018-11-26 15:15:04 -06:00
|
|
|
}
|
|
|
|
|
|
2023-11-20 14:55:21 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_texture(nir_to_brw_state &ntb,
|
2023-11-20 22:11:23 -08:00
|
|
|
nir_tex_instr *instr)
|
2014-08-15 10:32:07 -07:00
|
|
|
{
|
2023-12-05 15:27:29 -08:00
|
|
|
const intel_device_info *devinfo = ntb.devinfo;
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2023-11-20 14:55:21 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
|
2014-08-15 10:32:07 -07:00
|
|
|
|
2023-05-23 13:11:02 +03:00
|
|
|
/* SKL PRMs: Volume 7: 3D-Media-GPGPU:
|
|
|
|
|
*
|
|
|
|
|
* "The Pixel Null Mask field, when enabled via the Pixel Null Mask
|
|
|
|
|
* Enable will be incorect for sample_c when applied to a surface with
|
|
|
|
|
* 64-bit per texel format such as R16G16BA16_UNORM. Pixel Null mask
|
|
|
|
|
* Enable may incorrectly report pixels as referencing a Null surface."
|
|
|
|
|
*
|
|
|
|
|
* We'll take care of this in NIR.
|
|
|
|
|
*/
|
|
|
|
|
assert(!instr->is_sparse || srcs[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE);
|
|
|
|
|
|
2015-06-10 09:50:47 -07:00
|
|
|
int lod_components = 0;
|
2014-08-15 10:32:07 -07:00
|
|
|
|
2016-03-25 14:02:50 -07:00
|
|
|
/* The hardware requires a LOD for buffer textures */
|
|
|
|
|
if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
|
2016-05-03 10:41:38 -07:00
|
|
|
srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
|
2016-03-25 14:02:50 -07:00
|
|
|
|
2022-06-28 13:51:42 -07:00
|
|
|
ASSERTED bool got_lod = false;
|
|
|
|
|
ASSERTED bool got_bias = false;
|
2023-03-05 15:27:08 -08:00
|
|
|
bool pack_lod_bias_and_offset = false;
|
2016-11-28 18:13:02 -08:00
|
|
|
uint32_t header_bits = 0;
|
2025-09-02 14:20:34 +03:00
|
|
|
|
|
|
|
|
brw_reg_type default_src_type;
|
|
|
|
|
switch (instr->op) {
|
|
|
|
|
case nir_texop_txf_ms:
|
|
|
|
|
case nir_texop_txf_ms_mcs_intel:
|
|
|
|
|
default_src_type = devinfo->verx10 >= 125 ? BRW_TYPE_W : BRW_TYPE_D;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_texop_txf:
|
|
|
|
|
case nir_texop_txs:
|
|
|
|
|
default_src_type = BRW_TYPE_D;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
default_src_type = BRW_TYPE_F;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2014-08-15 10:32:07 -07:00
|
|
|
for (unsigned i = 0; i < instr->num_srcs; i++) {
|
2023-02-09 15:07:36 +02:00
|
|
|
nir_src nir_src = instr->src[i].src;
|
2024-02-12 08:43:34 -08:00
|
|
|
brw_reg src = get_nir_src(ntb, nir_src, -1);
|
|
|
|
|
|
|
|
|
|
/* If the source is not a vector (e.g., a 1D texture coordinate), then
|
|
|
|
|
* the eventual LOAD_PAYLOAD lowering will not properly adjust the
|
|
|
|
|
* stride, etc., so do it now.
|
|
|
|
|
*/
|
|
|
|
|
if (nir_tex_instr_src_size(instr, i) == 1)
|
|
|
|
|
src = offset(src, bld, 0);
|
|
|
|
|
|
2025-09-02 14:20:34 +03:00
|
|
|
brw_reg_type src_type = BRW_TYPE_F;
|
|
|
|
|
switch (instr->src[i].src_type) {
|
|
|
|
|
case nir_tex_src_sampler_offset:
|
|
|
|
|
case nir_tex_src_texture_offset:
|
|
|
|
|
case nir_tex_src_sampler_handle:
|
|
|
|
|
case nir_tex_src_texture_handle:
|
|
|
|
|
case nir_tex_src_offset:
|
|
|
|
|
src_type = BRW_TYPE_D;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
src_type = default_src_type;
|
2025-09-23 23:28:25 +03:00
|
|
|
break;
|
2025-09-02 14:20:34 +03:00
|
|
|
}
|
|
|
|
|
|
2015-01-09 20:01:13 -08:00
|
|
|
switch (instr->src[i].src_type) {
|
2014-08-15 10:32:07 -07:00
|
|
|
case nir_tex_src_bias:
|
2022-06-28 13:51:42 -07:00
|
|
|
assert(!got_lod);
|
|
|
|
|
got_bias = true;
|
2016-05-04 15:10:25 -07:00
|
|
|
srcs[TEX_LOGICAL_SRC_LOD] =
|
2025-09-02 14:20:34 +03:00
|
|
|
retype(get_nir_src_imm(ntb, instr->src[i].src), src_type);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
2016-12-12 08:32:38 -05:00
|
|
|
case nir_tex_src_comparator:
|
2025-09-02 14:20:34 +03:00
|
|
|
srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, src_type);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
case nir_tex_src_coord:
|
2025-09-02 14:20:34 +03:00
|
|
|
srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, src_type);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
case nir_tex_src_ddx:
|
2025-09-02 14:20:34 +03:00
|
|
|
srcs[TEX_LOGICAL_SRC_LOD] = retype(src, src_type);
|
2014-08-15 10:32:07 -07:00
|
|
|
lod_components = nir_tex_instr_src_size(instr, i);
|
|
|
|
|
break;
|
|
|
|
|
case nir_tex_src_ddy:
|
2025-09-02 14:20:34 +03:00
|
|
|
srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, src_type);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
|
|
|
|
case nir_tex_src_lod:
|
2022-06-28 13:51:42 -07:00
|
|
|
assert(!got_bias);
|
|
|
|
|
got_lod = true;
|
2025-09-02 14:20:34 +03:00
|
|
|
srcs[TEX_LOGICAL_SRC_LOD] =
|
|
|
|
|
retype(get_nir_src_imm(ntb, instr->src[i].src), src_type);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
2018-10-11 15:57:50 -05:00
|
|
|
case nir_tex_src_min_lod:
|
|
|
|
|
srcs[TEX_LOGICAL_SRC_MIN_LOD] =
|
2025-09-02 14:20:34 +03:00
|
|
|
retype(get_nir_src_imm(ntb, instr->src[i].src), src_type);
|
2018-10-11 15:57:50 -05:00
|
|
|
break;
|
2014-08-15 10:32:07 -07:00
|
|
|
case nir_tex_src_ms_index:
|
2025-09-02 14:20:34 +03:00
|
|
|
srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, src_type);
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
2016-02-09 14:51:28 -08:00
|
|
|
|
2025-08-28 14:28:53 +03:00
|
|
|
case nir_tex_src_offset: {
|
|
|
|
|
uint32_t offset_bits = 0;
|
|
|
|
|
if (brw_texture_offset(instr, i, &offset_bits)) {
|
|
|
|
|
header_bits |= offset_bits;
|
|
|
|
|
} else {
|
|
|
|
|
/* On gfx12.5+, if the offsets are not both constant and in the
|
|
|
|
|
* {-8,7} range, nir_lower_tex() will have already lowered the
|
|
|
|
|
* source offset. So we should never reach this point.
|
|
|
|
|
*/
|
|
|
|
|
assert(devinfo->verx10 < 125);
|
|
|
|
|
srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
|
2025-09-02 14:20:34 +03:00
|
|
|
retype(src, src_type);
|
2025-08-28 14:28:53 +03:00
|
|
|
}
|
2014-08-15 10:32:07 -07:00
|
|
|
break;
|
2025-08-28 14:28:53 +03:00
|
|
|
}
|
2016-02-09 14:51:28 -08:00
|
|
|
|
2014-08-15 10:32:07 -07:00
|
|
|
case nir_tex_src_projector:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("should be lowered");
|
2014-12-05 16:43:56 -08:00
|
|
|
|
2025-09-01 23:16:40 +03:00
|
|
|
case nir_tex_src_texture_offset:
|
2023-02-09 15:07:36 +02:00
|
|
|
assert(srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE);
|
|
|
|
|
/* Emit code to evaluate the actual indexing expression */
|
2024-12-12 16:41:51 -08:00
|
|
|
srcs[TEX_LOGICAL_SRC_SURFACE] =
|
|
|
|
|
bld.emit_uniformize(bld.ADD(retype(src, BRW_TYPE_UD),
|
|
|
|
|
brw_imm_ud(instr->texture_index)));
|
2014-12-05 16:43:56 -08:00
|
|
|
break;
|
|
|
|
|
|
2025-09-01 23:16:40 +03:00
|
|
|
case nir_tex_src_sampler_offset:
|
|
|
|
|
assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_handle) == -1);
|
|
|
|
|
assert(srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE);
|
2023-02-09 15:07:36 +02:00
|
|
|
/* Emit code to evaluate the actual indexing expression */
|
2024-12-12 16:41:51 -08:00
|
|
|
srcs[TEX_LOGICAL_SRC_SAMPLER] =
|
|
|
|
|
bld.emit_uniformize(bld.ADD(retype(src, BRW_TYPE_UD),
|
|
|
|
|
brw_imm_ud(instr->sampler_index)));
|
2016-02-05 18:24:02 -08:00
|
|
|
break;
|
2015-11-02 17:58:29 -08:00
|
|
|
|
2019-02-06 15:42:17 -06:00
|
|
|
case nir_tex_src_texture_handle:
|
|
|
|
|
assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1);
|
2025-09-01 23:16:40 +03:00
|
|
|
assert(srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE);
|
|
|
|
|
srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(src);
|
2019-02-06 15:42:17 -06:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_tex_src_sampler_handle:
|
|
|
|
|
assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
|
2025-09-01 23:16:40 +03:00
|
|
|
assert(srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE);
|
|
|
|
|
srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(src);
|
2019-02-06 15:42:17 -06:00
|
|
|
break;
|
|
|
|
|
|
2021-07-07 17:06:46 -05:00
|
|
|
case nir_tex_src_ms_mcs_intel:
|
2016-05-03 12:34:51 -07:00
|
|
|
assert(instr->op == nir_texop_txf_ms);
|
2025-09-02 14:20:34 +03:00
|
|
|
srcs[TEX_LOGICAL_SRC_MCS] = retype(src, src_type);
|
2016-05-03 12:34:51 -07:00
|
|
|
break;
|
|
|
|
|
|
2023-03-05 15:27:08 -08:00
|
|
|
/* If this parameter is present, we are packing offset U, V and LOD/Bias
|
|
|
|
|
* into a single (32-bit) value.
|
|
|
|
|
*/
|
|
|
|
|
case nir_tex_src_backend2:
|
2025-08-28 14:28:53 +03:00
|
|
|
assert(instr->op == nir_texop_tg4);
|
|
|
|
|
pack_lod_bias_and_offset = true;
|
|
|
|
|
srcs[TEX_LOGICAL_SRC_LOD] =
|
2025-09-02 14:20:34 +03:00
|
|
|
retype(get_nir_src_imm(ntb, instr->src[i].src), src_type);
|
2023-03-05 15:27:08 -08:00
|
|
|
break;
|
|
|
|
|
|
2024-02-02 20:39:23 -08:00
|
|
|
/* If this parameter is present, we are packing either the explicit LOD
|
|
|
|
|
* or LOD bias and the array index into a single (32-bit) value when
|
|
|
|
|
* 32-bit texture coordinates are used.
|
|
|
|
|
*/
|
|
|
|
|
case nir_tex_src_backend1:
|
2022-06-28 13:51:42 -07:00
|
|
|
assert(!got_lod && !got_bias);
|
|
|
|
|
got_lod = true;
|
|
|
|
|
assert(instr->op == nir_texop_txl || instr->op == nir_texop_txb);
|
|
|
|
|
srcs[TEX_LOGICAL_SRC_LOD] =
|
2025-09-02 14:20:34 +03:00
|
|
|
retype(get_nir_src_imm(ntb, instr->src[i].src), src_type);
|
2022-06-28 13:51:42 -07:00
|
|
|
break;
|
|
|
|
|
|
2014-08-15 10:32:07 -07:00
|
|
|
default:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("unknown texture source");
|
2014-08-15 10:32:07 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-01 23:16:40 +03:00
|
|
|
const bool surface_bindless = nir_tex_instr_src_index(
|
|
|
|
|
instr, nir_tex_src_texture_handle) >= 0;
|
|
|
|
|
const bool sampler_bindless = nir_tex_instr_src_index(
|
|
|
|
|
instr, nir_tex_src_sampler_handle) >= 0;
|
|
|
|
|
|
2023-02-09 15:07:36 +02:00
|
|
|
/* If the surface or sampler were not specified through sources, use the
|
|
|
|
|
* instruction index.
|
|
|
|
|
*/
|
2025-09-01 23:16:40 +03:00
|
|
|
if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE)
|
2023-02-09 15:07:36 +02:00
|
|
|
srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(instr->texture_index);
|
2025-09-01 23:16:40 +03:00
|
|
|
if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE)
|
2023-02-09 15:07:36 +02:00
|
|
|
srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(instr->sampler_index);
|
|
|
|
|
|
2025-09-02 14:20:34 +03:00
|
|
|
assert(srcs[TEX_LOGICAL_SRC_MCS].file != BAD_FILE ||
|
|
|
|
|
instr->op != nir_texop_txf_ms);
|
2014-08-15 10:32:07 -07:00
|
|
|
|
2025-09-01 13:55:57 +03:00
|
|
|
enum sampler_opcode opcode;
|
2014-08-15 10:32:07 -07:00
|
|
|
switch (instr->op) {
|
2016-05-03 10:41:38 -07:00
|
|
|
case nir_texop_tex:
|
2025-09-01 13:55:57 +03:00
|
|
|
opcode = SAMPLER_OPCODE_TEX_LOGICAL;
|
2016-05-03 10:41:38 -07:00
|
|
|
break;
|
|
|
|
|
case nir_texop_txb:
|
2025-09-01 13:55:57 +03:00
|
|
|
opcode = SAMPLER_OPCODE_TXB_LOGICAL;
|
2016-05-03 10:41:38 -07:00
|
|
|
break;
|
|
|
|
|
case nir_texop_txl:
|
2025-09-01 13:55:57 +03:00
|
|
|
opcode = SAMPLER_OPCODE_TXL_LOGICAL;
|
2016-05-03 10:41:38 -07:00
|
|
|
break;
|
|
|
|
|
case nir_texop_txd:
|
2025-09-01 13:55:57 +03:00
|
|
|
opcode = SAMPLER_OPCODE_TXD_LOGICAL;
|
2016-05-03 10:41:38 -07:00
|
|
|
break;
|
|
|
|
|
case nir_texop_txf:
|
2025-09-01 13:55:57 +03:00
|
|
|
opcode = SAMPLER_OPCODE_TXF_LOGICAL;
|
2016-05-03 10:41:38 -07:00
|
|
|
break;
|
|
|
|
|
case nir_texop_txf_ms:
|
2020-07-07 23:23:36 -07:00
|
|
|
/* On Gfx12HP there is only CMS_W available. From the Bspec: Shared
|
|
|
|
|
* Functions - 3D Sampler - Messages - Message Format:
|
|
|
|
|
*
|
|
|
|
|
* ld2dms REMOVEDBY(GEN:HAS:1406788836)
|
|
|
|
|
*/
|
|
|
|
|
if (devinfo->verx10 >= 125)
|
2025-09-01 13:55:57 +03:00
|
|
|
opcode = SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL;
|
2016-05-03 10:41:38 -07:00
|
|
|
else
|
2025-09-01 13:55:57 +03:00
|
|
|
opcode = SAMPLER_OPCODE_TXF_CMS_W_LOGICAL;
|
2016-05-03 10:41:38 -07:00
|
|
|
break;
|
2021-07-07 17:06:46 -05:00
|
|
|
case nir_texop_txf_ms_mcs_intel:
|
2025-09-01 13:55:57 +03:00
|
|
|
opcode = SAMPLER_OPCODE_TXF_MCS_LOGICAL;
|
2016-05-03 12:34:51 -07:00
|
|
|
break;
|
2016-05-03 10:41:38 -07:00
|
|
|
case nir_texop_query_levels:
|
|
|
|
|
case nir_texop_txs:
|
2025-09-01 13:55:57 +03:00
|
|
|
opcode = SAMPLER_OPCODE_TXS_LOGICAL;
|
2016-05-03 10:41:38 -07:00
|
|
|
break;
|
|
|
|
|
case nir_texop_lod:
|
2025-09-01 13:55:57 +03:00
|
|
|
opcode = SAMPLER_OPCODE_LOD_LOGICAL;
|
2016-05-03 10:41:38 -07:00
|
|
|
break;
|
2023-02-16 20:30:30 -08:00
|
|
|
case nir_texop_tg4: {
|
|
|
|
|
if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE) {
|
2025-09-01 13:55:57 +03:00
|
|
|
opcode = SAMPLER_OPCODE_TG4_OFFSET_LOGICAL;
|
2023-02-16 20:30:30 -08:00
|
|
|
} else {
|
2025-09-01 13:55:57 +03:00
|
|
|
opcode = SAMPLER_OPCODE_TG4_LOGICAL;
|
2023-02-16 20:30:30 -08:00
|
|
|
if (devinfo->ver >= 20) {
|
|
|
|
|
/* If SPV_AMD_texture_gather_bias_lod extension is enabled, all
|
|
|
|
|
* texture gather functions (ie. the ones which do not take the
|
|
|
|
|
* extra bias argument and the ones that do) fetch texels from
|
|
|
|
|
* implicit LOD in fragment shader stage. In all other shader
|
|
|
|
|
* stages, base level is used instead.
|
|
|
|
|
*/
|
|
|
|
|
if (instr->is_gather_implicit_lod)
|
2025-09-01 13:55:57 +03:00
|
|
|
opcode = SAMPLER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL;
|
2023-02-16 20:30:30 -08:00
|
|
|
|
|
|
|
|
if (got_bias)
|
2025-09-01 13:55:57 +03:00
|
|
|
opcode = SAMPLER_OPCODE_TG4_BIAS_LOGICAL;
|
2023-02-16 20:30:30 -08:00
|
|
|
|
|
|
|
|
if (got_lod)
|
2025-09-01 13:55:57 +03:00
|
|
|
opcode = SAMPLER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL;
|
2023-03-05 15:27:08 -08:00
|
|
|
|
|
|
|
|
if (pack_lod_bias_and_offset) {
|
|
|
|
|
if (got_lod)
|
2025-09-01 13:55:57 +03:00
|
|
|
opcode = SAMPLER_OPCODE_TG4_OFFSET_LOD_LOGICAL;
|
2023-03-05 15:27:08 -08:00
|
|
|
if (got_bias)
|
2025-09-01 13:55:57 +03:00
|
|
|
opcode = SAMPLER_OPCODE_TG4_OFFSET_BIAS_LOGICAL;
|
2023-03-05 15:27:08 -08:00
|
|
|
}
|
2023-02-16 20:30:30 -08:00
|
|
|
}
|
|
|
|
|
}
|
2016-05-03 10:41:38 -07:00
|
|
|
break;
|
2023-02-16 20:30:30 -08:00
|
|
|
}
|
2016-05-20 00:37:37 -07:00
|
|
|
case nir_texop_texture_samples:
|
2025-09-01 13:55:57 +03:00
|
|
|
opcode = SAMPLER_OPCODE_SAMPLEINFO_LOGICAL;
|
2016-05-20 00:37:37 -07:00
|
|
|
break;
|
2014-08-15 10:32:07 -07:00
|
|
|
default:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("unknown texture opcode");
|
2014-08-15 10:32:07 -07:00
|
|
|
}
|
|
|
|
|
|
2016-11-28 18:13:02 -08:00
|
|
|
if (instr->op == nir_texop_tg4) {
|
2024-02-23 16:24:53 -08:00
|
|
|
header_bits |= instr->component << 16;
|
2016-11-28 18:13:02 -08:00
|
|
|
}
|
|
|
|
|
|
2024-07-30 19:27:31 -07:00
|
|
|
brw_reg nir_def_reg = get_nir_def(ntb, instr->def);
|
|
|
|
|
|
2016-05-03 10:41:38 -07:00
|
|
|
const unsigned dest_size = nir_tex_instr_dest_size(instr);
|
2024-09-22 20:28:50 +03:00
|
|
|
unsigned dest_comp;
|
2024-02-15 02:51:39 -08:00
|
|
|
if (instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
|
2023-08-14 11:56:00 -05:00
|
|
|
unsigned write_mask = nir_def_components_read(&instr->def);
|
i965/fs: Reduce the response length of sampler messages on Skylake.
Often, we don't need a full 4 channels worth of data from the sampler.
For example, depth comparisons and red textures only return one value.
To handle this, the sampler message header contains a mask which can
be used to disable channels, and reduce the message length (in SIMD16
mode on all hardware, and SIMD8 mode on Broadwell and later).
We've never used it before, since it required setting up a message
header. This meant trading a smaller response length for a larger
message length and additional MOVs to set it up.
However, Skylake introduces a terrific new feature: for headerless
messages, you can simply reduce the response length, and it makes
the implicit header contain an appropriate mask. So to read only
RG, you would simply set the message length to 2 or 4 (SIMD8/16).
This means we can finally take advantage of this at no cost.
total instructions in shared programs: 9091831 -> 9073067 (-0.21%)
instructions in affected programs: 191370 -> 172606 (-9.81%)
helped: 2609
HURT: 0
total cycles in shared programs: 70868114 -> 68454752 (-3.41%)
cycles in affected programs: 35841154 -> 33427792 (-6.73%)
helped: 16357
HURT: 8188
total spills in shared programs: 3492 -> 1707 (-51.12%)
spills in affected programs: 2749 -> 964 (-64.93%)
helped: 74
HURT: 0
total fills in shared programs: 4266 -> 2647 (-37.95%)
fills in affected programs: 3029 -> 1410 (-53.45%)
helped: 74
HURT: 0
LOST: 1
GAINED: 143
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-04-23 01:54:33 -07:00
|
|
|
assert(write_mask != 0); /* dead code should have been eliminated */
|
2024-09-22 20:28:50 +03:00
|
|
|
|
|
|
|
|
dest_comp = util_last_bit(write_mask) - instr->is_sparse;
|
2016-05-03 10:41:38 -07:00
|
|
|
} else {
|
2024-09-22 20:28:50 +03:00
|
|
|
dest_comp = 4;
|
2016-05-03 10:41:38 -07:00
|
|
|
}
|
|
|
|
|
|
2024-09-22 20:28:50 +03:00
|
|
|
/* Compute the number of physical registers needed to hold a single
|
|
|
|
|
* component and round it up to a physical register count.
|
|
|
|
|
*/
|
|
|
|
|
brw_reg_type dst_type = brw_type_for_nir_type(devinfo, instr->dest_type);
|
|
|
|
|
const unsigned grf_size = reg_unit(devinfo) * REG_SIZE;
|
|
|
|
|
const unsigned per_component_regs =
|
|
|
|
|
DIV_ROUND_UP(brw_type_size_bytes(dst_type) * bld.dispatch_width(),
|
|
|
|
|
grf_size);
|
|
|
|
|
const unsigned total_regs =
|
|
|
|
|
dest_comp * per_component_regs + instr->is_sparse;
|
|
|
|
|
/* Allocate enough space for the components + one physical register for the
|
|
|
|
|
* residency data.
|
|
|
|
|
*/
|
2025-01-31 12:50:20 -08:00
|
|
|
brw_reg dst = retype(
|
|
|
|
|
brw_allocate_vgrf_units(*bld.shader, total_regs * reg_unit(devinfo)),
|
2024-09-22 20:28:50 +03:00
|
|
|
dst_type);
|
|
|
|
|
|
2025-09-01 13:55:57 +03:00
|
|
|
brw_tex_inst *tex = bld.emit(SHADER_OPCODE_SAMPLER, dst, srcs, ARRAY_SIZE(srcs))->as_tex();
|
|
|
|
|
tex->sampler_opcode = opcode;
|
2025-09-01 23:16:40 +03:00
|
|
|
tex->surface_bindless = surface_bindless;
|
|
|
|
|
tex->sampler_bindless = sampler_bindless;
|
2025-08-21 00:02:14 -07:00
|
|
|
tex->offset = header_bits;
|
|
|
|
|
tex->size_written = total_regs * grf_size;
|
|
|
|
|
tex->residency = instr->is_sparse;
|
|
|
|
|
tex->coord_components = instr->coord_components;
|
|
|
|
|
tex->grad_components = lod_components;
|
2024-09-22 20:28:50 +03:00
|
|
|
|
2023-10-31 20:45:31 -07:00
|
|
|
/* Wa_14012688258:
|
|
|
|
|
*
|
|
|
|
|
* Don't trim zeros at the end of payload for sample operations
|
|
|
|
|
* in cube and cube arrays.
|
|
|
|
|
*/
|
|
|
|
|
if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
|
|
|
|
|
intel_needs_workaround(devinfo, 14012688258)) {
|
|
|
|
|
|
|
|
|
|
/* Compiler should send U,V,R parameters even if V,R are 0. */
|
|
|
|
|
if (srcs[TEX_LOGICAL_SRC_COORDINATE].file != BAD_FILE)
|
|
|
|
|
assert(instr->coord_components >= 3u);
|
|
|
|
|
|
|
|
|
|
/* See opt_zero_samples(). */
|
2025-08-21 00:02:14 -07:00
|
|
|
tex->keep_payload_trailing_zeros = true;
|
2023-10-31 20:45:31 -07:00
|
|
|
}
|
|
|
|
|
|
2024-09-22 20:28:50 +03:00
|
|
|
/* With half-floats returns, the stride into a GRF allocation for each
|
|
|
|
|
* component might be different than where the sampler is storing each
|
|
|
|
|
* component. For example in SIMD8 on DG2 the layout of the data returned
|
|
|
|
|
* by the sampler is as follow for 2 components load:
|
|
|
|
|
*
|
|
|
|
|
* _______________________________________________________________
|
|
|
|
|
* g0 : | unused |hf7|hf6|hf5|hf4|hf3|hf2|hf1|hf0|
|
|
|
|
|
* g1 : | unused |hf7|hf6|hf5|hf4|hf3|hf2|hf1|hf0|
|
|
|
|
|
*
|
|
|
|
|
* The same issue also happens in SIMD16 on Xe2 because the physical
|
|
|
|
|
* register size has doubled but we're still loading data only on half the
|
|
|
|
|
* register.
|
|
|
|
|
*
|
|
|
|
|
* In those cases we need the special remapping case below.
|
|
|
|
|
*/
|
|
|
|
|
const bool non_aligned_component_stride =
|
|
|
|
|
(brw_type_size_bytes(dst_type) * bld.dispatch_width()) % grf_size != 0;
|
|
|
|
|
if (instr->op != nir_texop_query_levels && !instr->is_sparse &&
|
|
|
|
|
!non_aligned_component_stride) {
|
2024-02-25 03:57:45 -08:00
|
|
|
/* In most cases we can write directly to the result. */
|
2025-08-21 00:02:14 -07:00
|
|
|
tex->dst = nir_def_reg;
|
2024-02-25 03:57:45 -08:00
|
|
|
} else {
|
|
|
|
|
/* In other cases, we have to reorganize the sampler message's results
|
|
|
|
|
* a bit to match the NIR intrinsic's expectations.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg nir_dest[5];
|
2024-09-22 20:28:50 +03:00
|
|
|
for (unsigned i = 0; i < dest_comp; i++)
|
|
|
|
|
nir_dest[i] = byte_offset(dst, i * per_component_regs * grf_size);
|
2024-02-25 03:57:45 -08:00
|
|
|
|
2024-09-22 20:28:50 +03:00
|
|
|
for (unsigned i = dest_comp; i < dest_size; i++)
|
intel/brw: Set appropriate types for 16-bit sampler trailing components
16-bit SIMD8 sampler writeback messages come with a bit of padding in
them, requiring us to emit a LOAD_PAYLOAD to reorganize the data into
the padding-free format expected by NIR. Additionally, we may reduce
the response length on the sampler messages based on which components
of the (always vec4) NIR destination are actually in use. When we do
that, dest_size > read_size, and the trailing components are all empty
BAD_FILE registers, indicating the contents are undefined.
Unfortunately, we can't ignore those trailing components entirely.
In the past, we left them default-initialized, giving us a BAD_FILE
register with UD type (which didn't matter, since all sampler returns
were 32-bit). But with 16-bit, this was confusing the LOAD_PAYLOAD.
For example, writing RGB and skipping A (without sparse) would produce
read_size = 3 and dest_size = 4 and nir_dest[5] containing:
nir_dest[] = <R:hf, G:hf, B:hf, blank-A:ud, blank-sparse:ud>
We'd then call LOAD_PAYLOAD on the first 4 sources, causing it to see
3 HF's and a UD, and try to copy the full 32-bit value at the end,
instead of 16-bits of pad like we intended. This meant it would
overflow the destination register's size, triggering validation errors.
Thanks to Ian Romanick for noticing this, writing a test, and also
coming up with a nearly identical fix.
Fixes: 0116430d394 ("intel/brw: Handle 16-bit sampler return payloads")
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/11617
References: https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/152
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Sushma Venkatesh Reddy <sushma.venkatesh.reddy@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30529>
2024-08-05 14:57:37 -07:00
|
|
|
nir_dest[i].type = dst.type;
|
|
|
|
|
|
2024-02-25 03:57:45 -08:00
|
|
|
if (instr->op == nir_texop_query_levels) {
|
|
|
|
|
/* # levels is in .w */
|
|
|
|
|
if (devinfo->ver == 9) {
|
|
|
|
|
/**
|
|
|
|
|
* Wa_1940217:
|
|
|
|
|
*
|
|
|
|
|
* When a surface of type SURFTYPE_NULL is accessed by resinfo, the
|
|
|
|
|
* MIPCount returned is undefined instead of 0.
|
|
|
|
|
*/
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *mov = bld.MOV(bld.null_reg_d(), dst);
|
2024-02-25 03:57:45 -08:00
|
|
|
mov->conditional_mod = BRW_CONDITIONAL_NZ;
|
|
|
|
|
nir_dest[0] = bld.vgrf(BRW_TYPE_D);
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *sel =
|
2024-02-25 03:57:45 -08:00
|
|
|
bld.SEL(nir_dest[0], offset(dst, bld, 3), brw_imm_d(0));
|
|
|
|
|
sel->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
} else {
|
|
|
|
|
nir_dest[0] = offset(dst, bld, 3);
|
|
|
|
|
}
|
2021-04-06 15:54:07 -07:00
|
|
|
}
|
2014-08-15 10:32:07 -07:00
|
|
|
|
2024-02-25 03:57:45 -08:00
|
|
|
/* The residency bits are only in the first component. */
|
|
|
|
|
if (instr->is_sparse) {
|
|
|
|
|
nir_dest[dest_size - 1] =
|
|
|
|
|
component(offset(dst, bld, dest_size - 1), 0);
|
|
|
|
|
}
|
2023-05-23 13:11:02 +03:00
|
|
|
|
2024-02-25 03:57:45 -08:00
|
|
|
bld.LOAD_PAYLOAD(nir_def_reg, nir_dest, dest_size, 0);
|
|
|
|
|
}
|
2014-08-15 10:32:07 -07:00
|
|
|
}
|
|
|
|
|
|
2023-11-20 14:42:06 -08:00
|
|
|
static void
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_instr(nir_to_brw_state &ntb, nir_instr *instr)
|
2023-11-20 12:13:47 -08:00
|
|
|
{
|
2024-08-23 10:46:13 -07:00
|
|
|
#ifndef NDEBUG
|
|
|
|
|
if (unlikely(ntb.annotate)) {
|
|
|
|
|
/* Use shader mem_ctx since annotations outlive the NIR conversion. */
|
|
|
|
|
ntb.bld = ntb.bld.annotate(nir_instr_as_str(instr, ntb.s.mem_ctx));
|
|
|
|
|
}
|
|
|
|
|
#endif
|
2023-11-20 12:13:47 -08:00
|
|
|
|
|
|
|
|
switch (instr->type) {
|
|
|
|
|
case nir_instr_type_alu:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_alu(ntb, nir_instr_as_alu(instr), true);
|
2023-11-20 12:13:47 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_instr_type_deref:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("All derefs should've been lowered");
|
2023-11-20 12:13:47 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_instr_type_intrinsic:
|
2023-12-05 17:16:34 -08:00
|
|
|
switch (ntb.s.stage) {
|
2023-11-20 12:13:47 -08:00
|
|
|
case MESA_SHADER_VERTEX:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_vs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
|
2023-11-20 12:13:47 -08:00
|
|
|
break;
|
|
|
|
|
case MESA_SHADER_TESS_CTRL:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_tcs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
|
2023-11-20 12:13:47 -08:00
|
|
|
break;
|
|
|
|
|
case MESA_SHADER_TESS_EVAL:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_tes_intrinsic(ntb, nir_instr_as_intrinsic(instr));
|
2023-11-20 12:13:47 -08:00
|
|
|
break;
|
|
|
|
|
case MESA_SHADER_GEOMETRY:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_gs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
|
2023-11-20 12:13:47 -08:00
|
|
|
break;
|
|
|
|
|
case MESA_SHADER_FRAGMENT:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_fs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
|
2023-11-20 12:13:47 -08:00
|
|
|
break;
|
|
|
|
|
case MESA_SHADER_COMPUTE:
|
|
|
|
|
case MESA_SHADER_KERNEL:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_cs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
|
2023-11-20 12:13:47 -08:00
|
|
|
break;
|
|
|
|
|
case MESA_SHADER_RAYGEN:
|
|
|
|
|
case MESA_SHADER_ANY_HIT:
|
|
|
|
|
case MESA_SHADER_CLOSEST_HIT:
|
|
|
|
|
case MESA_SHADER_MISS:
|
|
|
|
|
case MESA_SHADER_INTERSECTION:
|
|
|
|
|
case MESA_SHADER_CALLABLE:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_bs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
|
2023-11-20 12:13:47 -08:00
|
|
|
break;
|
|
|
|
|
case MESA_SHADER_TASK:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_task_intrinsic(ntb, nir_instr_as_intrinsic(instr));
|
2023-11-20 12:13:47 -08:00
|
|
|
break;
|
|
|
|
|
case MESA_SHADER_MESH:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_mesh_intrinsic(ntb, nir_instr_as_intrinsic(instr));
|
2023-11-20 12:13:47 -08:00
|
|
|
break;
|
|
|
|
|
default:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("unsupported shader stage");
|
2023-11-20 12:13:47 -08:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_instr_type_tex:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_texture(ntb, nir_instr_as_tex(instr));
|
2023-11-20 12:13:47 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_instr_type_load_const:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_load_const(ntb, nir_instr_as_load_const(instr));
|
2023-11-20 12:13:47 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_instr_type_undef:
|
|
|
|
|
/* We create a new VGRF for undefs on every use (by handling
|
|
|
|
|
* them in get_nir_src()), rather than for each definition.
|
|
|
|
|
* This helps register coalescing eliminate MOVs from undef.
|
|
|
|
|
*/
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case nir_instr_type_jump:
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_jump(ntb, nir_instr_as_jump(instr));
|
2023-11-20 12:13:47 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("unknown instruction type");
|
2023-11-20 12:13:47 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 14:50:48 -08:00
|
|
|
static unsigned
|
|
|
|
|
brw_rnd_mode_from_nir(unsigned mode, unsigned *mask)
|
|
|
|
|
{
|
|
|
|
|
unsigned brw_mode = 0;
|
|
|
|
|
*mask = 0;
|
|
|
|
|
|
|
|
|
|
if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
|
|
|
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
|
|
|
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
|
|
|
|
|
mode) {
|
|
|
|
|
brw_mode |= BRW_RND_MODE_RTZ << BRW_CR0_RND_MODE_SHIFT;
|
|
|
|
|
*mask |= BRW_CR0_RND_MODE_MASK;
|
|
|
|
|
}
|
|
|
|
|
if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
|
|
|
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
|
|
|
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
|
|
|
|
|
mode) {
|
|
|
|
|
brw_mode |= BRW_RND_MODE_RTNE << BRW_CR0_RND_MODE_SHIFT;
|
|
|
|
|
*mask |= BRW_CR0_RND_MODE_MASK;
|
|
|
|
|
}
|
|
|
|
|
if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) {
|
|
|
|
|
brw_mode |= BRW_CR0_FP16_DENORM_PRESERVE;
|
|
|
|
|
*mask |= BRW_CR0_FP16_DENORM_PRESERVE;
|
|
|
|
|
}
|
|
|
|
|
if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) {
|
|
|
|
|
brw_mode |= BRW_CR0_FP32_DENORM_PRESERVE;
|
|
|
|
|
*mask |= BRW_CR0_FP32_DENORM_PRESERVE;
|
|
|
|
|
}
|
|
|
|
|
if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) {
|
|
|
|
|
brw_mode |= BRW_CR0_FP64_DENORM_PRESERVE;
|
|
|
|
|
*mask |= BRW_CR0_FP64_DENORM_PRESERVE;
|
|
|
|
|
}
|
|
|
|
|
if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16)
|
|
|
|
|
*mask |= BRW_CR0_FP16_DENORM_PRESERVE;
|
|
|
|
|
if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32)
|
|
|
|
|
*mask |= BRW_CR0_FP32_DENORM_PRESERVE;
|
|
|
|
|
if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64)
|
|
|
|
|
*mask |= BRW_CR0_FP64_DENORM_PRESERVE;
|
|
|
|
|
if (mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
|
|
|
|
|
*mask |= BRW_CR0_FP_MODE_MASK;
|
|
|
|
|
|
|
|
|
|
if (*mask != 0)
|
|
|
|
|
assert((*mask & brw_mode) == brw_mode);
|
|
|
|
|
|
|
|
|
|
return brw_mode;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2023-12-05 15:27:29 -08:00
|
|
|
emit_shader_float_controls_execution_mode(nir_to_brw_state &ntb)
|
2023-11-20 14:50:48 -08:00
|
|
|
{
|
2024-12-29 15:41:04 -08:00
|
|
|
const brw_builder &bld = ntb.bld;
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader &s = ntb.s;
|
2023-11-20 14:50:48 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
unsigned execution_mode = s.nir->info.float_controls_execution_mode;
|
2023-11-20 14:50:48 -08:00
|
|
|
if (execution_mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
|
|
|
|
|
return;
|
|
|
|
|
|
2025-04-03 01:14:03 -07:00
|
|
|
brw_builder abld = bld.uniform().annotate("shader floats control execution mode");
|
2023-11-20 14:50:48 -08:00
|
|
|
unsigned mask, mode = brw_rnd_mode_from_nir(execution_mode, &mask);
|
|
|
|
|
|
|
|
|
|
if (mask == 0)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
abld.emit(SHADER_OPCODE_FLOAT_CONTROL_MODE, bld.null_reg_ud(),
|
|
|
|
|
brw_imm_d(mode), brw_imm_d(mask));
|
|
|
|
|
}
|
|
|
|
|
|
2024-05-17 01:20:33 -07:00
|
|
|
/**
|
|
|
|
|
* Test the dispatch mask packing assumptions of
|
|
|
|
|
* brw_stage_has_packed_dispatch(). Call this from e.g. the top of
|
|
|
|
|
* nir_to_brw() to cause a GPU hang if any shader invocation is
|
|
|
|
|
* executed with an unexpected dispatch mask.
|
|
|
|
|
*/
|
|
|
|
|
static UNUSED void
|
2025-02-10 08:55:26 -08:00
|
|
|
brw_test_dispatch_packing(const brw_builder &bld)
|
2024-05-17 01:20:33 -07:00
|
|
|
{
|
2024-12-07 10:25:45 -08:00
|
|
|
const brw_shader *shader = bld.shader;
|
2025-08-05 15:54:29 +08:00
|
|
|
const mesa_shader_stage stage = shader->stage;
|
2024-05-17 01:20:33 -07:00
|
|
|
const bool uses_vmask =
|
|
|
|
|
stage == MESA_SHADER_FRAGMENT &&
|
|
|
|
|
brw_wm_prog_data(shader->prog_data)->uses_vmask;
|
|
|
|
|
|
|
|
|
|
if (brw_stage_has_packed_dispatch(shader->devinfo, stage,
|
|
|
|
|
shader->max_polygons,
|
|
|
|
|
shader->prog_data)) {
|
2025-04-03 01:14:03 -07:00
|
|
|
const brw_builder ubld = bld.uniform();
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg tmp = component(bld.vgrf(BRW_TYPE_UD), 0);
|
|
|
|
|
const brw_reg mask = uses_vmask ? brw_vmask_reg() : brw_dmask_reg();
|
2024-05-17 01:20:33 -07:00
|
|
|
|
|
|
|
|
ubld.ADD(tmp, mask, brw_imm_ud(1));
|
|
|
|
|
ubld.AND(tmp, mask, tmp);
|
|
|
|
|
|
|
|
|
|
/* This will loop forever if the dispatch mask doesn't have the expected
|
|
|
|
|
* form '2^n-1', in which case tmp will be non-zero.
|
|
|
|
|
*/
|
2025-02-13 19:20:38 -08:00
|
|
|
bld.DO();
|
2024-05-17 01:20:33 -07:00
|
|
|
bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
|
|
|
|
|
set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-10-09 02:16:49 -07:00
|
|
|
static void
|
|
|
|
|
set_clip_cull_distance_masks(brw_shader &s)
|
|
|
|
|
{
|
|
|
|
|
const shader_info &info = s.nir->info;
|
|
|
|
|
|
2025-10-10 01:35:20 -07:00
|
|
|
if (info.stage != MESA_SHADER_VERTEX &&
|
|
|
|
|
info.stage != MESA_SHADER_TESS_EVAL &&
|
|
|
|
|
info.stage != MESA_SHADER_GEOMETRY &&
|
|
|
|
|
info.stage != MESA_SHADER_MESH)
|
2025-10-09 02:16:49 -07:00
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
if (info.outputs_written &
|
|
|
|
|
(VARYING_BIT_CLIP_DIST0 | VARYING_BIT_CLIP_DIST1 |
|
|
|
|
|
VARYING_BIT_CULL_DIST0 | VARYING_BIT_CULL_DIST1)) {
|
|
|
|
|
|
2025-10-10 01:35:20 -07:00
|
|
|
uint32_t clip_mask = BITFIELD_MASK(info.clip_distance_array_size);
|
|
|
|
|
|
|
|
|
|
uint32_t cull_mask = BITFIELD_RANGE(info.clip_distance_array_size,
|
|
|
|
|
info.cull_distance_array_size);
|
2025-10-09 02:16:49 -07:00
|
|
|
|
2025-10-10 01:35:20 -07:00
|
|
|
if (info.stage == MESA_SHADER_MESH) {
|
|
|
|
|
struct brw_mesh_prog_data *prog_data = brw_mesh_prog_data(s.prog_data);
|
|
|
|
|
prog_data->clip_distance_mask = clip_mask;
|
|
|
|
|
prog_data->cull_distance_mask = cull_mask;
|
|
|
|
|
} else {
|
|
|
|
|
struct brw_vue_prog_data *prog_data = brw_vue_prog_data(s.prog_data);
|
|
|
|
|
prog_data->clip_distance_mask = clip_mask;
|
|
|
|
|
prog_data->cull_distance_mask = cull_mask;
|
|
|
|
|
}
|
2025-10-09 02:16:49 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-20 14:50:48 -08:00
|
|
|
void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_from_nir(brw_shader *s)
|
2023-11-20 14:50:48 -08:00
|
|
|
{
|
2023-12-05 17:16:34 -08:00
|
|
|
nir_to_brw_state ntb = {
|
|
|
|
|
.s = *s,
|
|
|
|
|
.nir = s->nir,
|
|
|
|
|
.devinfo = s->devinfo,
|
|
|
|
|
.mem_ctx = ralloc_context(NULL),
|
2025-02-27 22:56:15 -08:00
|
|
|
.bld = brw_builder(s),
|
2023-12-05 17:16:34 -08:00
|
|
|
};
|
2023-11-20 21:21:54 -08:00
|
|
|
|
2024-08-23 10:46:13 -07:00
|
|
|
if (INTEL_DEBUG(DEBUG_ANNOTATION))
|
|
|
|
|
ntb.annotate = true;
|
|
|
|
|
|
2025-02-10 08:55:26 -08:00
|
|
|
if (ENABLE_TEST_DISPATCH_PACKING)
|
|
|
|
|
brw_test_dispatch_packing(ntb.bld);
|
2024-05-17 01:20:33 -07:00
|
|
|
|
2025-10-09 02:16:49 -07:00
|
|
|
set_clip_cull_distance_masks(*s);
|
|
|
|
|
|
2023-11-20 22:00:28 -08:00
|
|
|
emit_shader_float_controls_execution_mode(ntb);
|
2023-11-20 14:50:48 -08:00
|
|
|
|
|
|
|
|
/* emit the arrays used for inputs and outputs - load/store intrinsics will
|
|
|
|
|
* be converted to reads/writes of these arrays
|
|
|
|
|
*/
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_setup_outputs(ntb);
|
|
|
|
|
brw_from_nir_emit_system_values(ntb);
|
2023-12-05 17:16:34 -08:00
|
|
|
ntb.s.last_scratch = ALIGN(ntb.nir->scratch_size, 4) * ntb.s.dispatch_width;
|
2023-11-20 14:50:48 -08:00
|
|
|
|
2024-12-07 09:36:03 -08:00
|
|
|
brw_from_nir_emit_impl(ntb, nir_shader_get_entrypoint((nir_shader *)ntb.nir));
|
2023-11-20 14:50:48 -08:00
|
|
|
|
2023-12-05 15:27:29 -08:00
|
|
|
ntb.bld.emit(SHADER_OPCODE_HALT_TARGET);
|
2023-11-20 21:21:54 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
ralloc_free(ntb.mem_ctx);
|
2024-08-27 10:16:11 -07:00
|
|
|
|
|
|
|
|
brw_shader_phase_update(*s, BRW_SHADER_PHASE_AFTER_NIR);
|
2023-11-20 14:50:48 -08:00
|
|
|
}
|