2010-08-10 20:39:06 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
2011-05-24 16:45:17 -07:00
|
|
|
*/
|
|
|
|
|
|
2024-07-13 00:19:44 -07:00
|
|
|
/** @file
|
2010-08-10 20:39:06 -07:00
|
|
|
*
|
2011-05-24 16:45:17 -07:00
|
|
|
* This file drives the GLSL IR -> LIR translation, contains the
|
|
|
|
|
* optimizations on the LIR, and drives the generation of native code
|
|
|
|
|
* from the LIR.
|
2010-08-10 20:39:06 -07:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "brw_eu.h"
|
2010-10-10 15:42:37 -07:00
|
|
|
#include "brw_fs.h"
|
2025-01-15 08:20:46 -08:00
|
|
|
#include "brw_builder.h"
|
2020-01-23 22:55:33 -08:00
|
|
|
#include "brw_fs_live_variables.h"
|
2015-11-11 10:04:43 -08:00
|
|
|
#include "brw_nir.h"
|
2014-07-12 21:18:39 -07:00
|
|
|
#include "brw_cfg.h"
|
2024-04-24 16:14:16 +03:00
|
|
|
#include "brw_rt.h"
|
2021-10-07 00:23:07 -07:00
|
|
|
#include "brw_private.h"
|
2023-11-01 12:51:33 -07:00
|
|
|
#include "intel_nir.h"
|
2023-09-24 21:38:47 -07:00
|
|
|
#include "shader_enums.h"
|
2021-04-05 10:44:41 -07:00
|
|
|
#include "dev/intel_debug.h"
|
2023-01-20 23:19:34 -08:00
|
|
|
#include "dev/intel_wa.h"
|
2016-01-18 11:35:29 +02:00
|
|
|
#include "compiler/glsl_types.h"
|
2016-07-17 18:37:08 -07:00
|
|
|
#include "compiler/nir/nir_builder.h"
|
2018-08-21 09:46:46 -07:00
|
|
|
#include "util/u_math.h"
|
2010-08-10 20:39:06 -07:00
|
|
|
|
2015-06-03 20:36:47 +03:00
|
|
|
using namespace brw;
|
|
|
|
|
|
2011-03-13 13:43:05 -07:00
|
|
|
void
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
fs_visitor::vfail(const char *format, va_list va)
|
2011-03-13 13:43:05 -07:00
|
|
|
{
|
2011-05-16 15:10:26 -07:00
|
|
|
char *msg;
|
2011-03-13 13:43:05 -07:00
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
if (failed)
|
|
|
|
|
return;
|
2011-03-13 13:43:05 -07:00
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
failed = true;
|
|
|
|
|
|
|
|
|
|
msg = ralloc_vasprintf(mem_ctx, format, va);
|
2020-07-02 13:37:10 +02:00
|
|
|
msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
|
2023-09-24 21:38:47 -07:00
|
|
|
dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
|
|
|
this->fail_msg = msg;
|
|
|
|
|
|
2021-03-23 11:31:51 -07:00
|
|
|
if (unlikely(debug_enabled)) {
|
2011-06-10 15:26:02 -03:00
|
|
|
fprintf(stderr, "%s", msg);
|
2011-03-13 13:43:05 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
void
|
|
|
|
|
fs_visitor::fail(const char *format, ...)
|
|
|
|
|
{
|
|
|
|
|
va_list va;
|
|
|
|
|
|
|
|
|
|
va_start(va, format);
|
|
|
|
|
vfail(format, va);
|
|
|
|
|
va_end(va);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
2016-05-18 14:39:52 -07:00
|
|
|
* Mark this program as impossible to compile with dispatch width greater
|
|
|
|
|
* than n.
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
*
|
|
|
|
|
* During the SIMD8 compile (which happens first), we can detect and flag
|
2016-05-18 14:39:52 -07:00
|
|
|
* things that are unsupported in SIMD16+ mode, so the compiler can skip the
|
|
|
|
|
* SIMD16+ compile altogether.
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
*
|
2016-05-18 14:39:52 -07:00
|
|
|
* During a compile of dispatch width greater than n (if one happens anyway),
|
|
|
|
|
* this just calls fail().
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
*/
|
|
|
|
|
void
|
2016-05-18 14:39:52 -07:00
|
|
|
fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
{
|
2016-05-18 14:39:52 -07:00
|
|
|
if (dispatch_width > n) {
|
2015-06-22 16:30:04 -07:00
|
|
|
fail("%s", msg);
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
} else {
|
2020-10-30 17:41:02 +02:00
|
|
|
max_dispatch_width = MIN2(max_dispatch_width, n);
|
2021-07-29 14:27:57 -07:00
|
|
|
brw_shader_perf_log(compiler, log_data,
|
2021-10-03 15:58:36 +03:00
|
|
|
"Shader dispatch width limited to SIMD%d: %s\n",
|
2021-07-29 14:27:57 -07:00
|
|
|
n, msg);
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2013-11-12 15:33:27 -08:00
|
|
|
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
|
2011-03-23 12:50:53 -07:00
|
|
|
* This brings in those uniform definitions
|
|
|
|
|
*/
|
|
|
|
|
void
|
2011-07-25 18:13:04 -07:00
|
|
|
fs_visitor::import_uniforms(fs_visitor *v)
|
2011-03-23 12:50:53 -07:00
|
|
|
{
|
2014-03-11 14:35:27 -07:00
|
|
|
this->uniforms = v->uniforms;
|
2011-03-23 12:50:53 -07:00
|
|
|
}
|
|
|
|
|
|
2024-11-18 11:33:35 +02:00
|
|
|
enum intel_barycentric_mode
|
2024-04-18 09:54:11 +03:00
|
|
|
brw_barycentric_mode(const struct brw_wm_prog_key *key,
|
|
|
|
|
nir_intrinsic_instr *intr)
|
2016-07-11 15:00:37 -07:00
|
|
|
{
|
2022-07-06 13:01:24 -07:00
|
|
|
const glsl_interp_mode mode =
|
|
|
|
|
(enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
|
|
|
|
|
|
2016-07-11 15:00:37 -07:00
|
|
|
/* Barycentric modes don't make sense for flat inputs. */
|
2016-07-07 02:02:38 -07:00
|
|
|
assert(mode != INTERP_MODE_FLAT);
|
2016-07-11 15:00:37 -07:00
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
unsigned bary;
|
2022-07-06 13:01:24 -07:00
|
|
|
switch (intr->intrinsic) {
|
2016-07-12 03:57:25 -07:00
|
|
|
case nir_intrinsic_load_barycentric_pixel:
|
|
|
|
|
case nir_intrinsic_load_barycentric_at_offset:
|
2024-04-18 09:54:11 +03:00
|
|
|
/* When per sample interpolation is dynamic, assume sample
|
|
|
|
|
* interpolation. We'll dynamically remap things so that the FS thread
|
|
|
|
|
* payload is not affected.
|
|
|
|
|
*/
|
2024-11-18 10:58:46 +02:00
|
|
|
bary = key->persample_interp == INTEL_SOMETIMES ?
|
2024-11-18 11:33:35 +02:00
|
|
|
INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE :
|
|
|
|
|
INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL;
|
2016-07-12 03:57:25 -07:00
|
|
|
break;
|
|
|
|
|
case nir_intrinsic_load_barycentric_centroid:
|
2024-11-18 11:33:35 +02:00
|
|
|
bary = INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID;
|
2016-07-12 03:57:25 -07:00
|
|
|
break;
|
|
|
|
|
case nir_intrinsic_load_barycentric_sample:
|
|
|
|
|
case nir_intrinsic_load_barycentric_at_sample:
|
2024-11-18 11:33:35 +02:00
|
|
|
bary = INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE;
|
2016-07-12 03:57:25 -07:00
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
unreachable("invalid intrinsic");
|
2016-07-11 15:00:37 -07:00
|
|
|
}
|
|
|
|
|
|
2016-07-07 02:02:38 -07:00
|
|
|
if (mode == INTERP_MODE_NOPERSPECTIVE)
|
2016-07-11 15:00:37 -07:00
|
|
|
bary += 3;
|
|
|
|
|
|
2024-11-18 11:33:35 +02:00
|
|
|
return (enum intel_barycentric_mode) bary;
|
2016-07-11 15:00:37 -07:00
|
|
|
}
|
|
|
|
|
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
/**
|
|
|
|
|
* Walk backwards from the end of the program looking for a URB write that
|
|
|
|
|
* isn't in control flow, and mark it with EOT.
|
|
|
|
|
*
|
|
|
|
|
* Return true if successful or false if a separate EOT write is needed.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
fs_visitor::mark_last_urb_write_with_eot()
|
|
|
|
|
{
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_in_list_reverse(brw_inst, prev, &this->instructions) {
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
|
|
|
|
|
prev->eot = true;
|
|
|
|
|
|
|
|
|
|
/* Delete now dead instructions. */
|
|
|
|
|
foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
|
|
|
|
|
if (dead == prev)
|
|
|
|
|
break;
|
|
|
|
|
dead->remove();
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
} else if (prev->is_control_flow() || prev->has_side_effects()) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-03 12:15:21 -07:00
|
|
|
static unsigned
|
|
|
|
|
round_components_to_whole_registers(const intel_device_info *devinfo,
|
|
|
|
|
unsigned c)
|
|
|
|
|
{
|
|
|
|
|
return DIV_ROUND_UP(c, 8 * reg_unit(devinfo)) * reg_unit(devinfo);
|
|
|
|
|
}
|
|
|
|
|
|
2010-08-26 16:39:41 -07:00
|
|
|
void
|
|
|
|
|
fs_visitor::assign_curb_setup()
|
|
|
|
|
{
|
2022-08-03 12:15:21 -07:00
|
|
|
unsigned uniform_push_length =
|
|
|
|
|
round_components_to_whole_registers(devinfo, prog_data->nr_params);
|
2016-11-29 02:47:15 -08:00
|
|
|
|
|
|
|
|
unsigned ubo_push_length = 0;
|
2016-11-29 05:20:20 -08:00
|
|
|
unsigned ubo_push_start[4];
|
2016-11-29 02:47:15 -08:00
|
|
|
for (int i = 0; i < 4; i++) {
|
2016-11-29 05:20:20 -08:00
|
|
|
ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
|
2024-02-19 23:07:04 -08:00
|
|
|
ubo_push_length += prog_data->ubo_ranges[i].length;
|
2022-08-03 12:15:21 -07:00
|
|
|
|
|
|
|
|
assert(ubo_push_start[i] % (8 * reg_unit(devinfo)) == 0);
|
|
|
|
|
assert(ubo_push_length % (1 * reg_unit(devinfo)) == 0);
|
2016-11-29 02:47:15 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
prog_data->curb_read_length = uniform_push_length + ubo_push_length;
|
2024-07-01 14:45:38 -07:00
|
|
|
if (stage == MESA_SHADER_FRAGMENT &&
|
|
|
|
|
((struct brw_wm_prog_key *)key)->null_push_constant_tbimr_workaround)
|
|
|
|
|
prog_data->curb_read_length = MAX2(1, prog_data->curb_read_length);
|
2014-02-19 15:27:01 +01:00
|
|
|
|
2020-04-03 20:20:53 -05:00
|
|
|
uint64_t used = 0;
|
2024-04-24 16:14:16 +03:00
|
|
|
const bool pull_constants =
|
|
|
|
|
devinfo->verx10 >= 125 &&
|
|
|
|
|
(gl_shader_stage_is_compute(stage) ||
|
|
|
|
|
gl_shader_stage_is_mesh(stage)) &&
|
|
|
|
|
uniform_push_length;
|
|
|
|
|
|
|
|
|
|
if (pull_constants) {
|
|
|
|
|
const bool pull_constants_a64 =
|
|
|
|
|
(gl_shader_stage_is_rt(stage) &&
|
|
|
|
|
brw_bs_prog_data(prog_data)->uses_inline_push_addr) ||
|
|
|
|
|
((gl_shader_stage_is_compute(stage) ||
|
|
|
|
|
gl_shader_stage_is_mesh(stage)) &&
|
|
|
|
|
brw_cs_prog_data(prog_data)->uses_inline_push_addr);
|
2022-07-15 13:08:23 +03:00
|
|
|
assert(devinfo->has_lsc);
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder ubld = brw_builder(this, 1).exec_all().at(
|
2020-05-04 16:17:58 -05:00
|
|
|
cfg->first_block(), cfg->first_block()->start());
|
|
|
|
|
|
2024-04-24 16:14:16 +03:00
|
|
|
brw_reg base_addr;
|
|
|
|
|
if (pull_constants_a64) {
|
|
|
|
|
/* The address of the push constants is at offset 0 in the inline
|
|
|
|
|
* parameter.
|
|
|
|
|
*/
|
|
|
|
|
base_addr =
|
|
|
|
|
gl_shader_stage_is_rt(stage) ?
|
|
|
|
|
retype(bs_payload().inline_parameter, BRW_TYPE_UQ) :
|
|
|
|
|
retype(cs_payload().inline_parameter, BRW_TYPE_UQ);
|
|
|
|
|
} else {
|
|
|
|
|
/* The base offset for our push data is passed in as R0.0[31:6]. We
|
|
|
|
|
* have to mask off the bottom 6 bits.
|
|
|
|
|
*/
|
|
|
|
|
base_addr = ubld.AND(retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
|
|
|
|
|
brw_imm_ud(INTEL_MASK(31, 6)));
|
|
|
|
|
}
|
2020-05-04 16:17:58 -05:00
|
|
|
|
2021-03-29 15:46:12 -07:00
|
|
|
/* On Gfx12-HP we load constants at the start of the program using A32
|
2020-05-04 16:17:58 -05:00
|
|
|
* stateless messages.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned i = 0; i < uniform_push_length;) {
|
2022-07-15 13:08:23 +03:00
|
|
|
/* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
|
|
|
|
|
unsigned num_regs = MIN2(uniform_push_length - i, 8);
|
2020-05-04 16:17:58 -05:00
|
|
|
assert(num_regs > 0);
|
|
|
|
|
num_regs = 1 << util_logbase2(num_regs);
|
|
|
|
|
|
2024-04-24 16:14:16 +03:00
|
|
|
brw_reg addr;
|
|
|
|
|
|
|
|
|
|
if (i != 0) {
|
|
|
|
|
if (pull_constants_a64) {
|
|
|
|
|
/* We need to do the carry manually as when this pass is run,
|
|
|
|
|
* we're not expecting any 64bit ALUs. Unfortunately all the
|
|
|
|
|
* 64bit lowering is done in NIR.
|
|
|
|
|
*/
|
|
|
|
|
addr = ubld.vgrf(BRW_TYPE_UQ);
|
|
|
|
|
brw_reg addr_ldw = subscript(addr, BRW_TYPE_UD, 0);
|
|
|
|
|
brw_reg addr_udw = subscript(addr, BRW_TYPE_UD, 1);
|
|
|
|
|
brw_reg base_addr_ldw = subscript(base_addr, BRW_TYPE_UD, 0);
|
|
|
|
|
brw_reg base_addr_udw = subscript(base_addr, BRW_TYPE_UD, 1);
|
|
|
|
|
ubld.ADD(addr_ldw, base_addr_ldw, brw_imm_ud(i * REG_SIZE));
|
|
|
|
|
ubld.CMP(ubld.null_reg_d(), addr_ldw, base_addr_ldw, BRW_CONDITIONAL_L);
|
|
|
|
|
set_predicate(BRW_PREDICATE_NORMAL,
|
|
|
|
|
ubld.ADD(addr_udw, base_addr_udw, brw_imm_ud(1)));
|
|
|
|
|
set_predicate_inv(BRW_PREDICATE_NORMAL, true,
|
|
|
|
|
ubld.MOV(addr_udw, base_addr_udw));
|
|
|
|
|
} else {
|
|
|
|
|
addr = ubld.ADD(base_addr, brw_imm_ud(i * REG_SIZE));
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
addr = base_addr;
|
|
|
|
|
}
|
2020-05-04 16:17:58 -05:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[4] = {
|
2020-05-04 16:17:58 -05:00
|
|
|
brw_imm_ud(0), /* desc */
|
|
|
|
|
brw_imm_ud(0), /* ex_desc */
|
2022-07-15 13:08:23 +03:00
|
|
|
addr, /* payload */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg(), /* payload2 */
|
2020-05-04 16:17:58 -05:00
|
|
|
};
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dest = retype(brw_vec8_grf(payload().num_regs + i, 0),
|
2024-04-20 17:08:02 -07:00
|
|
|
BRW_TYPE_UD);
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
|
2022-07-15 13:08:23 +03:00
|
|
|
|
|
|
|
|
send->sfid = GFX12_SFID_UGM;
|
brw: move final send lowering up into the IR
Because we do emit the final send message form in code generation, a
lot of emissions look like this :
add(8) vgrf0, u0, 0x100
mov(1) a0.1, vgrf0 # emitted by the generator
send(8) ..., a0.1
By moving address register manipulation in the IR, we can get this
down to :
add(1) a0.1, u0, 0x100
send(8) ..., a0.1
This reduce register pressure around some send messages by 1 vgrf.
All lost shaders in the below results are fragment SIMD32, due to the
throughput estimator. If turned off, we loose no SIMD32 shaders with
this change.
DG2 results:
Assassin's Creed Valhalla:
Totals from 2044 (96.87% of 2110) affected shaders:
Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00%
Subgroup size: 23832 -> 23824 (-0.03%)
Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82%
Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39%
Fill count: 2005 -> 1256 (-37.36%)
Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00%
Max live registers: 116765 -> 115058 (-1.46%)
Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67%
Cyberpunk 2077:
Totals from 1181 (93.43% of 1264) affected shaders:
Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01%
Subgroup size: 13016 -> 13032 (+0.12%)
Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39%
Spill count: 12 -> 8 (-33.33%)
Fill count: 9 -> 6 (-33.33%)
Dota2:
Totals from 173 (11.59% of 1493) affected shaders:
Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34%
Max live registers: 5787 -> 5779 (-0.14%)
Max dispatch width: 1344 -> 1152 (-14.29%)
Hitman3:
Totals from 5072 (95.39% of 5317) affected shaders:
Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00%
Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48%
Spill count: 3942 -> 3200 (-18.82%)
Fill count: 10158 -> 8846 (-12.92%)
Scratch Memory Size: 257024 -> 223232 (-13.15%)
Max live registers: 328467 -> 324631 (-1.17%)
Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73%
Fortnite:
Totals from 360 (4.82% of 7472) affected shaders:
Instrs: 778068 -> 777925 (-0.02%)
Subgroup size: 3128 -> 3136 (+0.26%)
Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19%
Max live registers: 50689 -> 50658 (-0.06%)
Hogwarts Legacy:
Totals from 1376 (84.00% of 1638) affected shaders:
Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03%
Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12%
Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36%
Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23%
Scratch Memory Size: 99328 -> 89088 (-10.31%)
Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23%
Max dispatch width: 11848 -> 11920 (+0.61%)
Metro Exodus:
Totals from 92 (0.21% of 43072) affected shaders:
Instrs: 262995 -> 262968 (-0.01%)
Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25%
Max live registers: 11152 -> 11140 (-0.11%)
Red Dead Redemption 2 :
Totals from 451 (7.71% of 5847) affected shaders:
Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00%
Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00%
Max live registers: 42294 -> 42185 (-0.26%)
Spiderman Remastered:
Totals from 6820 (98.02% of 6958) affected shaders:
Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65%
Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25%
Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61%
Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58%
Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74%
Max live registers: 493149 -> 487458 (-1.15%)
Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20%
Strange Brigade:
Totals from 3769 (91.21% of 4132) affected shaders:
Instrs: 1354476 -> 1321474 (-2.44%)
Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59%
Max live registers: 199057 -> 193656 (-2.71%)
Max dispatch width: 30272 -> 30240 (-0.11%)
Witcher 3:
Totals from 25 (2.40% of 1041) affected shaders:
Instrs: 24621 -> 24606 (-0.06%)
Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05%
Max live registers: 1963 -> 1955 (-0.41%)
LNL results:
Assassin's Creed Valhalla:
Totals from 1928 (98.02% of 1967) affected shaders:
Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11%
Subgroup size: 41264 -> 41280 (+0.04%)
Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11%
Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90%
Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60%
Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56%
Max live registers: 205483 -> 202192 (-1.60%)
Cyberpunk 2077:
Totals from 1177 (96.40% of 1221) affected shaders:
Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03%
Subgroup size: 24912 -> 24944 (+0.13%)
Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81%
Spill count: 8 -> 3 (-62.50%)
Fill count: 6 -> 3 (-50.00%)
Max live registers: 126922 -> 125472 (-1.14%)
Dota2:
Totals from 428 (32.47% of 1318) affected shaders:
Instrs: 89355 -> 89740 (+0.43%)
Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55%
Max live registers: 32863 -> 32847 (-0.05%)
Fortnite:
Totals from 5354 (81.72% of 6552) affected shaders:
Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53%
Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65%
Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72%
Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35%
Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71%
Hitman3:
Totals from 4912 (97.09% of 5059) affected shaders:
Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00%
Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55%
Spill count: 3739 -> 3136 (-16.13%)
Fill count: 10657 -> 9564 (-10.26%)
Scratch Memory Size: 373760 -> 318464 (-14.79%)
Max live registers: 597566 -> 589460 (-1.36%)
Hogwarts Legacy:
Totals from 1471 (96.33% of 1527) affected shaders:
Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05%
Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68%
Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95%
Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83%
Scratch Memory Size: 251904 -> 217088 (-13.82%)
Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12%
Metro Exodus:
Totals from 18356 (49.81% of 36854) affected shaders:
Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83%
Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84%
Spill count: 595 -> 546 (-8.24%)
Fill count: 1604 -> 1408 (-12.22%)
Max live registers: 2086937 -> 2086933 (-0.00%)
Red Dead Redemption 2:
Totals from 4171 (79.31% of 5259) affected shaders:
Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83%
Subgroup size: 86416 -> 86432 (+0.02%)
Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53%
Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59%
Scratch Memory Size: 401408 -> 385024 (-4.08%)
Spiderman Remastered:
Totals from 6639 (98.94% of 6710) affected shaders:
Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98%
Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59%
Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82%
Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76%
Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17%
Max live registers: 918240 -> 906604 (-1.27%)
Strange Brigade:
Totals from 3675 (92.24% of 3984) affected shaders:
Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00%
Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09%
Max live registers: 361849 -> 351265 (-2.92%)
Witcher 3:
Totals from 13 (46.43% of 28) affected shaders:
Instrs: 593 -> 660 (+11.30%)
Cycle count: 28302 -> 28714 (+1.46%)
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
2024-02-29 20:51:50 +02:00
|
|
|
uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
|
|
|
|
|
LSC_ADDR_SURFTYPE_FLAT,
|
2024-04-24 16:14:16 +03:00
|
|
|
pull_constants_a64 ?
|
|
|
|
|
LSC_ADDR_SIZE_A64 : LSC_ADDR_SIZE_A32,
|
brw: move final send lowering up into the IR
Because we do emit the final send message form in code generation, a
lot of emissions look like this :
add(8) vgrf0, u0, 0x100
mov(1) a0.1, vgrf0 # emitted by the generator
send(8) ..., a0.1
By moving address register manipulation in the IR, we can get this
down to :
add(1) a0.1, u0, 0x100
send(8) ..., a0.1
This reduce register pressure around some send messages by 1 vgrf.
All lost shaders in the below results are fragment SIMD32, due to the
throughput estimator. If turned off, we loose no SIMD32 shaders with
this change.
DG2 results:
Assassin's Creed Valhalla:
Totals from 2044 (96.87% of 2110) affected shaders:
Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00%
Subgroup size: 23832 -> 23824 (-0.03%)
Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82%
Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39%
Fill count: 2005 -> 1256 (-37.36%)
Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00%
Max live registers: 116765 -> 115058 (-1.46%)
Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67%
Cyberpunk 2077:
Totals from 1181 (93.43% of 1264) affected shaders:
Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01%
Subgroup size: 13016 -> 13032 (+0.12%)
Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39%
Spill count: 12 -> 8 (-33.33%)
Fill count: 9 -> 6 (-33.33%)
Dota2:
Totals from 173 (11.59% of 1493) affected shaders:
Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34%
Max live registers: 5787 -> 5779 (-0.14%)
Max dispatch width: 1344 -> 1152 (-14.29%)
Hitman3:
Totals from 5072 (95.39% of 5317) affected shaders:
Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00%
Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48%
Spill count: 3942 -> 3200 (-18.82%)
Fill count: 10158 -> 8846 (-12.92%)
Scratch Memory Size: 257024 -> 223232 (-13.15%)
Max live registers: 328467 -> 324631 (-1.17%)
Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73%
Fortnite:
Totals from 360 (4.82% of 7472) affected shaders:
Instrs: 778068 -> 777925 (-0.02%)
Subgroup size: 3128 -> 3136 (+0.26%)
Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19%
Max live registers: 50689 -> 50658 (-0.06%)
Hogwarts Legacy:
Totals from 1376 (84.00% of 1638) affected shaders:
Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03%
Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12%
Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36%
Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23%
Scratch Memory Size: 99328 -> 89088 (-10.31%)
Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23%
Max dispatch width: 11848 -> 11920 (+0.61%)
Metro Exodus:
Totals from 92 (0.21% of 43072) affected shaders:
Instrs: 262995 -> 262968 (-0.01%)
Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25%
Max live registers: 11152 -> 11140 (-0.11%)
Red Dead Redemption 2 :
Totals from 451 (7.71% of 5847) affected shaders:
Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00%
Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00%
Max live registers: 42294 -> 42185 (-0.26%)
Spiderman Remastered:
Totals from 6820 (98.02% of 6958) affected shaders:
Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65%
Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25%
Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61%
Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58%
Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74%
Max live registers: 493149 -> 487458 (-1.15%)
Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20%
Strange Brigade:
Totals from 3769 (91.21% of 4132) affected shaders:
Instrs: 1354476 -> 1321474 (-2.44%)
Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59%
Max live registers: 199057 -> 193656 (-2.71%)
Max dispatch width: 30272 -> 30240 (-0.11%)
Witcher 3:
Totals from 25 (2.40% of 1041) affected shaders:
Instrs: 24621 -> 24606 (-0.06%)
Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05%
Max live registers: 1963 -> 1955 (-0.41%)
LNL results:
Assassin's Creed Valhalla:
Totals from 1928 (98.02% of 1967) affected shaders:
Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11%
Subgroup size: 41264 -> 41280 (+0.04%)
Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11%
Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90%
Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60%
Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56%
Max live registers: 205483 -> 202192 (-1.60%)
Cyberpunk 2077:
Totals from 1177 (96.40% of 1221) affected shaders:
Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03%
Subgroup size: 24912 -> 24944 (+0.13%)
Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81%
Spill count: 8 -> 3 (-62.50%)
Fill count: 6 -> 3 (-50.00%)
Max live registers: 126922 -> 125472 (-1.14%)
Dota2:
Totals from 428 (32.47% of 1318) affected shaders:
Instrs: 89355 -> 89740 (+0.43%)
Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55%
Max live registers: 32863 -> 32847 (-0.05%)
Fortnite:
Totals from 5354 (81.72% of 6552) affected shaders:
Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53%
Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65%
Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72%
Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35%
Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71%
Hitman3:
Totals from 4912 (97.09% of 5059) affected shaders:
Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00%
Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55%
Spill count: 3739 -> 3136 (-16.13%)
Fill count: 10657 -> 9564 (-10.26%)
Scratch Memory Size: 373760 -> 318464 (-14.79%)
Max live registers: 597566 -> 589460 (-1.36%)
Hogwarts Legacy:
Totals from 1471 (96.33% of 1527) affected shaders:
Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05%
Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68%
Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95%
Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83%
Scratch Memory Size: 251904 -> 217088 (-13.82%)
Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12%
Metro Exodus:
Totals from 18356 (49.81% of 36854) affected shaders:
Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83%
Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84%
Spill count: 595 -> 546 (-8.24%)
Fill count: 1604 -> 1408 (-12.22%)
Max live registers: 2086937 -> 2086933 (-0.00%)
Red Dead Redemption 2:
Totals from 4171 (79.31% of 5259) affected shaders:
Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83%
Subgroup size: 86416 -> 86432 (+0.02%)
Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53%
Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59%
Scratch Memory Size: 401408 -> 385024 (-4.08%)
Spiderman Remastered:
Totals from 6639 (98.94% of 6710) affected shaders:
Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98%
Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59%
Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82%
Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76%
Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17%
Max live registers: 918240 -> 906604 (-1.27%)
Strange Brigade:
Totals from 3675 (92.24% of 3984) affected shaders:
Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00%
Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09%
Max live registers: 361849 -> 351265 (-2.92%)
Witcher 3:
Totals from 13 (46.43% of 28) affected shaders:
Instrs: 593 -> 660 (+11.30%)
Cycle count: 28302 -> 28714 (+1.46%)
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
2024-02-29 20:51:50 +02:00
|
|
|
LSC_DATA_SIZE_D32,
|
|
|
|
|
num_regs * 8 /* num_channels */,
|
|
|
|
|
true /* transpose */,
|
|
|
|
|
LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
|
2022-07-15 13:08:23 +03:00
|
|
|
send->header_size = 0;
|
2024-04-24 16:14:16 +03:00
|
|
|
send->mlen = lsc_msg_addr_len(
|
|
|
|
|
devinfo, pull_constants_a64 ?
|
|
|
|
|
LSC_ADDR_SIZE_A64 : LSC_ADDR_SIZE_A32, 1);
|
2022-07-15 13:08:23 +03:00
|
|
|
send->size_written =
|
2022-09-28 16:17:02 -07:00
|
|
|
lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE;
|
2024-04-24 16:14:16 +03:00
|
|
|
assert((payload().num_regs + i + send->size_written / REG_SIZE) <=
|
|
|
|
|
(payload().num_regs + prog_data->curb_read_length));
|
2020-05-04 16:17:58 -05:00
|
|
|
send->send_is_volatile = true;
|
|
|
|
|
|
brw: move final send lowering up into the IR
Because we do emit the final send message form in code generation, a
lot of emissions look like this :
add(8) vgrf0, u0, 0x100
mov(1) a0.1, vgrf0 # emitted by the generator
send(8) ..., a0.1
By moving address register manipulation in the IR, we can get this
down to :
add(1) a0.1, u0, 0x100
send(8) ..., a0.1
This reduce register pressure around some send messages by 1 vgrf.
All lost shaders in the below results are fragment SIMD32, due to the
throughput estimator. If turned off, we loose no SIMD32 shaders with
this change.
DG2 results:
Assassin's Creed Valhalla:
Totals from 2044 (96.87% of 2110) affected shaders:
Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00%
Subgroup size: 23832 -> 23824 (-0.03%)
Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82%
Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39%
Fill count: 2005 -> 1256 (-37.36%)
Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00%
Max live registers: 116765 -> 115058 (-1.46%)
Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67%
Cyberpunk 2077:
Totals from 1181 (93.43% of 1264) affected shaders:
Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01%
Subgroup size: 13016 -> 13032 (+0.12%)
Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39%
Spill count: 12 -> 8 (-33.33%)
Fill count: 9 -> 6 (-33.33%)
Dota2:
Totals from 173 (11.59% of 1493) affected shaders:
Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34%
Max live registers: 5787 -> 5779 (-0.14%)
Max dispatch width: 1344 -> 1152 (-14.29%)
Hitman3:
Totals from 5072 (95.39% of 5317) affected shaders:
Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00%
Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48%
Spill count: 3942 -> 3200 (-18.82%)
Fill count: 10158 -> 8846 (-12.92%)
Scratch Memory Size: 257024 -> 223232 (-13.15%)
Max live registers: 328467 -> 324631 (-1.17%)
Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73%
Fortnite:
Totals from 360 (4.82% of 7472) affected shaders:
Instrs: 778068 -> 777925 (-0.02%)
Subgroup size: 3128 -> 3136 (+0.26%)
Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19%
Max live registers: 50689 -> 50658 (-0.06%)
Hogwarts Legacy:
Totals from 1376 (84.00% of 1638) affected shaders:
Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03%
Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12%
Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36%
Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23%
Scratch Memory Size: 99328 -> 89088 (-10.31%)
Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23%
Max dispatch width: 11848 -> 11920 (+0.61%)
Metro Exodus:
Totals from 92 (0.21% of 43072) affected shaders:
Instrs: 262995 -> 262968 (-0.01%)
Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25%
Max live registers: 11152 -> 11140 (-0.11%)
Red Dead Redemption 2 :
Totals from 451 (7.71% of 5847) affected shaders:
Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00%
Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00%
Max live registers: 42294 -> 42185 (-0.26%)
Spiderman Remastered:
Totals from 6820 (98.02% of 6958) affected shaders:
Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65%
Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25%
Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61%
Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58%
Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74%
Max live registers: 493149 -> 487458 (-1.15%)
Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20%
Strange Brigade:
Totals from 3769 (91.21% of 4132) affected shaders:
Instrs: 1354476 -> 1321474 (-2.44%)
Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59%
Max live registers: 199057 -> 193656 (-2.71%)
Max dispatch width: 30272 -> 30240 (-0.11%)
Witcher 3:
Totals from 25 (2.40% of 1041) affected shaders:
Instrs: 24621 -> 24606 (-0.06%)
Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05%
Max live registers: 1963 -> 1955 (-0.41%)
LNL results:
Assassin's Creed Valhalla:
Totals from 1928 (98.02% of 1967) affected shaders:
Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11%
Subgroup size: 41264 -> 41280 (+0.04%)
Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11%
Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90%
Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60%
Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56%
Max live registers: 205483 -> 202192 (-1.60%)
Cyberpunk 2077:
Totals from 1177 (96.40% of 1221) affected shaders:
Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03%
Subgroup size: 24912 -> 24944 (+0.13%)
Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81%
Spill count: 8 -> 3 (-62.50%)
Fill count: 6 -> 3 (-50.00%)
Max live registers: 126922 -> 125472 (-1.14%)
Dota2:
Totals from 428 (32.47% of 1318) affected shaders:
Instrs: 89355 -> 89740 (+0.43%)
Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55%
Max live registers: 32863 -> 32847 (-0.05%)
Fortnite:
Totals from 5354 (81.72% of 6552) affected shaders:
Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53%
Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65%
Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72%
Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35%
Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71%
Hitman3:
Totals from 4912 (97.09% of 5059) affected shaders:
Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00%
Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55%
Spill count: 3739 -> 3136 (-16.13%)
Fill count: 10657 -> 9564 (-10.26%)
Scratch Memory Size: 373760 -> 318464 (-14.79%)
Max live registers: 597566 -> 589460 (-1.36%)
Hogwarts Legacy:
Totals from 1471 (96.33% of 1527) affected shaders:
Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05%
Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68%
Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95%
Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83%
Scratch Memory Size: 251904 -> 217088 (-13.82%)
Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12%
Metro Exodus:
Totals from 18356 (49.81% of 36854) affected shaders:
Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83%
Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84%
Spill count: 595 -> 546 (-8.24%)
Fill count: 1604 -> 1408 (-12.22%)
Max live registers: 2086937 -> 2086933 (-0.00%)
Red Dead Redemption 2:
Totals from 4171 (79.31% of 5259) affected shaders:
Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83%
Subgroup size: 86416 -> 86432 (+0.02%)
Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53%
Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59%
Scratch Memory Size: 401408 -> 385024 (-4.08%)
Spiderman Remastered:
Totals from 6639 (98.94% of 6710) affected shaders:
Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98%
Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59%
Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82%
Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76%
Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17%
Max live registers: 918240 -> 906604 (-1.27%)
Strange Brigade:
Totals from 3675 (92.24% of 3984) affected shaders:
Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00%
Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09%
Max live registers: 361849 -> 351265 (-2.92%)
Witcher 3:
Totals from 13 (46.43% of 28) affected shaders:
Instrs: 593 -> 660 (+11.30%)
Cycle count: 28302 -> 28714 (+1.46%)
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
2024-02-29 20:51:50 +02:00
|
|
|
send->src[0] = brw_imm_ud(desc |
|
|
|
|
|
brw_message_desc(devinfo,
|
|
|
|
|
send->mlen,
|
|
|
|
|
send->size_written / REG_SIZE,
|
|
|
|
|
send->header_size));
|
|
|
|
|
|
2020-05-04 16:17:58 -05:00
|
|
|
i += num_regs;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
|
|
|
|
}
|
|
|
|
|
|
2010-08-26 16:39:41 -07:00
|
|
|
/* Map the offsets in the UNIFORM file to fixed HW regs. */
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst(block, brw_inst, inst, cfg) {
|
2014-03-17 10:39:43 -07:00
|
|
|
for (unsigned int i = 0; i < inst->sources; i++) {
|
2010-08-26 16:39:41 -07:00
|
|
|
if (inst->src[i].file == UNIFORM) {
|
2016-09-01 12:42:20 -07:00
|
|
|
int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
|
2014-03-11 14:35:27 -07:00
|
|
|
int constant_nr;
|
2016-11-29 05:20:20 -08:00
|
|
|
if (inst->src[i].nr >= UBO_START) {
|
|
|
|
|
/* constant_nr is in 32-bit units, the rest are in bytes */
|
|
|
|
|
constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
|
|
|
|
|
inst->src[i].offset / 4;
|
|
|
|
|
} else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
|
2025-01-01 23:52:33 -08:00
|
|
|
constant_nr = uniform_nr;
|
2014-03-11 14:35:27 -07:00
|
|
|
} else {
|
|
|
|
|
/* Section 5.11 of the OpenGL 4.1 spec says:
|
|
|
|
|
* "Out-of-bounds reads return undefined values, which include
|
|
|
|
|
* values from other variables of the active program or zero."
|
|
|
|
|
* Just return the first push constant.
|
|
|
|
|
*/
|
|
|
|
|
constant_nr = 0;
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-03 20:20:53 -05:00
|
|
|
assert(constant_nr / 8 < 64);
|
|
|
|
|
used |= BITFIELD64_BIT(constant_nr / 8);
|
|
|
|
|
|
2022-08-19 12:40:20 -07:00
|
|
|
struct brw_reg brw_reg = brw_vec1_grf(payload().num_regs +
|
2010-08-27 14:15:42 -07:00
|
|
|
constant_nr / 8,
|
|
|
|
|
constant_nr % 8);
|
2015-10-24 15:29:03 -07:00
|
|
|
brw_reg.abs = inst->src[i].abs;
|
|
|
|
|
brw_reg.negate = inst->src[i].negate;
|
2010-08-26 16:39:41 -07:00
|
|
|
|
2024-02-01 15:02:37 -08:00
|
|
|
/* The combination of is_scalar for load_uniform, copy prop, and
|
|
|
|
|
* lower_btd_logical_send can generate a MOV from a UNIFORM with
|
|
|
|
|
* exec size 2 and stride of 1.
|
|
|
|
|
*/
|
|
|
|
|
assert(inst->src[i].stride == 0 || inst->exec_size == 2);
|
2015-10-24 15:29:03 -07:00
|
|
|
inst->src[i] = byte_offset(
|
2013-12-08 04:57:08 +01:00
|
|
|
retype(brw_reg, inst->src[i].type),
|
2016-09-01 15:11:21 -07:00
|
|
|
inst->src[i].offset % 4);
|
2010-08-26 16:39:41 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2014-10-03 19:05:32 -07:00
|
|
|
|
2024-02-19 23:07:04 -08:00
|
|
|
uint64_t want_zero = used & prog_data->zero_push_reg;
|
2020-04-03 20:20:53 -05:00
|
|
|
if (want_zero) {
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder ubld = brw_builder(this, 8).exec_all().at(
|
2020-04-03 20:20:53 -05:00
|
|
|
cfg->first_block(), cfg->first_block()->start());
|
|
|
|
|
|
|
|
|
|
/* push_reg_mask_param is in 32-bit units */
|
2024-02-19 23:07:04 -08:00
|
|
|
unsigned mask_param = prog_data->push_reg_mask_param;
|
2022-08-19 12:40:20 -07:00
|
|
|
struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
|
|
|
|
|
mask_param % 8);
|
2020-04-03 20:20:53 -05:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg b32;
|
2020-04-03 20:20:53 -05:00
|
|
|
for (unsigned i = 0; i < 64; i++) {
|
|
|
|
|
if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg shifted = ubld.vgrf(BRW_TYPE_W, 2);
|
2020-04-03 20:20:53 -05:00
|
|
|
ubld.SHL(horiz_offset(shifted, 8),
|
2024-04-20 17:08:02 -07:00
|
|
|
byte_offset(retype(mask, BRW_TYPE_W), i / 8),
|
2020-04-03 20:20:53 -05:00
|
|
|
brw_imm_v(0x01234567));
|
|
|
|
|
ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
|
|
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder ubld16 = ubld.group(16, 0);
|
2024-04-20 17:08:02 -07:00
|
|
|
b32 = ubld16.vgrf(BRW_TYPE_D);
|
2020-04-03 20:20:53 -05:00
|
|
|
ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (want_zero & BITFIELD64_BIT(i)) {
|
|
|
|
|
assert(i < prog_data->curb_read_length);
|
|
|
|
|
struct brw_reg push_reg =
|
2024-04-20 17:30:23 -07:00
|
|
|
retype(brw_vec8_grf(payload().num_regs + i, 0), BRW_TYPE_D);
|
2020-04-03 20:20:53 -05:00
|
|
|
|
|
|
|
|
ubld.AND(push_reg, push_reg, component(b32, i % 16));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
|
|
|
|
}
|
|
|
|
|
|
2014-10-03 19:05:32 -07:00
|
|
|
/* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
|
2022-08-19 12:40:20 -07:00
|
|
|
this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
|
2010-08-26 16:39:41 -07:00
|
|
|
}
|
|
|
|
|
|
2018-12-11 18:45:43 +01:00
|
|
|
/*
|
|
|
|
|
* Build up an array of indices into the urb_setup array that
|
|
|
|
|
* references the active entries of the urb_setup array.
|
|
|
|
|
* Used to accelerate walking the active entries of the urb_setup array
|
|
|
|
|
* on each upload.
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
|
|
|
|
|
{
|
2021-10-29 12:56:22 -07:00
|
|
|
/* TODO(mesh): Review usage of this in the context of Mesh, we may want to
|
|
|
|
|
* skip per-primitive attributes here.
|
|
|
|
|
*/
|
|
|
|
|
|
2018-12-11 18:45:43 +01:00
|
|
|
/* Make sure uint8_t is sufficient */
|
|
|
|
|
STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
|
|
|
|
|
uint8_t index = 0;
|
|
|
|
|
for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
|
|
|
|
|
if (wm_prog_data->urb_setup[attr] >= 0) {
|
|
|
|
|
wm_prog_data->urb_setup_attribs[index++] = attr;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
wm_prog_data->urb_setup_attribs_count = index;
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
void
|
2024-12-07 00:23:07 -08:00
|
|
|
fs_visitor::convert_attr_sources_to_hw_regs(brw_inst *inst)
|
2015-03-11 23:14:31 -07:00
|
|
|
{
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
if (inst->src[i].file == ATTR) {
|
2022-09-12 16:49:11 -07:00
|
|
|
assert(inst->src[i].nr == 0);
|
2022-08-19 12:40:20 -07:00
|
|
|
int grf = payload().num_regs +
|
2015-03-11 23:14:31 -07:00
|
|
|
prog_data->curb_read_length +
|
2016-09-01 12:42:20 -07:00
|
|
|
inst->src[i].offset / REG_SIZE;
|
2015-03-11 23:14:31 -07:00
|
|
|
|
2016-03-23 12:20:05 +01:00
|
|
|
/* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
|
|
|
|
|
*
|
|
|
|
|
* VertStride must be used to cross GRF register boundaries. This
|
|
|
|
|
* rule implies that elements within a 'Width' cannot cross GRF
|
|
|
|
|
* boundaries.
|
|
|
|
|
*
|
|
|
|
|
* So, for registers that are large enough, we have to split the exec
|
|
|
|
|
* size in two and trust the compression state to sort it out.
|
|
|
|
|
*/
|
|
|
|
|
unsigned total_size = inst->exec_size *
|
|
|
|
|
inst->src[i].stride *
|
2024-04-21 00:57:59 -07:00
|
|
|
brw_type_size_bytes(inst->src[i].type);
|
2016-03-23 12:20:05 +01:00
|
|
|
|
|
|
|
|
assert(total_size <= 2 * REG_SIZE);
|
|
|
|
|
const unsigned exec_size =
|
|
|
|
|
(total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
|
|
|
|
|
|
|
|
|
|
unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
|
2015-10-24 15:29:03 -07:00
|
|
|
struct brw_reg reg =
|
2015-03-11 23:14:31 -07:00
|
|
|
stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
|
2016-09-01 15:11:21 -07:00
|
|
|
inst->src[i].offset % REG_SIZE),
|
2016-03-23 12:20:05 +01:00
|
|
|
exec_size * inst->src[i].stride,
|
2015-11-11 22:37:53 -08:00
|
|
|
width, inst->src[i].stride);
|
2015-10-24 15:29:03 -07:00
|
|
|
reg.abs = inst->src[i].abs;
|
|
|
|
|
reg.negate = inst->src[i].negate;
|
|
|
|
|
|
|
|
|
|
inst->src[i] = reg;
|
2015-03-11 23:14:31 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-30 00:47:32 -07:00
|
|
|
int
|
|
|
|
|
brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
|
|
|
|
|
const brw_stage_prog_data *prog_data)
|
2017-09-29 12:22:48 -07:00
|
|
|
{
|
|
|
|
|
if (prog_data->nr_params == 0)
|
|
|
|
|
return -1;
|
|
|
|
|
|
2021-03-29 13:43:47 -07:00
|
|
|
if (devinfo->verx10 >= 125)
|
2020-06-16 23:06:25 -05:00
|
|
|
return -1;
|
|
|
|
|
|
2017-09-29 12:22:48 -07:00
|
|
|
/* The local thread id is always the last parameter in the list */
|
|
|
|
|
uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
|
2017-08-24 11:40:31 -07:00
|
|
|
if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
|
2017-09-29 12:22:48 -07:00
|
|
|
return prog_data->nr_params - 1;
|
|
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
2019-08-25 23:59:25 -07:00
|
|
|
uint32_t
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_fb_write_msg_control(const brw_inst *inst,
|
2019-08-25 23:59:25 -07:00
|
|
|
const struct brw_wm_prog_data *prog_data)
|
|
|
|
|
{
|
|
|
|
|
uint32_t mctl;
|
|
|
|
|
|
2024-02-19 19:41:48 -08:00
|
|
|
if (prog_data->dual_src_blend) {
|
2022-07-22 17:33:12 -07:00
|
|
|
assert(inst->exec_size < 32);
|
2019-08-25 23:59:25 -07:00
|
|
|
|
|
|
|
|
if (inst->group % 16 == 0)
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
|
|
|
|
|
else if (inst->group % 16 == 8)
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
|
|
|
|
|
else
|
|
|
|
|
unreachable("Invalid dual-source FB write instruction group");
|
|
|
|
|
} else {
|
|
|
|
|
assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
|
|
|
|
|
|
|
|
|
|
if (inst->exec_size == 16)
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
|
|
|
|
|
else if (inst->exec_size == 8)
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
|
2022-07-22 17:33:12 -07:00
|
|
|
else if (inst->exec_size == 32)
|
|
|
|
|
mctl = XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE;
|
2019-08-25 23:59:25 -07:00
|
|
|
else
|
|
|
|
|
unreachable("Invalid FB write execution size");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return mctl;
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
brw::register_pressure::register_pressure(const fs_visitor *v)
|
2013-08-04 23:27:14 -07:00
|
|
|
{
|
2016-03-13 16:35:49 -07:00
|
|
|
const fs_live_variables &live = v->live_analysis.require();
|
2016-03-13 16:37:03 -07:00
|
|
|
const unsigned num_instructions = v->cfg->num_blocks ?
|
|
|
|
|
v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
|
2013-08-04 23:27:14 -07:00
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
regs_live_at_ip = new unsigned[num_instructions]();
|
2013-08-04 23:27:14 -07:00
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
for (unsigned reg = 0; reg < v->alloc.count; reg++) {
|
2016-03-13 16:25:57 -07:00
|
|
|
for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
|
2016-03-13 16:35:49 -07:00
|
|
|
regs_live_at_ip[ip] += v->alloc.sizes[reg];
|
2013-08-04 23:27:14 -07:00
|
|
|
}
|
2023-08-15 01:15:17 -07:00
|
|
|
|
|
|
|
|
const unsigned payload_count = v->first_non_payload_grf;
|
|
|
|
|
|
|
|
|
|
int *payload_last_use_ip = new int[payload_count];
|
intel/brw: Only force g0's liveness to be the whole program if spilling
We don't actually need to extend g0's live range to the EOT message
generally - most messages that end a shader are headerless. The main
implicit use of g0 is for constructing scratch headers. With the last
two patches, we now consider scratch access that may exist in the IR
and already extend the liveness appropriately.
There is one remaining problem: spilling. The register allocator will
create new scratch messages when spilling a register, which need to
create scratch headers, which need g0. So, every new spill or fill
might extend the live range of g0, which would create new interference,
altering the graph. This can be problematic.
However, when compiling SIMD16 or SIMD32 fragment shaders, we don't
allow spilling anyway. So, why not use allow g0? Also, when trying
various scheduling modes, we first try allocation without spilling.
If it works, great, if not, we try a (hopefully) less aggressive
schedule, and only allow spilling on the lowest-pressure schedule.
So, even for regular SIMD8 shaders, we can potentially gain the use
of g0 on the first few tries at scheduling+allocation.
Once we try to allocate with spilling, we go back to reserving g0
for the entire program, so that we can construct scratch headers at
any point. We could possibly do better here, but this is simple and
reliable with some benefit.
Thanks to Ian Romanick for suggesting I try this approach.
fossil-db on Alchemist shows some more spill/fill improvements:
Totals:
Instrs: 149062395 -> 149053010 (-0.01%); split: -0.01%, +0.00%
Cycles: 12609496913 -> 12611652181 (+0.02%); split: -0.45%, +0.47%
Spill count: 52891 -> 52471 (-0.79%)
Fill count: 101599 -> 100818 (-0.77%)
Scratch Memory Size: 3292160 -> 3197952 (-2.86%)
Totals from 416541 (66.59% of 625484) affected shaders:
Instrs: 124058587 -> 124049202 (-0.01%); split: -0.01%, +0.01%
Cycles: 3567164271 -> 3569319539 (+0.06%); split: -1.61%, +1.67%
Spill count: 420 -> 0 (-inf%)
Fill count: 781 -> 0 (-inf%)
Scratch Memory Size: 94208 -> 0 (-inf%)
Witcher 3 shows a 33% reduction in scratch memory size, for example.
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30319>
2024-07-22 17:22:47 -07:00
|
|
|
v->calculate_payload_ranges(true, payload_count, payload_last_use_ip);
|
2023-08-15 01:15:17 -07:00
|
|
|
|
|
|
|
|
for (unsigned reg = 0; reg < payload_count; reg++) {
|
|
|
|
|
for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
|
|
|
|
|
++regs_live_at_ip[ip];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
delete[] payload_last_use_ip;
|
2013-08-04 23:27:14 -07:00
|
|
|
}
|
|
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
brw::register_pressure::~register_pressure()
|
|
|
|
|
{
|
|
|
|
|
delete[] regs_live_at_ip;
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-12 18:50:24 -08:00
|
|
|
void
|
|
|
|
|
fs_visitor::invalidate_analysis(brw::analysis_dependency_class c)
|
|
|
|
|
{
|
2016-03-13 16:25:57 -07:00
|
|
|
live_analysis.invalidate(c);
|
2016-03-13 16:35:49 -07:00
|
|
|
regpressure_analysis.invalidate(c);
|
2024-12-29 16:09:03 -08:00
|
|
|
performance_analysis.invalidate(c);
|
2024-02-19 22:25:16 -08:00
|
|
|
idom_analysis.invalidate(c);
|
2023-11-16 01:16:45 -08:00
|
|
|
def_analysis.invalidate(c);
|
2016-03-12 18:50:24 -08:00
|
|
|
}
|
|
|
|
|
|
2023-08-06 15:46:12 +03:00
|
|
|
void
|
2023-08-14 16:59:17 -07:00
|
|
|
fs_visitor::debug_optimizer(const nir_shader *nir,
|
|
|
|
|
const char *pass_name,
|
2023-08-06 15:46:12 +03:00
|
|
|
int iteration, int pass_num) const
|
|
|
|
|
{
|
2023-08-14 16:59:17 -07:00
|
|
|
if (!brw_should_print_shader(nir, DEBUG_OPTIMIZER))
|
2023-08-06 15:46:12 +03:00
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
char *filename;
|
2023-08-07 17:06:49 +03:00
|
|
|
int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
|
|
|
|
|
debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
|
2023-09-24 21:38:47 -07:00
|
|
|
_mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
|
2023-08-06 15:46:12 +03:00
|
|
|
iteration, pass_num, pass_name);
|
|
|
|
|
if (ret == -1)
|
|
|
|
|
return;
|
2024-12-07 09:53:31 -08:00
|
|
|
|
|
|
|
|
FILE *file = stderr;
|
|
|
|
|
if (__normal_user()) {
|
|
|
|
|
file = fopen(filename, "w");
|
|
|
|
|
if (!file)
|
|
|
|
|
file = stderr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
brw_print_instructions(*this, file);
|
|
|
|
|
|
|
|
|
|
if (file != stderr)
|
|
|
|
|
fclose(file);
|
|
|
|
|
|
2023-08-06 15:46:12 +03:00
|
|
|
free(filename);
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
static uint32_t
|
|
|
|
|
brw_compute_max_register_pressure(fs_visitor &s)
|
2023-02-03 17:02:28 +01:00
|
|
|
{
|
2024-07-12 16:55:33 -07:00
|
|
|
const register_pressure &rp = s.regpressure_analysis.require();
|
2023-02-03 17:02:28 +01:00
|
|
|
uint32_t ip = 0, max_pressure = 0;
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst(block, brw_inst, inst, s.cfg) {
|
2023-02-03 17:02:28 +01:00
|
|
|
max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
|
|
|
|
|
ip++;
|
|
|
|
|
}
|
|
|
|
|
return max_pressure;
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
static brw_inst **
|
2023-08-23 02:19:06 -07:00
|
|
|
save_instruction_order(const struct cfg_t *cfg)
|
|
|
|
|
{
|
|
|
|
|
/* Before we schedule anything, stash off the instruction order as an array
|
2024-12-07 00:23:07 -08:00
|
|
|
* of brw_inst *. This way, we can reset it between scheduling passes to
|
2023-08-23 02:19:06 -07:00
|
|
|
* prevent dependencies between the different scheduling modes.
|
|
|
|
|
*/
|
|
|
|
|
int num_insts = cfg->last_block()->end_ip + 1;
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst **inst_arr = new brw_inst * [num_insts];
|
2023-08-23 02:19:06 -07:00
|
|
|
|
|
|
|
|
int ip = 0;
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst(block, brw_inst, inst, cfg) {
|
2023-08-23 02:19:06 -07:00
|
|
|
assert(ip >= block->start_ip && ip <= block->end_ip);
|
|
|
|
|
inst_arr[ip++] = inst;
|
|
|
|
|
}
|
|
|
|
|
assert(ip == num_insts);
|
|
|
|
|
|
|
|
|
|
return inst_arr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2024-12-07 00:23:07 -08:00
|
|
|
restore_instruction_order(struct cfg_t *cfg, brw_inst **inst_arr)
|
2023-08-23 02:19:06 -07:00
|
|
|
{
|
2024-01-03 16:31:23 +10:00
|
|
|
ASSERTED int num_insts = cfg->last_block()->end_ip + 1;
|
2023-08-23 02:19:06 -07:00
|
|
|
|
|
|
|
|
int ip = 0;
|
|
|
|
|
foreach_block (block, cfg) {
|
|
|
|
|
block->instructions.make_empty();
|
|
|
|
|
|
|
|
|
|
assert(ip == block->start_ip);
|
|
|
|
|
for (; ip <= block->end_ip; ip++)
|
|
|
|
|
block->instructions.push_tail(inst_arr[ip]);
|
|
|
|
|
}
|
|
|
|
|
assert(ip == num_insts);
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-28 13:59:35 -08:00
|
|
|
/* Per-thread scratch space is a power-of-two multiple of 1KB. */
|
|
|
|
|
static inline unsigned
|
|
|
|
|
brw_get_scratch_size(int size)
|
|
|
|
|
{
|
|
|
|
|
return MAX2(1024, util_next_power_of_two(size));
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
void
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_allocate_registers(fs_visitor &s, bool allow_spilling)
|
2014-11-13 16:28:19 -08:00
|
|
|
{
|
2024-07-12 16:55:33 -07:00
|
|
|
const struct intel_device_info *devinfo = s.devinfo;
|
|
|
|
|
const nir_shader *nir = s.nir;
|
2019-05-09 14:44:16 -05:00
|
|
|
bool allocated;
|
2014-11-13 16:28:19 -08:00
|
|
|
|
2024-12-06 22:39:15 -08:00
|
|
|
static const enum brw_instruction_scheduler_mode pre_modes[] = {
|
|
|
|
|
BRW_SCHEDULE_PRE,
|
|
|
|
|
BRW_SCHEDULE_PRE_NON_LIFO,
|
|
|
|
|
BRW_SCHEDULE_NONE,
|
|
|
|
|
BRW_SCHEDULE_PRE_LIFO,
|
2014-11-13 16:28:19 -08:00
|
|
|
};
|
|
|
|
|
|
2016-10-17 14:12:28 -07:00
|
|
|
static const char *scheduler_mode_name[] = {
|
2024-12-06 22:39:15 -08:00
|
|
|
[BRW_SCHEDULE_PRE] = "top-down",
|
|
|
|
|
[BRW_SCHEDULE_PRE_NON_LIFO] = "non-lifo",
|
|
|
|
|
[BRW_SCHEDULE_PRE_LIFO] = "lifo",
|
|
|
|
|
[BRW_SCHEDULE_POST] = "post",
|
|
|
|
|
[BRW_SCHEDULE_NONE] = "none",
|
2016-10-17 14:12:28 -07:00
|
|
|
};
|
|
|
|
|
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
uint32_t best_register_pressure = UINT32_MAX;
|
2024-12-06 22:39:15 -08:00
|
|
|
enum brw_instruction_scheduler_mode best_sched = BRW_SCHEDULE_NONE;
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
brw_opt_compact_virtual_grfs(s);
|
2023-03-17 09:42:31 +02:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.needs_register_pressure)
|
|
|
|
|
s.shader_stats.max_register_pressure = brw_compute_max_register_pressure(s);
|
2023-02-03 17:02:28 +01:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
s.debug_optimizer(nir, "pre_register_allocate", 90, 90);
|
2023-08-06 15:46:12 +03:00
|
|
|
|
2021-10-13 11:21:41 +02:00
|
|
|
bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
|
2016-05-16 14:30:25 -07:00
|
|
|
|
2021-11-09 19:03:19 -06:00
|
|
|
/* Before we schedule anything, stash off the instruction order as an array
|
2024-12-07 00:23:07 -08:00
|
|
|
* of brw_inst *. This way, we can reset it between scheduling passes to
|
2021-11-09 19:03:19 -06:00
|
|
|
* prevent dependencies between the different scheduling modes.
|
|
|
|
|
*/
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst **orig_order = save_instruction_order(s.cfg);
|
|
|
|
|
brw_inst **best_pressure_order = NULL;
|
2021-11-09 19:03:19 -06:00
|
|
|
|
2023-10-20 10:32:54 -07:00
|
|
|
void *scheduler_ctx = ralloc_context(NULL);
|
2024-12-06 22:39:15 -08:00
|
|
|
brw_instruction_scheduler *sched = brw_prepare_scheduler(s, scheduler_ctx);
|
2023-10-20 10:32:54 -07:00
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
/* Try each scheduling heuristic to see if it can successfully register
|
|
|
|
|
* allocate without spilling. They should be ordered by decreasing
|
|
|
|
|
* performance but increasing likelihood of allocating.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
|
2024-12-06 22:39:15 -08:00
|
|
|
enum brw_instruction_scheduler_mode sched_mode = pre_modes[i];
|
2023-08-14 19:35:32 -07:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_schedule_instructions_pre_ra(s, sched, sched_mode);
|
|
|
|
|
s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
|
2014-11-13 16:28:19 -08:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
s.debug_optimizer(nir, s.shader_stats.scheduler_mode, 95, i);
|
2023-08-15 01:15:26 -07:00
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
if (0) {
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_assign_regs_trivial(s);
|
2019-05-09 14:44:16 -05:00
|
|
|
allocated = true;
|
|
|
|
|
break;
|
2014-11-13 16:28:19 -08:00
|
|
|
}
|
2019-05-09 14:44:16 -05:00
|
|
|
|
|
|
|
|
/* We should only spill registers on the last scheduling. */
|
2024-07-12 16:55:33 -07:00
|
|
|
assert(!s.spilled_any_registers);
|
2019-05-09 14:44:16 -05:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
allocated = brw_assign_regs(s, false, spill_all);
|
2019-05-09 14:44:16 -05:00
|
|
|
if (allocated)
|
2014-11-13 16:28:19 -08:00
|
|
|
break;
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
|
|
|
|
/* Save the maximum register pressure */
|
2024-07-12 16:55:33 -07:00
|
|
|
uint32_t this_pressure = brw_compute_max_register_pressure(s);
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
|
|
|
|
if (0) {
|
|
|
|
|
fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
|
|
|
|
|
scheduler_mode_name[sched_mode], this_pressure);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this_pressure < best_register_pressure) {
|
|
|
|
|
best_register_pressure = this_pressure;
|
|
|
|
|
best_sched = sched_mode;
|
|
|
|
|
delete[] best_pressure_order;
|
2024-07-12 16:55:33 -07:00
|
|
|
best_pressure_order = save_instruction_order(s.cfg);
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Reset back to the original order before trying the next mode */
|
2024-07-12 16:55:33 -07:00
|
|
|
restore_instruction_order(s.cfg, orig_order);
|
|
|
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
}
|
|
|
|
|
|
2023-10-20 10:32:54 -07:00
|
|
|
ralloc_free(scheduler_ctx);
|
|
|
|
|
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
if (!allocated) {
|
|
|
|
|
if (0) {
|
|
|
|
|
fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
|
|
|
|
|
scheduler_mode_name[best_sched]);
|
|
|
|
|
}
|
2024-07-12 16:55:33 -07:00
|
|
|
restore_instruction_order(s.cfg, best_pressure_order);
|
|
|
|
|
s.shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
allocated = brw_assign_regs(s, allow_spilling, spill_all);
|
2014-11-13 16:28:19 -08:00
|
|
|
}
|
|
|
|
|
|
2023-08-23 02:19:06 -07:00
|
|
|
delete[] orig_order;
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
delete[] best_pressure_order;
|
2023-08-23 02:19:06 -07:00
|
|
|
|
2019-05-09 14:44:16 -05:00
|
|
|
if (!allocated) {
|
2024-07-12 16:55:33 -07:00
|
|
|
s.fail("Failure to register allocate. Reduce number of "
|
2020-05-19 14:37:44 -07:00
|
|
|
"live scalar values to avoid this.");
|
2024-07-12 16:55:33 -07:00
|
|
|
} else if (s.spilled_any_registers) {
|
|
|
|
|
brw_shader_perf_log(s.compiler, s.log_data,
|
2021-07-29 14:27:57 -07:00
|
|
|
"%s shader triggered register spilling. "
|
|
|
|
|
"Try reducing the number of live scalar "
|
|
|
|
|
"values to improve performance.\n",
|
2024-07-12 16:55:33 -07:00
|
|
|
_mesa_shader_stage_to_string(s.stage));
|
2014-11-13 16:28:19 -08:00
|
|
|
}
|
|
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.failed)
|
2014-11-13 16:28:19 -08:00
|
|
|
return;
|
|
|
|
|
|
2024-12-18 11:07:08 -08:00
|
|
|
int pass_num = 0;
|
|
|
|
|
|
|
|
|
|
s.debug_optimizer(nir, "post_ra_alloc", 96, pass_num++);
|
2024-03-14 16:28:56 +02:00
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
brw_opt_bank_conflicts(s);
|
2017-06-15 15:23:57 -07:00
|
|
|
|
2024-12-18 11:07:08 -08:00
|
|
|
s.debug_optimizer(nir, "bank_conflict", 96, pass_num++);
|
2024-03-14 16:28:56 +02:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_schedule_instructions_post_ra(s);
|
2014-11-13 16:28:19 -08:00
|
|
|
|
2024-12-18 11:07:08 -08:00
|
|
|
s.debug_optimizer(nir, "post_ra_alloc_scheduling", 96, pass_num++);
|
2024-03-14 16:28:56 +02:00
|
|
|
|
2024-04-04 16:03:34 -07:00
|
|
|
/* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead
|
|
|
|
|
* of part of assign_regs since both bank conflicts optimization and post
|
|
|
|
|
* RA scheduling take advantage of distinguishing references to registers
|
|
|
|
|
* that were allocated from references that were already fixed.
|
|
|
|
|
*
|
|
|
|
|
* TODO: Change the passes above, then move this lowering to be part of
|
|
|
|
|
* assign_regs.
|
|
|
|
|
*/
|
2024-12-06 11:37:57 -08:00
|
|
|
brw_lower_vgrfs_to_fixed_grfs(s);
|
2024-04-04 16:03:34 -07:00
|
|
|
|
2024-12-18 11:07:08 -08:00
|
|
|
s.debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, pass_num++);
|
2016-06-09 18:13:26 -07:00
|
|
|
|
2024-11-20 08:12:52 -08:00
|
|
|
if (s.devinfo->ver >= 30) {
|
|
|
|
|
brw_lower_send_gather(s);
|
|
|
|
|
s.debug_optimizer(nir, "lower_send_gather", 96, pass_num++);
|
|
|
|
|
}
|
|
|
|
|
|
2024-08-27 10:16:11 -07:00
|
|
|
brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_REGALLOC);
|
|
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.last_scratch > 0) {
|
2016-06-09 18:13:26 -07:00
|
|
|
/* We currently only support up to 2MB of scratch space. If we
|
|
|
|
|
* need to support more eventually, the documentation suggests
|
|
|
|
|
* that we could allocate a larger buffer, and partition it out
|
|
|
|
|
* ourselves. We'd just have to undo the hardware's address
|
|
|
|
|
* calculation by subtracting (FFTID * Per Thread Scratch Space)
|
|
|
|
|
* and then add FFTID * (Larger Per Thread Scratch Space).
|
|
|
|
|
*
|
|
|
|
|
* See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
|
|
|
|
|
* Thread Group Tracking > Local Memory/Scratch Space.
|
|
|
|
|
*/
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.last_scratch <= devinfo->max_scratch_size_per_thread) {
|
2024-07-19 10:55:59 -07:00
|
|
|
/* Take the max of any previously compiled variant of the shader. In the
|
|
|
|
|
* case of bindless shaders with return parts, this will also take the
|
|
|
|
|
* max of all parts.
|
|
|
|
|
*/
|
2024-07-12 16:55:33 -07:00
|
|
|
s.prog_data->total_scratch = MAX2(brw_get_scratch_size(s.last_scratch),
|
|
|
|
|
s.prog_data->total_scratch);
|
2024-07-19 10:55:59 -07:00
|
|
|
} else {
|
2024-07-12 16:55:33 -07:00
|
|
|
s.fail("Scratch space required is larger than supported");
|
2024-07-19 10:55:59 -07:00
|
|
|
}
|
2016-06-09 16:56:31 -07:00
|
|
|
}
|
2018-11-09 14:13:37 -08:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.failed)
|
2024-07-19 10:55:59 -07:00
|
|
|
return;
|
|
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
brw_lower_scoreboard(s);
|
2024-11-20 16:18:40 -08:00
|
|
|
|
2024-12-18 11:07:08 -08:00
|
|
|
s.debug_optimizer(nir, "scoreboard", 96, pass_num++);
|
2014-11-13 16:28:19 -08:00
|
|
|
}
|
|
|
|
|
|
2020-03-20 21:02:06 -07:00
|
|
|
unsigned
|
|
|
|
|
brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
|
|
|
|
|
unsigned threads)
|
|
|
|
|
{
|
|
|
|
|
assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
|
|
|
|
|
assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
|
|
|
|
|
return cs_prog_data->push.per_thread.size * threads +
|
|
|
|
|
cs_prog_data->push.cross_thread.size;
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-01 16:02:50 -08:00
|
|
|
struct intel_cs_dispatch_info
|
2021-04-28 10:54:53 -07:00
|
|
|
brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
|
|
|
|
|
const struct brw_cs_prog_data *prog_data,
|
|
|
|
|
const unsigned *override_local_size)
|
|
|
|
|
{
|
2024-02-01 16:02:50 -08:00
|
|
|
struct intel_cs_dispatch_info info = {};
|
2021-04-28 10:54:53 -07:00
|
|
|
|
|
|
|
|
const unsigned *sizes =
|
|
|
|
|
override_local_size ? override_local_size :
|
|
|
|
|
prog_data->local_size;
|
|
|
|
|
|
2022-11-08 01:24:36 -08:00
|
|
|
const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
|
2021-10-11 07:49:40 -07:00
|
|
|
assert(simd >= 0 && simd < 3);
|
|
|
|
|
|
2021-04-28 10:54:53 -07:00
|
|
|
info.group_size = sizes[0] * sizes[1] * sizes[2];
|
2021-10-11 07:49:40 -07:00
|
|
|
info.simd_size = 8u << simd;
|
2021-04-28 10:54:53 -07:00
|
|
|
info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
|
|
|
|
|
|
|
|
|
|
const uint32_t remainder = info.group_size & (info.simd_size - 1);
|
|
|
|
|
if (remainder > 0)
|
|
|
|
|
info.right_mask = ~0u >> (32 - remainder);
|
|
|
|
|
else
|
|
|
|
|
info.right_mask = ~0u >> (32 - info.simd_size);
|
|
|
|
|
|
|
|
|
|
return info;
|
|
|
|
|
}
|
|
|
|
|
|
2024-08-27 10:16:11 -07:00
|
|
|
void
|
|
|
|
|
brw_shader_phase_update(fs_visitor &s, enum brw_shader_phase phase)
|
|
|
|
|
{
|
|
|
|
|
assert(phase == s.phase + 1);
|
|
|
|
|
s.phase = phase;
|
2024-12-29 17:39:39 -08:00
|
|
|
brw_validate(s);
|
2024-08-27 10:16:11 -07:00
|
|
|
}
|
|
|
|
|
|
2023-06-20 14:42:02 -07:00
|
|
|
bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
|
|
|
|
|
{
|
|
|
|
|
return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
|
2023-06-21 07:51:00 -07:00
|
|
|
}
|