2010-08-10 20:39:06 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
2011-05-24 16:45:17 -07:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/** @file brw_fs.cpp
|
2010-08-10 20:39:06 -07:00
|
|
|
*
|
2011-05-24 16:45:17 -07:00
|
|
|
* This file drives the GLSL IR -> LIR translation, contains the
|
|
|
|
|
* optimizations on the LIR, and drives the generation of native code
|
|
|
|
|
* from the LIR.
|
2010-08-10 20:39:06 -07:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "brw_eu.h"
|
2010-10-10 15:42:37 -07:00
|
|
|
#include "brw_fs.h"
|
2023-11-21 09:58:55 -08:00
|
|
|
#include "brw_fs_builder.h"
|
2020-01-23 22:55:33 -08:00
|
|
|
#include "brw_fs_live_variables.h"
|
2015-11-11 10:04:43 -08:00
|
|
|
#include "brw_nir.h"
|
2014-07-12 21:18:39 -07:00
|
|
|
#include "brw_cfg.h"
|
2021-10-07 00:23:07 -07:00
|
|
|
#include "brw_private.h"
|
2023-11-01 12:51:33 -07:00
|
|
|
#include "intel_nir.h"
|
2023-09-24 21:38:47 -07:00
|
|
|
#include "shader_enums.h"
|
2021-04-05 10:44:41 -07:00
|
|
|
#include "dev/intel_debug.h"
|
2023-01-20 23:19:34 -08:00
|
|
|
#include "dev/intel_wa.h"
|
2016-01-18 11:35:29 +02:00
|
|
|
#include "compiler/glsl_types.h"
|
2016-07-17 18:37:08 -07:00
|
|
|
#include "compiler/nir/nir_builder.h"
|
2018-08-21 09:46:46 -07:00
|
|
|
#include "util/u_math.h"
|
2010-08-10 20:39:06 -07:00
|
|
|
|
2022-11-08 14:14:37 -08:00
|
|
|
#include <memory>
|
|
|
|
|
|
2015-06-03 20:36:47 +03:00
|
|
|
using namespace brw;
|
|
|
|
|
|
2024-03-21 15:42:44 -07:00
|
|
|
static void
|
|
|
|
|
initialize_sources(fs_inst *inst, const fs_reg src[], uint8_t num_sources);
|
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
void
|
2014-08-14 13:56:24 -07:00
|
|
|
fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
2015-02-06 01:14:51 +02:00
|
|
|
const fs_reg *src, unsigned sources)
|
2012-07-04 13:12:50 -07:00
|
|
|
{
|
2018-07-16 13:19:30 -07:00
|
|
|
memset((void*)this, 0, sizeof(*this));
|
2014-02-19 21:18:44 -08:00
|
|
|
|
2024-03-21 15:42:44 -07:00
|
|
|
initialize_sources(this, src, sources);
|
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
for (unsigned i = 0; i < sources; i++)
|
|
|
|
|
this->src[i] = src[i];
|
|
|
|
|
|
2014-05-27 10:25:05 -07:00
|
|
|
this->opcode = opcode;
|
|
|
|
|
this->dst = dst;
|
2014-08-14 13:56:24 -07:00
|
|
|
this->exec_size = exec_size;
|
|
|
|
|
|
|
|
|
|
assert(dst.file != IMM && dst.file != UNIFORM);
|
|
|
|
|
|
|
|
|
|
assert(this->exec_size != 0);
|
2014-02-19 21:18:44 -08:00
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
this->conditional_mod = BRW_CONDITIONAL_NONE;
|
|
|
|
|
|
2013-03-18 11:30:57 -07:00
|
|
|
/* This will be the case for almost all instructions. */
|
2014-08-18 14:27:55 -07:00
|
|
|
switch (dst.file) {
|
2015-10-26 17:09:25 -07:00
|
|
|
case VGRF:
|
2015-10-26 17:52:57 -07:00
|
|
|
case ARF:
|
|
|
|
|
case FIXED_GRF:
|
2014-10-20 23:16:48 -07:00
|
|
|
case ATTR:
|
2016-09-07 13:38:20 -07:00
|
|
|
this->size_written = dst.component_size(exec_size);
|
2014-08-18 14:27:55 -07:00
|
|
|
break;
|
|
|
|
|
case BAD_FILE:
|
2016-09-07 13:38:20 -07:00
|
|
|
this->size_written = 0;
|
2014-08-18 14:27:55 -07:00
|
|
|
break;
|
|
|
|
|
case IMM:
|
|
|
|
|
case UNIFORM:
|
|
|
|
|
unreachable("Invalid destination register file");
|
|
|
|
|
}
|
2014-04-04 16:51:59 +03:00
|
|
|
|
|
|
|
|
this->writes_accumulator = false;
|
2012-07-04 13:12:50 -07:00
|
|
|
}
|
|
|
|
|
|
2014-08-14 13:56:24 -07:00
|
|
|
fs_inst::fs_inst()
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
init(opcode, exec_size, reg_undef, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
}
|
|
|
|
|
|
2015-06-18 12:30:43 -07:00
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
|
2012-07-04 13:12:50 -07:00
|
|
|
{
|
2015-06-18 12:30:43 -07:00
|
|
|
init(opcode, exec_size, dst, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
|
|
|
|
const fs_reg &src0)
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
const fs_reg src[1] = { src0 };
|
2014-08-14 13:56:24 -07:00
|
|
|
init(opcode, exec_size, dst, src, 1);
|
2012-07-04 13:12:50 -07:00
|
|
|
}
|
|
|
|
|
|
2014-08-14 13:56:24 -07:00
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
|
|
|
|
const fs_reg &src0, const fs_reg &src1)
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
const fs_reg src[2] = { src0, src1 };
|
2014-08-14 13:56:24 -07:00
|
|
|
init(opcode, exec_size, dst, src, 2);
|
2012-07-04 13:12:50 -07:00
|
|
|
}
|
|
|
|
|
|
2014-08-14 13:56:24 -07:00
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
|
|
|
|
const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
const fs_reg src[3] = { src0, src1, src2 };
|
2014-08-14 13:56:24 -07:00
|
|
|
init(opcode, exec_size, dst, src, 3);
|
2012-07-04 13:12:50 -07:00
|
|
|
}
|
|
|
|
|
|
2014-08-14 13:56:24 -07:00
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
|
2015-02-06 01:14:51 +02:00
|
|
|
const fs_reg src[], unsigned sources)
|
2014-08-14 13:56:24 -07:00
|
|
|
{
|
|
|
|
|
init(opcode, exec_width, dst, src, sources);
|
2014-05-26 18:44:17 -07:00
|
|
|
}
|
|
|
|
|
|
2014-02-20 09:40:02 -08:00
|
|
|
fs_inst::fs_inst(const fs_inst &that)
|
|
|
|
|
{
|
2018-07-16 13:19:30 -07:00
|
|
|
memcpy((void*)this, &that, sizeof(that));
|
2024-03-21 15:42:44 -07:00
|
|
|
initialize_sources(this, that.src, that.sources);
|
2014-02-20 09:40:02 -08:00
|
|
|
}
|
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
fs_inst::~fs_inst()
|
|
|
|
|
{
|
2024-03-21 15:42:44 -07:00
|
|
|
if (this->src != this->builtin_src)
|
|
|
|
|
delete[] this->src;
|
2015-02-06 01:14:51 +02:00
|
|
|
}
|
|
|
|
|
|
2024-03-21 15:42:44 -07:00
|
|
|
static void
|
|
|
|
|
initialize_sources(fs_inst *inst, const fs_reg src[], uint8_t num_sources)
|
2014-02-20 13:14:05 -08:00
|
|
|
{
|
2024-03-21 15:42:44 -07:00
|
|
|
if (num_sources > ARRAY_SIZE(inst->builtin_src))
|
|
|
|
|
inst->src = new fs_reg[num_sources];
|
|
|
|
|
else
|
|
|
|
|
inst->src = inst->builtin_src;
|
2015-02-06 01:14:51 +02:00
|
|
|
|
2024-03-21 15:42:44 -07:00
|
|
|
for (unsigned i = 0; i < num_sources; i++)
|
|
|
|
|
inst->src[i] = src[i];
|
2015-02-06 01:14:51 +02:00
|
|
|
|
2024-03-21 15:42:44 -07:00
|
|
|
inst->sources = num_sources;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
fs_inst::resize_sources(uint8_t num_sources)
|
|
|
|
|
{
|
|
|
|
|
if (this->sources == num_sources)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
fs_reg *old_src = this->src;
|
|
|
|
|
fs_reg *new_src;
|
|
|
|
|
|
|
|
|
|
const unsigned builtin_size = ARRAY_SIZE(this->builtin_src);
|
|
|
|
|
|
|
|
|
|
if (old_src == this->builtin_src) {
|
|
|
|
|
if (num_sources > builtin_size) {
|
|
|
|
|
new_src = new fs_reg[num_sources];
|
|
|
|
|
for (unsigned i = 0; i < this->sources; i++)
|
|
|
|
|
new_src[i] = old_src[i];
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
new_src = old_src;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if (num_sources <= builtin_size) {
|
|
|
|
|
new_src = this->builtin_src;
|
|
|
|
|
assert(this->sources > num_sources);
|
|
|
|
|
for (unsigned i = 0; i < num_sources; i++)
|
|
|
|
|
new_src[i] = old_src[i];
|
|
|
|
|
|
|
|
|
|
} else if (num_sources < this->sources) {
|
|
|
|
|
new_src = old_src;
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
new_src = new fs_reg[num_sources];
|
|
|
|
|
for (unsigned i = 0; i < num_sources; i++)
|
|
|
|
|
new_src[i] = old_src[i];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (old_src != new_src)
|
|
|
|
|
delete[] old_src;
|
2014-02-20 13:14:05 -08:00
|
|
|
}
|
2024-03-21 15:42:44 -07:00
|
|
|
|
|
|
|
|
this->sources = num_sources;
|
|
|
|
|
this->src = new_src;
|
2014-02-20 13:14:05 -08:00
|
|
|
}
|
|
|
|
|
|
2015-06-03 22:22:39 +03:00
|
|
|
void
|
|
|
|
|
fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
|
|
|
|
|
const fs_reg &dst,
|
2023-01-13 12:29:30 +02:00
|
|
|
const fs_reg &surface,
|
|
|
|
|
const fs_reg &surface_handle,
|
2014-02-19 20:31:14 -08:00
|
|
|
const fs_reg &varying_offset,
|
2020-02-21 10:59:38 -06:00
|
|
|
uint32_t const_offset,
|
intel/fs: Don't rely on CSE for VARYING_PULL_CONSTANT_LOAD
In the past, we didn't have a good solution for combining scalar loads
with a variable index plus a constant offset. To handle that, we took
our load offset and rounded it down to the nearest vec4, loaded an
entire vec4, and trusted in the backend CSE pass to detect loads from
the same address and remove redundant ones.
These days, nir_opt_load_store_vectorize() does a good job of taking
those scalar loads and combining them into vector loads for us, so we
no longer need to do this trick. In fact, it can be better not to:
our offset need only be 4 byte (scalar) aligned, but we were making it
16 byte (vec4) aligned. So if you wanted to load an unaligned vec2,
we might actually load two vec4's (___X | Y___) instead of doing a
single load at the starting offset.
This should also reduce the work the backend CSE pass has to do,
since we just emit a single VARYING_PULL_CONSTANT_LOAD instead of 4.
shader-db results on Alchemist:
- No changes in SEND count or spills/fills
- Instructions: helped 95, hurt 100, +/- 1-3 instructions
- Cycles: helped 3411 hurt 1868, -0.01% (-0.28% in affected)
- SIMD32: gained 5, lost 3
fossil-db results on Alchemist:
- Instrs: 161381427 -> 161384130 (+0.00%); split: -0.00%, +0.00%
- Cycles: 14258305873 -> 14145884365 (-0.79%); split: -0.95%, +0.16%
- SIMD32: Gained 42, lost 26
- Totals from 56285 (8.63% of 652236) affected shaders:
- Instrs: 13318308 -> 13321011 (+0.02%); split: -0.01%, +0.03%
- Cycles: 7464985282 -> 7352563774 (-1.51%); split: -1.82%, +0.31%
From this we can see that we aren't doing more loads than before
and the change is pretty inconsequential, but it requires less
optimizing to produce similar results.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27568>
2024-02-01 09:45:46 -08:00
|
|
|
uint8_t alignment,
|
|
|
|
|
unsigned components)
|
2012-11-08 16:06:24 -08:00
|
|
|
{
|
intel/fs: Don't rely on CSE for VARYING_PULL_CONSTANT_LOAD
In the past, we didn't have a good solution for combining scalar loads
with a variable index plus a constant offset. To handle that, we took
our load offset and rounded it down to the nearest vec4, loaded an
entire vec4, and trusted in the backend CSE pass to detect loads from
the same address and remove redundant ones.
These days, nir_opt_load_store_vectorize() does a good job of taking
those scalar loads and combining them into vector loads for us, so we
no longer need to do this trick. In fact, it can be better not to:
our offset need only be 4 byte (scalar) aligned, but we were making it
16 byte (vec4) aligned. So if you wanted to load an unaligned vec2,
we might actually load two vec4's (___X | Y___) instead of doing a
single load at the starting offset.
This should also reduce the work the backend CSE pass has to do,
since we just emit a single VARYING_PULL_CONSTANT_LOAD instead of 4.
shader-db results on Alchemist:
- No changes in SEND count or spills/fills
- Instructions: helped 95, hurt 100, +/- 1-3 instructions
- Cycles: helped 3411 hurt 1868, -0.01% (-0.28% in affected)
- SIMD32: gained 5, lost 3
fossil-db results on Alchemist:
- Instrs: 161381427 -> 161384130 (+0.00%); split: -0.00%, +0.00%
- Cycles: 14258305873 -> 14145884365 (-0.79%); split: -0.95%, +0.16%
- SIMD32: Gained 42, lost 26
- Totals from 56285 (8.63% of 652236) affected shaders:
- Instrs: 13318308 -> 13321011 (+0.02%); split: -0.01%, +0.03%
- Cycles: 7464985282 -> 7352563774 (-1.51%); split: -1.82%, +0.31%
From this we can see that we aren't doing more loads than before
and the change is pretty inconsequential, but it requires less
optimizing to produce similar results.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27568>
2024-02-01 09:45:46 -08:00
|
|
|
assert(components <= 4);
|
|
|
|
|
|
2013-03-18 10:16:42 -07:00
|
|
|
/* We have our constant surface use a pitch of 4 bytes, so our index can
|
|
|
|
|
* be any component of a vector, and then we load 4 contiguous
|
intel/fs: Don't rely on CSE for VARYING_PULL_CONSTANT_LOAD
In the past, we didn't have a good solution for combining scalar loads
with a variable index plus a constant offset. To handle that, we took
our load offset and rounded it down to the nearest vec4, loaded an
entire vec4, and trusted in the backend CSE pass to detect loads from
the same address and remove redundant ones.
These days, nir_opt_load_store_vectorize() does a good job of taking
those scalar loads and combining them into vector loads for us, so we
no longer need to do this trick. In fact, it can be better not to:
our offset need only be 4 byte (scalar) aligned, but we were making it
16 byte (vec4) aligned. So if you wanted to load an unaligned vec2,
we might actually load two vec4's (___X | Y___) instead of doing a
single load at the starting offset.
This should also reduce the work the backend CSE pass has to do,
since we just emit a single VARYING_PULL_CONSTANT_LOAD instead of 4.
shader-db results on Alchemist:
- No changes in SEND count or spills/fills
- Instructions: helped 95, hurt 100, +/- 1-3 instructions
- Cycles: helped 3411 hurt 1868, -0.01% (-0.28% in affected)
- SIMD32: gained 5, lost 3
fossil-db results on Alchemist:
- Instrs: 161381427 -> 161384130 (+0.00%); split: -0.00%, +0.00%
- Cycles: 14258305873 -> 14145884365 (-0.79%); split: -0.95%, +0.16%
- SIMD32: Gained 42, lost 26
- Totals from 56285 (8.63% of 652236) affected shaders:
- Instrs: 13318308 -> 13321011 (+0.02%); split: -0.01%, +0.03%
- Cycles: 7464985282 -> 7352563774 (-1.51%); split: -1.82%, +0.31%
From this we can see that we aren't doing more loads than before
and the change is pretty inconsequential, but it requires less
optimizing to produce similar results.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27568>
2024-02-01 09:45:46 -08:00
|
|
|
* components starting from that. TODO: Support loading fewer than 4.
|
2013-03-18 10:16:42 -07:00
|
|
|
*/
|
2024-04-12 17:43:22 -07:00
|
|
|
fs_reg total_offset = bld.ADD(varying_offset, brw_imm_ud(const_offset));
|
2013-03-18 10:16:42 -07:00
|
|
|
|
2016-01-14 08:55:28 +01:00
|
|
|
/* The pull load message will load a vec4 (16 bytes). If we are loading
|
|
|
|
|
* a double this means we are only loading 2 elements worth of data.
|
|
|
|
|
* We also want to use a 32-bit data type for the dst of the load operation
|
|
|
|
|
* so other parts of the driver don't get confused about the size of the
|
|
|
|
|
* result.
|
|
|
|
|
*/
|
2024-04-20 17:08:02 -07:00
|
|
|
fs_reg vec4_result = bld.vgrf(BRW_TYPE_F, 4);
|
2023-01-13 12:29:30 +02:00
|
|
|
|
|
|
|
|
fs_reg srcs[PULL_VARYING_CONSTANT_SRCS];
|
|
|
|
|
srcs[PULL_VARYING_CONSTANT_SRC_SURFACE] = surface;
|
|
|
|
|
srcs[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
|
intel/fs: Don't rely on CSE for VARYING_PULL_CONSTANT_LOAD
In the past, we didn't have a good solution for combining scalar loads
with a variable index plus a constant offset. To handle that, we took
our load offset and rounded it down to the nearest vec4, loaded an
entire vec4, and trusted in the backend CSE pass to detect loads from
the same address and remove redundant ones.
These days, nir_opt_load_store_vectorize() does a good job of taking
those scalar loads and combining them into vector loads for us, so we
no longer need to do this trick. In fact, it can be better not to:
our offset need only be 4 byte (scalar) aligned, but we were making it
16 byte (vec4) aligned. So if you wanted to load an unaligned vec2,
we might actually load two vec4's (___X | Y___) instead of doing a
single load at the starting offset.
This should also reduce the work the backend CSE pass has to do,
since we just emit a single VARYING_PULL_CONSTANT_LOAD instead of 4.
shader-db results on Alchemist:
- No changes in SEND count or spills/fills
- Instructions: helped 95, hurt 100, +/- 1-3 instructions
- Cycles: helped 3411 hurt 1868, -0.01% (-0.28% in affected)
- SIMD32: gained 5, lost 3
fossil-db results on Alchemist:
- Instrs: 161381427 -> 161384130 (+0.00%); split: -0.00%, +0.00%
- Cycles: 14258305873 -> 14145884365 (-0.79%); split: -0.95%, +0.16%
- SIMD32: Gained 42, lost 26
- Totals from 56285 (8.63% of 652236) affected shaders:
- Instrs: 13318308 -> 13321011 (+0.02%); split: -0.01%, +0.03%
- Cycles: 7464985282 -> 7352563774 (-1.51%); split: -1.82%, +0.31%
From this we can see that we aren't doing more loads than before
and the change is pretty inconsequential, but it requires less
optimizing to produce similar results.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27568>
2024-02-01 09:45:46 -08:00
|
|
|
srcs[PULL_VARYING_CONSTANT_SRC_OFFSET] = total_offset;
|
2023-01-13 12:29:30 +02:00
|
|
|
srcs[PULL_VARYING_CONSTANT_SRC_ALIGNMENT] = brw_imm_ud(alignment);
|
|
|
|
|
|
2016-05-17 23:18:38 -07:00
|
|
|
fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
|
2023-01-13 12:29:30 +02:00
|
|
|
vec4_result, srcs, PULL_VARYING_CONSTANT_SRCS);
|
2016-09-01 18:43:48 -07:00
|
|
|
inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
|
2012-11-08 16:06:24 -08:00
|
|
|
|
intel/fs: Don't rely on CSE for VARYING_PULL_CONSTANT_LOAD
In the past, we didn't have a good solution for combining scalar loads
with a variable index plus a constant offset. To handle that, we took
our load offset and rounded it down to the nearest vec4, loaded an
entire vec4, and trusted in the backend CSE pass to detect loads from
the same address and remove redundant ones.
These days, nir_opt_load_store_vectorize() does a good job of taking
those scalar loads and combining them into vector loads for us, so we
no longer need to do this trick. In fact, it can be better not to:
our offset need only be 4 byte (scalar) aligned, but we were making it
16 byte (vec4) aligned. So if you wanted to load an unaligned vec2,
we might actually load two vec4's (___X | Y___) instead of doing a
single load at the starting offset.
This should also reduce the work the backend CSE pass has to do,
since we just emit a single VARYING_PULL_CONSTANT_LOAD instead of 4.
shader-db results on Alchemist:
- No changes in SEND count or spills/fills
- Instructions: helped 95, hurt 100, +/- 1-3 instructions
- Cycles: helped 3411 hurt 1868, -0.01% (-0.28% in affected)
- SIMD32: gained 5, lost 3
fossil-db results on Alchemist:
- Instrs: 161381427 -> 161384130 (+0.00%); split: -0.00%, +0.00%
- Cycles: 14258305873 -> 14145884365 (-0.79%); split: -0.95%, +0.16%
- SIMD32: Gained 42, lost 26
- Totals from 56285 (8.63% of 652236) affected shaders:
- Instrs: 13318308 -> 13321011 (+0.02%); split: -0.01%, +0.03%
- Cycles: 7464985282 -> 7352563774 (-1.51%); split: -1.82%, +0.31%
From this we can see that we aren't doing more loads than before
and the change is pretty inconsequential, but it requires less
optimizing to produce similar results.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27568>
2024-02-01 09:45:46 -08:00
|
|
|
shuffle_from_32bit_read(bld, dst, vec4_result, 0, components);
|
2012-11-08 16:06:24 -08:00
|
|
|
}
|
|
|
|
|
|
2012-11-09 11:48:20 -08:00
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
fs_inst::is_send_from_grf() const
|
2012-11-09 11:48:20 -08:00
|
|
|
{
|
2014-09-13 11:49:55 -07:00
|
|
|
switch (opcode) {
|
2018-10-29 15:06:14 -05:00
|
|
|
case SHADER_OPCODE_SEND:
|
2014-09-13 11:49:55 -07:00
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
2019-04-26 17:11:42 -07:00
|
|
|
case SHADER_OPCODE_INTERLOCK:
|
|
|
|
|
case SHADER_OPCODE_MEMORY_FENCE:
|
|
|
|
|
case SHADER_OPCODE_BARRIER:
|
2014-09-13 11:49:55 -07:00
|
|
|
return true;
|
2024-03-26 02:15:47 -07:00
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
|
|
|
|
return src[1].file == VGRF;
|
2014-09-13 11:49:55 -07:00
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2012-11-09 11:48:20 -08:00
|
|
|
}
|
|
|
|
|
|
2019-01-16 18:30:08 -08:00
|
|
|
bool
|
|
|
|
|
fs_inst::is_control_source(unsigned arg) const
|
|
|
|
|
{
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
|
|
|
|
return arg == 0;
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_BROADCAST:
|
|
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
case SHADER_OPCODE_QUAD_SWIZZLE:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
|
|
|
return arg == 1;
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
|
|
|
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
|
|
|
|
return arg == 1 || arg == 2;
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_SEND:
|
|
|
|
|
return arg == 0 || arg == 1;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-09 14:13:37 -08:00
|
|
|
bool
|
|
|
|
|
fs_inst::is_payload(unsigned arg) const
|
|
|
|
|
{
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
case SHADER_OPCODE_INTERLOCK:
|
|
|
|
|
case SHADER_OPCODE_MEMORY_FENCE:
|
|
|
|
|
case SHADER_OPCODE_BARRIER:
|
|
|
|
|
return arg == 0;
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_SEND:
|
|
|
|
|
return arg == 2 || arg == 3;
|
|
|
|
|
|
|
|
|
|
default:
|
2023-10-09 08:23:53 -07:00
|
|
|
return false;
|
2018-11-09 14:13:37 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
i965: Add src/dst interference for certain instructions with hazards.
When working on tessellation shaders, I created some vec4 virtual
opcodes for creating message headers through a sequence like:
mov(8) g7<1>UD 0x00000000UD { align1 WE_all 1Q compacted };
mov(1) g7.5<1>UD 0x00000100UD { align1 WE_all };
mov(1) g7<1>UD g0<0,1,0>UD { align1 WE_all compacted };
mov(1) g7.3<1>UD g8<0,1,0>UD { align1 WE_all };
This is done in the generator since the vec4 backend can't handle align1
regioning. From the visitor's point of view, this is a single opcode:
hs_set_output_urb_offsets vgrf7.0:UD, 1U, vgrf8.xxxx:UD
Normally, there's no hazard between sources and destinations - an
instruction (naturally) reads its sources, then writes the result to the
destination. However, when the virtual instruction generates multiple
hardware instructions, we can get into trouble.
In the above example, if the register allocator assigned vgrf7 and vgrf8
to the same hardware register, then we'd clobber the source with 0 in
the first instruction, and read back the wrong value in the last one.
It occured to me that this is exactly the same problem we have with
SIMD16 instructions that use W/UW or B/UB types with 0 stride. The
hardware implicitly decodes them as two SIMD8 instructions, and with
the overlapping regions, the first would clobber the second.
Previously, we handled that by incrementing the live range end IP by 1,
which works, but is excessive: the next instruction doesn't actually
care about that. It might also be the end of control flow. This might
keep values alive too long. What we really want is to say "my source
and destinations interfere".
This patch creates new infrastructure for doing just that, and teaches
the register allocator to add interference when there's a hazard. For
my vec4 case, we can determine this by switching on opcodes. For the
SIMD16 case, we just move the existing code there.
I audited our existing virtual opcodes that generate multiple
instructions; I believe FS_OPCODE_PACK_HALF_2x16_SPLIT needs this
treatment as well, but no others.
v2: Rebased by mattst88.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-11-19 16:00:18 -08:00
|
|
|
/**
|
|
|
|
|
* Returns true if this instruction's sources and destinations cannot
|
|
|
|
|
* safely be the same register.
|
|
|
|
|
*
|
|
|
|
|
* In most cases, a register can be written over safely by the same
|
|
|
|
|
* instruction that is its last use. For a single instruction, the
|
|
|
|
|
* sources are dereferenced before writing of the destination starts
|
|
|
|
|
* (naturally).
|
|
|
|
|
*
|
|
|
|
|
* However, there are a few cases where this can be problematic:
|
|
|
|
|
*
|
|
|
|
|
* - Virtual opcodes that translate to multiple instructions in the
|
|
|
|
|
* code generator: if src == dst and one instruction writes the
|
|
|
|
|
* destination before a later instruction reads the source, then
|
|
|
|
|
* src will have been clobbered.
|
|
|
|
|
*
|
|
|
|
|
* - SIMD16 compressed instructions with certain regioning (see below).
|
|
|
|
|
*
|
|
|
|
|
* The register allocator uses this information to set up conflicts between
|
|
|
|
|
* GRF sources and the destination.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
fs_inst::has_source_and_destination_hazard() const
|
|
|
|
|
{
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
|
|
|
|
|
/* Multiple partial writes to the destination */
|
|
|
|
|
return true;
|
2017-08-29 09:21:32 -07:00
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
/* This instruction returns an arbitrary channel from the source and
|
|
|
|
|
* gets split into smaller instructions in the generator. It's possible
|
|
|
|
|
* that one of the instructions will read from a channel corresponding
|
|
|
|
|
* to an earlier instruction.
|
|
|
|
|
*/
|
2017-08-31 21:45:30 -07:00
|
|
|
case SHADER_OPCODE_SEL_EXEC:
|
|
|
|
|
/* This is implemented as
|
|
|
|
|
*
|
|
|
|
|
* mov(16) g4<1>D 0D { align1 WE_all 1H };
|
|
|
|
|
* mov(16) g4<1>D g5<8,8,1>D { align1 1H }
|
|
|
|
|
*
|
|
|
|
|
* Because the source is only read in the second instruction, the first
|
|
|
|
|
* may stomp all over it.
|
|
|
|
|
*/
|
2017-08-29 09:21:32 -07:00
|
|
|
return true;
|
2018-12-06 14:11:34 -08:00
|
|
|
case SHADER_OPCODE_QUAD_SWIZZLE:
|
|
|
|
|
switch (src[1].ud) {
|
|
|
|
|
case BRW_SWIZZLE_XXXX:
|
|
|
|
|
case BRW_SWIZZLE_YYYY:
|
|
|
|
|
case BRW_SWIZZLE_ZZZZ:
|
|
|
|
|
case BRW_SWIZZLE_WWWW:
|
|
|
|
|
case BRW_SWIZZLE_XXZZ:
|
|
|
|
|
case BRW_SWIZZLE_YYWW:
|
|
|
|
|
case BRW_SWIZZLE_XYXY:
|
|
|
|
|
case BRW_SWIZZLE_ZWZW:
|
|
|
|
|
/* These can be implemented as a single Align1 region on all
|
|
|
|
|
* platforms, so there's never a hazard between source and
|
|
|
|
|
* destination. C.f. fs_generator::generate_quad_swizzle().
|
|
|
|
|
*/
|
|
|
|
|
return false;
|
|
|
|
|
default:
|
|
|
|
|
return !is_uniform(src[0]);
|
|
|
|
|
}
|
2023-09-20 12:42:24 -07:00
|
|
|
case BRW_OPCODE_DPAS:
|
|
|
|
|
/* This is overly conservative. The actual hazard is more complicated to
|
|
|
|
|
* describe. When the repeat count is N, the single instruction behaves
|
|
|
|
|
* like N instructions with a repeat count of one, but the destination
|
|
|
|
|
* and source registers are incremented (in somewhat complex ways) for
|
|
|
|
|
* each instruction.
|
|
|
|
|
*
|
|
|
|
|
* This means the source and destination register is actually a range of
|
|
|
|
|
* registers. The hazard exists of an earlier iteration would write a
|
|
|
|
|
* register that should be read by a later iteration.
|
|
|
|
|
*
|
|
|
|
|
* There may be some advantage to properly modeling this, but for now,
|
|
|
|
|
* be overly conservative.
|
|
|
|
|
*/
|
|
|
|
|
return rcount > 1;
|
i965: Add src/dst interference for certain instructions with hazards.
When working on tessellation shaders, I created some vec4 virtual
opcodes for creating message headers through a sequence like:
mov(8) g7<1>UD 0x00000000UD { align1 WE_all 1Q compacted };
mov(1) g7.5<1>UD 0x00000100UD { align1 WE_all };
mov(1) g7<1>UD g0<0,1,0>UD { align1 WE_all compacted };
mov(1) g7.3<1>UD g8<0,1,0>UD { align1 WE_all };
This is done in the generator since the vec4 backend can't handle align1
regioning. From the visitor's point of view, this is a single opcode:
hs_set_output_urb_offsets vgrf7.0:UD, 1U, vgrf8.xxxx:UD
Normally, there's no hazard between sources and destinations - an
instruction (naturally) reads its sources, then writes the result to the
destination. However, when the virtual instruction generates multiple
hardware instructions, we can get into trouble.
In the above example, if the register allocator assigned vgrf7 and vgrf8
to the same hardware register, then we'd clobber the source with 0 in
the first instruction, and read back the wrong value in the last one.
It occured to me that this is exactly the same problem we have with
SIMD16 instructions that use W/UW or B/UB types with 0 stride. The
hardware implicitly decodes them as two SIMD8 instructions, and with
the overlapping regions, the first would clobber the second.
Previously, we handled that by incrementing the live range end IP by 1,
which works, but is excessive: the next instruction doesn't actually
care about that. It might also be the end of control flow. This might
keep values alive too long. What we really want is to say "my source
and destinations interfere".
This patch creates new infrastructure for doing just that, and teaches
the register allocator to add interference when there's a hazard. For
my vec4 case, we can determine this by switching on opcodes. For the
SIMD16 case, we just move the existing code there.
I audited our existing virtual opcodes that generate multiple
instructions; I believe FS_OPCODE_PACK_HALF_2x16_SPLIT needs this
treatment as well, but no others.
v2: Rebased by mattst88.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-11-19 16:00:18 -08:00
|
|
|
default:
|
|
|
|
|
/* The SIMD16 compressed instruction
|
|
|
|
|
*
|
|
|
|
|
* add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F
|
|
|
|
|
*
|
|
|
|
|
* is actually decoded in hardware as:
|
|
|
|
|
*
|
|
|
|
|
* add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F
|
|
|
|
|
* add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F
|
|
|
|
|
*
|
|
|
|
|
* Which is safe. However, if we have uniform accesses
|
|
|
|
|
* happening, we get into trouble:
|
|
|
|
|
*
|
|
|
|
|
* add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F
|
|
|
|
|
* add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F
|
|
|
|
|
*
|
|
|
|
|
* Now our destination for the first instruction overwrote the
|
|
|
|
|
* second instruction's src0, and we get garbage for those 8
|
2021-03-29 15:40:04 -07:00
|
|
|
* pixels. There's a similar issue for the pre-gfx6
|
i965: Add src/dst interference for certain instructions with hazards.
When working on tessellation shaders, I created some vec4 virtual
opcodes for creating message headers through a sequence like:
mov(8) g7<1>UD 0x00000000UD { align1 WE_all 1Q compacted };
mov(1) g7.5<1>UD 0x00000100UD { align1 WE_all };
mov(1) g7<1>UD g0<0,1,0>UD { align1 WE_all compacted };
mov(1) g7.3<1>UD g8<0,1,0>UD { align1 WE_all };
This is done in the generator since the vec4 backend can't handle align1
regioning. From the visitor's point of view, this is a single opcode:
hs_set_output_urb_offsets vgrf7.0:UD, 1U, vgrf8.xxxx:UD
Normally, there's no hazard between sources and destinations - an
instruction (naturally) reads its sources, then writes the result to the
destination. However, when the virtual instruction generates multiple
hardware instructions, we can get into trouble.
In the above example, if the register allocator assigned vgrf7 and vgrf8
to the same hardware register, then we'd clobber the source with 0 in
the first instruction, and read back the wrong value in the last one.
It occured to me that this is exactly the same problem we have with
SIMD16 instructions that use W/UW or B/UB types with 0 stride. The
hardware implicitly decodes them as two SIMD8 instructions, and with
the overlapping regions, the first would clobber the second.
Previously, we handled that by incrementing the live range end IP by 1,
which works, but is excessive: the next instruction doesn't actually
care about that. It might also be the end of control flow. This might
keep values alive too long. What we really want is to say "my source
and destinations interfere".
This patch creates new infrastructure for doing just that, and teaches
the register allocator to add interference when there's a hazard. For
my vec4 case, we can determine this by switching on opcodes. For the
SIMD16 case, we just move the existing code there.
I audited our existing virtual opcodes that generate multiple
instructions; I believe FS_OPCODE_PACK_HALF_2x16_SPLIT needs this
treatment as well, but no others.
v2: Rebased by mattst88.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-11-19 16:00:18 -08:00
|
|
|
* pixel_x/pixel_y, which are registers of 16-bit values and thus
|
|
|
|
|
* would get stomped by the first decode as well.
|
|
|
|
|
*/
|
|
|
|
|
if (exec_size == 16) {
|
|
|
|
|
for (int i = 0; i < sources; i++) {
|
|
|
|
|
if (src[i].file == VGRF && (src[i].stride == 0 ||
|
2024-04-20 17:08:02 -07:00
|
|
|
src[i].type == BRW_TYPE_UW ||
|
|
|
|
|
src[i].type == BRW_TYPE_W ||
|
|
|
|
|
src[i].type == BRW_TYPE_UB ||
|
|
|
|
|
src[i].type == BRW_TYPE_B)) {
|
i965: Add src/dst interference for certain instructions with hazards.
When working on tessellation shaders, I created some vec4 virtual
opcodes for creating message headers through a sequence like:
mov(8) g7<1>UD 0x00000000UD { align1 WE_all 1Q compacted };
mov(1) g7.5<1>UD 0x00000100UD { align1 WE_all };
mov(1) g7<1>UD g0<0,1,0>UD { align1 WE_all compacted };
mov(1) g7.3<1>UD g8<0,1,0>UD { align1 WE_all };
This is done in the generator since the vec4 backend can't handle align1
regioning. From the visitor's point of view, this is a single opcode:
hs_set_output_urb_offsets vgrf7.0:UD, 1U, vgrf8.xxxx:UD
Normally, there's no hazard between sources and destinations - an
instruction (naturally) reads its sources, then writes the result to the
destination. However, when the virtual instruction generates multiple
hardware instructions, we can get into trouble.
In the above example, if the register allocator assigned vgrf7 and vgrf8
to the same hardware register, then we'd clobber the source with 0 in
the first instruction, and read back the wrong value in the last one.
It occured to me that this is exactly the same problem we have with
SIMD16 instructions that use W/UW or B/UB types with 0 stride. The
hardware implicitly decodes them as two SIMD8 instructions, and with
the overlapping regions, the first would clobber the second.
Previously, we handled that by incrementing the live range end IP by 1,
which works, but is excessive: the next instruction doesn't actually
care about that. It might also be the end of control flow. This might
keep values alive too long. What we really want is to say "my source
and destinations interfere".
This patch creates new infrastructure for doing just that, and teaches
the register allocator to add interference when there's a hazard. For
my vec4 case, we can determine this by switching on opcodes. For the
SIMD16 case, we just move the existing code there.
I audited our existing virtual opcodes that generate multiple
instructions; I believe FS_OPCODE_PACK_HALF_2x16_SPLIT needs this
treatment as well, but no others.
v2: Rebased by mattst88.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-11-19 16:00:18 -08:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2012-11-09 11:48:20 -08:00
|
|
|
bool
|
2021-04-05 13:19:39 -07:00
|
|
|
fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
|
2012-11-09 11:48:20 -08:00
|
|
|
{
|
2014-06-23 21:57:31 -07:00
|
|
|
if (is_send_from_grf())
|
2012-11-09 11:48:20 -08:00
|
|
|
return false;
|
|
|
|
|
|
2024-06-05 13:23:39 -07:00
|
|
|
/* From TGL PRM Vol 2a Pg. 1053 and Pg. 1069 MAD and MUL Instructions:
|
2018-12-07 14:13:53 -08:00
|
|
|
*
|
|
|
|
|
* "When multiplying a DW and any lower precision integer, source modifier
|
|
|
|
|
* is not supported."
|
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
if (devinfo->ver >= 12 && (opcode == BRW_OPCODE_MUL ||
|
2018-12-07 14:13:53 -08:00
|
|
|
opcode == BRW_OPCODE_MAD)) {
|
|
|
|
|
const brw_reg_type exec_type = get_exec_type(this);
|
2024-04-21 00:57:59 -07:00
|
|
|
const unsigned min_brw_type_size_bytes = opcode == BRW_OPCODE_MAD ?
|
|
|
|
|
MIN2(brw_type_size_bytes(src[1].type), brw_type_size_bytes(src[2].type)) :
|
|
|
|
|
MIN2(brw_type_size_bytes(src[0].type), brw_type_size_bytes(src[1].type));
|
2018-12-07 14:13:53 -08:00
|
|
|
|
2024-04-20 23:19:43 -07:00
|
|
|
if (brw_type_is_int(exec_type) &&
|
2024-04-21 00:57:59 -07:00
|
|
|
brw_type_size_bytes(exec_type) >= 4 &&
|
|
|
|
|
brw_type_size_bytes(exec_type) != min_brw_type_size_bytes)
|
2018-12-07 14:13:53 -08:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-20 21:12:17 -08:00
|
|
|
switch (opcode) {
|
|
|
|
|
case BRW_OPCODE_ADDC:
|
|
|
|
|
case BRW_OPCODE_BFE:
|
|
|
|
|
case BRW_OPCODE_BFI1:
|
|
|
|
|
case BRW_OPCODE_BFI2:
|
|
|
|
|
case BRW_OPCODE_BFREV:
|
|
|
|
|
case BRW_OPCODE_CBIT:
|
|
|
|
|
case BRW_OPCODE_FBH:
|
|
|
|
|
case BRW_OPCODE_FBL:
|
|
|
|
|
case BRW_OPCODE_ROL:
|
|
|
|
|
case BRW_OPCODE_ROR:
|
|
|
|
|
case BRW_OPCODE_SUBB:
|
|
|
|
|
case BRW_OPCODE_DP4A:
|
|
|
|
|
case BRW_OPCODE_DPAS:
|
|
|
|
|
case SHADER_OPCODE_BROADCAST:
|
|
|
|
|
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
|
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
|
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
2013-09-19 19:48:22 -07:00
|
|
|
return false;
|
2024-02-20 21:12:17 -08:00
|
|
|
default:
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2012-11-09 11:48:20 -08:00
|
|
|
}
|
|
|
|
|
|
2018-10-08 12:22:35 -05:00
|
|
|
bool
|
2024-02-20 21:12:17 -08:00
|
|
|
fs_inst::can_do_cmod() const
|
2018-10-08 12:22:35 -05:00
|
|
|
{
|
2024-02-20 21:12:17 -08:00
|
|
|
switch (opcode) {
|
|
|
|
|
case BRW_OPCODE_ADD:
|
|
|
|
|
case BRW_OPCODE_ADD3:
|
|
|
|
|
case BRW_OPCODE_ADDC:
|
|
|
|
|
case BRW_OPCODE_AND:
|
|
|
|
|
case BRW_OPCODE_ASR:
|
|
|
|
|
case BRW_OPCODE_AVG:
|
|
|
|
|
case BRW_OPCODE_CMP:
|
|
|
|
|
case BRW_OPCODE_CMPN:
|
|
|
|
|
case BRW_OPCODE_DP2:
|
|
|
|
|
case BRW_OPCODE_DP3:
|
|
|
|
|
case BRW_OPCODE_DP4:
|
|
|
|
|
case BRW_OPCODE_DPH:
|
|
|
|
|
case BRW_OPCODE_FRC:
|
|
|
|
|
case BRW_OPCODE_LINE:
|
|
|
|
|
case BRW_OPCODE_LRP:
|
|
|
|
|
case BRW_OPCODE_LZD:
|
|
|
|
|
case BRW_OPCODE_MAC:
|
|
|
|
|
case BRW_OPCODE_MACH:
|
|
|
|
|
case BRW_OPCODE_MAD:
|
|
|
|
|
case BRW_OPCODE_MOV:
|
|
|
|
|
case BRW_OPCODE_MUL:
|
|
|
|
|
case BRW_OPCODE_NOT:
|
|
|
|
|
case BRW_OPCODE_OR:
|
|
|
|
|
case BRW_OPCODE_PLN:
|
|
|
|
|
case BRW_OPCODE_RNDD:
|
|
|
|
|
case BRW_OPCODE_RNDE:
|
|
|
|
|
case BRW_OPCODE_RNDU:
|
|
|
|
|
case BRW_OPCODE_RNDZ:
|
|
|
|
|
case BRW_OPCODE_SHL:
|
|
|
|
|
case BRW_OPCODE_SHR:
|
|
|
|
|
case BRW_OPCODE_SUBB:
|
|
|
|
|
case BRW_OPCODE_XOR:
|
|
|
|
|
break;
|
|
|
|
|
default:
|
2018-10-08 12:22:35 -05:00
|
|
|
return false;
|
2024-02-20 21:12:17 -08:00
|
|
|
}
|
2018-10-08 12:22:35 -05:00
|
|
|
|
|
|
|
|
/* The accumulator result appears to get used for the conditional modifier
|
|
|
|
|
* generation. When negating a UD value, there is a 33rd bit generated for
|
|
|
|
|
* the sign in the accumulator value, so now you can't check, for example,
|
|
|
|
|
* equality with a 32-bit value. See piglit fs-op-neg-uvec4.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned i = 0; i < sources; i++) {
|
2024-04-20 23:19:43 -07:00
|
|
|
if (brw_type_is_uint(src[i].type) && src[i].negate)
|
2018-10-08 12:22:35 -05:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2015-10-14 02:12:09 -07:00
|
|
|
bool
|
|
|
|
|
fs_inst::can_change_types() const
|
|
|
|
|
{
|
|
|
|
|
return dst.type == src[0].type &&
|
2022-06-22 16:17:21 -07:00
|
|
|
!src[0].abs && !src[0].negate && !saturate && src[0].file != ATTR &&
|
2015-10-14 02:12:09 -07:00
|
|
|
(opcode == BRW_OPCODE_MOV ||
|
2024-01-24 16:19:39 -08:00
|
|
|
(opcode == SHADER_OPCODE_LOAD_PAYLOAD && sources == 1) ||
|
2015-10-14 02:12:09 -07:00
|
|
|
(opcode == BRW_OPCODE_SEL &&
|
|
|
|
|
dst.type == src[1].type &&
|
|
|
|
|
predicate != BRW_PREDICATE_NONE &&
|
2022-06-22 16:17:21 -07:00
|
|
|
!src[1].abs && !src[1].negate && src[1].file != ATTR));
|
2015-10-14 02:12:09 -07:00
|
|
|
}
|
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
void
|
|
|
|
|
fs_reg::init()
|
|
|
|
|
{
|
2018-07-16 13:19:30 -07:00
|
|
|
memset((void*)this, 0, sizeof(*this));
|
2024-04-20 17:08:02 -07:00
|
|
|
type = BRW_TYPE_UD;
|
2013-12-08 04:57:35 +01:00
|
|
|
stride = 1;
|
2012-07-04 13:12:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** Generic unset register constructor. */
|
|
|
|
|
fs_reg::fs_reg()
|
|
|
|
|
{
|
|
|
|
|
init();
|
|
|
|
|
this->file = BAD_FILE;
|
|
|
|
|
}
|
|
|
|
|
|
2015-11-23 16:17:28 -08:00
|
|
|
fs_reg::fs_reg(struct ::brw_reg reg) :
|
2024-02-20 21:30:07 -08:00
|
|
|
brw_reg(reg)
|
2012-07-04 13:12:50 -07:00
|
|
|
{
|
2016-09-01 12:42:20 -07:00
|
|
|
this->offset = 0;
|
2015-10-24 15:29:03 -07:00
|
|
|
this->stride = 1;
|
2015-11-02 00:25:04 +00:00
|
|
|
if (this->file == IMM &&
|
2024-04-20 17:08:02 -07:00
|
|
|
(this->type != BRW_TYPE_V &&
|
|
|
|
|
this->type != BRW_TYPE_UV &&
|
|
|
|
|
this->type != BRW_TYPE_VF)) {
|
2015-11-02 00:25:04 +00:00
|
|
|
this->stride = 0;
|
|
|
|
|
}
|
2012-07-04 13:12:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
fs_reg::equals(const fs_reg &r) const
|
|
|
|
|
{
|
2024-02-20 21:30:07 -08:00
|
|
|
return brw_regs_equal(this, &r) &&
|
|
|
|
|
offset == r.offset &&
|
|
|
|
|
stride == r.stride;
|
2012-07-04 13:12:50 -07:00
|
|
|
}
|
|
|
|
|
|
2015-04-07 16:11:37 -07:00
|
|
|
bool
|
|
|
|
|
fs_reg::negative_equals(const fs_reg &r) const
|
|
|
|
|
{
|
2024-02-20 21:30:07 -08:00
|
|
|
return brw_regs_negative_equal(this, &r) &&
|
|
|
|
|
offset == r.offset &&
|
|
|
|
|
stride == r.stride;
|
2015-04-07 16:11:37 -07:00
|
|
|
}
|
|
|
|
|
|
2013-12-08 04:57:35 +01:00
|
|
|
bool
|
|
|
|
|
fs_reg::is_contiguous() const
|
|
|
|
|
{
|
2020-01-02 15:32:56 -08:00
|
|
|
switch (file) {
|
|
|
|
|
case ARF:
|
|
|
|
|
case FIXED_GRF:
|
|
|
|
|
return hstride == BRW_HORIZONTAL_STRIDE_1 &&
|
|
|
|
|
vstride == width + hstride;
|
|
|
|
|
case VGRF:
|
|
|
|
|
case ATTR:
|
|
|
|
|
return stride == 1;
|
|
|
|
|
case UNIFORM:
|
|
|
|
|
case IMM:
|
|
|
|
|
case BAD_FILE:
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unreachable("Invalid register file");
|
2013-12-08 04:57:35 +01:00
|
|
|
}
|
|
|
|
|
|
2015-07-14 15:43:44 +03:00
|
|
|
unsigned
|
|
|
|
|
fs_reg::component_size(unsigned width) const
|
|
|
|
|
{
|
2022-06-22 16:18:13 -07:00
|
|
|
if (file == ARF || file == FIXED_GRF) {
|
|
|
|
|
const unsigned w = MIN2(width, 1u << this->width);
|
|
|
|
|
const unsigned h = width >> this->width;
|
|
|
|
|
const unsigned vs = vstride ? 1 << (vstride - 1) : 0;
|
|
|
|
|
const unsigned hs = hstride ? 1 << (hstride - 1) : 0;
|
|
|
|
|
assert(w > 0);
|
2024-04-21 00:57:59 -07:00
|
|
|
return ((MAX2(1, h) - 1) * vs + (w - 1) * hs + 1) * brw_type_size_bytes(type);
|
2022-06-22 16:18:13 -07:00
|
|
|
} else {
|
2024-04-21 00:57:59 -07:00
|
|
|
return MAX2(width * stride, 1) * brw_type_size_bytes(type);
|
2022-06-22 16:18:13 -07:00
|
|
|
}
|
2015-07-14 15:43:44 +03:00
|
|
|
}
|
|
|
|
|
|
2011-03-13 13:43:05 -07:00
|
|
|
void
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
fs_visitor::vfail(const char *format, va_list va)
|
2011-03-13 13:43:05 -07:00
|
|
|
{
|
2011-05-16 15:10:26 -07:00
|
|
|
char *msg;
|
2011-03-13 13:43:05 -07:00
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
if (failed)
|
|
|
|
|
return;
|
2011-03-13 13:43:05 -07:00
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
failed = true;
|
|
|
|
|
|
|
|
|
|
msg = ralloc_vasprintf(mem_ctx, format, va);
|
2020-07-02 13:37:10 +02:00
|
|
|
msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
|
2023-09-24 21:38:47 -07:00
|
|
|
dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
|
|
|
this->fail_msg = msg;
|
|
|
|
|
|
2021-03-23 11:31:51 -07:00
|
|
|
if (unlikely(debug_enabled)) {
|
2011-06-10 15:26:02 -03:00
|
|
|
fprintf(stderr, "%s", msg);
|
2011-03-13 13:43:05 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
void
|
|
|
|
|
fs_visitor::fail(const char *format, ...)
|
|
|
|
|
{
|
|
|
|
|
va_list va;
|
|
|
|
|
|
|
|
|
|
va_start(va, format);
|
|
|
|
|
vfail(format, va);
|
|
|
|
|
va_end(va);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
2016-05-18 14:39:52 -07:00
|
|
|
* Mark this program as impossible to compile with dispatch width greater
|
|
|
|
|
* than n.
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
*
|
|
|
|
|
* During the SIMD8 compile (which happens first), we can detect and flag
|
2016-05-18 14:39:52 -07:00
|
|
|
* things that are unsupported in SIMD16+ mode, so the compiler can skip the
|
|
|
|
|
* SIMD16+ compile altogether.
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
*
|
2016-05-18 14:39:52 -07:00
|
|
|
* During a compile of dispatch width greater than n (if one happens anyway),
|
|
|
|
|
* this just calls fail().
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
*/
|
|
|
|
|
void
|
2016-05-18 14:39:52 -07:00
|
|
|
fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
{
|
2016-05-18 14:39:52 -07:00
|
|
|
if (dispatch_width > n) {
|
2015-06-22 16:30:04 -07:00
|
|
|
fail("%s", msg);
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
} else {
|
2020-10-30 17:41:02 +02:00
|
|
|
max_dispatch_width = MIN2(max_dispatch_width, n);
|
2021-07-29 14:27:57 -07:00
|
|
|
brw_shader_perf_log(compiler, log_data,
|
2021-10-03 15:58:36 +03:00
|
|
|
"Shader dispatch width limited to SIMD%d: %s\n",
|
2021-07-29 14:27:57 -07:00
|
|
|
n, msg);
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2012-06-04 08:59:00 -07:00
|
|
|
/**
|
|
|
|
|
* Returns true if the instruction has a flag that means it won't
|
|
|
|
|
* update an entire destination register.
|
|
|
|
|
*
|
|
|
|
|
* For example, dead code elimination and live variable analysis want to know
|
|
|
|
|
* when a write to a variable screens off any preceding values that were in
|
|
|
|
|
* it.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
2019-04-24 12:38:28 +02:00
|
|
|
fs_inst::is_partial_write() const
|
2012-06-04 08:59:00 -07:00
|
|
|
{
|
2023-03-14 18:22:50 +02:00
|
|
|
if (this->predicate && !this->predicate_trivial &&
|
|
|
|
|
this->opcode != BRW_OPCODE_SEL)
|
2023-03-10 16:11:56 +02:00
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
if (this->dst.offset % REG_SIZE != 0)
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
/* SEND instructions always write whole registers */
|
|
|
|
|
if (this->opcode == SHADER_OPCODE_SEND)
|
|
|
|
|
return false;
|
|
|
|
|
|
2023-07-23 18:20:23 +03:00
|
|
|
/* Special case UNDEF since a lot of places in the backend do things like this :
|
|
|
|
|
*
|
|
|
|
|
* fs_builder ubld = bld.exec_all().group(1, 0);
|
2024-04-20 17:08:02 -07:00
|
|
|
* fs_reg tmp = ubld.vgrf(BRW_TYPE_UD);
|
2023-07-23 18:20:23 +03:00
|
|
|
* ubld.UNDEF(tmp); <- partial write, even if the whole register is concerned
|
|
|
|
|
*/
|
|
|
|
|
if (this->opcode == SHADER_OPCODE_UNDEF) {
|
|
|
|
|
assert(this->dst.is_contiguous());
|
|
|
|
|
return this->size_written < 32;
|
|
|
|
|
}
|
|
|
|
|
|
2024-04-21 00:57:59 -07:00
|
|
|
return this->exec_size * brw_type_size_bytes(this->dst.type) < 32 ||
|
2023-03-10 16:11:56 +02:00
|
|
|
!this->dst.is_contiguous();
|
2012-06-04 08:59:00 -07:00
|
|
|
}
|
|
|
|
|
|
2015-07-21 17:28:39 +03:00
|
|
|
unsigned
|
|
|
|
|
fs_inst::components_read(unsigned i) const
|
|
|
|
|
{
|
2016-08-12 18:33:58 -07:00
|
|
|
/* Return zero if the source is not present. */
|
|
|
|
|
if (src[i].file == BAD_FILE)
|
|
|
|
|
return 0;
|
|
|
|
|
|
2015-07-21 17:28:39 +03:00
|
|
|
switch (opcode) {
|
2024-04-11 01:10:51 -07:00
|
|
|
case BRW_OPCODE_PLN:
|
|
|
|
|
return i == 0 ? 1 : 2;
|
2015-07-21 17:28:39 +03:00
|
|
|
|
|
|
|
|
case FS_OPCODE_PIXEL_X:
|
|
|
|
|
case FS_OPCODE_PIXEL_Y:
|
2020-10-29 15:10:59 +02:00
|
|
|
assert(i < 2);
|
|
|
|
|
if (i == 0)
|
|
|
|
|
return 2;
|
|
|
|
|
else
|
|
|
|
|
return 1;
|
2015-07-21 17:28:39 +03:00
|
|
|
|
2015-07-27 16:14:36 +03:00
|
|
|
case FS_OPCODE_FB_WRITE_LOGICAL:
|
2015-10-20 14:29:37 -07:00
|
|
|
assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
|
2015-07-27 16:14:36 +03:00
|
|
|
/* First/second FB write color. */
|
|
|
|
|
if (i < 2)
|
2015-10-24 14:55:57 -07:00
|
|
|
return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
|
2015-07-27 16:14:36 +03:00
|
|
|
else
|
|
|
|
|
return 1;
|
|
|
|
|
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
case SHADER_OPCODE_TEX_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
2018-10-31 09:52:33 -05:00
|
|
|
case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
case FS_OPCODE_TXB_LOGICAL:
|
2015-09-08 15:52:09 +01:00
|
|
|
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
|
2020-07-07 23:54:00 -07:00
|
|
|
case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
case SHADER_OPCODE_TXF_MCS_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
2023-02-16 20:30:30 -08:00
|
|
|
case SHADER_OPCODE_TG4_BIAS_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
|
2023-03-05 15:27:08 -08:00
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
|
2016-05-20 00:37:37 -07:00
|
|
|
case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
|
2016-02-05 18:39:13 -08:00
|
|
|
assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
|
2023-05-23 13:11:02 +03:00
|
|
|
src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
|
|
|
|
|
src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
/* Texture coordinates. */
|
2016-02-05 18:39:13 -08:00
|
|
|
if (i == TEX_LOGICAL_SRC_COORDINATE)
|
|
|
|
|
return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
/* Texture derivatives. */
|
2016-02-05 18:39:13 -08:00
|
|
|
else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
|
|
|
|
|
opcode == SHADER_OPCODE_TXD_LOGICAL)
|
|
|
|
|
return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
/* Texture offset. */
|
2016-11-28 18:13:02 -08:00
|
|
|
else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
return 2;
|
2015-09-08 15:52:09 +01:00
|
|
|
/* MCS */
|
2020-07-07 23:54:00 -07:00
|
|
|
else if (i == TEX_LOGICAL_SRC_MCS) {
|
|
|
|
|
if (opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
|
|
|
|
|
return 2;
|
|
|
|
|
else if (opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
|
|
|
|
|
return 4;
|
|
|
|
|
else
|
|
|
|
|
return 1;
|
|
|
|
|
} else
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
return 1;
|
|
|
|
|
|
2015-07-21 18:45:32 +03:00
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
|
2019-02-11 14:51:02 -06:00
|
|
|
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM);
|
2015-07-21 18:45:32 +03:00
|
|
|
/* Surface coordinates. */
|
2019-02-11 14:51:02 -06:00
|
|
|
if (i == SURFACE_LOGICAL_SRC_ADDRESS)
|
|
|
|
|
return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
|
2015-07-21 18:45:32 +03:00
|
|
|
/* Surface operation source (ignored for reads). */
|
2019-02-11 14:51:02 -06:00
|
|
|
else if (i == SURFACE_LOGICAL_SRC_DATA)
|
2015-07-21 18:45:32 +03:00
|
|
|
return 0;
|
|
|
|
|
else
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
|
2019-02-11 14:51:02 -06:00
|
|
|
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
|
|
|
|
|
src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
|
2015-07-21 18:45:32 +03:00
|
|
|
/* Surface coordinates. */
|
2019-02-11 14:51:02 -06:00
|
|
|
if (i == SURFACE_LOGICAL_SRC_ADDRESS)
|
|
|
|
|
return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
|
2015-07-21 18:45:32 +03:00
|
|
|
/* Surface operation source. */
|
2019-02-11 14:51:02 -06:00
|
|
|
else if (i == SURFACE_LOGICAL_SRC_DATA)
|
|
|
|
|
return src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
|
2015-07-21 18:45:32 +03:00
|
|
|
else
|
|
|
|
|
return 1;
|
|
|
|
|
|
2018-11-14 17:13:57 -06:00
|
|
|
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
|
2020-10-05 14:43:41 -07:00
|
|
|
case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
|
2023-01-10 15:09:12 -08:00
|
|
|
assert(src[A64_LOGICAL_ARG].file == IMM);
|
2018-11-14 17:13:57 -06:00
|
|
|
return 1;
|
|
|
|
|
|
2020-10-05 14:43:41 -07:00
|
|
|
case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
|
2023-01-10 15:09:12 -08:00
|
|
|
assert(src[A64_LOGICAL_ARG].file == IMM);
|
|
|
|
|
if (i == A64_LOGICAL_SRC) { /* data to write */
|
|
|
|
|
const unsigned comps = src[A64_LOGICAL_ARG].ud / exec_size;
|
2020-10-05 14:43:41 -07:00
|
|
|
assert(comps > 0);
|
|
|
|
|
return comps;
|
|
|
|
|
} else {
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
2020-10-29 14:20:39 -07:00
|
|
|
case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
|
|
|
|
|
assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
|
|
|
|
|
assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
|
|
|
|
|
if (i == SURFACE_LOGICAL_SRC_DATA) {
|
|
|
|
|
const unsigned comps = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud / exec_size;
|
|
|
|
|
assert(comps > 0);
|
|
|
|
|
return comps;
|
|
|
|
|
} else {
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-14 17:13:57 -06:00
|
|
|
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
|
2023-01-10 15:09:12 -08:00
|
|
|
assert(src[A64_LOGICAL_ARG].file == IMM);
|
|
|
|
|
return i == A64_LOGICAL_SRC ? src[A64_LOGICAL_ARG].ud : 1;
|
2018-11-14 17:13:57 -06:00
|
|
|
|
2018-11-26 15:15:04 -06:00
|
|
|
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
|
2023-01-10 15:09:12 -08:00
|
|
|
assert(src[A64_LOGICAL_ARG].file == IMM);
|
|
|
|
|
return i == A64_LOGICAL_SRC ?
|
|
|
|
|
lsc_op_num_data_values(src[A64_LOGICAL_ARG].ud) : 1;
|
2018-11-26 15:15:04 -06:00
|
|
|
|
2017-07-01 08:19:17 +02:00
|
|
|
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
|
2015-04-08 02:41:33 -07:00
|
|
|
case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
|
2017-07-01 08:19:17 +02:00
|
|
|
/* Scattered logical opcodes use the following params:
|
|
|
|
|
* src[0] Surface coordinates
|
|
|
|
|
* src[1] Surface operation source (ignored for reads)
|
|
|
|
|
* src[2] Surface
|
|
|
|
|
* src[3] IMM with always 1 dimension.
|
|
|
|
|
* src[4] IMM with arg bitsize for scattered read/write 8, 16, 32
|
|
|
|
|
*/
|
2019-02-11 14:51:02 -06:00
|
|
|
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
|
|
|
|
|
src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
|
|
|
|
|
return i == SURFACE_LOGICAL_SRC_DATA ? 0 : 1;
|
2017-07-01 08:19:17 +02:00
|
|
|
|
2017-07-01 08:16:01 +02:00
|
|
|
case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
|
2015-04-08 02:41:33 -07:00
|
|
|
case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
|
2019-02-11 14:51:02 -06:00
|
|
|
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
|
|
|
|
|
src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
|
2017-07-01 08:16:01 +02:00
|
|
|
return 1;
|
|
|
|
|
|
2015-07-21 18:45:32 +03:00
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
|
2019-02-11 14:51:02 -06:00
|
|
|
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
|
|
|
|
|
src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
|
|
|
|
|
const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
|
2015-07-21 18:45:32 +03:00
|
|
|
/* Surface coordinates. */
|
2019-02-11 14:51:02 -06:00
|
|
|
if (i == SURFACE_LOGICAL_SRC_ADDRESS)
|
|
|
|
|
return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
|
2015-07-21 18:45:32 +03:00
|
|
|
/* Surface operation source. */
|
2023-01-09 15:37:30 -08:00
|
|
|
else if (i == SURFACE_LOGICAL_SRC_DATA)
|
|
|
|
|
return lsc_op_num_data_values(op);
|
2023-05-31 12:15:02 -07:00
|
|
|
else
|
|
|
|
|
return 1;
|
2015-07-21 18:45:32 +03:00
|
|
|
}
|
2016-04-25 18:06:13 -07:00
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
|
|
|
return (i == 0 ? 2 : 1);
|
2015-07-21 18:45:32 +03:00
|
|
|
|
2022-07-12 15:32:01 -07:00
|
|
|
case SHADER_OPCODE_URB_WRITE_LOGICAL:
|
2022-09-28 16:38:35 -07:00
|
|
|
assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM);
|
|
|
|
|
|
2022-07-12 15:32:01 -07:00
|
|
|
if (i == URB_LOGICAL_SRC_DATA)
|
2022-09-28 16:38:35 -07:00
|
|
|
return src[URB_LOGICAL_SRC_COMPONENTS].ud;
|
2022-07-12 15:32:01 -07:00
|
|
|
else
|
|
|
|
|
return 1;
|
|
|
|
|
|
2023-09-20 12:42:24 -07:00
|
|
|
case BRW_OPCODE_DPAS:
|
|
|
|
|
unreachable("Do not use components_read() for DPAS.");
|
|
|
|
|
|
2015-07-21 17:28:39 +03:00
|
|
|
default:
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-07 17:00:58 -07:00
|
|
|
unsigned
|
2016-09-07 17:00:07 -07:00
|
|
|
fs_inst::size_read(int arg) const
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
{
|
2015-06-18 11:53:08 -07:00
|
|
|
switch (opcode) {
|
2018-10-29 15:06:14 -05:00
|
|
|
case SHADER_OPCODE_SEND:
|
|
|
|
|
if (arg == 2) {
|
|
|
|
|
return mlen * REG_SIZE;
|
|
|
|
|
} else if (arg == 3) {
|
|
|
|
|
return ex_mlen * REG_SIZE;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
2018-04-19 20:48:42 -07:00
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
2015-06-18 11:53:08 -07:00
|
|
|
if (arg == 0)
|
2016-09-07 17:00:07 -07:00
|
|
|
return mlen * REG_SIZE;
|
2015-06-18 11:53:08 -07:00
|
|
|
break;
|
|
|
|
|
|
2024-04-11 01:10:51 -07:00
|
|
|
case BRW_OPCODE_PLN:
|
|
|
|
|
if (arg == 0)
|
2016-09-07 13:02:55 -07:00
|
|
|
return 16;
|
2015-06-18 17:48:27 -07:00
|
|
|
break;
|
|
|
|
|
|
2015-06-30 15:51:13 -07:00
|
|
|
case SHADER_OPCODE_LOAD_PAYLOAD:
|
|
|
|
|
if (arg < this->header_size)
|
2024-04-20 17:08:02 -07:00
|
|
|
return retype(src[arg], BRW_TYPE_UD).component_size(8);
|
2015-06-30 15:51:13 -07:00
|
|
|
break;
|
|
|
|
|
|
2015-09-15 14:01:17 -07:00
|
|
|
case SHADER_OPCODE_BARRIER:
|
2016-09-07 17:00:07 -07:00
|
|
|
return REG_SIZE;
|
2015-07-16 15:04:43 -07:00
|
|
|
|
2015-11-07 18:58:34 -08:00
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
|
|
|
if (arg == 0) {
|
|
|
|
|
assert(src[2].file == IMM);
|
2016-09-07 14:36:32 -07:00
|
|
|
return src[2].ud;
|
2015-11-07 18:58:34 -08:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
2023-09-20 12:42:24 -07:00
|
|
|
case BRW_OPCODE_DPAS:
|
|
|
|
|
switch (arg) {
|
|
|
|
|
case 0:
|
2024-04-20 17:08:02 -07:00
|
|
|
if (src[0].type == BRW_TYPE_HF) {
|
2023-09-20 12:42:24 -07:00
|
|
|
return rcount * REG_SIZE / 2;
|
|
|
|
|
} else {
|
|
|
|
|
return rcount * REG_SIZE;
|
|
|
|
|
}
|
|
|
|
|
case 1:
|
|
|
|
|
return sdepth * REG_SIZE;
|
|
|
|
|
case 2:
|
|
|
|
|
/* This is simpler than the formula described in the Bspec, but it
|
|
|
|
|
* covers all of the cases that we support on DG2.
|
|
|
|
|
*/
|
|
|
|
|
return rcount * REG_SIZE;
|
|
|
|
|
default:
|
|
|
|
|
unreachable("Invalid source number.");
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
2023-10-09 08:23:53 -07:00
|
|
|
default:
|
|
|
|
|
break;
|
2014-08-18 14:27:55 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
switch (src[arg].file) {
|
|
|
|
|
case UNIFORM:
|
|
|
|
|
case IMM:
|
2024-04-21 00:57:59 -07:00
|
|
|
return components_read(arg) * brw_type_size_bytes(src[arg].type);
|
2016-08-12 18:33:58 -07:00
|
|
|
case BAD_FILE:
|
2015-10-26 17:52:57 -07:00
|
|
|
case ARF:
|
|
|
|
|
case FIXED_GRF:
|
2015-10-26 17:09:25 -07:00
|
|
|
case VGRF:
|
2015-08-05 16:29:30 +03:00
|
|
|
case ATTR:
|
2016-09-07 17:00:07 -07:00
|
|
|
return components_read(arg) * src[arg].component_size(exec_size);
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
}
|
2015-10-26 06:58:56 -07:00
|
|
|
return 0;
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
}
|
|
|
|
|
|
2016-05-18 21:54:35 -07:00
|
|
|
namespace {
|
2019-09-24 17:06:12 -05:00
|
|
|
unsigned
|
2022-07-22 17:11:52 -07:00
|
|
|
predicate_width(const intel_device_info *devinfo, brw_predicate predicate)
|
2019-09-24 17:06:12 -05:00
|
|
|
{
|
2022-07-22 17:11:52 -07:00
|
|
|
if (devinfo->ver >= 20) {
|
|
|
|
|
return 1;
|
|
|
|
|
} else {
|
|
|
|
|
switch (predicate) {
|
|
|
|
|
case BRW_PREDICATE_NONE: return 1;
|
|
|
|
|
case BRW_PREDICATE_NORMAL: return 1;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY2H: return 2;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL2H: return 2;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY4H: return 4;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL4H: return 4;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY8H: return 8;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL8H: return 8;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY16H: return 16;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL16H: return 16;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY32H: return 32;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL32H: return 32;
|
|
|
|
|
default: unreachable("Unsupported predicate");
|
|
|
|
|
}
|
2019-09-24 17:06:12 -05:00
|
|
|
}
|
|
|
|
|
}
|
2016-05-18 21:54:35 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unsigned
|
2021-04-05 13:19:39 -07:00
|
|
|
fs_inst::flags_read(const intel_device_info *devinfo) const
|
2013-10-20 11:32:01 -07:00
|
|
|
{
|
2022-07-22 17:11:52 -07:00
|
|
|
if (devinfo->ver < 20 && (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
|
|
|
|
|
predicate == BRW_PREDICATE_ALIGN1_ALLV)) {
|
2016-05-18 21:54:35 -07:00
|
|
|
/* The vertical predication modes combine corresponding bits from
|
2024-02-17 22:15:44 -08:00
|
|
|
* f0.0 and f1.0 on Gfx7+.
|
2016-05-18 21:54:35 -07:00
|
|
|
*/
|
2024-02-17 22:15:44 -08:00
|
|
|
const unsigned shift = 4;
|
2024-01-04 22:29:54 -08:00
|
|
|
return brw_fs_flag_mask(this, 1) << shift | brw_fs_flag_mask(this, 1);
|
2016-05-18 21:54:35 -07:00
|
|
|
} else if (predicate) {
|
2024-01-04 22:29:54 -08:00
|
|
|
return brw_fs_flag_mask(this, predicate_width(devinfo, predicate));
|
2016-05-18 21:54:35 -07:00
|
|
|
} else {
|
2017-06-22 16:42:34 -07:00
|
|
|
unsigned mask = 0;
|
|
|
|
|
for (int i = 0; i < sources; i++) {
|
2024-01-04 22:29:54 -08:00
|
|
|
mask |= brw_fs_flag_mask(src[i], size_read(i));
|
2017-06-22 16:42:34 -07:00
|
|
|
}
|
|
|
|
|
return mask;
|
2016-05-18 21:54:35 -07:00
|
|
|
}
|
2013-10-20 11:32:01 -07:00
|
|
|
}
|
|
|
|
|
|
2016-05-18 21:54:35 -07:00
|
|
|
unsigned
|
intel/fs: sel.cond writes the flags on Gfx4 and Gfx5
On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
using a separte cmpn and sel instruction. This lowering occurs in
fs_vistor::lower_minmax which is called very, very late... a long, long
time after the first calls to opt_cmod_propagation. As a result,
conditional modifiers can be incorrectly propagated across sel.cond on
those platforms.
No tests were affected by this change, and I find that quite shocking.
After just changing flags_written(), all of the atan tests started
failing on ILK. That required the change in cmod_propagatin (and the
addition of the prop_across_into_sel_gfx5 unit test).
Shader-db results for ILK and GM45 are below. I looked at a couple
before and after shaders... and every case that I looked at had
experienced incorrect cmod propagation. This affected a LOT of apps!
Euro Truck Simulator 2, The Talos Principle, Serious Sam 3, Sanctum 2,
Gang Beasts, and on and on... :(
I discovered this bug while working on a couple new optimization
passes. One of the passes attempts to remove condition modifiers that
are never used. The pass made no progress except on ILK and GM45.
After investigating a couple of the affected shaders, I noticed that
the code in those shaders looked wrong... investigation led to this
cause.
v2: Trivial changes in the unit tests.
v3: Fix type in comment in unit tests. Noticed by Jason and Priit.
v4: Tweak handling of BRW_OPCODE_SEL special case. Suggested by Jason.
Fixes: df1aec763eb ("i965/fs: Define methods to calculate the flag subset read or written by an fs_inst.")
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Tested-by: Dave Airlie <airlied@redhat.com>
Iron Lake
total instructions in shared programs: 8180493 -> 8181781 (0.02%)
instructions in affected programs: 541796 -> 543084 (0.24%)
helped: 28
HURT: 1158
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.86% x̄: 0.53% x̃: 0.50%
HURT stats (abs) min: 1 max: 3 x̄: 1.14 x̃: 1
HURT stats (rel) min: 0.12% max: 4.00% x̄: 0.37% x̃: 0.23%
95% mean confidence interval for instructions value: 1.06 1.11
95% mean confidence interval for instructions %-change: 0.31% 0.38%
Instructions are HURT.
total cycles in shared programs: 239420470 -> 239421690 (<.01%)
cycles in affected programs: 2925992 -> 2927212 (0.04%)
helped: 49
HURT: 157
helped stats (abs) min: 2 max: 284 x̄: 62.69 x̃: 70
helped stats (rel) min: 0.04% max: 6.20% x̄: 1.68% x̃: 1.96%
HURT stats (abs) min: 2 max: 48 x̄: 27.34 x̃: 24
HURT stats (rel) min: 0.02% max: 2.91% x̄: 0.31% x̃: 0.20%
95% mean confidence interval for cycles value: -0.80 12.64
95% mean confidence interval for cycles %-change: -0.31% <.01%
Inconclusive result (value mean confidence interval includes 0).
GM45
total instructions in shared programs: 4985517 -> 4986207 (0.01%)
instructions in affected programs: 306935 -> 307625 (0.22%)
helped: 14
HURT: 625
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.82% x̄: 0.52% x̃: 0.49%
HURT stats (abs) min: 1 max: 3 x̄: 1.13 x̃: 1
HURT stats (rel) min: 0.12% max: 3.90% x̄: 0.34% x̃: 0.22%
95% mean confidence interval for instructions value: 1.04 1.12
95% mean confidence interval for instructions %-change: 0.29% 0.36%
Instructions are HURT.
total cycles in shared programs: 153827268 -> 153828052 (<.01%)
cycles in affected programs: 1669290 -> 1670074 (0.05%)
helped: 24
HURT: 84
helped stats (abs) min: 2 max: 232 x̄: 64.33 x̃: 67
helped stats (rel) min: 0.04% max: 4.62% x̄: 1.60% x̃: 1.94%
HURT stats (abs) min: 2 max: 48 x̄: 27.71 x̃: 24
HURT stats (rel) min: 0.02% max: 2.66% x̄: 0.34% x̃: 0.14%
95% mean confidence interval for cycles value: -1.94 16.46
95% mean confidence interval for cycles %-change: -0.29% 0.11%
Inconclusive result (value mean confidence interval includes 0).
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12191>
2021-08-02 21:33:17 -07:00
|
|
|
fs_inst::flags_written(const intel_device_info *devinfo) const
|
2013-10-20 11:32:01 -07:00
|
|
|
{
|
2024-02-19 19:41:48 -08:00
|
|
|
if (conditional_mod && (opcode != BRW_OPCODE_SEL &&
|
|
|
|
|
opcode != BRW_OPCODE_CSEL &&
|
|
|
|
|
opcode != BRW_OPCODE_IF &&
|
|
|
|
|
opcode != BRW_OPCODE_WHILE)) {
|
2024-01-04 22:29:54 -08:00
|
|
|
return brw_fs_flag_mask(this, 1);
|
2024-02-27 02:02:24 -08:00
|
|
|
} else if (opcode == FS_OPCODE_LOAD_LIVE_CHANNELS) {
|
2024-01-04 22:29:54 -08:00
|
|
|
return brw_fs_flag_mask(this, 32);
|
2016-05-18 21:54:35 -07:00
|
|
|
} else {
|
2024-01-04 22:29:54 -08:00
|
|
|
return brw_fs_flag_mask(dst, size_written);
|
2016-05-18 21:54:35 -07:00
|
|
|
}
|
2013-10-20 11:32:01 -07:00
|
|
|
}
|
|
|
|
|
|
2023-05-23 13:11:02 +03:00
|
|
|
bool
|
|
|
|
|
fs_inst::has_sampler_residency() const
|
|
|
|
|
{
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
case SHADER_OPCODE_TEX_LOGICAL:
|
|
|
|
|
case FS_OPCODE_TXB_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_LOGICAL:
|
2023-02-16 20:30:30 -08:00
|
|
|
case SHADER_OPCODE_TG4_BIAS_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
|
2023-03-05 15:27:08 -08:00
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
|
2023-05-23 13:11:02 +03:00
|
|
|
assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
|
|
|
|
|
return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-10-31 18:41:35 +01:00
|
|
|
fs_reg::fs_reg(enum brw_reg_file file, unsigned nr)
|
2010-08-15 18:58:58 -07:00
|
|
|
{
|
2010-09-03 13:21:51 -07:00
|
|
|
init();
|
2010-08-15 18:58:58 -07:00
|
|
|
this->file = file;
|
2015-10-26 04:35:14 -07:00
|
|
|
this->nr = nr;
|
2024-04-20 17:08:02 -07:00
|
|
|
this->type = BRW_TYPE_F;
|
i965/fs: Fix stride field for uniforms.
This fixes essentially the same problem as for immediates. Registers
of the UNIFORM file are typically accessed according to the formula:
read_uniform(r, channel_index, array_index) =
read_element(r, channel_index * 0 + array_index * 1)
Which matches the general direct addressing formula for stride=0:
read_direct(r, channel_index, array_index) =
read_element(r, channel_index * stride +
array_index * max{1, stride * width})
In either case if reladdr is present the access will be according to
the composition of two register regions, the first one determining the
per-channel array_index used for the second, like:
read_indirect(r, channel_index, array_index) =
read_direct(r, channel_index,
read(r.reladdr, channel_index, array_index))
where:
read(r, channel_index, array_index) = if r.reladdr == NULL
then read_direct(r, channel_index, array_index)
else read_indirect(r, channel_index, array_index)
In conclusion we can handle uniforms consistently with the other
register files if we set stride to zero. After lowering to a GRF
using VARYING_PULL_CONSTANT_LOAD in demote_pull_constant_loads() the
stride of the source is set to one again because the result of
VARYING_PULL_CONSTANT_LOAD is generally non-uniform.
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
2015-07-13 15:29:39 +03:00
|
|
|
this->stride = (file == UNIFORM ? 0 : 1);
|
2010-08-15 18:58:58 -07:00
|
|
|
}
|
|
|
|
|
|
2022-10-31 18:41:35 +01:00
|
|
|
fs_reg::fs_reg(enum brw_reg_file file, unsigned nr, enum brw_reg_type type)
|
2010-10-15 12:04:52 -07:00
|
|
|
{
|
|
|
|
|
init();
|
|
|
|
|
this->file = file;
|
2015-10-26 04:35:14 -07:00
|
|
|
this->nr = nr;
|
2010-10-15 12:04:52 -07:00
|
|
|
this->type = type;
|
i965/fs: Fix stride field for uniforms.
This fixes essentially the same problem as for immediates. Registers
of the UNIFORM file are typically accessed according to the formula:
read_uniform(r, channel_index, array_index) =
read_element(r, channel_index * 0 + array_index * 1)
Which matches the general direct addressing formula for stride=0:
read_direct(r, channel_index, array_index) =
read_element(r, channel_index * stride +
array_index * max{1, stride * width})
In either case if reladdr is present the access will be according to
the composition of two register regions, the first one determining the
per-channel array_index used for the second, like:
read_indirect(r, channel_index, array_index) =
read_direct(r, channel_index,
read(r.reladdr, channel_index, array_index))
where:
read(r, channel_index, array_index) = if r.reladdr == NULL
then read_direct(r, channel_index, array_index)
else read_indirect(r, channel_index, array_index)
In conclusion we can handle uniforms consistently with the other
register files if we set stride to zero. After lowering to a GRF
using VARYING_PULL_CONSTANT_LOAD in demote_pull_constant_loads() the
stride of the source is set to one again because the result of
VARYING_PULL_CONSTANT_LOAD is generally non-uniform.
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
2015-07-13 15:29:39 +03:00
|
|
|
this->stride = (file == UNIFORM ? 0 : 1);
|
2010-10-15 12:04:52 -07:00
|
|
|
}
|
|
|
|
|
|
2013-11-12 15:33:27 -08:00
|
|
|
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
|
2011-03-23 12:50:53 -07:00
|
|
|
* This brings in those uniform definitions
|
|
|
|
|
*/
|
|
|
|
|
void
|
2011-07-25 18:13:04 -07:00
|
|
|
fs_visitor::import_uniforms(fs_visitor *v)
|
2011-03-23 12:50:53 -07:00
|
|
|
{
|
2014-03-11 14:35:27 -07:00
|
|
|
this->push_constant_loc = v->push_constant_loc;
|
|
|
|
|
this->uniforms = v->uniforms;
|
2011-03-23 12:50:53 -07:00
|
|
|
}
|
|
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
enum brw_barycentric_mode
|
2024-04-18 09:54:11 +03:00
|
|
|
brw_barycentric_mode(const struct brw_wm_prog_key *key,
|
|
|
|
|
nir_intrinsic_instr *intr)
|
2016-07-11 15:00:37 -07:00
|
|
|
{
|
2022-07-06 13:01:24 -07:00
|
|
|
const glsl_interp_mode mode =
|
|
|
|
|
(enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
|
|
|
|
|
|
2016-07-11 15:00:37 -07:00
|
|
|
/* Barycentric modes don't make sense for flat inputs. */
|
2016-07-07 02:02:38 -07:00
|
|
|
assert(mode != INTERP_MODE_FLAT);
|
2016-07-11 15:00:37 -07:00
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
unsigned bary;
|
2022-07-06 13:01:24 -07:00
|
|
|
switch (intr->intrinsic) {
|
2016-07-12 03:57:25 -07:00
|
|
|
case nir_intrinsic_load_barycentric_pixel:
|
|
|
|
|
case nir_intrinsic_load_barycentric_at_offset:
|
2024-04-18 09:54:11 +03:00
|
|
|
/* When per sample interpolation is dynamic, assume sample
|
|
|
|
|
* interpolation. We'll dynamically remap things so that the FS thread
|
|
|
|
|
* payload is not affected.
|
|
|
|
|
*/
|
|
|
|
|
bary = key->persample_interp == BRW_SOMETIMES ?
|
|
|
|
|
BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE :
|
|
|
|
|
BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
|
2016-07-12 03:57:25 -07:00
|
|
|
break;
|
|
|
|
|
case nir_intrinsic_load_barycentric_centroid:
|
|
|
|
|
bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
|
|
|
|
|
break;
|
|
|
|
|
case nir_intrinsic_load_barycentric_sample:
|
|
|
|
|
case nir_intrinsic_load_barycentric_at_sample:
|
|
|
|
|
bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
unreachable("invalid intrinsic");
|
2016-07-11 15:00:37 -07:00
|
|
|
}
|
|
|
|
|
|
2016-07-07 02:02:38 -07:00
|
|
|
if (mode == INTERP_MODE_NOPERSPECTIVE)
|
2016-07-11 15:00:37 -07:00
|
|
|
bary += 3;
|
|
|
|
|
|
|
|
|
|
return (enum brw_barycentric_mode) bary;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Turn one of the two CENTROID barycentric modes into PIXEL mode.
|
|
|
|
|
*/
|
|
|
|
|
static enum brw_barycentric_mode
|
|
|
|
|
centroid_to_pixel(enum brw_barycentric_mode bary)
|
|
|
|
|
{
|
|
|
|
|
assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID ||
|
|
|
|
|
bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
|
|
|
|
|
return (enum brw_barycentric_mode) ((unsigned) bary - 1);
|
|
|
|
|
}
|
|
|
|
|
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
/**
|
|
|
|
|
* Walk backwards from the end of the program looking for a URB write that
|
|
|
|
|
* isn't in control flow, and mark it with EOT.
|
|
|
|
|
*
|
|
|
|
|
* Return true if successful or false if a separate EOT write is needed.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
fs_visitor::mark_last_urb_write_with_eot()
|
|
|
|
|
{
|
|
|
|
|
foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
|
|
|
|
|
if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
|
|
|
|
|
prev->eot = true;
|
|
|
|
|
|
|
|
|
|
/* Delete now dead instructions. */
|
|
|
|
|
foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
|
|
|
|
|
if (dead == prev)
|
|
|
|
|
break;
|
|
|
|
|
dead->remove();
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
} else if (prev->is_control_flow() || prev->has_side_effects()) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
void
|
|
|
|
|
fs_visitor::emit_gs_thread_end()
|
|
|
|
|
{
|
|
|
|
|
assert(stage == MESA_SHADER_GEOMETRY);
|
|
|
|
|
|
2016-09-08 23:48:51 -07:00
|
|
|
struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
if (gs_compile->control_data_header_size_bits > 0) {
|
|
|
|
|
emit_gs_control_data_bits(this->final_gs_vertex_count);
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-21 10:12:09 -08:00
|
|
|
const fs_builder abld = fs_builder(this).at_end().annotate("thread end");
|
2015-03-11 23:14:31 -07:00
|
|
|
fs_inst *inst;
|
|
|
|
|
|
|
|
|
|
if (gs_prog_data->static_vertex_count != -1) {
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
/* Try and tag the last URB write with EOT instead of emitting a whole
|
|
|
|
|
* separate write just to finish the thread.
|
|
|
|
|
*/
|
|
|
|
|
if (mark_last_urb_write_with_eot())
|
|
|
|
|
return;
|
2015-03-11 23:14:31 -07:00
|
|
|
|
2022-07-12 15:32:01 -07:00
|
|
|
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2022-08-22 22:23:17 -07:00
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
|
2022-09-28 16:38:35 -07:00
|
|
|
srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(0);
|
2022-07-12 15:32:01 -07:00
|
|
|
inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
|
|
|
|
|
srcs, ARRAY_SIZE(srcs));
|
2015-03-11 23:14:31 -07:00
|
|
|
} else {
|
2022-07-12 15:32:01 -07:00
|
|
|
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2022-08-22 22:23:17 -07:00
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
|
2022-07-12 15:32:01 -07:00
|
|
|
srcs[URB_LOGICAL_SRC_DATA] = this->final_gs_vertex_count;
|
2022-09-28 16:38:35 -07:00
|
|
|
srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1);
|
2022-07-12 15:32:01 -07:00
|
|
|
inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
|
|
|
|
|
srcs, ARRAY_SIZE(srcs));
|
2015-03-11 23:14:31 -07:00
|
|
|
}
|
|
|
|
|
inst->eot = true;
|
|
|
|
|
inst->offset = 0;
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-03 12:15:21 -07:00
|
|
|
static unsigned
|
|
|
|
|
round_components_to_whole_registers(const intel_device_info *devinfo,
|
|
|
|
|
unsigned c)
|
|
|
|
|
{
|
|
|
|
|
return DIV_ROUND_UP(c, 8 * reg_unit(devinfo)) * reg_unit(devinfo);
|
|
|
|
|
}
|
|
|
|
|
|
2010-08-26 16:39:41 -07:00
|
|
|
void
|
|
|
|
|
fs_visitor::assign_curb_setup()
|
|
|
|
|
{
|
2022-08-03 12:15:21 -07:00
|
|
|
unsigned uniform_push_length =
|
|
|
|
|
round_components_to_whole_registers(devinfo, prog_data->nr_params);
|
2016-11-29 02:47:15 -08:00
|
|
|
|
|
|
|
|
unsigned ubo_push_length = 0;
|
2016-11-29 05:20:20 -08:00
|
|
|
unsigned ubo_push_start[4];
|
2016-11-29 02:47:15 -08:00
|
|
|
for (int i = 0; i < 4; i++) {
|
2016-11-29 05:20:20 -08:00
|
|
|
ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
|
2024-02-19 23:07:04 -08:00
|
|
|
ubo_push_length += prog_data->ubo_ranges[i].length;
|
2022-08-03 12:15:21 -07:00
|
|
|
|
|
|
|
|
assert(ubo_push_start[i] % (8 * reg_unit(devinfo)) == 0);
|
|
|
|
|
assert(ubo_push_length % (1 * reg_unit(devinfo)) == 0);
|
2016-11-29 02:47:15 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
prog_data->curb_read_length = uniform_push_length + ubo_push_length;
|
2014-02-19 15:27:01 +01:00
|
|
|
|
2020-04-03 20:20:53 -05:00
|
|
|
uint64_t used = 0;
|
2021-10-04 13:58:07 +03:00
|
|
|
bool is_compute = gl_shader_stage_is_compute(stage);
|
2020-04-03 20:20:53 -05:00
|
|
|
|
2021-10-04 13:58:07 +03:00
|
|
|
if (is_compute && brw_cs_prog_data(prog_data)->uses_inline_data) {
|
2020-06-16 23:06:56 -05:00
|
|
|
/* With COMPUTE_WALKER, we can push up to one register worth of data via
|
|
|
|
|
* the inline data parameter in the COMPUTE_WALKER command itself.
|
|
|
|
|
*
|
|
|
|
|
* TODO: Support inline data and push at the same time.
|
|
|
|
|
*/
|
|
|
|
|
assert(devinfo->verx10 >= 125);
|
2023-01-31 16:01:26 -08:00
|
|
|
assert(uniform_push_length <= reg_unit(devinfo));
|
2021-10-04 13:58:07 +03:00
|
|
|
} else if (is_compute && devinfo->verx10 >= 125) {
|
2022-07-15 13:08:23 +03:00
|
|
|
assert(devinfo->has_lsc);
|
2023-11-21 09:47:18 -08:00
|
|
|
fs_builder ubld = fs_builder(this, 1).exec_all().at(
|
2020-05-04 16:17:58 -05:00
|
|
|
cfg->first_block(), cfg->first_block()->start());
|
|
|
|
|
|
2022-07-15 13:08:23 +03:00
|
|
|
/* The base offset for our push data is passed in as R0.0[31:6]. We have
|
|
|
|
|
* to mask off the bottom 6 bits.
|
2020-05-04 16:17:58 -05:00
|
|
|
*/
|
2024-04-12 17:43:22 -07:00
|
|
|
fs_reg base_addr =
|
|
|
|
|
ubld.AND(retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
|
|
|
|
|
brw_imm_ud(INTEL_MASK(31, 6)));
|
2020-05-04 16:17:58 -05:00
|
|
|
|
2021-03-29 15:46:12 -07:00
|
|
|
/* On Gfx12-HP we load constants at the start of the program using A32
|
2020-05-04 16:17:58 -05:00
|
|
|
* stateless messages.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned i = 0; i < uniform_push_length;) {
|
2022-07-15 13:08:23 +03:00
|
|
|
/* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
|
|
|
|
|
unsigned num_regs = MIN2(uniform_push_length - i, 8);
|
2020-05-04 16:17:58 -05:00
|
|
|
assert(num_regs > 0);
|
|
|
|
|
num_regs = 1 << util_logbase2(num_regs);
|
|
|
|
|
|
2024-01-26 12:25:41 -08:00
|
|
|
/* This pass occurs after all of the optimization passes, so don't
|
|
|
|
|
* emit an 'ADD addr, base_addr, 0' instruction.
|
|
|
|
|
*/
|
2024-04-12 17:43:22 -07:00
|
|
|
fs_reg addr = i == 0 ? base_addr :
|
|
|
|
|
ubld.ADD(base_addr, brw_imm_ud(i * REG_SIZE));
|
2020-05-04 16:17:58 -05:00
|
|
|
|
|
|
|
|
fs_reg srcs[4] = {
|
|
|
|
|
brw_imm_ud(0), /* desc */
|
|
|
|
|
brw_imm_ud(0), /* ex_desc */
|
2022-07-15 13:08:23 +03:00
|
|
|
addr, /* payload */
|
|
|
|
|
fs_reg(), /* payload2 */
|
2020-05-04 16:17:58 -05:00
|
|
|
};
|
|
|
|
|
|
2022-08-19 12:40:20 -07:00
|
|
|
fs_reg dest = retype(brw_vec8_grf(payload().num_regs + i, 0),
|
2024-04-20 17:08:02 -07:00
|
|
|
BRW_TYPE_UD);
|
2022-07-15 13:08:23 +03:00
|
|
|
fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
|
|
|
|
|
|
|
|
|
|
send->sfid = GFX12_SFID_UGM;
|
|
|
|
|
send->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
|
|
|
|
|
LSC_ADDR_SURFTYPE_FLAT,
|
|
|
|
|
LSC_ADDR_SIZE_A32,
|
|
|
|
|
LSC_DATA_SIZE_D32,
|
|
|
|
|
num_regs * 8 /* num_channels */,
|
|
|
|
|
true /* transpose */,
|
2022-09-29 12:38:19 -07:00
|
|
|
LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
|
2022-07-15 13:08:23 +03:00
|
|
|
send->header_size = 0;
|
2022-09-28 16:17:02 -07:00
|
|
|
send->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, 1);
|
2022-07-15 13:08:23 +03:00
|
|
|
send->size_written =
|
2022-09-28 16:17:02 -07:00
|
|
|
lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE;
|
2020-05-04 16:17:58 -05:00
|
|
|
send->send_is_volatile = true;
|
|
|
|
|
|
|
|
|
|
i += num_regs;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
|
|
|
|
}
|
|
|
|
|
|
2010-08-26 16:39:41 -07:00
|
|
|
/* Map the offsets in the UNIFORM file to fixed HW regs. */
|
2014-09-01 13:35:04 -07:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2014-03-17 10:39:43 -07:00
|
|
|
for (unsigned int i = 0; i < inst->sources; i++) {
|
2010-08-26 16:39:41 -07:00
|
|
|
if (inst->src[i].file == UNIFORM) {
|
2016-09-01 12:42:20 -07:00
|
|
|
int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
|
2014-03-11 14:35:27 -07:00
|
|
|
int constant_nr;
|
2016-11-29 05:20:20 -08:00
|
|
|
if (inst->src[i].nr >= UBO_START) {
|
|
|
|
|
/* constant_nr is in 32-bit units, the rest are in bytes */
|
|
|
|
|
constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
|
|
|
|
|
inst->src[i].offset / 4;
|
|
|
|
|
} else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
|
2014-03-11 14:35:27 -07:00
|
|
|
constant_nr = push_constant_loc[uniform_nr];
|
|
|
|
|
} else {
|
|
|
|
|
/* Section 5.11 of the OpenGL 4.1 spec says:
|
|
|
|
|
* "Out-of-bounds reads return undefined values, which include
|
|
|
|
|
* values from other variables of the active program or zero."
|
|
|
|
|
* Just return the first push constant.
|
|
|
|
|
*/
|
|
|
|
|
constant_nr = 0;
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-03 20:20:53 -05:00
|
|
|
assert(constant_nr / 8 < 64);
|
|
|
|
|
used |= BITFIELD64_BIT(constant_nr / 8);
|
|
|
|
|
|
2022-08-19 12:40:20 -07:00
|
|
|
struct brw_reg brw_reg = brw_vec1_grf(payload().num_regs +
|
2010-08-27 14:15:42 -07:00
|
|
|
constant_nr / 8,
|
|
|
|
|
constant_nr % 8);
|
2015-10-24 15:29:03 -07:00
|
|
|
brw_reg.abs = inst->src[i].abs;
|
|
|
|
|
brw_reg.negate = inst->src[i].negate;
|
2010-08-26 16:39:41 -07:00
|
|
|
|
i965/fs: Fix stride field for uniforms.
This fixes essentially the same problem as for immediates. Registers
of the UNIFORM file are typically accessed according to the formula:
read_uniform(r, channel_index, array_index) =
read_element(r, channel_index * 0 + array_index * 1)
Which matches the general direct addressing formula for stride=0:
read_direct(r, channel_index, array_index) =
read_element(r, channel_index * stride +
array_index * max{1, stride * width})
In either case if reladdr is present the access will be according to
the composition of two register regions, the first one determining the
per-channel array_index used for the second, like:
read_indirect(r, channel_index, array_index) =
read_direct(r, channel_index,
read(r.reladdr, channel_index, array_index))
where:
read(r, channel_index, array_index) = if r.reladdr == NULL
then read_direct(r, channel_index, array_index)
else read_indirect(r, channel_index, array_index)
In conclusion we can handle uniforms consistently with the other
register files if we set stride to zero. After lowering to a GRF
using VARYING_PULL_CONSTANT_LOAD in demote_pull_constant_loads() the
stride of the source is set to one again because the result of
VARYING_PULL_CONSTANT_LOAD is generally non-uniform.
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
2015-07-13 15:29:39 +03:00
|
|
|
assert(inst->src[i].stride == 0);
|
2015-10-24 15:29:03 -07:00
|
|
|
inst->src[i] = byte_offset(
|
2013-12-08 04:57:08 +01:00
|
|
|
retype(brw_reg, inst->src[i].type),
|
2016-09-01 15:11:21 -07:00
|
|
|
inst->src[i].offset % 4);
|
2010-08-26 16:39:41 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2014-10-03 19:05:32 -07:00
|
|
|
|
2024-02-19 23:07:04 -08:00
|
|
|
uint64_t want_zero = used & prog_data->zero_push_reg;
|
2020-04-03 20:20:53 -05:00
|
|
|
if (want_zero) {
|
2023-11-21 09:47:18 -08:00
|
|
|
fs_builder ubld = fs_builder(this, 8).exec_all().at(
|
2020-04-03 20:20:53 -05:00
|
|
|
cfg->first_block(), cfg->first_block()->start());
|
|
|
|
|
|
|
|
|
|
/* push_reg_mask_param is in 32-bit units */
|
2024-02-19 23:07:04 -08:00
|
|
|
unsigned mask_param = prog_data->push_reg_mask_param;
|
2022-08-19 12:40:20 -07:00
|
|
|
struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
|
|
|
|
|
mask_param % 8);
|
2020-04-03 20:20:53 -05:00
|
|
|
|
|
|
|
|
fs_reg b32;
|
|
|
|
|
for (unsigned i = 0; i < 64; i++) {
|
|
|
|
|
if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
|
2024-04-20 17:08:02 -07:00
|
|
|
fs_reg shifted = ubld.vgrf(BRW_TYPE_W, 2);
|
2020-04-03 20:20:53 -05:00
|
|
|
ubld.SHL(horiz_offset(shifted, 8),
|
2024-04-20 17:08:02 -07:00
|
|
|
byte_offset(retype(mask, BRW_TYPE_W), i / 8),
|
2020-04-03 20:20:53 -05:00
|
|
|
brw_imm_v(0x01234567));
|
|
|
|
|
ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
|
|
|
|
|
|
|
|
|
|
fs_builder ubld16 = ubld.group(16, 0);
|
2024-04-20 17:08:02 -07:00
|
|
|
b32 = ubld16.vgrf(BRW_TYPE_D);
|
2020-04-03 20:20:53 -05:00
|
|
|
ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (want_zero & BITFIELD64_BIT(i)) {
|
|
|
|
|
assert(i < prog_data->curb_read_length);
|
|
|
|
|
struct brw_reg push_reg =
|
2024-04-20 17:30:23 -07:00
|
|
|
retype(brw_vec8_grf(payload().num_regs + i, 0), BRW_TYPE_D);
|
2020-04-03 20:20:53 -05:00
|
|
|
|
|
|
|
|
ubld.AND(push_reg, push_reg, component(b32, i % 16));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
|
|
|
|
}
|
|
|
|
|
|
2014-10-03 19:05:32 -07:00
|
|
|
/* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
|
2022-08-19 12:40:20 -07:00
|
|
|
this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
|
2010-08-26 16:39:41 -07:00
|
|
|
}
|
|
|
|
|
|
2018-12-11 18:45:43 +01:00
|
|
|
/*
|
|
|
|
|
* Build up an array of indices into the urb_setup array that
|
|
|
|
|
* references the active entries of the urb_setup array.
|
|
|
|
|
* Used to accelerate walking the active entries of the urb_setup array
|
|
|
|
|
* on each upload.
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
|
|
|
|
|
{
|
2021-10-29 12:56:22 -07:00
|
|
|
/* TODO(mesh): Review usage of this in the context of Mesh, we may want to
|
|
|
|
|
* skip per-primitive attributes here.
|
|
|
|
|
*/
|
|
|
|
|
|
2018-12-11 18:45:43 +01:00
|
|
|
/* Make sure uint8_t is sufficient */
|
|
|
|
|
STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
|
|
|
|
|
uint8_t index = 0;
|
|
|
|
|
for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
|
|
|
|
|
if (wm_prog_data->urb_setup[attr] >= 0) {
|
|
|
|
|
wm_prog_data->urb_setup_attribs[index++] = attr;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
wm_prog_data->urb_setup_attribs_count = index;
|
|
|
|
|
}
|
|
|
|
|
|
2019-07-18 09:15:15 -05:00
|
|
|
static void
|
2021-04-05 13:19:39 -07:00
|
|
|
calculate_urb_setup(const struct intel_device_info *devinfo,
|
2019-07-18 09:15:15 -05:00
|
|
|
const struct brw_wm_prog_key *key,
|
|
|
|
|
struct brw_wm_prog_data *prog_data,
|
2021-05-18 11:05:33 -07:00
|
|
|
const nir_shader *nir,
|
|
|
|
|
const struct brw_mue_map *mue_map)
|
2010-08-16 21:53:02 -07:00
|
|
|
{
|
2022-12-21 15:40:07 +01:00
|
|
|
memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
|
|
|
|
|
memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));
|
2010-08-16 21:53:02 -07:00
|
|
|
|
2022-12-21 15:40:07 +01:00
|
|
|
int urb_next = 0; /* in vec4s */
|
2021-05-18 10:17:43 -07:00
|
|
|
|
|
|
|
|
const uint64_t inputs_read =
|
|
|
|
|
nir->info.inputs_read & ~nir->info.per_primitive_inputs;
|
|
|
|
|
|
2010-08-16 21:53:02 -07:00
|
|
|
/* Figure out where each of the incoming setup attributes lands. */
|
2023-09-04 22:31:17 -07:00
|
|
|
if (key->mesh_input != BRW_NEVER) {
|
2022-01-27 00:50:52 -08:00
|
|
|
/* Per-Primitive Attributes are laid out by Hardware before the regular
|
|
|
|
|
* attributes, so order them like this to make easy later to map setup
|
|
|
|
|
* into real HW registers.
|
|
|
|
|
*/
|
|
|
|
|
if (nir->info.per_primitive_inputs) {
|
2022-02-25 16:35:26 +01:00
|
|
|
uint64_t per_prim_inputs_read =
|
|
|
|
|
nir->info.inputs_read & nir->info.per_primitive_inputs;
|
|
|
|
|
|
2022-04-12 15:06:16 +02:00
|
|
|
/* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots
|
|
|
|
|
* are always at the beginning, because they come from MUE
|
|
|
|
|
* Primitive Header, not Per-Primitive Attributes.
|
2022-02-25 16:35:26 +01:00
|
|
|
*/
|
|
|
|
|
const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT |
|
2022-04-12 15:06:16 +02:00
|
|
|
VARYING_BIT_LAYER |
|
|
|
|
|
VARYING_BIT_PRIMITIVE_SHADING_RATE;
|
2022-02-25 16:35:26 +01:00
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
if (mue_map) {
|
|
|
|
|
unsigned per_prim_start_dw = mue_map->per_primitive_start_dw;
|
|
|
|
|
unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw;
|
2022-04-12 15:06:16 +02:00
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0;
|
2022-02-25 16:35:26 +01:00
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
if (reads_header || mue_map->user_data_in_primitive_header) {
|
|
|
|
|
/* Primitive Shading Rate, Layer and Viewport live in the same
|
|
|
|
|
* 4-dwords slot (psr is dword 0, layer is dword 1, and viewport
|
|
|
|
|
* is dword 2).
|
|
|
|
|
*/
|
|
|
|
|
if (per_prim_inputs_read & VARYING_BIT_PRIMITIVE_SHADING_RATE)
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 0;
|
2022-02-25 16:35:26 +01:00
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
if (per_prim_inputs_read & VARYING_BIT_LAYER)
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
|
2022-02-25 16:35:26 +01:00
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
if (per_prim_inputs_read & VARYING_BIT_VIEWPORT)
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0;
|
2022-12-21 15:40:07 +01:00
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
per_prim_inputs_read &= ~primitive_header_bits;
|
|
|
|
|
} else {
|
|
|
|
|
/* If fs doesn't need primitive header, then it won't be made
|
|
|
|
|
* available through SBE_MESH, so we have to skip them when
|
|
|
|
|
* calculating offset from start of per-prim data.
|
|
|
|
|
*/
|
|
|
|
|
per_prim_start_dw += mue_map->per_primitive_header_size_dw;
|
|
|
|
|
per_prim_size_dw -= mue_map->per_primitive_header_size_dw;
|
|
|
|
|
}
|
2022-12-21 15:40:07 +01:00
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
u_foreach_bit64(i, per_prim_inputs_read) {
|
|
|
|
|
int start = mue_map->start_dw[i];
|
2022-12-21 15:40:07 +01:00
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
assert(start >= 0);
|
|
|
|
|
assert(mue_map->len_dw[i] > 0);
|
|
|
|
|
|
|
|
|
|
assert(unsigned(start) >= per_prim_start_dw);
|
|
|
|
|
unsigned pos_dw = unsigned(start) - per_prim_start_dw;
|
|
|
|
|
|
|
|
|
|
prog_data->urb_setup[i] = urb_next + pos_dw / 4;
|
|
|
|
|
prog_data->urb_setup_channel[i] = pos_dw % 4;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
urb_next = per_prim_size_dw / 4;
|
|
|
|
|
} else {
|
|
|
|
|
/* With no MUE map, we never read the primitive header, and
|
|
|
|
|
* per-primitive attributes won't be packed either, so just lay
|
|
|
|
|
* them in varying order.
|
|
|
|
|
*/
|
|
|
|
|
per_prim_inputs_read &= ~primitive_header_bits;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
|
|
|
|
|
if (per_prim_inputs_read & BITFIELD64_BIT(i)) {
|
|
|
|
|
prog_data->urb_setup[i] = urb_next++;
|
|
|
|
|
}
|
|
|
|
|
}
|
2022-01-27 00:50:52 -08:00
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
/* The actual setup attributes later must be aligned to a full GRF. */
|
|
|
|
|
urb_next = ALIGN(urb_next, 2);
|
|
|
|
|
}
|
2022-01-27 00:50:52 -08:00
|
|
|
|
|
|
|
|
prog_data->num_per_primitive_inputs = urb_next;
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-09 16:50:18 +01:00
|
|
|
const uint64_t clip_dist_bits = VARYING_BIT_CLIP_DIST0 |
|
|
|
|
|
VARYING_BIT_CLIP_DIST1;
|
|
|
|
|
|
2022-01-27 00:48:19 -08:00
|
|
|
uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
|
|
|
|
|
|
2021-12-09 16:50:18 +01:00
|
|
|
if (inputs_read & clip_dist_bits) {
|
2023-09-04 22:31:17 -07:00
|
|
|
assert(!mue_map || mue_map->per_vertex_header_size_dw > 8);
|
2021-12-09 16:50:18 +01:00
|
|
|
unique_fs_attrs &= ~clip_dist_bits;
|
|
|
|
|
}
|
|
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
if (mue_map) {
|
|
|
|
|
unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw;
|
|
|
|
|
unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw;
|
2022-12-21 15:40:07 +01:00
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
/* Per-Vertex header is available to fragment shader only if there's
|
|
|
|
|
* user data there.
|
|
|
|
|
*/
|
|
|
|
|
if (!mue_map->user_data_in_vertex_header) {
|
|
|
|
|
per_vertex_start_dw += 8;
|
|
|
|
|
per_vertex_size_dw -= 8;
|
|
|
|
|
}
|
2022-12-21 15:40:07 +01:00
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
/* In Mesh, CLIP_DIST slots are always at the beginning, because
|
|
|
|
|
* they come from MUE Vertex Header, not Per-Vertex Attributes.
|
|
|
|
|
*/
|
|
|
|
|
if (inputs_read & clip_dist_bits) {
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next;
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1;
|
|
|
|
|
} else if (mue_map && mue_map->per_vertex_header_size_dw > 8) {
|
|
|
|
|
/* Clip distances are in MUE, but we are not reading them in FS. */
|
|
|
|
|
per_vertex_start_dw += 8;
|
|
|
|
|
per_vertex_size_dw -= 8;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Per-Vertex attributes are laid out ordered. Because we always link
|
|
|
|
|
* Mesh and Fragment shaders, the which slots are written and read by
|
|
|
|
|
* each of them will match. */
|
|
|
|
|
u_foreach_bit64(i, unique_fs_attrs) {
|
|
|
|
|
int start = mue_map->start_dw[i];
|
2021-12-09 16:50:18 +01:00
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
assert(start >= 0);
|
|
|
|
|
assert(mue_map->len_dw[i] > 0);
|
2022-12-21 15:40:07 +01:00
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
assert(unsigned(start) >= per_vertex_start_dw);
|
|
|
|
|
unsigned pos_dw = unsigned(start) - per_vertex_start_dw;
|
2022-12-21 15:40:07 +01:00
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
prog_data->urb_setup[i] = urb_next + pos_dw / 4;
|
|
|
|
|
prog_data->urb_setup_channel[i] = pos_dw % 4;
|
|
|
|
|
}
|
2022-12-21 15:40:07 +01:00
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
urb_next += per_vertex_size_dw / 4;
|
|
|
|
|
} else {
|
|
|
|
|
/* If we don't have an MUE map, just lay down the inputs the FS reads
|
|
|
|
|
* in varying order, as we do for the legacy pipeline.
|
|
|
|
|
*/
|
|
|
|
|
if (inputs_read & clip_dist_bits) {
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++;
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++;
|
|
|
|
|
}
|
2022-12-21 15:40:07 +01:00
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
|
|
|
|
|
if (unique_fs_attrs & BITFIELD64_BIT(i))
|
|
|
|
|
prog_data->urb_setup[i] = urb_next++;
|
|
|
|
|
}
|
2022-01-27 00:48:19 -08:00
|
|
|
}
|
2024-02-17 22:43:47 -08:00
|
|
|
} else {
|
2023-09-04 22:31:17 -07:00
|
|
|
assert(!nir->info.per_primitive_inputs);
|
|
|
|
|
|
2021-12-15 00:35:40 -08:00
|
|
|
uint64_t vue_header_bits =
|
|
|
|
|
VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;
|
|
|
|
|
|
|
|
|
|
uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
|
|
|
|
|
|
|
|
|
|
/* VUE header fields all live in the same URB slot, so we pass them
|
|
|
|
|
* as a single FS input attribute. We want to only count them once.
|
|
|
|
|
*/
|
|
|
|
|
if (inputs_read & vue_header_bits) {
|
|
|
|
|
unique_fs_attrs &= ~vue_header_bits;
|
|
|
|
|
unique_fs_attrs |= VARYING_BIT_PSIZ;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (util_bitcount64(unique_fs_attrs) <= 16) {
|
2013-09-03 12:15:53 -07:00
|
|
|
/* The SF/SBE pipeline stage can do arbitrary rearrangement of the
|
|
|
|
|
* first 16 varying inputs, so we can put them wherever we want.
|
|
|
|
|
* Just put them in order.
|
|
|
|
|
*
|
|
|
|
|
* This is useful because it means that (a) inputs not used by the
|
|
|
|
|
* fragment shader won't take up valuable register space, and (b) we
|
|
|
|
|
* won't have to recompile the fragment shader if it gets paired with
|
|
|
|
|
* a different vertex (or geometry) shader.
|
2021-12-15 00:35:40 -08:00
|
|
|
*
|
|
|
|
|
* VUE header fields share the same FS input attribute.
|
2013-09-03 12:15:53 -07:00
|
|
|
*/
|
2021-12-15 00:35:40 -08:00
|
|
|
if (inputs_read & vue_header_bits) {
|
|
|
|
|
if (inputs_read & VARYING_BIT_PSIZ)
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_PSIZ] = urb_next;
|
|
|
|
|
if (inputs_read & VARYING_BIT_LAYER)
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_LAYER] = urb_next;
|
|
|
|
|
if (inputs_read & VARYING_BIT_VIEWPORT)
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = urb_next;
|
|
|
|
|
|
|
|
|
|
urb_next++;
|
|
|
|
|
}
|
|
|
|
|
|
2013-09-03 12:15:53 -07:00
|
|
|
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
|
2021-12-15 00:35:40 -08:00
|
|
|
if (inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits &
|
2013-09-03 12:15:53 -07:00
|
|
|
BITFIELD64_BIT(i)) {
|
2014-05-14 00:17:03 -07:00
|
|
|
prog_data->urb_setup[i] = urb_next++;
|
2013-09-03 12:15:53 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
/* We have enough input varyings that the SF/SBE pipeline stage can't
|
|
|
|
|
* arbitrarily rearrange them to suit our whim; we have to put them
|
|
|
|
|
* in an order that matches the output of the previous pipeline stage
|
|
|
|
|
* (geometry or vertex shader).
|
|
|
|
|
*/
|
2021-04-30 01:00:51 -07:00
|
|
|
|
|
|
|
|
/* Re-compute the VUE map here in the case that the one coming from
|
|
|
|
|
* geometry has more than one position slot (used for Primitive
|
|
|
|
|
* Replication).
|
|
|
|
|
*/
|
2024-02-01 15:39:52 -08:00
|
|
|
struct intel_vue_map prev_stage_vue_map;
|
2015-04-17 12:52:00 -07:00
|
|
|
brw_compute_vue_map(devinfo, &prev_stage_vue_map,
|
i965: Don't re-layout varyings for separate shader programs.
Previously, our VUE map code always assigned slots to varyings
sequentially, in one contiguous block.
This was a bad fit for separate shaders - the GS input layout depended
or the VS output layout, so if we swapped out vertex shaders, we might
have to recompile the GS on the fly - which rather defeats the point of
using separate shader objects. (Tessellation would suffer from this
as well - we could have to recompile the HS, DS, and GS.)
Instead, this patch makes the VUE map for separate shaders use a fixed
layout, based on the input/output variable's location field. (This is
either specified by layout(location = ...) or assigned by the linker.)
Corresponding inputs/outputs will match up by location; if there's a
mismatch, we're allowed to have undefined behavior.
This may be less efficient - depending what locations were chosen, we
may have empty padding slots in the VUE. But applications presumably
use small consecutive integers for locations, so it hopefully won't be
much worse in practice.
3% of Dota 2 Reborn shaders are hurt, but only by 2 instructions.
This seems like a small price to pay for avoiding recompiles.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
2015-09-09 16:21:56 -07:00
|
|
|
key->input_slots_valid,
|
2018-09-21 16:07:38 -07:00
|
|
|
nir->info.separate_shader, 1);
|
i965: skip reading unused slots at the begining of the URB for the FS
We can start reading the URB at the first offset that contains varyings
that are actually read in the URB. We still need to make sure that we
read at least one varying to honor hardware requirements.
This helps alleviate a problem introduced with 99df02ca26f61 for
separate shader objects: without separate shader objects we assign
locations sequentially, however, since that commit we have changed the
method for SSO so that the VUE slot assigned depends on the number of
builtin slots plus the location assigned to the varying. This fixed
layout is intended to help SSO programs by avoiding on-the-fly recompiles
when swapping out shaders, however, it also means that if a varying uses
a large location number close to the maximum allowed by the SF/FS units
(31), then the offset introduced by the number of builtin slots can push
the location outside the range and trigger an assertion.
This problem is affecting at least the following CTS tests for
enhanced layouts:
KHR-GL45.enhanced_layouts.varying_array_components
KHR-GL45.enhanced_layouts.varying_array_locations
KHR-GL45.enhanced_layouts.varying_components
KHR-GL45.enhanced_layouts.varying_locations
which use SSO and the the location layout qualifier to select such
location numbers explicitly.
This change helps these tests because for SSO we always have to include
things such as VARYING_SLOT_CLIP_DIST{0,1} even if the fragment shader is
very unlikely to read them, so by doing this we free builtin slots from
the fixed VUE layout and we avoid the tests to crash in this scenario.
Of course, this is not a proper fix, we'd still run into problems if someone
tries to use an explicit max location and read gl_ViewportIndex, gl_LayerID or
gl_CullDistancein in the FS, but that would be a much less common bug and we
can probably wait to see if anyone actually runs into that situation in a real
world scenario before making the decision that more aggresive changes are
required to support this without reverting 99df02ca26f61.
v2:
- Add a debug message when we skip clip distances (Ilia)
- we also need to account for this when we compute the urb setup
for the fragment shader stage, so add a compiler util to compute
the first slot that we need to read from the URB instead of
replicating the logic in both places.
v3:
- Make the util more generic so it can account for all unused slots
at the beginning of the URB, that will make it more useful (Ken).
- Drop the debug message, it was not what Ilia was asking for.
Suggested-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-09-20 09:22:51 +02:00
|
|
|
|
2015-06-17 13:06:18 -07:00
|
|
|
int first_slot =
|
2021-05-18 10:17:43 -07:00
|
|
|
brw_compute_first_urb_slot_required(inputs_read,
|
i965: skip reading unused slots at the begining of the URB for the FS
We can start reading the URB at the first offset that contains varyings
that are actually read in the URB. We still need to make sure that we
read at least one varying to honor hardware requirements.
This helps alleviate a problem introduced with 99df02ca26f61 for
separate shader objects: without separate shader objects we assign
locations sequentially, however, since that commit we have changed the
method for SSO so that the VUE slot assigned depends on the number of
builtin slots plus the location assigned to the varying. This fixed
layout is intended to help SSO programs by avoiding on-the-fly recompiles
when swapping out shaders, however, it also means that if a varying uses
a large location number close to the maximum allowed by the SF/FS units
(31), then the offset introduced by the number of builtin slots can push
the location outside the range and trigger an assertion.
This problem is affecting at least the following CTS tests for
enhanced layouts:
KHR-GL45.enhanced_layouts.varying_array_components
KHR-GL45.enhanced_layouts.varying_array_locations
KHR-GL45.enhanced_layouts.varying_components
KHR-GL45.enhanced_layouts.varying_locations
which use SSO and the the location layout qualifier to select such
location numbers explicitly.
This change helps these tests because for SSO we always have to include
things such as VARYING_SLOT_CLIP_DIST{0,1} even if the fragment shader is
very unlikely to read them, so by doing this we free builtin slots from
the fixed VUE layout and we avoid the tests to crash in this scenario.
Of course, this is not a proper fix, we'd still run into problems if someone
tries to use an explicit max location and read gl_ViewportIndex, gl_LayerID or
gl_CullDistancein in the FS, but that would be a much less common bug and we
can probably wait to see if anyone actually runs into that situation in a real
world scenario before making the decision that more aggresive changes are
required to support this without reverting 99df02ca26f61.
v2:
- Add a debug message when we skip clip distances (Ilia)
- we also need to account for this when we compute the urb setup
for the fragment shader stage, so add a compiler util to compute
the first slot that we need to read from the URB instead of
replicating the logic in both places.
v3:
- Make the util more generic so it can account for all unused slots
at the beginning of the URB, that will make it more useful (Ken).
- Drop the debug message, it was not what Ilia was asking for.
Suggested-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-09-20 09:22:51 +02:00
|
|
|
&prev_stage_vue_map);
|
2015-06-17 13:06:18 -07:00
|
|
|
|
2013-09-03 12:15:53 -07:00
|
|
|
assert(prev_stage_vue_map.num_slots <= first_slot + 32);
|
|
|
|
|
for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
|
|
|
|
|
slot++) {
|
|
|
|
|
int varying = prev_stage_vue_map.slot_to_varying[slot];
|
2015-10-26 01:03:12 -07:00
|
|
|
if (varying != BRW_VARYING_SLOT_PAD &&
|
2021-05-18 10:17:43 -07:00
|
|
|
(inputs_read & BRW_FS_VARYING_INPUT_MASK &
|
2013-09-03 12:15:53 -07:00
|
|
|
BITFIELD64_BIT(varying))) {
|
2014-05-14 00:17:03 -07:00
|
|
|
prog_data->urb_setup[varying] = slot - first_slot;
|
2013-09-03 12:15:53 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
urb_next = prev_stage_vue_map.num_slots - first_slot;
|
2010-10-01 12:15:48 -07:00
|
|
|
}
|
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
2021-05-18 10:17:43 -07:00
|
|
|
prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
|
|
|
|
|
prog_data->inputs = inputs_read;
|
2018-12-11 18:45:43 +01:00
|
|
|
|
|
|
|
|
brw_compute_urb_setup_index(prog_data);
|
2010-10-01 12:15:48 -07:00
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
2010-10-01 12:15:48 -07:00
|
|
|
void
|
|
|
|
|
fs_visitor::assign_urb_setup()
|
|
|
|
|
{
|
2014-08-29 12:50:46 -07:00
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
2016-09-08 23:48:51 -07:00
|
|
|
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
|
2014-08-29 12:50:46 -07:00
|
|
|
|
2022-08-19 12:40:20 -07:00
|
|
|
int urb_start = payload().num_regs + prog_data->base.curb_read_length;
|
2010-08-16 21:53:02 -07:00
|
|
|
|
2010-10-01 12:15:48 -07:00
|
|
|
/* Offset all the urb_setup[] index by the actual position of the
|
|
|
|
|
* setup regs, now that the location of the constants has been chosen.
|
2010-08-16 21:53:02 -07:00
|
|
|
*/
|
2014-09-01 13:35:04 -07:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2016-04-25 18:33:22 -07:00
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
if (inst->src[i].file == ATTR) {
|
2022-06-22 16:22:40 -07:00
|
|
|
/* ATTR fs_reg::nr in the FS is in units of logical scalar
|
2023-12-01 16:23:11 -08:00
|
|
|
* inputs each of which consumes 16B on Gfx4-Gfx12. In
|
|
|
|
|
* single polygon mode this leads to the following layout
|
|
|
|
|
* of the vertex setup plane parameters in the ATTR
|
|
|
|
|
* register file:
|
2022-06-22 16:22:40 -07:00
|
|
|
*
|
|
|
|
|
* fs_reg::nr Input Comp0 Comp1 Comp2 Comp3
|
|
|
|
|
* 0 Attr0.x a1-a0 a2-a0 N/A a0
|
|
|
|
|
* 1 Attr0.y a1-a0 a2-a0 N/A a0
|
|
|
|
|
* 2 Attr0.z a1-a0 a2-a0 N/A a0
|
|
|
|
|
* 3 Attr0.w a1-a0 a2-a0 N/A a0
|
|
|
|
|
* 4 Attr1.x a1-a0 a2-a0 N/A a0
|
|
|
|
|
* ...
|
|
|
|
|
*
|
|
|
|
|
* In multipolygon mode that no longer works since
|
|
|
|
|
* different channels may be processing polygons with
|
|
|
|
|
* different plane parameters, so each parameter above is
|
|
|
|
|
* represented as a dispatch_width-wide vector:
|
|
|
|
|
*
|
|
|
|
|
* fs_reg::nr fs_reg::offset Input Comp0 ... CompN
|
|
|
|
|
* 0 0 Attr0.x a1[0]-a0[0] ... a1[N]-a0[N]
|
|
|
|
|
* 0 4 * dispatch_width Attr0.x a2[0]-a0[0] ... a2[N]-a0[N]
|
|
|
|
|
* 0 8 * dispatch_width Attr0.x N/A ... N/A
|
|
|
|
|
* 0 12 * dispatch_width Attr0.x a0[0] ... a0[N]
|
|
|
|
|
* 1 0 Attr0.y a1[0]-a0[0] ... a1[N]-a0[N]
|
|
|
|
|
* ...
|
|
|
|
|
*
|
|
|
|
|
* Note that many of the components on a single row above
|
|
|
|
|
* are likely to be replicated multiple times (if, say, a
|
|
|
|
|
* single SIMD thread is only processing 2 different
|
|
|
|
|
* polygons), so plane parameters aren't actually stored
|
|
|
|
|
* in GRF memory with that layout to avoid wasting space.
|
|
|
|
|
* Instead we compose ATTR register regions with a 2D
|
|
|
|
|
* region that walks through the parameters of each
|
|
|
|
|
* polygon with the correct stride, reading the parameter
|
|
|
|
|
* corresponding to each channel directly from the PS
|
|
|
|
|
* thread payload.
|
|
|
|
|
*
|
|
|
|
|
* The latter layout corresponds to a param_width equal to
|
|
|
|
|
* dispatch_width, while the former (scalar parameter)
|
|
|
|
|
* layout has a param_width of 1.
|
2023-12-01 16:23:11 -08:00
|
|
|
*
|
|
|
|
|
* Gfx20+ represent plane parameters in a format similar
|
|
|
|
|
* to the above, except the parameters are packed in 12B
|
|
|
|
|
* and ordered like "a0, a1-a0, a2-a0" instead of the
|
|
|
|
|
* above vec4 representation with a missing component.
|
2016-04-25 18:33:22 -07:00
|
|
|
*/
|
2022-06-22 16:22:40 -07:00
|
|
|
const unsigned param_width = (max_polygons > 1 ? dispatch_width : 1);
|
|
|
|
|
|
|
|
|
|
/* Size of a single scalar component of a plane parameter
|
|
|
|
|
* in bytes.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned chan_sz = 4;
|
2023-12-01 16:23:11 -08:00
|
|
|
struct brw_reg reg;
|
|
|
|
|
assert(max_polygons > 0);
|
2022-06-22 16:22:40 -07:00
|
|
|
|
2023-12-01 17:01:36 -08:00
|
|
|
/* Calculate the base register on the thread payload of
|
|
|
|
|
* either the block of vertex setup data or the block of
|
|
|
|
|
* per-primitive constant data depending on whether we're
|
|
|
|
|
* accessing a primitive or vertex input. Also calculate
|
|
|
|
|
* the index of the input within that block.
|
|
|
|
|
*/
|
|
|
|
|
const bool per_prim = inst->src[i].nr < prog_data->num_per_primitive_inputs;
|
|
|
|
|
const unsigned base = urb_start +
|
|
|
|
|
(per_prim ? 0 :
|
|
|
|
|
ALIGN(prog_data->num_per_primitive_inputs / 2,
|
|
|
|
|
reg_unit(devinfo)) * max_polygons);
|
|
|
|
|
const unsigned idx = per_prim ? inst->src[i].nr :
|
|
|
|
|
inst->src[i].nr - prog_data->num_per_primitive_inputs;
|
|
|
|
|
|
2022-06-22 16:22:40 -07:00
|
|
|
/* Translate the offset within the param_width-wide
|
2023-12-01 16:23:11 -08:00
|
|
|
* representation described above into an offset and a
|
|
|
|
|
* grf, which contains the plane parameters for the first
|
|
|
|
|
* polygon processed by the thread.
|
2022-06-22 16:22:40 -07:00
|
|
|
*/
|
2023-12-01 17:01:36 -08:00
|
|
|
if (devinfo->ver >= 20 && !per_prim) {
|
2023-12-01 16:23:11 -08:00
|
|
|
/* Gfx20+ is able to pack 5 logical input components
|
2023-12-01 17:01:36 -08:00
|
|
|
* per 64B register for vertex setup data.
|
2023-12-01 16:23:11 -08:00
|
|
|
*/
|
2023-12-01 17:01:36 -08:00
|
|
|
const unsigned grf = base + idx / 5 * 2 * max_polygons;
|
2023-12-01 16:23:11 -08:00
|
|
|
assert(inst->src[i].offset / param_width < 12);
|
2023-12-01 17:01:36 -08:00
|
|
|
const unsigned delta = idx % 5 * 12 +
|
2023-12-01 16:23:11 -08:00
|
|
|
inst->src[i].offset / (param_width * chan_sz) * chan_sz +
|
|
|
|
|
inst->src[i].offset % chan_sz;
|
|
|
|
|
reg = byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
|
|
|
|
|
delta);
|
|
|
|
|
} else {
|
2023-12-01 17:01:36 -08:00
|
|
|
/* Earlier platforms and per-primitive block pack 2 logical
|
|
|
|
|
* input components per 32B register.
|
2023-12-01 16:23:11 -08:00
|
|
|
*/
|
2023-12-01 17:01:36 -08:00
|
|
|
const unsigned grf = base + idx / 2 * max_polygons;
|
2023-12-01 16:23:11 -08:00
|
|
|
assert(inst->src[i].offset / param_width < REG_SIZE / 2);
|
2023-12-01 17:01:36 -08:00
|
|
|
const unsigned delta = (idx % 2) * (REG_SIZE / 2) +
|
2023-12-01 16:23:11 -08:00
|
|
|
inst->src[i].offset / (param_width * chan_sz) * chan_sz +
|
|
|
|
|
inst->src[i].offset % chan_sz;
|
|
|
|
|
reg = byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
|
|
|
|
|
delta);
|
|
|
|
|
}
|
2022-06-22 16:22:40 -07:00
|
|
|
|
|
|
|
|
if (max_polygons > 1) {
|
2023-12-01 22:18:52 -08:00
|
|
|
assert(devinfo->ver >= 12);
|
2022-06-22 16:22:40 -07:00
|
|
|
/* Misaligned channel strides that would lead to
|
|
|
|
|
* cross-channel access in the representation above are
|
|
|
|
|
* disallowed.
|
|
|
|
|
*/
|
2024-04-21 00:57:59 -07:00
|
|
|
assert(inst->src[i].stride * brw_type_size_bytes(inst->src[i].type) == chan_sz);
|
2023-12-01 22:18:52 -08:00
|
|
|
|
|
|
|
|
/* Number of channels processing the same polygon. */
|
|
|
|
|
const unsigned poly_width = dispatch_width / max_polygons;
|
|
|
|
|
assert(dispatch_width % max_polygons == 0);
|
|
|
|
|
|
2022-06-22 16:22:40 -07:00
|
|
|
/* Accessing a subset of channels of a parameter vector
|
|
|
|
|
* starting from "chan" is necessary to handle
|
|
|
|
|
* SIMD-lowered instructions though.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned chan = inst->src[i].offset %
|
|
|
|
|
(param_width * chan_sz) / chan_sz;
|
|
|
|
|
assert(chan < dispatch_width);
|
2023-12-01 22:18:52 -08:00
|
|
|
assert(chan % poly_width == 0);
|
|
|
|
|
const unsigned reg_size = reg_unit(devinfo) * REG_SIZE;
|
|
|
|
|
reg = byte_offset(reg, chan / poly_width * reg_size);
|
2022-06-22 16:22:40 -07:00
|
|
|
|
2023-12-01 22:18:52 -08:00
|
|
|
if (inst->exec_size > poly_width) {
|
2022-06-22 16:22:40 -07:00
|
|
|
/* Accessing the parameters for multiple polygons.
|
|
|
|
|
* Corresponding parameters for different polygons
|
2023-12-01 22:18:52 -08:00
|
|
|
* are stored a GRF apart on the thread payload, so
|
2022-06-22 16:22:40 -07:00
|
|
|
* use that as vertical stride.
|
|
|
|
|
*/
|
2024-04-21 00:57:59 -07:00
|
|
|
const unsigned vstride = reg_size / brw_type_size_bytes(inst->src[i].type);
|
2022-06-22 16:22:40 -07:00
|
|
|
assert(vstride <= 32);
|
2023-12-01 22:18:52 -08:00
|
|
|
assert(chan % poly_width == 0);
|
|
|
|
|
reg = stride(reg, vstride, poly_width, 0);
|
2022-06-22 16:22:40 -07:00
|
|
|
} else {
|
|
|
|
|
/* Accessing one parameter for a single polygon --
|
|
|
|
|
* Translate to a scalar region.
|
|
|
|
|
*/
|
2023-12-01 22:18:52 -08:00
|
|
|
assert(chan % poly_width + inst->exec_size <= poly_width);
|
|
|
|
|
reg = stride(reg, 0, 1, 0);
|
2022-06-22 16:22:40 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
const unsigned width = inst->src[i].stride == 0 ?
|
|
|
|
|
1 : MIN2(inst->exec_size, 8);
|
|
|
|
|
reg = stride(reg, width * inst->src[i].stride,
|
|
|
|
|
width, inst->src[i].stride);
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-25 18:33:22 -07:00
|
|
|
reg.abs = inst->src[i].abs;
|
|
|
|
|
reg.negate = inst->src[i].negate;
|
|
|
|
|
inst->src[i] = reg;
|
|
|
|
|
}
|
2011-01-12 12:52:16 -08:00
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
}
|
|
|
|
|
|
2022-06-22 16:22:40 -07:00
|
|
|
/* Each attribute is 4 setup channels, each of which is half a reg,
|
|
|
|
|
* but they may be replicated multiple times for multipolygon
|
|
|
|
|
* dispatch.
|
|
|
|
|
*/
|
|
|
|
|
this->first_non_payload_grf += prog_data->num_varying_inputs * 2 * max_polygons;
|
2021-05-18 10:17:43 -07:00
|
|
|
|
|
|
|
|
/* Unlike regular attributes, per-primitive attributes have all 4 channels
|
|
|
|
|
* in the same slot, so each GRF can store two slots.
|
|
|
|
|
*/
|
|
|
|
|
assert(prog_data->num_per_primitive_inputs % 2 == 0);
|
2022-06-22 16:22:40 -07:00
|
|
|
this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2 * max_polygons;
|
2010-08-16 21:53:02 -07:00
|
|
|
}
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
void
|
|
|
|
|
fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
|
|
|
|
|
{
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
if (inst->src[i].file == ATTR) {
|
2022-09-12 16:49:11 -07:00
|
|
|
assert(inst->src[i].nr == 0);
|
2022-08-19 12:40:20 -07:00
|
|
|
int grf = payload().num_regs +
|
2015-03-11 23:14:31 -07:00
|
|
|
prog_data->curb_read_length +
|
2016-09-01 12:42:20 -07:00
|
|
|
inst->src[i].offset / REG_SIZE;
|
2015-03-11 23:14:31 -07:00
|
|
|
|
2016-03-23 12:20:05 +01:00
|
|
|
/* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
|
|
|
|
|
*
|
|
|
|
|
* VertStride must be used to cross GRF register boundaries. This
|
|
|
|
|
* rule implies that elements within a 'Width' cannot cross GRF
|
|
|
|
|
* boundaries.
|
|
|
|
|
*
|
|
|
|
|
* So, for registers that are large enough, we have to split the exec
|
|
|
|
|
* size in two and trust the compression state to sort it out.
|
|
|
|
|
*/
|
|
|
|
|
unsigned total_size = inst->exec_size *
|
|
|
|
|
inst->src[i].stride *
|
2024-04-21 00:57:59 -07:00
|
|
|
brw_type_size_bytes(inst->src[i].type);
|
2016-03-23 12:20:05 +01:00
|
|
|
|
|
|
|
|
assert(total_size <= 2 * REG_SIZE);
|
|
|
|
|
const unsigned exec_size =
|
|
|
|
|
(total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
|
|
|
|
|
|
|
|
|
|
unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
|
2015-10-24 15:29:03 -07:00
|
|
|
struct brw_reg reg =
|
2015-03-11 23:14:31 -07:00
|
|
|
stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
|
2016-09-01 15:11:21 -07:00
|
|
|
inst->src[i].offset % REG_SIZE),
|
2016-03-23 12:20:05 +01:00
|
|
|
exec_size * inst->src[i].stride,
|
2015-11-11 22:37:53 -08:00
|
|
|
width, inst->src[i].stride);
|
2015-10-24 15:29:03 -07:00
|
|
|
reg.abs = inst->src[i].abs;
|
|
|
|
|
reg.negate = inst->src[i].negate;
|
|
|
|
|
|
|
|
|
|
inst->src[i] = reg;
|
2015-03-11 23:14:31 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
void
|
|
|
|
|
fs_visitor::assign_vs_urb_setup()
|
|
|
|
|
{
|
2016-09-08 23:48:51 -07:00
|
|
|
struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
|
2014-10-27 22:42:50 -07:00
|
|
|
|
|
|
|
|
assert(stage == MESA_SHADER_VERTEX);
|
|
|
|
|
|
|
|
|
|
/* Each attribute is 4 regs. */
|
2016-04-04 12:47:57 +02:00
|
|
|
this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
|
2014-10-27 22:42:50 -07:00
|
|
|
|
|
|
|
|
assert(vs_prog_data->base.urb_read_length <= 15);
|
|
|
|
|
|
|
|
|
|
/* Rewrite all ATTR file references to the hw grf that they land in. */
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2015-03-11 23:14:31 -07:00
|
|
|
convert_attr_sources_to_hw_regs(inst);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-11-14 17:40:43 -08:00
|
|
|
void
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
fs_visitor::assign_tcs_urb_setup()
|
2015-11-14 17:40:43 -08:00
|
|
|
{
|
|
|
|
|
assert(stage == MESA_SHADER_TESS_CTRL);
|
|
|
|
|
|
|
|
|
|
/* Rewrite all ATTR file references to HW_REGs. */
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
convert_attr_sources_to_hw_regs(inst);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-11-10 14:35:27 -08:00
|
|
|
void
|
|
|
|
|
fs_visitor::assign_tes_urb_setup()
|
|
|
|
|
{
|
|
|
|
|
assert(stage == MESA_SHADER_TESS_EVAL);
|
|
|
|
|
|
2016-09-08 23:48:51 -07:00
|
|
|
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
|
2015-11-10 14:35:27 -08:00
|
|
|
|
|
|
|
|
first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
|
|
|
|
|
|
|
|
|
|
/* Rewrite all ATTR file references to HW_REGs. */
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
convert_attr_sources_to_hw_regs(inst);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
void
|
|
|
|
|
fs_visitor::assign_gs_urb_setup()
|
|
|
|
|
{
|
|
|
|
|
assert(stage == MESA_SHADER_GEOMETRY);
|
|
|
|
|
|
2016-09-08 23:48:51 -07:00
|
|
|
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
first_non_payload_grf +=
|
2017-05-08 09:20:21 -07:00
|
|
|
8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2015-10-26 17:52:57 -07:00
|
|
|
/* Rewrite all ATTR file references to GRFs. */
|
2015-03-11 23:14:31 -07:00
|
|
|
convert_attr_sources_to_hw_regs(inst);
|
2014-10-27 22:42:50 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-30 00:47:32 -07:00
|
|
|
int
|
|
|
|
|
brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
|
|
|
|
|
const brw_stage_prog_data *prog_data)
|
2017-09-29 12:22:48 -07:00
|
|
|
{
|
|
|
|
|
if (prog_data->nr_params == 0)
|
|
|
|
|
return -1;
|
|
|
|
|
|
2021-03-29 13:43:47 -07:00
|
|
|
if (devinfo->verx10 >= 125)
|
2020-06-16 23:06:25 -05:00
|
|
|
return -1;
|
|
|
|
|
|
2017-09-29 12:22:48 -07:00
|
|
|
/* The local thread id is always the last parameter in the list */
|
|
|
|
|
uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
|
2017-08-24 11:40:31 -07:00
|
|
|
if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
|
2017-09-29 12:22:48 -07:00
|
|
|
return prog_data->nr_params - 1;
|
|
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
2015-08-18 17:04:53 -07:00
|
|
|
/**
|
|
|
|
|
* Assign UNIFORM file registers to either push constants or pull constants.
|
2012-11-08 16:06:24 -08:00
|
|
|
*
|
2015-08-18 17:04:53 -07:00
|
|
|
* We allow a fragment shader to have more than the specified minimum
|
|
|
|
|
* maximum number of fragment shader uniform components (64). If
|
|
|
|
|
* there are too many of these, they'd fill up all of register space.
|
|
|
|
|
* So, this will push some of them out to the pull constant buffer and
|
2015-12-08 17:34:38 -08:00
|
|
|
* update the program to load them.
|
2012-11-08 16:06:24 -08:00
|
|
|
*/
|
|
|
|
|
void
|
2015-08-18 17:04:53 -07:00
|
|
|
fs_visitor::assign_constant_locations()
|
2012-11-08 16:06:24 -08:00
|
|
|
{
|
2016-02-22 10:42:07 -08:00
|
|
|
/* Only the first compile gets to decide on locations. */
|
2021-12-03 21:34:06 -06:00
|
|
|
if (push_constant_loc)
|
2014-03-07 16:10:50 -08:00
|
|
|
return;
|
2019-10-31 15:57:52 -05:00
|
|
|
|
2021-12-03 21:34:06 -06:00
|
|
|
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
|
|
|
|
for (unsigned u = 0; u < uniforms; u++)
|
|
|
|
|
push_constant_loc[u] = u;
|
2014-03-11 14:35:27 -07:00
|
|
|
|
2016-11-29 05:20:20 -08:00
|
|
|
/* Now that we know how many regular uniforms we'll push, reduce the
|
|
|
|
|
* UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
|
2021-07-28 13:51:38 +10:00
|
|
|
*
|
|
|
|
|
* If changing this value, note the limitation about total_regs in
|
|
|
|
|
* brw_curbe.c/crocus_state.c
|
|
|
|
|
*/
|
2024-02-17 22:43:47 -08:00
|
|
|
const unsigned max_push_length = 64;
|
2022-08-03 12:15:21 -07:00
|
|
|
unsigned push_length =
|
|
|
|
|
round_components_to_whole_registers(devinfo, prog_data->nr_params);
|
2016-11-29 05:20:20 -08:00
|
|
|
for (int i = 0; i < 4; i++) {
|
|
|
|
|
struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
|
|
|
|
|
|
2021-07-28 13:51:38 +10:00
|
|
|
if (push_length + range->length > max_push_length)
|
|
|
|
|
range->length = max_push_length - push_length;
|
2016-11-29 05:20:20 -08:00
|
|
|
|
|
|
|
|
push_length += range->length;
|
2022-08-03 12:15:21 -07:00
|
|
|
|
|
|
|
|
assert(push_length % (1 * reg_unit(devinfo)) == 0);
|
|
|
|
|
|
2016-11-29 05:20:20 -08:00
|
|
|
}
|
2021-07-28 13:51:38 +10:00
|
|
|
assert(push_length <= max_push_length);
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
}
|
|
|
|
|
|
2017-06-02 09:54:31 -07:00
|
|
|
bool
|
|
|
|
|
fs_visitor::get_pull_locs(const fs_reg &src,
|
|
|
|
|
unsigned *out_surf_index,
|
|
|
|
|
unsigned *out_pull_index)
|
|
|
|
|
{
|
|
|
|
|
assert(src.file == UNIFORM);
|
|
|
|
|
|
2021-12-03 21:34:06 -06:00
|
|
|
if (src.nr < UBO_START)
|
|
|
|
|
return false;
|
2016-11-29 05:20:20 -08:00
|
|
|
|
2021-12-03 21:34:06 -06:00
|
|
|
const struct brw_ubo_range *range =
|
|
|
|
|
&prog_data->ubo_ranges[src.nr - UBO_START];
|
2019-09-09 22:21:17 -07:00
|
|
|
|
2021-12-03 21:34:06 -06:00
|
|
|
/* If this access is in our (reduced) range, use the push data. */
|
|
|
|
|
if (src.offset / 32 < range->length)
|
|
|
|
|
return false;
|
2017-06-02 09:54:31 -07:00
|
|
|
|
2021-12-03 22:20:30 -06:00
|
|
|
*out_surf_index = range->block;
|
2021-12-03 21:34:06 -06:00
|
|
|
*out_pull_index = (32 * range->start + src.offset) / 4;
|
2019-09-09 22:21:17 -07:00
|
|
|
|
2021-12-03 21:34:06 -06:00
|
|
|
prog_data->has_ubo_pull = true;
|
2017-06-02 09:54:31 -07:00
|
|
|
|
2021-12-03 21:34:06 -06:00
|
|
|
return true;
|
2017-06-02 09:54:31 -07:00
|
|
|
}
|
|
|
|
|
|
2014-07-07 15:27:17 -07:00
|
|
|
/**
|
|
|
|
|
* Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
|
|
|
|
|
* instructions to FS_OPCODE_REP_FB_WRITE.
|
|
|
|
|
*/
|
|
|
|
|
void
|
2014-09-26 14:47:03 -07:00
|
|
|
fs_visitor::emit_repclear_shader()
|
2014-07-07 15:27:17 -07:00
|
|
|
{
|
2014-08-19 13:57:11 -07:00
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
2022-11-23 01:47:55 -08:00
|
|
|
fs_inst *write = NULL;
|
2014-07-07 15:27:17 -07:00
|
|
|
|
2022-07-19 16:46:50 -07:00
|
|
|
assert(devinfo->ver < 20);
|
2022-11-23 00:38:02 -08:00
|
|
|
assert(uniforms == 0);
|
2022-11-23 00:55:19 -08:00
|
|
|
assume(key->nr_color_regions > 0);
|
2016-04-04 14:38:42 -07:00
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
fs_reg color_output = retype(brw_vec4_grf(127, 0), BRW_TYPE_UD);
|
|
|
|
|
fs_reg header = retype(brw_vec8_grf(125, 0), BRW_TYPE_UD);
|
2022-11-23 01:47:55 -08:00
|
|
|
|
|
|
|
|
/* We pass the clear color as a flat input. Copy it to the output. */
|
|
|
|
|
fs_reg color_input =
|
2024-04-20 17:08:02 -07:00
|
|
|
brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_TYPE_UD,
|
2022-11-23 00:38:02 -08:00
|
|
|
BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
|
|
|
|
|
BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
|
|
|
|
|
|
2023-11-21 10:12:09 -08:00
|
|
|
const fs_builder bld = fs_builder(this).at_end();
|
2022-11-23 01:47:55 -08:00
|
|
|
bld.exec_all().group(4, 0).MOV(color_output, color_input);
|
2018-05-17 08:46:03 -07:00
|
|
|
|
2022-11-23 00:55:19 -08:00
|
|
|
if (key->nr_color_regions > 1) {
|
2022-11-23 01:47:55 -08:00
|
|
|
/* Copy g0..g1 as the message header */
|
2018-05-17 08:46:03 -07:00
|
|
|
bld.exec_all().group(16, 0)
|
2024-04-20 17:08:02 -07:00
|
|
|
.MOV(header, retype(brw_vec8_grf(0, 0), BRW_TYPE_UD));
|
2022-11-23 00:55:19 -08:00
|
|
|
}
|
2018-05-17 08:46:03 -07:00
|
|
|
|
2022-11-23 00:55:19 -08:00
|
|
|
for (int i = 0; i < key->nr_color_regions; ++i) {
|
|
|
|
|
if (i > 0)
|
|
|
|
|
bld.exec_all().group(1, 0).MOV(component(header, 2), brw_imm_ud(i));
|
2018-05-17 08:46:03 -07:00
|
|
|
|
2024-02-17 22:43:47 -08:00
|
|
|
write = bld.emit(SHADER_OPCODE_SEND);
|
|
|
|
|
write->resize_sources(3);
|
|
|
|
|
write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
|
|
|
|
|
write->src[0] = brw_imm_ud(0);
|
|
|
|
|
write->src[1] = brw_imm_ud(0);
|
|
|
|
|
write->src[2] = i == 0 ? color_output : header;
|
|
|
|
|
write->check_tdr = true;
|
|
|
|
|
write->send_has_side_effects = true;
|
|
|
|
|
write->desc = brw_fb_write_desc(devinfo, i,
|
|
|
|
|
BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
|
|
|
|
|
i == key->nr_color_regions - 1, false);
|
2022-11-23 01:47:55 -08:00
|
|
|
|
2022-11-23 00:55:19 -08:00
|
|
|
/* We can use a headerless message for the first render target */
|
|
|
|
|
write->header_size = i == 0 ? 0 : 2;
|
|
|
|
|
write->mlen = 1 + write->header_size;
|
2014-09-26 14:47:03 -07:00
|
|
|
}
|
|
|
|
|
write->eot = true;
|
2017-01-13 14:01:45 -08:00
|
|
|
write->last_rt = true;
|
2014-07-07 15:27:17 -07:00
|
|
|
|
2014-09-26 14:47:03 -07:00
|
|
|
calculate_cfg();
|
2014-07-07 15:27:17 -07:00
|
|
|
|
2022-11-23 00:38:02 -08:00
|
|
|
this->first_non_payload_grf = payload().num_regs;
|
2018-11-09 14:13:37 -08:00
|
|
|
|
2024-01-04 16:28:40 -08:00
|
|
|
brw_fs_lower_scoreboard(*this);
|
2014-07-07 15:27:17 -07:00
|
|
|
}
|
|
|
|
|
|
2020-01-23 12:50:50 -08:00
|
|
|
/**
|
|
|
|
|
* Get the mask of SIMD channels enabled during dispatch and not yet disabled
|
|
|
|
|
* by discard. Due to the layout of the sample mask in the fragment shader
|
|
|
|
|
* thread payload, \p bld is required to have a dispatch_width() not greater
|
|
|
|
|
* than 16 for fragment shaders.
|
|
|
|
|
*/
|
2022-06-27 12:24:58 -07:00
|
|
|
fs_reg
|
|
|
|
|
brw_sample_mask_reg(const fs_builder &bld)
|
2020-01-23 12:50:50 -08:00
|
|
|
{
|
2023-12-05 17:16:34 -08:00
|
|
|
const fs_visitor &s = *bld.shader;
|
2020-01-23 12:50:50 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
if (s.stage != MESA_SHADER_FRAGMENT) {
|
2020-01-23 12:50:50 -08:00
|
|
|
return brw_imm_ud(0xffffffff);
|
2022-08-16 17:40:31 -07:00
|
|
|
} else if (s.devinfo->ver >= 20 ||
|
|
|
|
|
brw_wm_prog_data(s.prog_data)->uses_kill) {
|
2023-12-05 17:16:34 -08:00
|
|
|
return brw_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16);
|
2020-01-23 12:50:50 -08:00
|
|
|
} else {
|
2024-02-17 22:43:47 -08:00
|
|
|
assert(bld.dispatch_width() <= 16);
|
2022-06-11 17:36:09 -07:00
|
|
|
assert(s.devinfo->ver < 20);
|
2020-01-23 12:50:50 -08:00
|
|
|
return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
|
2024-04-20 17:08:02 -07:00
|
|
|
BRW_TYPE_UW);
|
2020-01-23 12:50:50 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-08-25 23:59:25 -07:00
|
|
|
uint32_t
|
|
|
|
|
brw_fb_write_msg_control(const fs_inst *inst,
|
|
|
|
|
const struct brw_wm_prog_data *prog_data)
|
|
|
|
|
{
|
|
|
|
|
uint32_t mctl;
|
|
|
|
|
|
2024-02-19 19:41:48 -08:00
|
|
|
if (prog_data->dual_src_blend) {
|
2022-07-22 17:33:12 -07:00
|
|
|
assert(inst->exec_size < 32);
|
2019-08-25 23:59:25 -07:00
|
|
|
|
|
|
|
|
if (inst->group % 16 == 0)
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
|
|
|
|
|
else if (inst->group % 16 == 8)
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
|
|
|
|
|
else
|
|
|
|
|
unreachable("Invalid dual-source FB write instruction group");
|
|
|
|
|
} else {
|
|
|
|
|
assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
|
|
|
|
|
|
|
|
|
|
if (inst->exec_size == 16)
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
|
|
|
|
|
else if (inst->exec_size == 8)
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
|
2022-07-22 17:33:12 -07:00
|
|
|
else if (inst->exec_size == 32)
|
|
|
|
|
mctl = XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE;
|
2019-08-25 23:59:25 -07:00
|
|
|
else
|
|
|
|
|
unreachable("Invalid FB write execution size");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return mctl;
|
|
|
|
|
}
|
|
|
|
|
|
2022-06-27 12:24:58 -07:00
|
|
|
/**
|
|
|
|
|
* Predicate the specified instruction on the sample mask.
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
brw_emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst)
|
|
|
|
|
{
|
|
|
|
|
assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
|
|
|
|
|
bld.group() == inst->group &&
|
|
|
|
|
bld.dispatch_width() == inst->exec_size);
|
2015-07-13 17:59:34 +03:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
const fs_visitor &s = *bld.shader;
|
2022-06-27 12:24:58 -07:00
|
|
|
const fs_reg sample_mask = brw_sample_mask_reg(bld);
|
2023-12-05 17:16:34 -08:00
|
|
|
const unsigned subreg = sample_mask_flag_subreg(s);
|
2015-07-13 17:59:34 +03:00
|
|
|
|
2022-08-16 17:40:31 -07:00
|
|
|
if (s.devinfo->ver >= 20 || brw_wm_prog_data(s.prog_data)->uses_kill) {
|
2022-06-27 12:24:58 -07:00
|
|
|
assert(sample_mask.file == ARF &&
|
|
|
|
|
sample_mask.nr == brw_flag_subreg(subreg).nr &&
|
|
|
|
|
sample_mask.subnr == brw_flag_subreg(
|
|
|
|
|
subreg + inst->group / 16).subnr);
|
|
|
|
|
} else {
|
|
|
|
|
bld.group(1, 0).exec_all()
|
|
|
|
|
.MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask);
|
2015-07-13 17:59:34 +03:00
|
|
|
}
|
|
|
|
|
|
2022-06-27 12:24:58 -07:00
|
|
|
if (inst->predicate) {
|
|
|
|
|
assert(inst->predicate == BRW_PREDICATE_NORMAL);
|
|
|
|
|
assert(!inst->predicate_inverse);
|
|
|
|
|
assert(inst->flag_subreg == 0);
|
2022-07-22 17:11:52 -07:00
|
|
|
assert(s.devinfo->ver < 20);
|
2022-06-27 12:24:58 -07:00
|
|
|
/* Combine the sample mask with the existing predicate by using a
|
|
|
|
|
* vertical predication mode.
|
2015-11-16 17:23:01 -08:00
|
|
|
*/
|
2022-06-27 12:24:58 -07:00
|
|
|
inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
|
2015-07-13 17:59:34 +03:00
|
|
|
} else {
|
2022-06-27 12:24:58 -07:00
|
|
|
inst->flag_subreg = subreg;
|
|
|
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
inst->predicate_inverse = false;
|
2015-07-13 17:59:34 +03:00
|
|
|
}
|
2015-07-27 16:14:36 +03:00
|
|
|
}
|
|
|
|
|
|
2013-08-04 23:34:01 -07:00
|
|
|
void
|
2023-06-05 23:31:17 -07:00
|
|
|
fs_visitor::dump_instructions_to_file(FILE *file) const
|
2014-05-29 13:08:59 -07:00
|
|
|
{
|
2024-03-14 16:28:56 +02:00
|
|
|
if (cfg && grf_used == 0) {
|
2024-05-08 13:38:39 -07:00
|
|
|
const register_pressure *rp =
|
|
|
|
|
INTEL_DEBUG(DEBUG_REG_PRESSURE) ? ®pressure_analysis.require() : NULL;
|
|
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
unsigned ip = 0, max_pressure = 0;
|
intel/fs: print identation for control flow
INTEL_DEBUG=optimizer output changes from :
{ 10} 40: cmp.nz.f0.0(8) null:F, vgrf3470:F, 0f
{ 10} 41: (+f0.0) if(8) (null):UD,
{ 11} 42: txf_logical(8) vgrf3473:UD, vgrf250:D(null):UD, 0d(null):UD(null):UD(null):UD(null):UD, 31u, 0u(null):UD(null):UD(null):UD, 3d, 0d
{ 12} 43: and(8) vgrf262:UD, vgrf3473:UD, 2u
{ 11} 44: cmp.nz.f0.0(8) null:D, vgrf262:D, 0d
{ 10} 45: (+f0.0) if(8) (null):UD,
{ 11} 46: mov(8) vgrf270:D, -1082130432d
{ 12} 47: mov(8) vgrf271:D, 1082130432d
{ 14} 48: mov(8) vgrf274+0.0:D, 0d
{ 14} 49: mov(8) vgrf274+1.0:D, 0d
to :
{ 10} 40: cmp.nz.f0.0(8) null:F, vgrf3470:F, 0f
{ 10} 41: (+f0.0) if(8) (null):UD,
{ 11} 42: txf_logical(8) vgrf3473:UD, vgrf250:D(null):UD, 0d(null):UD(null):UD(null):UD(null):UD, 31u, 0u(null):UD(null):UD(null):UD, 3d, 0d
{ 12} 43: and(8) vgrf262:UD, vgrf3473:UD, 2u
{ 11} 44: cmp.nz.f0.0(8) null:D, vgrf262:D, 0d
{ 10} 45: (+f0.0) if(8) (null):UD,
{ 11} 46: mov(8) vgrf270:D, -1082130432d
{ 12} 47: mov(8) vgrf271:D, 1082130432d
{ 14} 48: mov(8) vgrf274+0.0:D, 0d
{ 14} 49: mov(8) vgrf274+1.0:D, 0d
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23477>
2023-06-09 11:48:26 +03:00
|
|
|
unsigned cf_count = 0;
|
2024-02-19 22:37:47 -08:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
intel/fs: print identation for control flow
INTEL_DEBUG=optimizer output changes from :
{ 10} 40: cmp.nz.f0.0(8) null:F, vgrf3470:F, 0f
{ 10} 41: (+f0.0) if(8) (null):UD,
{ 11} 42: txf_logical(8) vgrf3473:UD, vgrf250:D(null):UD, 0d(null):UD(null):UD(null):UD(null):UD, 31u, 0u(null):UD(null):UD(null):UD, 3d, 0d
{ 12} 43: and(8) vgrf262:UD, vgrf3473:UD, 2u
{ 11} 44: cmp.nz.f0.0(8) null:D, vgrf262:D, 0d
{ 10} 45: (+f0.0) if(8) (null):UD,
{ 11} 46: mov(8) vgrf270:D, -1082130432d
{ 12} 47: mov(8) vgrf271:D, 1082130432d
{ 14} 48: mov(8) vgrf274+0.0:D, 0d
{ 14} 49: mov(8) vgrf274+1.0:D, 0d
to :
{ 10} 40: cmp.nz.f0.0(8) null:F, vgrf3470:F, 0f
{ 10} 41: (+f0.0) if(8) (null):UD,
{ 11} 42: txf_logical(8) vgrf3473:UD, vgrf250:D(null):UD, 0d(null):UD(null):UD(null):UD(null):UD, 31u, 0u(null):UD(null):UD(null):UD, 3d, 0d
{ 12} 43: and(8) vgrf262:UD, vgrf3473:UD, 2u
{ 11} 44: cmp.nz.f0.0(8) null:D, vgrf262:D, 0d
{ 10} 45: (+f0.0) if(8) (null):UD,
{ 11} 46: mov(8) vgrf270:D, -1082130432d
{ 12} 47: mov(8) vgrf271:D, 1082130432d
{ 14} 48: mov(8) vgrf274+0.0:D, 0d
{ 14} 49: mov(8) vgrf274+1.0:D, 0d
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23477>
2023-06-09 11:48:26 +03:00
|
|
|
if (inst->is_control_flow_end())
|
|
|
|
|
cf_count -= 1;
|
|
|
|
|
|
2024-05-08 13:38:39 -07:00
|
|
|
if (rp) {
|
|
|
|
|
max_pressure = MAX2(max_pressure, rp->regs_live_at_ip[ip]);
|
|
|
|
|
fprintf(file, "{%3d} ", rp->regs_live_at_ip[ip]);
|
|
|
|
|
}
|
|
|
|
|
|
intel/fs: print identation for control flow
INTEL_DEBUG=optimizer output changes from :
{ 10} 40: cmp.nz.f0.0(8) null:F, vgrf3470:F, 0f
{ 10} 41: (+f0.0) if(8) (null):UD,
{ 11} 42: txf_logical(8) vgrf3473:UD, vgrf250:D(null):UD, 0d(null):UD(null):UD(null):UD(null):UD, 31u, 0u(null):UD(null):UD(null):UD, 3d, 0d
{ 12} 43: and(8) vgrf262:UD, vgrf3473:UD, 2u
{ 11} 44: cmp.nz.f0.0(8) null:D, vgrf262:D, 0d
{ 10} 45: (+f0.0) if(8) (null):UD,
{ 11} 46: mov(8) vgrf270:D, -1082130432d
{ 12} 47: mov(8) vgrf271:D, 1082130432d
{ 14} 48: mov(8) vgrf274+0.0:D, 0d
{ 14} 49: mov(8) vgrf274+1.0:D, 0d
to :
{ 10} 40: cmp.nz.f0.0(8) null:F, vgrf3470:F, 0f
{ 10} 41: (+f0.0) if(8) (null):UD,
{ 11} 42: txf_logical(8) vgrf3473:UD, vgrf250:D(null):UD, 0d(null):UD(null):UD(null):UD(null):UD, 31u, 0u(null):UD(null):UD(null):UD, 3d, 0d
{ 12} 43: and(8) vgrf262:UD, vgrf3473:UD, 2u
{ 11} 44: cmp.nz.f0.0(8) null:D, vgrf262:D, 0d
{ 10} 45: (+f0.0) if(8) (null):UD,
{ 11} 46: mov(8) vgrf270:D, -1082130432d
{ 12} 47: mov(8) vgrf271:D, 1082130432d
{ 14} 48: mov(8) vgrf274+0.0:D, 0d
{ 14} 49: mov(8) vgrf274+1.0:D, 0d
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23477>
2023-06-09 11:48:26 +03:00
|
|
|
for (unsigned i = 0; i < cf_count; i++)
|
|
|
|
|
fprintf(file, " ");
|
2015-02-13 10:46:32 -08:00
|
|
|
dump_instruction(inst, file);
|
|
|
|
|
ip++;
|
intel/fs: print identation for control flow
INTEL_DEBUG=optimizer output changes from :
{ 10} 40: cmp.nz.f0.0(8) null:F, vgrf3470:F, 0f
{ 10} 41: (+f0.0) if(8) (null):UD,
{ 11} 42: txf_logical(8) vgrf3473:UD, vgrf250:D(null):UD, 0d(null):UD(null):UD(null):UD(null):UD, 31u, 0u(null):UD(null):UD(null):UD, 3d, 0d
{ 12} 43: and(8) vgrf262:UD, vgrf3473:UD, 2u
{ 11} 44: cmp.nz.f0.0(8) null:D, vgrf262:D, 0d
{ 10} 45: (+f0.0) if(8) (null):UD,
{ 11} 46: mov(8) vgrf270:D, -1082130432d
{ 12} 47: mov(8) vgrf271:D, 1082130432d
{ 14} 48: mov(8) vgrf274+0.0:D, 0d
{ 14} 49: mov(8) vgrf274+1.0:D, 0d
to :
{ 10} 40: cmp.nz.f0.0(8) null:F, vgrf3470:F, 0f
{ 10} 41: (+f0.0) if(8) (null):UD,
{ 11} 42: txf_logical(8) vgrf3473:UD, vgrf250:D(null):UD, 0d(null):UD(null):UD(null):UD(null):UD, 31u, 0u(null):UD(null):UD(null):UD, 3d, 0d
{ 12} 43: and(8) vgrf262:UD, vgrf3473:UD, 2u
{ 11} 44: cmp.nz.f0.0(8) null:D, vgrf262:D, 0d
{ 10} 45: (+f0.0) if(8) (null):UD,
{ 11} 46: mov(8) vgrf270:D, -1082130432d
{ 12} 47: mov(8) vgrf271:D, 1082130432d
{ 14} 48: mov(8) vgrf274+0.0:D, 0d
{ 14} 49: mov(8) vgrf274+1.0:D, 0d
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23477>
2023-06-09 11:48:26 +03:00
|
|
|
|
|
|
|
|
if (inst->is_control_flow_begin())
|
|
|
|
|
cf_count += 1;
|
2015-02-13 10:46:32 -08:00
|
|
|
}
|
2024-05-08 13:38:39 -07:00
|
|
|
if (rp)
|
|
|
|
|
fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
|
2024-03-14 16:28:56 +02:00
|
|
|
} else if (cfg && exec_list_is_empty(&instructions)) {
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
dump_instruction(inst, file);
|
|
|
|
|
}
|
2015-02-13 10:46:32 -08:00
|
|
|
} else {
|
2024-02-19 22:37:47 -08:00
|
|
|
foreach_in_list(fs_inst, inst, &instructions) {
|
2015-02-13 10:46:32 -08:00
|
|
|
dump_instruction(inst, file);
|
|
|
|
|
}
|
2013-08-04 23:34:01 -07:00
|
|
|
}
|
2014-05-29 11:45:15 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
2024-02-19 22:37:47 -08:00
|
|
|
fs_visitor::dump_instructions(const char *name) const
|
2012-10-30 15:35:44 -07:00
|
|
|
{
|
2024-02-19 22:37:47 -08:00
|
|
|
FILE *file = stderr;
|
|
|
|
|
if (name && __normal_user()) {
|
|
|
|
|
file = fopen(name, "w");
|
|
|
|
|
if (!file)
|
|
|
|
|
file = stderr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
dump_instructions_to_file(file);
|
2013-04-29 14:21:14 -07:00
|
|
|
|
2024-02-19 22:37:47 -08:00
|
|
|
if (file != stderr) {
|
|
|
|
|
fclose(file);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-28 13:59:35 -08:00
|
|
|
static const char *
|
|
|
|
|
brw_instruction_name(const struct brw_isa_info *isa, enum opcode op)
|
|
|
|
|
{
|
|
|
|
|
const struct intel_device_info *devinfo = isa->devinfo;
|
|
|
|
|
|
|
|
|
|
switch (op) {
|
|
|
|
|
case 0 ... NUM_BRW_OPCODES - 1:
|
|
|
|
|
/* The DO instruction doesn't exist on Gfx9+, but we use it to mark the
|
|
|
|
|
* start of a loop in the IR.
|
|
|
|
|
*/
|
|
|
|
|
if (op == BRW_OPCODE_DO)
|
|
|
|
|
return "do";
|
|
|
|
|
|
|
|
|
|
/* DPAS instructions may transiently exist on platforms that do not
|
|
|
|
|
* support DPAS. They will eventually be lowered, but in the meantime it
|
|
|
|
|
* must be possible to query the instruction name.
|
|
|
|
|
*/
|
|
|
|
|
if (devinfo->verx10 < 125 && op == BRW_OPCODE_DPAS)
|
|
|
|
|
return "dpas";
|
|
|
|
|
|
|
|
|
|
assert(brw_opcode_desc(isa, op)->name);
|
|
|
|
|
return brw_opcode_desc(isa, op)->name;
|
|
|
|
|
case FS_OPCODE_FB_WRITE_LOGICAL:
|
|
|
|
|
return "fb_write_logical";
|
|
|
|
|
case FS_OPCODE_FB_READ_LOGICAL:
|
|
|
|
|
return "fb_read_logical";
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_RCP:
|
|
|
|
|
return "rcp";
|
|
|
|
|
case SHADER_OPCODE_RSQ:
|
|
|
|
|
return "rsq";
|
|
|
|
|
case SHADER_OPCODE_SQRT:
|
|
|
|
|
return "sqrt";
|
|
|
|
|
case SHADER_OPCODE_EXP2:
|
|
|
|
|
return "exp2";
|
|
|
|
|
case SHADER_OPCODE_LOG2:
|
|
|
|
|
return "log2";
|
|
|
|
|
case SHADER_OPCODE_POW:
|
|
|
|
|
return "pow";
|
|
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
|
|
|
return "int_quot";
|
|
|
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
|
|
|
|
return "int_rem";
|
|
|
|
|
case SHADER_OPCODE_SIN:
|
|
|
|
|
return "sin";
|
|
|
|
|
case SHADER_OPCODE_COS:
|
|
|
|
|
return "cos";
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_SEND:
|
|
|
|
|
return "send";
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_UNDEF:
|
|
|
|
|
return "undef";
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TEX_LOGICAL:
|
|
|
|
|
return "tex_logical";
|
|
|
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
|
|
|
return "txd_logical";
|
|
|
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
|
|
|
return "txf_logical";
|
|
|
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
|
|
|
|
return "txl_logical";
|
|
|
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
|
|
|
|
return "txs_logical";
|
|
|
|
|
case FS_OPCODE_TXB_LOGICAL:
|
|
|
|
|
return "txb_logical";
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
|
|
|
|
|
return "txf_cms_w_logical";
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
|
|
|
|
|
return "txf_cms_w_gfx12_logical";
|
|
|
|
|
case SHADER_OPCODE_TXF_MCS_LOGICAL:
|
|
|
|
|
return "txf_mcs_logical";
|
|
|
|
|
case SHADER_OPCODE_LOD_LOGICAL:
|
|
|
|
|
return "lod_logical";
|
|
|
|
|
case SHADER_OPCODE_TG4_LOGICAL:
|
|
|
|
|
return "tg4_logical";
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
|
|
|
|
return "tg4_offset_logical";
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
|
|
|
|
|
return "tg4_offset_lod_logical";
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
|
|
|
|
|
return "tg4_offset_bias_logical";
|
|
|
|
|
case SHADER_OPCODE_TG4_BIAS_LOGICAL:
|
|
|
|
|
return "tg4_b_logical";
|
|
|
|
|
case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
|
|
|
|
|
return "tg4_l_logical";
|
|
|
|
|
case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
|
|
|
|
|
return "tg4_i_logical";
|
|
|
|
|
case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
|
|
|
|
|
return "sampleinfo_logical";
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
|
|
|
|
|
return "image_size_logical";
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
|
|
|
|
|
return "untyped_atomic_logical";
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
|
|
|
|
|
return "untyped_surface_read_logical";
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
|
|
|
|
|
return "untyped_surface_write_logical";
|
|
|
|
|
case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
|
|
|
|
|
return "unaligned_oword_block_read_logical";
|
|
|
|
|
case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
|
|
|
|
|
return "oword_block_write_logical";
|
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
|
|
|
|
|
return "a64_untyped_read_logical";
|
|
|
|
|
case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
|
|
|
|
|
return "a64_oword_block_read_logical";
|
|
|
|
|
case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
|
|
|
|
|
return "a64_unaligned_oword_block_read_logical";
|
|
|
|
|
case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
|
|
|
|
|
return "a64_oword_block_write_logical";
|
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
|
|
|
|
|
return "a64_untyped_write_logical";
|
|
|
|
|
case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
|
|
|
|
|
return "a64_byte_scattered_read_logical";
|
|
|
|
|
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
|
|
|
|
|
return "a64_byte_scattered_write_logical";
|
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
|
|
|
|
|
return "a64_untyped_atomic_logical";
|
|
|
|
|
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
|
|
|
|
|
return "typed_atomic_logical";
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
|
|
|
|
|
return "typed_surface_read_logical";
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
|
|
|
|
|
return "typed_surface_write_logical";
|
|
|
|
|
case SHADER_OPCODE_MEMORY_FENCE:
|
|
|
|
|
return "memory_fence";
|
|
|
|
|
case FS_OPCODE_SCHEDULING_FENCE:
|
|
|
|
|
return "scheduling_fence";
|
|
|
|
|
case SHADER_OPCODE_INTERLOCK:
|
|
|
|
|
/* For an interlock we actually issue a memory fence via sendc. */
|
|
|
|
|
return "interlock";
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
|
|
|
|
|
return "byte_scattered_read_logical";
|
|
|
|
|
case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
|
|
|
|
|
return "byte_scattered_write_logical";
|
|
|
|
|
case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
|
|
|
|
|
return "dword_scattered_read_logical";
|
|
|
|
|
case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
|
|
|
|
|
return "dword_scattered_write_logical";
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_LOAD_PAYLOAD:
|
|
|
|
|
return "load_payload";
|
|
|
|
|
case FS_OPCODE_PACK:
|
|
|
|
|
return "pack";
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_SCRATCH_HEADER:
|
|
|
|
|
return "scratch_header";
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_URB_WRITE_LOGICAL:
|
|
|
|
|
return "urb_write_logical";
|
|
|
|
|
case SHADER_OPCODE_URB_READ_LOGICAL:
|
|
|
|
|
return "urb_read_logical";
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
|
|
|
|
|
return "find_live_channel";
|
|
|
|
|
case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
|
|
|
|
|
return "find_last_live_channel";
|
|
|
|
|
case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
|
|
|
|
|
return "load_live_channels";
|
|
|
|
|
case FS_OPCODE_LOAD_LIVE_CHANNELS:
|
|
|
|
|
return "fs_load_live_channels";
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_BROADCAST:
|
|
|
|
|
return "broadcast";
|
|
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
return "shuffle";
|
|
|
|
|
case SHADER_OPCODE_SEL_EXEC:
|
|
|
|
|
return "sel_exec";
|
|
|
|
|
case SHADER_OPCODE_QUAD_SWIZZLE:
|
|
|
|
|
return "quad_swizzle";
|
|
|
|
|
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
|
|
|
|
return "cluster_broadcast";
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_GET_BUFFER_SIZE:
|
|
|
|
|
return "get_buffer_size";
|
|
|
|
|
|
|
|
|
|
case FS_OPCODE_DDX_COARSE:
|
|
|
|
|
return "ddx_coarse";
|
|
|
|
|
case FS_OPCODE_DDX_FINE:
|
|
|
|
|
return "ddx_fine";
|
|
|
|
|
case FS_OPCODE_DDY_COARSE:
|
|
|
|
|
return "ddy_coarse";
|
|
|
|
|
case FS_OPCODE_DDY_FINE:
|
|
|
|
|
return "ddy_fine";
|
|
|
|
|
|
|
|
|
|
case FS_OPCODE_PIXEL_X:
|
|
|
|
|
return "pixel_x";
|
|
|
|
|
case FS_OPCODE_PIXEL_Y:
|
|
|
|
|
return "pixel_y";
|
|
|
|
|
|
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
|
|
|
|
return "uniform_pull_const";
|
|
|
|
|
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
|
|
|
|
|
return "varying_pull_const_logical";
|
|
|
|
|
|
|
|
|
|
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
|
|
|
|
|
return "pack_half_2x16_split";
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_HALT_TARGET:
|
|
|
|
|
return "halt_target";
|
|
|
|
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
return "interp_sample";
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
return "interp_shared_offset";
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
|
|
|
return "interp_per_slot_offset";
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_BARRIER:
|
|
|
|
|
return "barrier";
|
|
|
|
|
case SHADER_OPCODE_MULH:
|
|
|
|
|
return "mulh";
|
|
|
|
|
case SHADER_OPCODE_ISUB_SAT:
|
|
|
|
|
return "isub_sat";
|
|
|
|
|
case SHADER_OPCODE_USUB_SAT:
|
|
|
|
|
return "usub_sat";
|
|
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
|
|
|
return "mov_indirect";
|
|
|
|
|
case SHADER_OPCODE_MOV_RELOC_IMM:
|
|
|
|
|
return "mov_reloc_imm";
|
|
|
|
|
|
|
|
|
|
case RT_OPCODE_TRACE_RAY_LOGICAL:
|
|
|
|
|
return "rt_trace_ray_logical";
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_RND_MODE:
|
|
|
|
|
return "rnd_mode";
|
|
|
|
|
case SHADER_OPCODE_FLOAT_CONTROL_MODE:
|
|
|
|
|
return "float_control_mode";
|
|
|
|
|
case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
|
|
|
|
|
return "btd_spawn_logical";
|
|
|
|
|
case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
|
|
|
|
|
return "btd_retire_logical";
|
2024-05-28 16:43:43 +03:00
|
|
|
case SHADER_OPCODE_READ_ARCH_REG:
|
|
|
|
|
return "read_arch_reg";
|
2024-02-24 01:24:03 -08:00
|
|
|
case SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION:
|
|
|
|
|
return "load_subgroup_invocation";
|
2024-02-28 13:59:35 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unreachable("not reached");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2024-02-19 22:37:47 -08:00
|
|
|
void
|
|
|
|
|
fs_visitor::dump_instruction_to_file(const fs_inst *inst, FILE *file) const
|
|
|
|
|
{
|
2012-12-06 10:36:11 -08:00
|
|
|
if (inst->predicate) {
|
2017-12-12 12:05:02 -08:00
|
|
|
fprintf(file, "(%cf%d.%d) ",
|
|
|
|
|
inst->predicate_inverse ? '-' : '+',
|
|
|
|
|
inst->flag_subreg / 2,
|
|
|
|
|
inst->flag_subreg % 2);
|
2012-12-06 10:36:11 -08:00
|
|
|
}
|
|
|
|
|
|
2022-06-29 14:13:31 -07:00
|
|
|
fprintf(file, "%s", brw_instruction_name(&compiler->isa, inst->opcode));
|
2012-10-30 15:35:44 -07:00
|
|
|
if (inst->saturate)
|
2014-05-29 11:45:15 -07:00
|
|
|
fprintf(file, ".sat");
|
2012-12-06 10:36:11 -08:00
|
|
|
if (inst->conditional_mod) {
|
2014-05-29 11:45:15 -07:00
|
|
|
fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
|
2012-12-06 10:36:11 -08:00
|
|
|
if (!inst->predicate &&
|
2024-02-15 13:19:08 -08:00
|
|
|
(inst->opcode != BRW_OPCODE_SEL &&
|
|
|
|
|
inst->opcode != BRW_OPCODE_CSEL &&
|
|
|
|
|
inst->opcode != BRW_OPCODE_IF &&
|
|
|
|
|
inst->opcode != BRW_OPCODE_WHILE)) {
|
2017-12-12 12:05:02 -08:00
|
|
|
fprintf(file, ".f%d.%d", inst->flag_subreg / 2,
|
|
|
|
|
inst->flag_subreg % 2);
|
2012-12-06 10:36:11 -08:00
|
|
|
}
|
|
|
|
|
}
|
2014-09-16 18:02:52 -07:00
|
|
|
fprintf(file, "(%d) ", inst->exec_size);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
2015-06-02 20:40:54 -07:00
|
|
|
if (inst->mlen) {
|
|
|
|
|
fprintf(file, "(mlen: %d) ", inst->mlen);
|
|
|
|
|
}
|
2012-12-06 10:36:11 -08:00
|
|
|
|
2018-10-29 15:06:14 -05:00
|
|
|
if (inst->ex_mlen) {
|
|
|
|
|
fprintf(file, "(ex_mlen: %d) ", inst->ex_mlen);
|
|
|
|
|
}
|
|
|
|
|
|
2016-06-26 00:39:32 -07:00
|
|
|
if (inst->eot) {
|
|
|
|
|
fprintf(file, "(EOT) ");
|
|
|
|
|
}
|
|
|
|
|
|
2012-10-30 15:35:44 -07:00
|
|
|
switch (inst->dst.file) {
|
2015-10-26 17:09:25 -07:00
|
|
|
case VGRF:
|
2024-05-08 13:45:34 -07:00
|
|
|
fprintf(file, "v%d", inst->dst.nr);
|
2012-10-30 15:35:44 -07:00
|
|
|
break;
|
2015-10-26 17:52:57 -07:00
|
|
|
case FIXED_GRF:
|
|
|
|
|
fprintf(file, "g%d", inst->dst.nr);
|
2024-01-30 19:32:41 -08:00
|
|
|
if (inst->dst.subnr != 0)
|
2024-04-21 00:57:59 -07:00
|
|
|
fprintf(file, ".%d", inst->dst.subnr / brw_type_size_bytes(inst->dst.type));
|
2015-10-26 17:52:57 -07:00
|
|
|
break;
|
2012-10-30 15:35:44 -07:00
|
|
|
case BAD_FILE:
|
2014-05-29 11:45:15 -07:00
|
|
|
fprintf(file, "(null)");
|
2012-10-30 15:35:44 -07:00
|
|
|
break;
|
|
|
|
|
case UNIFORM:
|
2016-09-01 20:31:47 -07:00
|
|
|
fprintf(file, "***u%d***", inst->dst.nr);
|
2012-10-30 15:35:44 -07:00
|
|
|
break;
|
2014-10-20 23:16:48 -07:00
|
|
|
case ATTR:
|
2016-09-01 20:31:47 -07:00
|
|
|
fprintf(file, "***attr%d***", inst->dst.nr);
|
2014-10-20 23:16:48 -07:00
|
|
|
break;
|
2015-10-26 17:52:57 -07:00
|
|
|
case ARF:
|
2023-08-09 14:03:57 -07:00
|
|
|
switch (inst->dst.nr & 0xF0) {
|
2015-10-26 17:52:57 -07:00
|
|
|
case BRW_ARF_NULL:
|
|
|
|
|
fprintf(file, "null");
|
|
|
|
|
break;
|
|
|
|
|
case BRW_ARF_ADDRESS:
|
|
|
|
|
fprintf(file, "a0.%d", inst->dst.subnr);
|
|
|
|
|
break;
|
|
|
|
|
case BRW_ARF_ACCUMULATOR:
|
2023-08-09 14:03:57 -07:00
|
|
|
if (inst->dst.subnr == 0)
|
|
|
|
|
fprintf(file, "acc%d", inst->dst.nr & 0x0F);
|
|
|
|
|
else
|
|
|
|
|
fprintf(file, "acc%d.%d", inst->dst.nr & 0x0F, inst->dst.subnr);
|
|
|
|
|
|
2015-10-26 17:52:57 -07:00
|
|
|
break;
|
|
|
|
|
case BRW_ARF_FLAG:
|
|
|
|
|
fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
|
|
|
|
|
break;
|
2013-11-25 15:37:18 -08:00
|
|
|
}
|
2013-10-08 23:30:08 -07:00
|
|
|
break;
|
2015-10-26 06:58:56 -07:00
|
|
|
case IMM:
|
|
|
|
|
unreachable("not reached");
|
2012-10-30 15:35:44 -07:00
|
|
|
}
|
2016-09-01 20:31:47 -07:00
|
|
|
|
|
|
|
|
if (inst->dst.offset ||
|
|
|
|
|
(inst->dst.file == VGRF &&
|
|
|
|
|
alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
|
|
|
|
|
const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
|
|
|
|
|
fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
|
|
|
|
|
inst->dst.offset % reg_size);
|
|
|
|
|
}
|
|
|
|
|
|
2015-08-05 09:41:18 -07:00
|
|
|
if (inst->dst.stride != 1)
|
|
|
|
|
fprintf(file, "<%u>", inst->dst.stride);
|
intel/brw: Fix commas when dumping instructions
Some commas were being skipped, according to history as an attempt
to elide BAD_FILEs, but we still print them, so be consistent. Also
for instructions without any sources, the trailing comma was always
being printed. Fix that too.
Example of instruction output before the change
halt_target(8) (null):UD,
send(8) (mlen: 1) (EOT) (null):UD, 0u, 0u, g126:UD(null):UD NoMask
and after it
halt_target(8) (null):UD
send(8) (mlen: 1) (EOT) (null):UD, 0u, 0u, g126:UD, (null):UD NoMask
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29114>
2024-05-08 13:51:37 -07:00
|
|
|
fprintf(file, ":%s", brw_reg_type_to_letters(inst->dst.type));
|
2012-10-30 15:35:44 -07:00
|
|
|
|
2014-09-16 15:56:47 -07:00
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
intel/brw: Fix commas when dumping instructions
Some commas were being skipped, according to history as an attempt
to elide BAD_FILEs, but we still print them, so be consistent. Also
for instructions without any sources, the trailing comma was always
being printed. Fix that too.
Example of instruction output before the change
halt_target(8) (null):UD,
send(8) (mlen: 1) (EOT) (null):UD, 0u, 0u, g126:UD(null):UD NoMask
and after it
halt_target(8) (null):UD
send(8) (mlen: 1) (EOT) (null):UD, 0u, 0u, g126:UD, (null):UD NoMask
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29114>
2024-05-08 13:51:37 -07:00
|
|
|
fprintf(file, ", ");
|
|
|
|
|
|
2012-10-30 15:35:44 -07:00
|
|
|
if (inst->src[i].negate)
|
2014-05-29 11:45:15 -07:00
|
|
|
fprintf(file, "-");
|
2012-10-30 15:35:44 -07:00
|
|
|
if (inst->src[i].abs)
|
2014-05-29 11:45:15 -07:00
|
|
|
fprintf(file, "|");
|
2012-10-30 15:35:44 -07:00
|
|
|
switch (inst->src[i].file) {
|
2015-10-26 17:09:25 -07:00
|
|
|
case VGRF:
|
2024-05-08 13:45:34 -07:00
|
|
|
fprintf(file, "v%d", inst->src[i].nr);
|
2012-10-30 15:35:44 -07:00
|
|
|
break;
|
2015-10-26 17:52:57 -07:00
|
|
|
case FIXED_GRF:
|
|
|
|
|
fprintf(file, "g%d", inst->src[i].nr);
|
|
|
|
|
break;
|
2014-10-20 23:16:48 -07:00
|
|
|
case ATTR:
|
2016-09-01 20:31:47 -07:00
|
|
|
fprintf(file, "attr%d", inst->src[i].nr);
|
2014-10-20 23:16:48 -07:00
|
|
|
break;
|
2012-10-30 15:35:44 -07:00
|
|
|
case UNIFORM:
|
2016-09-01 20:31:47 -07:00
|
|
|
fprintf(file, "u%d", inst->src[i].nr);
|
2012-10-30 15:35:44 -07:00
|
|
|
break;
|
|
|
|
|
case BAD_FILE:
|
2014-05-29 11:45:15 -07:00
|
|
|
fprintf(file, "(null)");
|
2012-10-30 15:35:44 -07:00
|
|
|
break;
|
2013-02-15 19:55:46 -08:00
|
|
|
case IMM:
|
|
|
|
|
switch (inst->src[i].type) {
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_HF:
|
2020-07-29 16:38:40 -07:00
|
|
|
fprintf(file, "%-ghf", _mesa_half_to_float(inst->src[i].ud & 0xffff));
|
|
|
|
|
break;
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_F:
|
2016-02-11 19:03:56 +13:00
|
|
|
fprintf(file, "%-gf", inst->src[i].f);
|
2013-02-15 19:55:46 -08:00
|
|
|
break;
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_DF:
|
2015-08-03 15:00:51 -07:00
|
|
|
fprintf(file, "%fdf", inst->src[i].df);
|
|
|
|
|
break;
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_W:
|
2024-04-01 16:43:54 -07:00
|
|
|
fprintf(file, "%dw", (int)(int16_t)inst->src[i].d);
|
|
|
|
|
break;
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_D:
|
2015-10-24 14:55:57 -07:00
|
|
|
fprintf(file, "%dd", inst->src[i].d);
|
2013-02-15 19:55:46 -08:00
|
|
|
break;
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_UW:
|
2024-04-01 16:43:54 -07:00
|
|
|
fprintf(file, "%duw", inst->src[i].ud & 0xffff);
|
|
|
|
|
break;
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_UD:
|
2015-10-24 14:55:57 -07:00
|
|
|
fprintf(file, "%uu", inst->src[i].ud);
|
2013-02-15 19:55:46 -08:00
|
|
|
break;
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_Q:
|
2018-09-06 11:15:55 -07:00
|
|
|
fprintf(file, "%" PRId64 "q", inst->src[i].d64);
|
|
|
|
|
break;
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_UQ:
|
2018-09-06 11:15:55 -07:00
|
|
|
fprintf(file, "%" PRIu64 "uq", inst->src[i].u64);
|
|
|
|
|
break;
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_VF:
|
2014-12-31 16:54:44 -08:00
|
|
|
fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
|
2015-10-24 14:55:57 -07:00
|
|
|
brw_vf_to_float((inst->src[i].ud >> 0) & 0xff),
|
|
|
|
|
brw_vf_to_float((inst->src[i].ud >> 8) & 0xff),
|
|
|
|
|
brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
|
|
|
|
|
brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
|
2014-03-08 17:25:34 -08:00
|
|
|
break;
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_V:
|
|
|
|
|
case BRW_TYPE_UV:
|
2018-11-05 09:52:09 -08:00
|
|
|
fprintf(file, "%08x%s", inst->src[i].ud,
|
2024-04-20 17:08:02 -07:00
|
|
|
inst->src[i].type == BRW_TYPE_V ? "V" : "UV");
|
2018-11-05 09:52:09 -08:00
|
|
|
break;
|
2013-02-15 19:55:46 -08:00
|
|
|
default:
|
2014-05-29 11:45:15 -07:00
|
|
|
fprintf(file, "???");
|
2013-02-15 19:55:46 -08:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
break;
|
2015-10-26 17:52:57 -07:00
|
|
|
case ARF:
|
2023-08-09 14:03:57 -07:00
|
|
|
switch (inst->src[i].nr & 0xF0) {
|
2015-10-26 17:52:57 -07:00
|
|
|
case BRW_ARF_NULL:
|
|
|
|
|
fprintf(file, "null");
|
|
|
|
|
break;
|
|
|
|
|
case BRW_ARF_ADDRESS:
|
|
|
|
|
fprintf(file, "a0.%d", inst->src[i].subnr);
|
|
|
|
|
break;
|
|
|
|
|
case BRW_ARF_ACCUMULATOR:
|
2023-08-09 14:03:57 -07:00
|
|
|
if (inst->src[i].subnr == 0)
|
|
|
|
|
fprintf(file, "acc%d", inst->src[i].nr & 0x0F);
|
|
|
|
|
else
|
|
|
|
|
fprintf(file, "acc%d.%d", inst->src[i].nr & 0x0F, inst->src[i].subnr);
|
|
|
|
|
|
2015-10-26 17:52:57 -07:00
|
|
|
break;
|
|
|
|
|
case BRW_ARF_FLAG:
|
|
|
|
|
fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
|
|
|
|
|
break;
|
2013-11-25 15:37:18 -08:00
|
|
|
}
|
2013-10-08 23:30:08 -07:00
|
|
|
break;
|
2012-10-30 15:35:44 -07:00
|
|
|
}
|
2016-09-01 20:31:47 -07:00
|
|
|
|
2024-01-30 19:32:41 -08:00
|
|
|
if (inst->src[i].file == FIXED_GRF && inst->src[i].subnr != 0) {
|
|
|
|
|
assert(inst->src[i].offset == 0);
|
|
|
|
|
|
2024-04-21 00:57:59 -07:00
|
|
|
fprintf(file, ".%d", inst->src[i].subnr / brw_type_size_bytes(inst->src[i].type));
|
2024-01-30 19:32:41 -08:00
|
|
|
} else if (inst->src[i].offset ||
|
2016-09-01 20:31:47 -07:00
|
|
|
(inst->src[i].file == VGRF &&
|
|
|
|
|
alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
|
|
|
|
|
const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
|
|
|
|
|
fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
|
|
|
|
|
inst->src[i].offset % reg_size);
|
|
|
|
|
}
|
|
|
|
|
|
2012-10-30 15:35:44 -07:00
|
|
|
if (inst->src[i].abs)
|
2014-05-29 11:45:15 -07:00
|
|
|
fprintf(file, "|");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
2013-12-02 13:10:29 -08:00
|
|
|
if (inst->src[i].file != IMM) {
|
2015-08-05 09:41:18 -07:00
|
|
|
unsigned stride;
|
|
|
|
|
if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
|
|
|
|
|
unsigned hstride = inst->src[i].hstride;
|
|
|
|
|
stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
|
|
|
|
|
} else {
|
|
|
|
|
stride = inst->src[i].stride;
|
|
|
|
|
}
|
|
|
|
|
if (stride != 1)
|
|
|
|
|
fprintf(file, "<%u>", stride);
|
|
|
|
|
|
2017-07-26 17:31:36 -07:00
|
|
|
fprintf(file, ":%s", brw_reg_type_to_letters(inst->src[i].type));
|
2013-12-02 13:10:29 -08:00
|
|
|
}
|
2012-10-30 15:35:44 -07:00
|
|
|
}
|
|
|
|
|
|
2014-05-29 11:45:15 -07:00
|
|
|
fprintf(file, " ");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
2015-11-09 23:55:58 -08:00
|
|
|
if (inst->force_writemask_all)
|
|
|
|
|
fprintf(file, "NoMask ");
|
|
|
|
|
|
2016-05-20 16:14:13 -07:00
|
|
|
if (inst->exec_size != dispatch_width)
|
|
|
|
|
fprintf(file, "group%d ", inst->group);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
2014-05-29 11:45:15 -07:00
|
|
|
fprintf(file, "\n");
|
2012-10-30 15:35:44 -07:00
|
|
|
}
|
|
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
brw::register_pressure::register_pressure(const fs_visitor *v)
|
2013-08-04 23:27:14 -07:00
|
|
|
{
|
2016-03-13 16:35:49 -07:00
|
|
|
const fs_live_variables &live = v->live_analysis.require();
|
2016-03-13 16:37:03 -07:00
|
|
|
const unsigned num_instructions = v->cfg->num_blocks ?
|
|
|
|
|
v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
|
2013-08-04 23:27:14 -07:00
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
regs_live_at_ip = new unsigned[num_instructions]();
|
2013-08-04 23:27:14 -07:00
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
for (unsigned reg = 0; reg < v->alloc.count; reg++) {
|
2016-03-13 16:25:57 -07:00
|
|
|
for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
|
2016-03-13 16:35:49 -07:00
|
|
|
regs_live_at_ip[ip] += v->alloc.sizes[reg];
|
2013-08-04 23:27:14 -07:00
|
|
|
}
|
2023-08-15 01:15:17 -07:00
|
|
|
|
|
|
|
|
const unsigned payload_count = v->first_non_payload_grf;
|
|
|
|
|
|
|
|
|
|
int *payload_last_use_ip = new int[payload_count];
|
|
|
|
|
v->calculate_payload_ranges(payload_count, payload_last_use_ip);
|
|
|
|
|
|
|
|
|
|
for (unsigned reg = 0; reg < payload_count; reg++) {
|
|
|
|
|
for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
|
|
|
|
|
++regs_live_at_ip[ip];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
delete[] payload_last_use_ip;
|
2013-08-04 23:27:14 -07:00
|
|
|
}
|
|
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
brw::register_pressure::~register_pressure()
|
|
|
|
|
{
|
|
|
|
|
delete[] regs_live_at_ip;
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-12 18:50:24 -08:00
|
|
|
void
|
|
|
|
|
fs_visitor::invalidate_analysis(brw::analysis_dependency_class c)
|
|
|
|
|
{
|
2016-03-13 16:25:57 -07:00
|
|
|
live_analysis.invalidate(c);
|
2016-03-13 16:35:49 -07:00
|
|
|
regpressure_analysis.invalidate(c);
|
2024-02-19 22:25:16 -08:00
|
|
|
idom_analysis.invalidate(c);
|
2016-03-12 18:50:24 -08:00
|
|
|
}
|
|
|
|
|
|
2023-08-06 15:46:12 +03:00
|
|
|
void
|
2023-08-14 16:59:17 -07:00
|
|
|
fs_visitor::debug_optimizer(const nir_shader *nir,
|
|
|
|
|
const char *pass_name,
|
2023-08-06 15:46:12 +03:00
|
|
|
int iteration, int pass_num) const
|
|
|
|
|
{
|
2023-08-14 16:59:17 -07:00
|
|
|
if (!brw_should_print_shader(nir, DEBUG_OPTIMIZER))
|
2023-08-06 15:46:12 +03:00
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
char *filename;
|
2023-08-07 17:06:49 +03:00
|
|
|
int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
|
|
|
|
|
debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
|
2023-09-24 21:38:47 -07:00
|
|
|
_mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
|
2023-08-06 15:46:12 +03:00
|
|
|
iteration, pass_num, pass_name);
|
|
|
|
|
if (ret == -1)
|
|
|
|
|
return;
|
|
|
|
|
dump_instructions(filename);
|
|
|
|
|
free(filename);
|
|
|
|
|
}
|
|
|
|
|
|
2023-02-03 17:02:28 +01:00
|
|
|
uint32_t
|
|
|
|
|
fs_visitor::compute_max_register_pressure()
|
|
|
|
|
{
|
|
|
|
|
const register_pressure &rp = regpressure_analysis.require();
|
|
|
|
|
uint32_t ip = 0, max_pressure = 0;
|
2024-02-28 15:19:10 -08:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2023-02-03 17:02:28 +01:00
|
|
|
max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
|
|
|
|
|
ip++;
|
|
|
|
|
}
|
|
|
|
|
return max_pressure;
|
|
|
|
|
}
|
|
|
|
|
|
2023-08-23 02:19:06 -07:00
|
|
|
static fs_inst **
|
|
|
|
|
save_instruction_order(const struct cfg_t *cfg)
|
|
|
|
|
{
|
|
|
|
|
/* Before we schedule anything, stash off the instruction order as an array
|
|
|
|
|
* of fs_inst *. This way, we can reset it between scheduling passes to
|
|
|
|
|
* prevent dependencies between the different scheduling modes.
|
|
|
|
|
*/
|
|
|
|
|
int num_insts = cfg->last_block()->end_ip + 1;
|
|
|
|
|
fs_inst **inst_arr = new fs_inst * [num_insts];
|
|
|
|
|
|
|
|
|
|
int ip = 0;
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
assert(ip >= block->start_ip && ip <= block->end_ip);
|
|
|
|
|
inst_arr[ip++] = inst;
|
|
|
|
|
}
|
|
|
|
|
assert(ip == num_insts);
|
|
|
|
|
|
|
|
|
|
return inst_arr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
restore_instruction_order(struct cfg_t *cfg, fs_inst **inst_arr)
|
|
|
|
|
{
|
2024-01-03 16:31:23 +10:00
|
|
|
ASSERTED int num_insts = cfg->last_block()->end_ip + 1;
|
2023-08-23 02:19:06 -07:00
|
|
|
|
|
|
|
|
int ip = 0;
|
|
|
|
|
foreach_block (block, cfg) {
|
|
|
|
|
block->instructions.make_empty();
|
|
|
|
|
|
|
|
|
|
assert(ip == block->start_ip);
|
|
|
|
|
for (; ip <= block->end_ip; ip++)
|
|
|
|
|
block->instructions.push_tail(inst_arr[ip]);
|
|
|
|
|
}
|
|
|
|
|
assert(ip == num_insts);
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-28 13:59:35 -08:00
|
|
|
/* Per-thread scratch space is a power-of-two multiple of 1KB. */
|
|
|
|
|
static inline unsigned
|
|
|
|
|
brw_get_scratch_size(int size)
|
|
|
|
|
{
|
|
|
|
|
return MAX2(1024, util_next_power_of_two(size));
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
void
|
2020-05-19 14:37:44 -07:00
|
|
|
fs_visitor::allocate_registers(bool allow_spilling)
|
2014-11-13 16:28:19 -08:00
|
|
|
{
|
2019-05-09 14:44:16 -05:00
|
|
|
bool allocated;
|
2014-11-13 16:28:19 -08:00
|
|
|
|
2014-12-19 12:55:13 -08:00
|
|
|
static const enum instruction_scheduler_mode pre_modes[] = {
|
2014-11-13 16:28:19 -08:00
|
|
|
SCHEDULE_PRE,
|
|
|
|
|
SCHEDULE_PRE_NON_LIFO,
|
2021-11-09 22:55:49 -06:00
|
|
|
SCHEDULE_NONE,
|
2014-11-13 16:28:19 -08:00
|
|
|
SCHEDULE_PRE_LIFO,
|
|
|
|
|
};
|
|
|
|
|
|
2016-10-17 14:12:28 -07:00
|
|
|
static const char *scheduler_mode_name[] = {
|
2023-08-14 19:35:32 -07:00
|
|
|
[SCHEDULE_PRE] = "top-down",
|
|
|
|
|
[SCHEDULE_PRE_NON_LIFO] = "non-lifo",
|
|
|
|
|
[SCHEDULE_PRE_LIFO] = "lifo",
|
|
|
|
|
[SCHEDULE_POST] = "post",
|
|
|
|
|
[SCHEDULE_NONE] = "none",
|
2016-10-17 14:12:28 -07:00
|
|
|
};
|
|
|
|
|
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
uint32_t best_register_pressure = UINT32_MAX;
|
|
|
|
|
enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
|
|
|
|
|
|
2024-01-03 14:40:37 -08:00
|
|
|
brw_fs_opt_compact_virtual_grfs(*this);
|
2023-03-17 09:42:31 +02:00
|
|
|
|
2023-02-03 17:02:28 +01:00
|
|
|
if (needs_register_pressure)
|
|
|
|
|
shader_stats.max_register_pressure = compute_max_register_pressure();
|
|
|
|
|
|
2023-08-15 01:15:26 -07:00
|
|
|
debug_optimizer(nir, "pre_register_allocate", 90, 90);
|
2023-08-06 15:46:12 +03:00
|
|
|
|
2021-10-13 11:21:41 +02:00
|
|
|
bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
|
2016-05-16 14:30:25 -07:00
|
|
|
|
2021-11-09 19:03:19 -06:00
|
|
|
/* Before we schedule anything, stash off the instruction order as an array
|
|
|
|
|
* of fs_inst *. This way, we can reset it between scheduling passes to
|
|
|
|
|
* prevent dependencies between the different scheduling modes.
|
|
|
|
|
*/
|
2023-08-23 02:19:06 -07:00
|
|
|
fs_inst **orig_order = save_instruction_order(cfg);
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
fs_inst **best_pressure_order = NULL;
|
2021-11-09 19:03:19 -06:00
|
|
|
|
2023-10-20 10:32:54 -07:00
|
|
|
void *scheduler_ctx = ralloc_context(NULL);
|
2024-02-28 13:39:45 -08:00
|
|
|
instruction_scheduler *sched = prepare_scheduler(scheduler_ctx);
|
2023-10-20 10:32:54 -07:00
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
/* Try each scheduling heuristic to see if it can successfully register
|
|
|
|
|
* allocate without spilling. They should be ordered by decreasing
|
|
|
|
|
* performance but increasing likelihood of allocating.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
|
2023-08-14 19:35:32 -07:00
|
|
|
enum instruction_scheduler_mode sched_mode = pre_modes[i];
|
|
|
|
|
|
2023-10-20 10:32:54 -07:00
|
|
|
schedule_instructions_pre_ra(sched, sched_mode);
|
2023-08-14 19:35:32 -07:00
|
|
|
this->shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
|
2014-11-13 16:28:19 -08:00
|
|
|
|
2023-08-15 01:15:26 -07:00
|
|
|
debug_optimizer(nir, shader_stats.scheduler_mode, 95, i);
|
|
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
if (0) {
|
|
|
|
|
assign_regs_trivial();
|
2019-05-09 14:44:16 -05:00
|
|
|
allocated = true;
|
|
|
|
|
break;
|
2014-11-13 16:28:19 -08:00
|
|
|
}
|
2019-05-09 14:44:16 -05:00
|
|
|
|
|
|
|
|
/* We should only spill registers on the last scheduling. */
|
|
|
|
|
assert(!spilled_any_registers);
|
|
|
|
|
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
allocated = assign_regs(false, spill_all);
|
2019-05-09 14:44:16 -05:00
|
|
|
if (allocated)
|
2014-11-13 16:28:19 -08:00
|
|
|
break;
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
|
|
|
|
/* Save the maximum register pressure */
|
|
|
|
|
uint32_t this_pressure = compute_max_register_pressure();
|
|
|
|
|
|
|
|
|
|
if (0) {
|
|
|
|
|
fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
|
|
|
|
|
scheduler_mode_name[sched_mode], this_pressure);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this_pressure < best_register_pressure) {
|
|
|
|
|
best_register_pressure = this_pressure;
|
|
|
|
|
best_sched = sched_mode;
|
|
|
|
|
delete[] best_pressure_order;
|
|
|
|
|
best_pressure_order = save_instruction_order(cfg);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Reset back to the original order before trying the next mode */
|
|
|
|
|
restore_instruction_order(cfg, orig_order);
|
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-20 10:32:54 -07:00
|
|
|
ralloc_free(scheduler_ctx);
|
|
|
|
|
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
if (!allocated) {
|
|
|
|
|
if (0) {
|
|
|
|
|
fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
|
|
|
|
|
scheduler_mode_name[best_sched]);
|
|
|
|
|
}
|
|
|
|
|
restore_instruction_order(cfg, best_pressure_order);
|
|
|
|
|
shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
|
|
|
|
|
|
|
|
|
|
allocated = assign_regs(allow_spilling, spill_all);
|
2014-11-13 16:28:19 -08:00
|
|
|
}
|
|
|
|
|
|
2023-08-23 02:19:06 -07:00
|
|
|
delete[] orig_order;
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
delete[] best_pressure_order;
|
2023-08-23 02:19:06 -07:00
|
|
|
|
2019-05-09 14:44:16 -05:00
|
|
|
if (!allocated) {
|
2020-05-19 14:37:44 -07:00
|
|
|
fail("Failure to register allocate. Reduce number of "
|
|
|
|
|
"live scalar values to avoid this.");
|
2019-05-09 14:44:16 -05:00
|
|
|
} else if (spilled_any_registers) {
|
2021-07-29 14:27:57 -07:00
|
|
|
brw_shader_perf_log(compiler, log_data,
|
|
|
|
|
"%s shader triggered register spilling. "
|
|
|
|
|
"Try reducing the number of live scalar "
|
|
|
|
|
"values to improve performance.\n",
|
2023-09-24 21:38:47 -07:00
|
|
|
_mesa_shader_stage_to_string(stage));
|
2014-11-13 16:28:19 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
return;
|
|
|
|
|
|
2024-03-14 16:28:56 +02:00
|
|
|
debug_optimizer(nir, "post_ra_alloc", 96, 0);
|
|
|
|
|
|
2024-01-03 13:49:58 -08:00
|
|
|
brw_fs_opt_bank_conflicts(*this);
|
2017-06-15 15:23:57 -07:00
|
|
|
|
2024-03-14 16:28:56 +02:00
|
|
|
debug_optimizer(nir, "bank_conflict", 96, 1);
|
|
|
|
|
|
2023-10-20 10:32:54 -07:00
|
|
|
schedule_instructions_post_ra();
|
2014-11-13 16:28:19 -08:00
|
|
|
|
2024-03-14 16:28:56 +02:00
|
|
|
debug_optimizer(nir, "post_ra_alloc_scheduling", 96, 2);
|
|
|
|
|
|
2024-04-04 16:03:34 -07:00
|
|
|
/* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead
|
|
|
|
|
* of part of assign_regs since both bank conflicts optimization and post
|
|
|
|
|
* RA scheduling take advantage of distinguishing references to registers
|
|
|
|
|
* that were allocated from references that were already fixed.
|
|
|
|
|
*
|
|
|
|
|
* TODO: Change the passes above, then move this lowering to be part of
|
|
|
|
|
* assign_regs.
|
|
|
|
|
*/
|
|
|
|
|
brw_fs_lower_vgrfs_to_fixed_grfs(*this);
|
|
|
|
|
|
|
|
|
|
debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, 3);
|
|
|
|
|
|
2016-06-09 16:56:31 -07:00
|
|
|
if (last_scratch > 0) {
|
2019-06-19 12:47:19 +01:00
|
|
|
ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024;
|
2016-06-09 16:56:31 -07:00
|
|
|
|
2022-02-28 15:13:07 +02:00
|
|
|
/* Take the max of any previously compiled variant of the shader. In the
|
|
|
|
|
* case of bindless shaders with return parts, this will also take the
|
|
|
|
|
* max of all parts.
|
|
|
|
|
*/
|
|
|
|
|
prog_data->total_scratch = MAX2(brw_get_scratch_size(last_scratch),
|
|
|
|
|
prog_data->total_scratch);
|
2016-06-09 18:13:26 -07:00
|
|
|
|
|
|
|
|
/* We currently only support up to 2MB of scratch space. If we
|
|
|
|
|
* need to support more eventually, the documentation suggests
|
|
|
|
|
* that we could allocate a larger buffer, and partition it out
|
|
|
|
|
* ourselves. We'd just have to undo the hardware's address
|
|
|
|
|
* calculation by subtracting (FFTID * Per Thread Scratch Space)
|
|
|
|
|
* and then add FFTID * (Larger Per Thread Scratch Space).
|
|
|
|
|
*
|
|
|
|
|
* See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
|
|
|
|
|
* Thread Group Tracking > Local Memory/Scratch Space.
|
|
|
|
|
*/
|
2016-06-13 23:09:31 -07:00
|
|
|
assert(prog_data->total_scratch < max_scratch_size);
|
2016-06-09 16:56:31 -07:00
|
|
|
}
|
2018-11-09 14:13:37 -08:00
|
|
|
|
2024-01-04 16:28:40 -08:00
|
|
|
brw_fs_lower_scoreboard(*this);
|
2014-11-13 16:28:19 -08:00
|
|
|
}
|
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
bool
|
2017-09-28 16:25:31 -07:00
|
|
|
fs_visitor::run_vs()
|
2014-10-27 22:42:50 -07:00
|
|
|
{
|
|
|
|
|
assert(stage == MESA_SHADER_VERTEX);
|
|
|
|
|
|
2022-08-01 16:42:57 -07:00
|
|
|
payload_ = new vs_thread_payload(*this);
|
2014-10-27 22:42:50 -07:00
|
|
|
|
2023-12-05 15:11:09 -08:00
|
|
|
nir_to_brw(this);
|
2015-03-09 01:58:59 -07:00
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
if (failed)
|
|
|
|
|
return false;
|
|
|
|
|
|
2015-06-26 15:05:13 -07:00
|
|
|
emit_urb_writes();
|
2014-10-27 22:42:50 -07:00
|
|
|
|
2015-02-13 10:34:39 -08:00
|
|
|
calculate_cfg();
|
|
|
|
|
|
2024-01-04 16:42:50 -08:00
|
|
|
brw_fs_optimize(*this);
|
2014-10-27 22:42:50 -07:00
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
assign_vs_urb_setup();
|
|
|
|
|
|
2024-01-04 16:11:22 -08:00
|
|
|
brw_fs_lower_3src_null_dest(*this);
|
2024-01-04 16:18:04 -08:00
|
|
|
brw_fs_workaround_memory_fence_before_eot(*this);
|
2024-01-04 16:24:21 -08:00
|
|
|
brw_fs_workaround_emit_dummy_mov_instruction(*this);
|
2022-12-06 18:11:10 +02:00
|
|
|
|
2020-05-19 14:37:44 -07:00
|
|
|
allocate_registers(true /* allow_spilling */);
|
2015-11-10 14:35:27 -08:00
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-03 14:20:00 -07:00
|
|
|
void
|
|
|
|
|
fs_visitor::set_tcs_invocation_id()
|
2015-11-14 17:40:43 -08:00
|
|
|
{
|
2016-09-08 23:48:51 -07:00
|
|
|
struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
|
2023-11-21 10:12:09 -08:00
|
|
|
const fs_builder bld = fs_builder(this).at_end();
|
2015-11-14 17:40:43 -08:00
|
|
|
|
2019-05-03 14:24:49 -07:00
|
|
|
const unsigned instance_id_mask =
|
2022-08-09 14:02:16 -07:00
|
|
|
(devinfo->verx10 >= 125) ? INTEL_MASK(7, 0) :
|
|
|
|
|
(devinfo->ver >= 11) ? INTEL_MASK(22, 16) :
|
|
|
|
|
INTEL_MASK(23, 17);
|
2019-05-03 14:24:49 -07:00
|
|
|
const unsigned instance_id_shift =
|
2022-08-09 14:02:16 -07:00
|
|
|
(devinfo->verx10 >= 125) ? 0 : (devinfo->ver >= 11) ? 16 : 17;
|
2015-11-14 17:40:43 -08:00
|
|
|
|
2020-07-13 22:24:19 -07:00
|
|
|
/* Get instance number from g0.2 bits:
|
|
|
|
|
* * 7:0 on DG2+
|
|
|
|
|
* * 22:16 on gfx11+
|
|
|
|
|
* * 23:17 otherwise
|
|
|
|
|
*/
|
2024-04-12 17:43:22 -07:00
|
|
|
fs_reg t =
|
|
|
|
|
bld.AND(fs_reg(retype(brw_vec1_grf(0, 2), BRW_TYPE_UD)),
|
|
|
|
|
brw_imm_ud(instance_id_mask));
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
2024-02-01 13:58:36 -08:00
|
|
|
if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH) {
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
/* gl_InvocationID is just the thread number */
|
2024-04-12 17:43:22 -07:00
|
|
|
invocation_id = bld.SHR(t, brw_imm_ud(instance_id_shift));
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-01 13:58:36 -08:00
|
|
|
assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH);
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
fs_reg channels_uw = bld.vgrf(BRW_TYPE_UW);
|
|
|
|
|
fs_reg channels_ud = bld.vgrf(BRW_TYPE_UD);
|
2015-11-14 17:40:43 -08:00
|
|
|
bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
|
|
|
|
|
bld.MOV(channels_ud, channels_uw);
|
|
|
|
|
|
|
|
|
|
if (tcs_prog_data->instances == 1) {
|
|
|
|
|
invocation_id = channels_ud;
|
|
|
|
|
} else {
|
2024-04-12 17:43:22 -07:00
|
|
|
/* instance_id = 8 * t + <76543210> */
|
|
|
|
|
invocation_id =
|
|
|
|
|
bld.ADD(bld.SHR(t, brw_imm_ud(instance_id_shift - 3)), channels_ud);
|
2015-11-14 17:40:43 -08:00
|
|
|
}
|
2019-05-03 14:20:00 -07:00
|
|
|
}
|
|
|
|
|
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
void
|
|
|
|
|
fs_visitor::emit_tcs_thread_end()
|
|
|
|
|
{
|
|
|
|
|
/* Try and tag the last URB write with EOT instead of emitting a whole
|
|
|
|
|
* separate write just to finish the thread. There isn't guaranteed to
|
|
|
|
|
* be one, so this may not succeed.
|
|
|
|
|
*/
|
2024-02-17 22:43:47 -08:00
|
|
|
if (mark_last_urb_write_with_eot())
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
return;
|
|
|
|
|
|
2023-11-21 10:12:09 -08:00
|
|
|
const fs_builder bld = fs_builder(this).at_end();
|
2023-11-21 09:47:18 -08:00
|
|
|
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
/* Emit a URB write to end the thread. On Broadwell, we use this to write
|
|
|
|
|
* zero to the "TR DS Cache Disable" bit (we haven't implemented a fancy
|
|
|
|
|
* algorithm to set it optimally). On other platforms, we simply write
|
|
|
|
|
* zero to a reserved/MBZ patch header DWord which has no consequence.
|
|
|
|
|
*/
|
|
|
|
|
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
|
|
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = tcs_payload().patch_urb_output;
|
|
|
|
|
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(WRITEMASK_X << 16);
|
|
|
|
|
srcs[URB_LOGICAL_SRC_DATA] = brw_imm_ud(0);
|
2022-09-28 16:38:35 -07:00
|
|
|
srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1);
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
|
|
|
|
|
reg_undef, srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
inst->eot = true;
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-03 14:20:00 -07:00
|
|
|
bool
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
fs_visitor::run_tcs()
|
2019-05-03 14:20:00 -07:00
|
|
|
{
|
|
|
|
|
assert(stage == MESA_SHADER_TESS_CTRL);
|
|
|
|
|
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
|
2023-11-21 10:12:09 -08:00
|
|
|
const fs_builder bld = fs_builder(this).at_end();
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
2024-02-01 13:58:36 -08:00
|
|
|
assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH ||
|
|
|
|
|
vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
2022-08-19 14:41:52 -07:00
|
|
|
payload_ = new tcs_thread_payload(*this);
|
2019-05-03 14:20:00 -07:00
|
|
|
|
|
|
|
|
/* Initialize gl_InvocationID */
|
|
|
|
|
set_tcs_invocation_id();
|
2015-11-14 17:40:43 -08:00
|
|
|
|
2019-05-03 14:28:51 -07:00
|
|
|
const bool fix_dispatch_mask =
|
2024-02-01 13:58:36 -08:00
|
|
|
vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH &&
|
2019-05-03 14:28:51 -07:00
|
|
|
(nir->info.tess.tcs_vertices_out % 8) != 0;
|
|
|
|
|
|
2015-11-14 17:40:43 -08:00
|
|
|
/* Fix the disptach mask */
|
2019-05-03 14:28:51 -07:00
|
|
|
if (fix_dispatch_mask) {
|
2015-11-14 17:40:43 -08:00
|
|
|
bld.CMP(bld.null_reg_ud(), invocation_id,
|
2017-05-08 09:20:21 -07:00
|
|
|
brw_imm_ud(nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L);
|
2015-11-14 17:40:43 -08:00
|
|
|
bld.IF(BRW_PREDICATE_NORMAL);
|
|
|
|
|
}
|
|
|
|
|
|
2023-12-05 15:11:09 -08:00
|
|
|
nir_to_brw(this);
|
2015-11-14 17:40:43 -08:00
|
|
|
|
2019-05-03 14:28:51 -07:00
|
|
|
if (fix_dispatch_mask) {
|
2015-11-14 17:40:43 -08:00
|
|
|
bld.emit(BRW_OPCODE_ENDIF);
|
|
|
|
|
}
|
|
|
|
|
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
emit_tcs_thread_end();
|
2015-11-14 17:40:43 -08:00
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
2024-01-04 16:42:50 -08:00
|
|
|
brw_fs_optimize(*this);
|
2015-11-14 17:40:43 -08:00
|
|
|
|
|
|
|
|
assign_curb_setup();
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
assign_tcs_urb_setup();
|
2015-11-14 17:40:43 -08:00
|
|
|
|
2024-01-04 16:11:22 -08:00
|
|
|
brw_fs_lower_3src_null_dest(*this);
|
2024-01-04 16:18:04 -08:00
|
|
|
brw_fs_workaround_memory_fence_before_eot(*this);
|
2024-01-04 16:24:21 -08:00
|
|
|
brw_fs_workaround_emit_dummy_mov_instruction(*this);
|
2022-12-06 18:11:10 +02:00
|
|
|
|
2020-05-19 14:37:44 -07:00
|
|
|
allocate_registers(true /* allow_spilling */);
|
2015-11-14 17:40:43 -08:00
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
}
|
|
|
|
|
|
2015-11-10 14:35:27 -08:00
|
|
|
bool
|
|
|
|
|
fs_visitor::run_tes()
|
|
|
|
|
{
|
|
|
|
|
assert(stage == MESA_SHADER_TESS_EVAL);
|
|
|
|
|
|
2022-09-07 14:11:05 -07:00
|
|
|
payload_ = new tes_thread_payload(*this);
|
2015-11-10 14:35:27 -08:00
|
|
|
|
2023-12-05 15:11:09 -08:00
|
|
|
nir_to_brw(this);
|
2015-11-10 14:35:27 -08:00
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
emit_urb_writes();
|
|
|
|
|
|
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
2024-01-04 16:42:50 -08:00
|
|
|
brw_fs_optimize(*this);
|
2015-11-10 14:35:27 -08:00
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
assign_tes_urb_setup();
|
|
|
|
|
|
2024-01-04 16:11:22 -08:00
|
|
|
brw_fs_lower_3src_null_dest(*this);
|
2024-01-04 16:18:04 -08:00
|
|
|
brw_fs_workaround_memory_fence_before_eot(*this);
|
2024-01-04 16:24:21 -08:00
|
|
|
brw_fs_workaround_emit_dummy_mov_instruction(*this);
|
2022-12-06 18:11:10 +02:00
|
|
|
|
2020-05-19 14:37:44 -07:00
|
|
|
allocate_registers(true /* allow_spilling */);
|
2014-10-27 22:42:50 -07:00
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
bool
|
|
|
|
|
fs_visitor::run_gs()
|
|
|
|
|
{
|
|
|
|
|
assert(stage == MESA_SHADER_GEOMETRY);
|
|
|
|
|
|
2022-08-22 22:23:17 -07:00
|
|
|
payload_ = new gs_thread_payload(*this);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
2024-04-11 00:09:22 -07:00
|
|
|
const fs_builder bld = fs_builder(this).at_end();
|
|
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
this->final_gs_vertex_count = bld.vgrf(BRW_TYPE_UD);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
if (gs_compile->control_data_header_size_bits > 0) {
|
|
|
|
|
/* Create a VGRF to store accumulated control data bits. */
|
2024-04-20 17:08:02 -07:00
|
|
|
this->control_data_bits = bld.vgrf(BRW_TYPE_UD);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
/* If we're outputting more than 32 control data bits, then EmitVertex()
|
|
|
|
|
* will set control_data_bits to 0 after emitting the first vertex.
|
|
|
|
|
* Otherwise, we need to initialize it to 0 here.
|
|
|
|
|
*/
|
|
|
|
|
if (gs_compile->control_data_header_size_bits <= 32) {
|
|
|
|
|
const fs_builder abld = bld.annotate("initialize control data bits");
|
2015-11-02 11:26:16 -08:00
|
|
|
abld.MOV(this->control_data_bits, brw_imm_ud(0u));
|
2015-03-11 23:14:31 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-12-05 15:11:09 -08:00
|
|
|
nir_to_brw(this);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
emit_gs_thread_end();
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
2024-01-04 16:42:50 -08:00
|
|
|
brw_fs_optimize(*this);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
assign_gs_urb_setup();
|
|
|
|
|
|
2024-01-04 16:11:22 -08:00
|
|
|
brw_fs_lower_3src_null_dest(*this);
|
2024-01-04 16:18:04 -08:00
|
|
|
brw_fs_workaround_memory_fence_before_eot(*this);
|
2024-01-04 16:24:21 -08:00
|
|
|
brw_fs_workaround_emit_dummy_mov_instruction(*this);
|
2022-12-06 18:11:10 +02:00
|
|
|
|
2020-05-19 14:37:44 -07:00
|
|
|
allocate_registers(true /* allow_spilling */);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
}
|
|
|
|
|
|
2017-10-25 16:50:11 +03:00
|
|
|
/* From the SKL PRM, Volume 16, Workarounds:
|
|
|
|
|
*
|
|
|
|
|
* 0877 3D Pixel Shader Hang possible when pixel shader dispatched with
|
|
|
|
|
* only header phases (R0-R2)
|
|
|
|
|
*
|
|
|
|
|
* WA: Enable a non-header phase (e.g. push constant) when dispatch would
|
|
|
|
|
* have been header only.
|
|
|
|
|
*
|
|
|
|
|
* Instead of enabling push constants one can alternatively enable one of the
|
|
|
|
|
* inputs. Here one simply chooses "layer" which shouldn't impose much
|
|
|
|
|
* overhead.
|
|
|
|
|
*/
|
|
|
|
|
static void
|
2021-03-29 15:40:04 -07:00
|
|
|
gfx9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)
|
2017-10-25 16:50:11 +03:00
|
|
|
{
|
|
|
|
|
if (wm_prog_data->num_varying_inputs)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
if (wm_prog_data->base.curb_read_length)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
|
|
|
|
|
wm_prog_data->num_varying_inputs = 1;
|
2018-12-11 18:45:43 +01:00
|
|
|
|
|
|
|
|
brw_compute_urb_setup_index(wm_prog_data);
|
2017-10-25 16:50:11 +03:00
|
|
|
}
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
bool
|
2016-05-16 14:30:25 -07:00
|
|
|
fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
|
2010-08-26 12:12:00 -07:00
|
|
|
{
|
2016-09-08 23:48:51 -07:00
|
|
|
struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
|
2014-10-27 23:36:31 -07:00
|
|
|
brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
|
2023-11-21 10:12:09 -08:00
|
|
|
const fs_builder bld = fs_builder(this).at_end();
|
2014-10-27 23:36:31 -07:00
|
|
|
|
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
2024-02-27 12:23:52 -08:00
|
|
|
payload_ = new fs_thread_payload(*this, source_depth_to_render_target);
|
2010-08-26 12:12:00 -07:00
|
|
|
|
2024-01-05 11:09:19 -08:00
|
|
|
if (nir->info.ray_queries > 0)
|
|
|
|
|
limit_dispatch_width(16, "SIMD32 not supported with ray queries.\n");
|
|
|
|
|
|
intel/compiler: Delete unused emit_dummy_fs()
This code is compiled out, but has been left in place in case we wanted
to use it for debugging something. In the olden days, we'd use it for
platform enabling. I can't think of the last time we did that, though.
I also used to use it for debugging. If something was misrendering, I'd
iterate through shaders 0..N, replacing them with "draw hot pink" until
whatever shader was drawing the bad stuff was brightly illuminated.
Once it was identified, I'd start investigating that shader.
These days, we have frameretrace and renderdoc which are like, actual
tools that let you highlight draws and replace shaders. So we don't
need to resort iterative driver hacks anymore. Again, I can't think of
the last time I actually did that.
So, this code is basically just dead. And it's using legacy MRF paths,
which we could update...or we could just delete it.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20172>
2022-11-23 01:04:00 -08:00
|
|
|
if (do_rep_send) {
|
2015-06-19 17:25:28 -07:00
|
|
|
assert(dispatch_width == 16);
|
2014-09-26 14:47:03 -07:00
|
|
|
emit_repclear_shader();
|
2010-08-15 18:58:58 -07:00
|
|
|
} else {
|
2017-05-08 09:20:21 -07:00
|
|
|
if (nir->info.inputs_read > 0 ||
|
2021-01-19 17:14:28 -08:00
|
|
|
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
|
2017-05-08 09:20:21 -07:00
|
|
|
(nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
|
2024-02-17 22:43:47 -08:00
|
|
|
emit_interpolation_setup();
|
2013-10-19 21:27:37 -07:00
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
2012-12-06 12:15:13 -08:00
|
|
|
/* We handle discards by keeping track of the still-live pixels in f0.1.
|
|
|
|
|
* Initialize it with the dispatched pixels.
|
|
|
|
|
*/
|
2022-08-16 17:40:31 -07:00
|
|
|
if (devinfo->ver >= 20 || wm_prog_data->uses_kill) {
|
2020-01-04 16:16:24 -08:00
|
|
|
const unsigned lower_width = MIN2(dispatch_width, 16);
|
|
|
|
|
for (unsigned i = 0; i < dispatch_width / lower_width; i++) {
|
2022-06-11 17:36:09 -07:00
|
|
|
/* According to the "PS Thread Payload for Normal
|
|
|
|
|
* Dispatch" pages on the BSpec, the dispatch mask is
|
|
|
|
|
* stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
|
|
|
|
|
* gfx6+.
|
|
|
|
|
*/
|
2020-01-04 16:16:24 -08:00
|
|
|
const fs_reg dispatch_mask =
|
2022-06-11 17:36:09 -07:00
|
|
|
devinfo->ver >= 20 ? xe2_vec1_grf(i, 15) :
|
2024-02-17 22:43:47 -08:00
|
|
|
brw_vec1_grf(i + 1, 7);
|
2020-01-04 16:16:24 -08:00
|
|
|
bld.exec_all().group(1, 0)
|
2022-06-27 12:24:58 -07:00
|
|
|
.MOV(brw_sample_mask_reg(bld.group(lower_width, i)),
|
2024-04-20 17:08:02 -07:00
|
|
|
retype(dispatch_mask, BRW_TYPE_UW));
|
2020-01-04 16:16:24 -08:00
|
|
|
}
|
2012-12-06 12:15:13 -08:00
|
|
|
}
|
|
|
|
|
|
2020-04-29 13:48:58 -07:00
|
|
|
if (nir->info.writes_memory)
|
|
|
|
|
wm_prog_data->has_side_effects = true;
|
|
|
|
|
|
2023-12-05 15:11:09 -08:00
|
|
|
nir_to_brw(this);
|
2015-05-20 10:03:50 -07:00
|
|
|
|
2011-06-10 16:00:03 -07:00
|
|
|
if (failed)
|
|
|
|
|
return false;
|
2010-08-15 18:58:58 -07:00
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
emit_fb_writes();
|
2010-10-13 20:17:15 -07:00
|
|
|
|
2015-02-13 10:34:39 -08:00
|
|
|
calculate_cfg();
|
|
|
|
|
|
2024-01-04 16:42:50 -08:00
|
|
|
brw_fs_optimize(*this);
|
2013-02-15 19:26:48 -08:00
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
assign_curb_setup();
|
2017-10-25 16:50:11 +03:00
|
|
|
|
2021-12-21 12:16:21 +01:00
|
|
|
if (devinfo->ver == 9)
|
2021-03-29 15:40:04 -07:00
|
|
|
gfx9_ps_header_only_workaround(wm_prog_data);
|
2017-10-25 16:50:11 +03:00
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
assign_urb_setup();
|
2011-01-18 22:03:34 -08:00
|
|
|
|
2024-01-04 16:11:22 -08:00
|
|
|
brw_fs_lower_3src_null_dest(*this);
|
2024-01-04 16:18:04 -08:00
|
|
|
brw_fs_workaround_memory_fence_before_eot(*this);
|
2024-01-04 16:24:21 -08:00
|
|
|
brw_fs_workaround_emit_dummy_mov_instruction(*this);
|
2022-12-06 18:11:10 +02:00
|
|
|
|
2020-05-19 14:37:44 -07:00
|
|
|
allocate_registers(allow_spilling);
|
2014-05-13 20:51:32 -07:00
|
|
|
}
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
return !failed;
|
|
|
|
|
}
|
2010-08-26 12:12:00 -07:00
|
|
|
|
2014-08-30 19:57:39 -07:00
|
|
|
bool
|
2020-05-19 14:37:44 -07:00
|
|
|
fs_visitor::run_cs(bool allow_spilling)
|
2014-08-30 19:57:39 -07:00
|
|
|
{
|
2021-10-18 15:24:23 +03:00
|
|
|
assert(gl_shader_stage_is_compute(stage));
|
2023-11-21 10:12:09 -08:00
|
|
|
const fs_builder bld = fs_builder(this).at_end();
|
2014-08-30 19:57:39 -07:00
|
|
|
|
2022-08-22 21:47:02 -07:00
|
|
|
payload_ = new cs_thread_payload(*this);
|
2014-08-30 19:57:39 -07:00
|
|
|
|
2021-09-22 15:06:58 +03:00
|
|
|
if (devinfo->platform == INTEL_PLATFORM_HSW && prog_data->total_shared > 0) {
|
2016-02-20 01:22:08 -08:00
|
|
|
/* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
|
|
|
|
|
const fs_builder abld = bld.exec_all().group(1, 0);
|
2024-04-20 17:08:02 -07:00
|
|
|
abld.MOV(retype(brw_sr0_reg(1), BRW_TYPE_UW),
|
|
|
|
|
suboffset(retype(brw_vec1_grf(0, 0), BRW_TYPE_UW), 1));
|
2016-02-20 01:22:08 -08:00
|
|
|
}
|
|
|
|
|
|
2023-12-05 15:11:09 -08:00
|
|
|
nir_to_brw(this);
|
2014-08-30 19:57:39 -07:00
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
emit_cs_terminate();
|
|
|
|
|
|
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
2024-01-04 16:42:50 -08:00
|
|
|
brw_fs_optimize(*this);
|
2014-08-30 19:57:39 -07:00
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
2024-01-04 16:11:22 -08:00
|
|
|
brw_fs_lower_3src_null_dest(*this);
|
2024-01-04 16:18:04 -08:00
|
|
|
brw_fs_workaround_memory_fence_before_eot(*this);
|
2024-01-04 16:24:21 -08:00
|
|
|
brw_fs_workaround_emit_dummy_mov_instruction(*this);
|
2022-12-06 18:11:10 +02:00
|
|
|
|
2020-05-19 14:37:44 -07:00
|
|
|
allocate_registers(allow_spilling);
|
2014-08-30 19:57:39 -07:00
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
}
|
|
|
|
|
|
2020-10-21 14:46:50 -05:00
|
|
|
bool
|
|
|
|
|
fs_visitor::run_bs(bool allow_spilling)
|
|
|
|
|
{
|
|
|
|
|
assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE);
|
|
|
|
|
|
2022-09-11 00:57:26 -07:00
|
|
|
payload_ = new bs_thread_payload(*this);
|
2020-10-21 14:46:50 -05:00
|
|
|
|
2023-12-05 15:11:09 -08:00
|
|
|
nir_to_brw(this);
|
2020-10-21 14:46:50 -05:00
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* TODO(RT): Perhaps rename this? */
|
|
|
|
|
emit_cs_terminate();
|
|
|
|
|
|
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
2024-01-04 16:42:50 -08:00
|
|
|
brw_fs_optimize(*this);
|
2020-10-21 14:46:50 -05:00
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
2024-01-04 16:11:22 -08:00
|
|
|
brw_fs_lower_3src_null_dest(*this);
|
2024-01-04 16:18:04 -08:00
|
|
|
brw_fs_workaround_memory_fence_before_eot(*this);
|
2024-01-04 16:24:21 -08:00
|
|
|
brw_fs_workaround_emit_dummy_mov_instruction(*this);
|
2022-12-06 18:11:10 +02:00
|
|
|
|
2020-10-21 14:46:50 -05:00
|
|
|
allocate_registers(allow_spilling);
|
|
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
}
|
|
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
bool
|
|
|
|
|
fs_visitor::run_task(bool allow_spilling)
|
|
|
|
|
{
|
|
|
|
|
assert(stage == MESA_SHADER_TASK);
|
|
|
|
|
|
2022-08-21 23:05:08 -07:00
|
|
|
payload_ = new task_mesh_thread_payload(*this);
|
2021-10-29 12:27:45 -07:00
|
|
|
|
2023-12-05 15:11:09 -08:00
|
|
|
nir_to_brw(this);
|
2021-10-29 12:27:45 -07:00
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
return false;
|
|
|
|
|
|
2022-05-18 17:05:53 +02:00
|
|
|
emit_urb_fence();
|
|
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
emit_cs_terminate();
|
|
|
|
|
|
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
2024-01-04 16:42:50 -08:00
|
|
|
brw_fs_optimize(*this);
|
2021-10-29 12:27:45 -07:00
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
2024-01-04 16:11:22 -08:00
|
|
|
brw_fs_lower_3src_null_dest(*this);
|
2024-01-04 16:18:04 -08:00
|
|
|
brw_fs_workaround_memory_fence_before_eot(*this);
|
2024-01-04 16:24:21 -08:00
|
|
|
brw_fs_workaround_emit_dummy_mov_instruction(*this);
|
2022-12-06 18:11:10 +02:00
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
allocate_registers(allow_spilling);
|
|
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
fs_visitor::run_mesh(bool allow_spilling)
|
|
|
|
|
{
|
|
|
|
|
assert(stage == MESA_SHADER_MESH);
|
|
|
|
|
|
2022-08-21 23:05:08 -07:00
|
|
|
payload_ = new task_mesh_thread_payload(*this);
|
2021-10-29 12:27:45 -07:00
|
|
|
|
2023-12-05 15:11:09 -08:00
|
|
|
nir_to_brw(this);
|
2021-10-29 12:27:45 -07:00
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
return false;
|
|
|
|
|
|
2022-05-18 17:05:53 +02:00
|
|
|
emit_urb_fence();
|
|
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
emit_cs_terminate();
|
|
|
|
|
|
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
2024-01-04 16:42:50 -08:00
|
|
|
brw_fs_optimize(*this);
|
2021-10-29 12:27:45 -07:00
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
2024-01-04 16:11:22 -08:00
|
|
|
brw_fs_lower_3src_null_dest(*this);
|
2024-01-04 16:18:04 -08:00
|
|
|
brw_fs_workaround_memory_fence_before_eot(*this);
|
2024-01-04 16:24:21 -08:00
|
|
|
brw_fs_workaround_emit_dummy_mov_instruction(*this);
|
2022-12-06 18:11:10 +02:00
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
allocate_registers(allow_spilling);
|
|
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
}
|
|
|
|
|
|
2019-04-11 14:12:58 -05:00
|
|
|
static bool
|
2023-08-12 16:17:15 -04:00
|
|
|
is_used_in_not_interp_frag_coord(nir_def *def)
|
2019-04-11 14:12:58 -05:00
|
|
|
{
|
2023-04-06 13:19:31 -04:00
|
|
|
nir_foreach_use_including_if(src, def) {
|
2023-08-14 09:58:47 -04:00
|
|
|
if (nir_src_is_if(src))
|
2023-04-06 13:19:31 -04:00
|
|
|
return true;
|
|
|
|
|
|
2023-08-14 09:58:47 -04:00
|
|
|
if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
|
2019-04-11 14:12:58 -05:00
|
|
|
return true;
|
|
|
|
|
|
2023-08-14 09:58:47 -04:00
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src));
|
2019-07-18 09:59:44 -05:00
|
|
|
if (intrin->intrinsic != nir_intrinsic_load_frag_coord)
|
2019-04-11 14:12:58 -05:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2015-10-08 16:01:44 -07:00
|
|
|
/**
|
|
|
|
|
* Return a bitfield where bit n is set if barycentric interpolation mode n
|
2016-07-11 16:24:12 -07:00
|
|
|
* (see enum brw_barycentric_mode) is needed by the fragment shader.
|
2016-07-12 03:57:25 -07:00
|
|
|
*
|
|
|
|
|
* We examine the load_barycentric intrinsics rather than looking at input
|
|
|
|
|
* variables so that we catch interpolateAtCentroid() messages too, which
|
|
|
|
|
* also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
|
2015-10-08 16:01:44 -07:00
|
|
|
*/
|
|
|
|
|
static unsigned
|
2021-04-05 13:19:39 -07:00
|
|
|
brw_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,
|
2024-04-18 09:54:11 +03:00
|
|
|
const struct brw_wm_prog_key *key,
|
2015-10-08 16:01:44 -07:00
|
|
|
const nir_shader *shader)
|
|
|
|
|
{
|
|
|
|
|
unsigned barycentric_interp_modes = 0;
|
|
|
|
|
|
2023-06-28 19:40:56 +08:00
|
|
|
nir_foreach_function_impl(impl, shader) {
|
|
|
|
|
nir_foreach_block(block, impl) {
|
2016-07-12 03:57:25 -07:00
|
|
|
nir_foreach_instr(instr, block) {
|
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
|
continue;
|
2016-07-11 15:00:37 -07:00
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
2019-04-11 14:12:58 -05:00
|
|
|
switch (intrin->intrinsic) {
|
|
|
|
|
case nir_intrinsic_load_barycentric_pixel:
|
|
|
|
|
case nir_intrinsic_load_barycentric_centroid:
|
|
|
|
|
case nir_intrinsic_load_barycentric_sample:
|
2022-03-16 19:26:54 -07:00
|
|
|
case nir_intrinsic_load_barycentric_at_sample:
|
|
|
|
|
case nir_intrinsic_load_barycentric_at_offset:
|
2019-04-11 14:12:58 -05:00
|
|
|
break;
|
|
|
|
|
default:
|
2016-07-12 03:57:25 -07:00
|
|
|
continue;
|
2019-04-11 14:12:58 -05:00
|
|
|
}
|
2016-07-12 03:57:25 -07:00
|
|
|
|
|
|
|
|
/* Ignore WPOS; it doesn't require interpolation. */
|
2023-08-14 11:56:00 -05:00
|
|
|
if (!is_used_in_not_interp_frag_coord(&intrin->def))
|
2016-07-12 03:57:25 -07:00
|
|
|
continue;
|
2016-07-11 15:00:37 -07:00
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
nir_intrinsic_op bary_op = intrin->intrinsic;
|
|
|
|
|
enum brw_barycentric_mode bary =
|
2024-04-18 09:54:11 +03:00
|
|
|
brw_barycentric_mode(key, intrin);
|
2016-07-11 15:00:37 -07:00
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
barycentric_interp_modes |= 1 << bary;
|
|
|
|
|
|
|
|
|
|
if (devinfo->needs_unlit_centroid_workaround &&
|
|
|
|
|
bary_op == nir_intrinsic_load_barycentric_centroid)
|
|
|
|
|
barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
|
|
|
|
|
}
|
|
|
|
|
}
|
2015-10-08 16:01:44 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return barycentric_interp_modes;
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-05 18:19:34 -07:00
|
|
|
static void
|
|
|
|
|
brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
|
2016-07-07 00:47:18 -07:00
|
|
|
const nir_shader *shader)
|
2016-04-05 18:19:34 -07:00
|
|
|
{
|
|
|
|
|
prog_data->flat_inputs = 0;
|
|
|
|
|
|
2020-07-18 18:24:25 -05:00
|
|
|
nir_foreach_shader_in_variable(var, shader) {
|
2022-02-24 17:09:25 +01:00
|
|
|
/* flat shading */
|
|
|
|
|
if (var->data.interpolation != INTERP_MODE_FLAT)
|
|
|
|
|
continue;
|
|
|
|
|
|
2022-02-24 17:06:33 +01:00
|
|
|
if (var->data.per_primitive)
|
|
|
|
|
continue;
|
|
|
|
|
|
2018-07-31 05:31:47 -07:00
|
|
|
unsigned slots = glsl_count_attribute_slots(var->type, false);
|
|
|
|
|
for (unsigned s = 0; s < slots; s++) {
|
|
|
|
|
int input_index = prog_data->urb_setup[var->data.location + s];
|
2016-04-05 18:19:34 -07:00
|
|
|
|
2022-02-24 17:09:25 +01:00
|
|
|
if (input_index >= 0)
|
2018-07-31 05:31:47 -07:00
|
|
|
prog_data->flat_inputs |= 1 << input_index;
|
|
|
|
|
}
|
2016-04-05 18:19:34 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-10-08 16:01:44 -07:00
|
|
|
static uint8_t
|
|
|
|
|
computed_depth_mode(const nir_shader *shader)
|
|
|
|
|
{
|
2017-05-08 09:20:21 -07:00
|
|
|
if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
|
|
|
|
|
switch (shader->info.fs.depth_layout) {
|
2015-10-08 16:01:44 -07:00
|
|
|
case FRAG_DEPTH_LAYOUT_NONE:
|
|
|
|
|
case FRAG_DEPTH_LAYOUT_ANY:
|
|
|
|
|
return BRW_PSCDEPTH_ON;
|
|
|
|
|
case FRAG_DEPTH_LAYOUT_GREATER:
|
|
|
|
|
return BRW_PSCDEPTH_ON_GE;
|
|
|
|
|
case FRAG_DEPTH_LAYOUT_LESS:
|
|
|
|
|
return BRW_PSCDEPTH_ON_LE;
|
|
|
|
|
case FRAG_DEPTH_LAYOUT_UNCHANGED:
|
2024-01-08 12:17:54 +02:00
|
|
|
/* We initially set this to OFF, but having the shader write the
|
|
|
|
|
* depth means we allocate register space in the SEND message. The
|
|
|
|
|
* difference between the SEND register count and the OFF state
|
|
|
|
|
* programming makes the HW hang.
|
|
|
|
|
*
|
|
|
|
|
* Removing the depth writes also leads to test failures. So use
|
|
|
|
|
* LesserThanOrEqual, which fits writing the same value
|
|
|
|
|
* (unchanged/equal).
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
return BRW_PSCDEPTH_ON_LE;
|
2015-10-08 16:01:44 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return BRW_PSCDEPTH_OFF;
|
|
|
|
|
}
|
|
|
|
|
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
/**
|
|
|
|
|
* Move load_interpolated_input with simple (payload-based) barycentric modes
|
|
|
|
|
* to the top of the program so we don't emit multiple PLNs for the same input.
|
|
|
|
|
*
|
|
|
|
|
* This works around CSE not being able to handle non-dominating cases
|
|
|
|
|
* such as:
|
|
|
|
|
*
|
|
|
|
|
* if (...) {
|
|
|
|
|
* interpolate input
|
|
|
|
|
* } else {
|
|
|
|
|
* interpolate the same exact input
|
|
|
|
|
* }
|
|
|
|
|
*
|
|
|
|
|
* This should be replaced by global value numbering someday.
|
|
|
|
|
*/
|
2019-07-18 09:23:23 -05:00
|
|
|
bool
|
|
|
|
|
brw_nir_move_interpolation_to_top(nir_shader *nir)
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
{
|
2017-03-09 11:05:08 -08:00
|
|
|
bool progress = false;
|
|
|
|
|
|
2023-06-28 19:40:56 +08:00
|
|
|
nir_foreach_function_impl(impl, nir) {
|
|
|
|
|
nir_block *top = nir_start_block(impl);
|
2023-03-20 20:57:47 -07:00
|
|
|
nir_cursor cursor = nir_before_instr(nir_block_first_instr(top));
|
2023-03-20 20:57:47 -07:00
|
|
|
bool impl_progress = false;
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
2023-03-20 20:57:47 -07:00
|
|
|
for (nir_block *block = nir_block_cf_tree_next(top);
|
|
|
|
|
block != NULL;
|
|
|
|
|
block = nir_block_cf_tree_next(block)) {
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
2016-07-26 13:19:46 -07:00
|
|
|
nir_foreach_instr_safe(instr, block) {
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
2016-07-26 13:19:46 -07:00
|
|
|
if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
continue;
|
2016-07-26 13:19:46 -07:00
|
|
|
nir_intrinsic_instr *bary_intrinsic =
|
|
|
|
|
nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
|
|
|
|
|
nir_intrinsic_op op = bary_intrinsic->intrinsic;
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
2016-07-26 13:19:46 -07:00
|
|
|
/* Leave interpolateAtSample/Offset() where they are. */
|
|
|
|
|
if (op == nir_intrinsic_load_barycentric_at_sample ||
|
|
|
|
|
op == nir_intrinsic_load_barycentric_at_offset)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
nir_instr *move[3] = {
|
|
|
|
|
&bary_intrinsic->instr,
|
|
|
|
|
intrin->src[1].ssa->parent_instr,
|
|
|
|
|
instr
|
|
|
|
|
};
|
|
|
|
|
|
2016-08-01 10:35:06 +10:00
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
|
2016-07-26 13:19:46 -07:00
|
|
|
if (move[i]->block != top) {
|
2023-03-20 20:57:47 -07:00
|
|
|
nir_instr_move(cursor, move[i]);
|
2023-03-20 20:57:47 -07:00
|
|
|
impl_progress = true;
|
2016-07-26 13:19:46 -07:00
|
|
|
}
|
|
|
|
|
}
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
}
|
|
|
|
|
}
|
2023-03-20 20:57:47 -07:00
|
|
|
|
|
|
|
|
progress = progress || impl_progress;
|
|
|
|
|
|
2024-06-16 16:32:01 -04:00
|
|
|
nir_metadata_preserve(impl, impl_progress ? nir_metadata_control_flow
|
|
|
|
|
: nir_metadata_all);
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
}
|
2017-03-09 11:05:08 -08:00
|
|
|
|
|
|
|
|
return progress;
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
}
|
|
|
|
|
|
2021-05-17 15:25:26 -07:00
|
|
|
static void
|
2023-07-01 23:36:19 +03:00
|
|
|
brw_nir_populate_wm_prog_data(nir_shader *shader,
|
2021-04-05 13:19:39 -07:00
|
|
|
const struct intel_device_info *devinfo,
|
2019-07-18 09:23:47 -05:00
|
|
|
const struct brw_wm_prog_key *key,
|
2021-05-18 11:05:33 -07:00
|
|
|
struct brw_wm_prog_data *prog_data,
|
|
|
|
|
const struct brw_mue_map *mue_map)
|
2019-07-18 09:23:47 -05:00
|
|
|
{
|
2024-02-08 16:03:55 +01:00
|
|
|
prog_data->uses_kill = shader->info.fs.uses_discard;
|
2019-07-18 09:23:47 -05:00
|
|
|
prog_data->uses_omask = !key->ignore_sample_mask_out &&
|
|
|
|
|
(shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
|
2022-06-28 17:44:13 -07:00
|
|
|
prog_data->max_polygons = 1;
|
2019-07-18 09:23:47 -05:00
|
|
|
prog_data->computed_depth_mode = computed_depth_mode(shader);
|
|
|
|
|
prog_data->computed_stencil =
|
|
|
|
|
shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
|
|
|
|
|
|
2021-11-19 16:32:24 -06:00
|
|
|
prog_data->sample_shading =
|
|
|
|
|
shader->info.fs.uses_sample_shading ||
|
|
|
|
|
shader->info.outputs_read;
|
|
|
|
|
|
2021-11-19 13:44:35 -06:00
|
|
|
assert(key->multisample_fbo != BRW_NEVER ||
|
|
|
|
|
key->persample_interp == BRW_NEVER);
|
2021-11-19 16:34:19 -06:00
|
|
|
|
|
|
|
|
prog_data->persample_dispatch = key->persample_interp;
|
2021-11-19 13:44:35 -06:00
|
|
|
if (prog_data->sample_shading)
|
2021-11-19 16:32:24 -06:00
|
|
|
prog_data->persample_dispatch = BRW_ALWAYS;
|
2019-07-18 09:23:47 -05:00
|
|
|
|
2021-11-19 13:44:35 -06:00
|
|
|
/* We can only persample dispatch if we have a multisample FBO */
|
|
|
|
|
prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch,
|
|
|
|
|
key->multisample_fbo);
|
|
|
|
|
|
2022-03-09 15:31:34 +02:00
|
|
|
/* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
|
|
|
|
|
* persample_dispatch & multisample_fbo are not dynamic, Anv should be able
|
|
|
|
|
* to definitively tell whether alpha_to_coverage is on or off.
|
|
|
|
|
*/
|
|
|
|
|
prog_data->alpha_to_coverage = key->alpha_to_coverage;
|
|
|
|
|
|
2024-02-17 22:43:47 -08:00
|
|
|
prog_data->uses_sample_mask =
|
|
|
|
|
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
|
2019-10-24 17:31:18 -05:00
|
|
|
|
2024-02-17 22:43:47 -08:00
|
|
|
/* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
|
|
|
|
|
*
|
|
|
|
|
* "MSDISPMODE_PERSAMPLE is required in order to select
|
|
|
|
|
* POSOFFSET_SAMPLE"
|
|
|
|
|
*
|
|
|
|
|
* So we can only really get sample positions if we are doing real
|
|
|
|
|
* per-sample dispatch. If we need gl_SamplePosition and we don't have
|
|
|
|
|
* persample dispatch, we hard-code it to 0.5.
|
|
|
|
|
*/
|
|
|
|
|
prog_data->uses_pos_offset =
|
|
|
|
|
prog_data->persample_dispatch != BRW_NEVER &&
|
|
|
|
|
(BITSET_TEST(shader->info.system_values_read,
|
|
|
|
|
SYSTEM_VALUE_SAMPLE_POS) ||
|
|
|
|
|
BITSET_TEST(shader->info.system_values_read,
|
|
|
|
|
SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
|
2019-10-24 17:31:18 -05:00
|
|
|
|
2019-07-18 09:23:47 -05:00
|
|
|
prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
|
|
|
|
|
prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
|
|
|
|
|
prog_data->inner_coverage = shader->info.fs.inner_coverage;
|
|
|
|
|
|
|
|
|
|
prog_data->barycentric_interp_modes =
|
2024-04-18 09:54:11 +03:00
|
|
|
brw_compute_barycentric_interp_modes(devinfo, key, shader);
|
2022-03-16 19:26:54 -07:00
|
|
|
|
|
|
|
|
/* From the BDW PRM documentation for 3DSTATE_WM:
|
|
|
|
|
*
|
|
|
|
|
* "MSDISPMODE_PERSAMPLE is required in order to select Perspective
|
|
|
|
|
* Sample or Non- perspective Sample barycentric coordinates."
|
|
|
|
|
*
|
|
|
|
|
* So cleanup any potentially set sample barycentric mode when not in per
|
|
|
|
|
* sample dispatch.
|
|
|
|
|
*/
|
|
|
|
|
if (prog_data->persample_dispatch == BRW_NEVER) {
|
|
|
|
|
prog_data->barycentric_interp_modes &=
|
|
|
|
|
~BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE);
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-07 12:24:38 +03:00
|
|
|
prog_data->uses_nonperspective_interp_modes |=
|
|
|
|
|
(prog_data->barycentric_interp_modes &
|
|
|
|
|
BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0;
|
2019-07-18 09:23:47 -05:00
|
|
|
|
2023-05-30 18:10:53 +02:00
|
|
|
/* The current VK_EXT_graphics_pipeline_library specification requires
|
|
|
|
|
* coarse to specified at compile time. But per sample interpolation can be
|
|
|
|
|
* dynamic. So we should never be in a situation where coarse &
|
|
|
|
|
* persample_interp are both respectively true & BRW_ALWAYS.
|
|
|
|
|
*
|
|
|
|
|
* Coarse will dynamically turned off when persample_interp is active.
|
|
|
|
|
*/
|
|
|
|
|
assert(!key->coarse_pixel || key->persample_interp != BRW_ALWAYS);
|
|
|
|
|
|
2021-11-19 16:32:24 -06:00
|
|
|
prog_data->coarse_pixel_dispatch =
|
|
|
|
|
brw_sometimes_invert(prog_data->persample_dispatch);
|
|
|
|
|
if (!key->coarse_pixel ||
|
|
|
|
|
prog_data->uses_omask ||
|
|
|
|
|
prog_data->sample_shading ||
|
|
|
|
|
prog_data->uses_sample_mask ||
|
|
|
|
|
(prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) ||
|
|
|
|
|
prog_data->computed_stencil) {
|
|
|
|
|
prog_data->coarse_pixel_dispatch = BRW_NEVER;
|
|
|
|
|
}
|
2020-10-22 13:23:06 +03:00
|
|
|
|
2023-07-01 23:36:19 +03:00
|
|
|
/* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
|
|
|
|
|
* Message Descriptor :
|
|
|
|
|
*
|
|
|
|
|
* "Message Type. Specifies the type of message being sent when
|
|
|
|
|
* pixel-rate evaluation is requested :
|
|
|
|
|
*
|
|
|
|
|
* Format = U2
|
|
|
|
|
* 0: Per Message Offset (eval_snapped with immediate offset)
|
|
|
|
|
* 1: Sample Position Offset (eval_sindex)
|
|
|
|
|
* 2: Centroid Position Offset (eval_centroid)
|
|
|
|
|
* 3: Per Slot Offset (eval_snapped with register offset)
|
|
|
|
|
*
|
|
|
|
|
* Message Type. Specifies the type of message being sent when
|
|
|
|
|
* coarse-rate evaluation is requested :
|
|
|
|
|
*
|
|
|
|
|
* Format = U2
|
|
|
|
|
* 0: Coarse to Pixel Mapping Message (internal message)
|
|
|
|
|
* 1: Reserved
|
|
|
|
|
* 2: Coarse Centroid Position (eval_centroid)
|
|
|
|
|
* 3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
|
|
|
|
|
*
|
|
|
|
|
* The Sample Position Offset is marked as reserved for coarse rate
|
|
|
|
|
* evaluation and leads to hangs if we try to use it. So disable coarse
|
|
|
|
|
* pixel shading if we have any intrinsic that will result in a pixel
|
|
|
|
|
* interpolater message at sample.
|
|
|
|
|
*/
|
2023-11-01 15:15:43 -07:00
|
|
|
if (intel_nir_pulls_at_sample(shader))
|
2023-07-01 23:36:19 +03:00
|
|
|
prog_data->coarse_pixel_dispatch = BRW_NEVER;
|
|
|
|
|
|
2019-06-07 18:17:36 -05:00
|
|
|
/* We choose to always enable VMask prior to XeHP, as it would cause
|
|
|
|
|
* us to lose out on the eliminate_find_live_channel() optimization.
|
|
|
|
|
*/
|
|
|
|
|
prog_data->uses_vmask = devinfo->verx10 < 125 ||
|
|
|
|
|
shader->info.fs.needs_quad_helper_invocations ||
|
2023-11-02 15:38:46 +01:00
|
|
|
shader->info.uses_wide_subgroup_intrinsics ||
|
2021-11-19 16:32:24 -06:00
|
|
|
prog_data->coarse_pixel_dispatch != BRW_NEVER;
|
2019-06-07 18:17:36 -05:00
|
|
|
|
2020-10-29 15:10:59 +02:00
|
|
|
prog_data->uses_src_w =
|
|
|
|
|
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
|
|
|
|
|
prog_data->uses_src_depth =
|
|
|
|
|
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
|
2021-11-19 16:32:24 -06:00
|
|
|
prog_data->coarse_pixel_dispatch != BRW_ALWAYS;
|
2020-10-29 15:10:59 +02:00
|
|
|
prog_data->uses_depth_w_coefficients =
|
|
|
|
|
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
|
2021-11-19 16:32:24 -06:00
|
|
|
prog_data->coarse_pixel_dispatch != BRW_NEVER;
|
2020-10-29 15:10:59 +02:00
|
|
|
|
2021-05-18 11:05:33 -07:00
|
|
|
calculate_urb_setup(devinfo, key, prog_data, shader, mue_map);
|
2019-07-18 09:23:47 -05:00
|
|
|
brw_compute_flat_inputs(prog_data, shader);
|
|
|
|
|
}
|
|
|
|
|
|
2012-11-20 16:21:27 -08:00
|
|
|
const unsigned *
|
2021-03-22 22:13:09 -07:00
|
|
|
brw_compile_fs(const struct brw_compiler *compiler,
|
|
|
|
|
struct brw_compile_fs_params *params)
|
2011-03-11 19:19:01 -08:00
|
|
|
{
|
2023-07-14 02:10:20 +03:00
|
|
|
struct nir_shader *nir = params->base.nir;
|
2021-03-22 22:13:09 -07:00
|
|
|
const struct brw_wm_prog_key *key = params->key;
|
|
|
|
|
struct brw_wm_prog_data *prog_data = params->prog_data;
|
|
|
|
|
bool allow_spilling = params->allow_spilling;
|
2021-03-23 11:38:28 -07:00
|
|
|
const bool debug_enabled =
|
2023-07-14 02:10:20 +03:00
|
|
|
brw_should_print_shader(nir, params->base.debug_flag ?
|
|
|
|
|
params->base.debug_flag : DEBUG_WM);
|
2021-03-22 22:13:09 -07:00
|
|
|
|
2020-11-10 13:11:31 -09:00
|
|
|
prog_data->base.stage = MESA_SHADER_FRAGMENT;
|
2021-10-26 16:39:08 +03:00
|
|
|
prog_data->base.ray_queries = nir->info.ray_queries;
|
2022-02-28 15:13:07 +02:00
|
|
|
prog_data->base.total_scratch = 0;
|
2020-11-10 13:11:31 -09:00
|
|
|
|
2021-04-05 13:19:39 -07:00
|
|
|
const struct intel_device_info *devinfo = compiler->devinfo;
|
2024-02-15 02:03:38 -08:00
|
|
|
const unsigned max_subgroup_size = 32;
|
2019-02-22 10:48:39 -06:00
|
|
|
|
2023-05-17 17:09:06 +02:00
|
|
|
brw_nir_apply_key(nir, compiler, &key->base, max_subgroup_size);
|
2020-09-04 18:43:35 +02:00
|
|
|
brw_nir_lower_fs_inputs(nir, devinfo, key);
|
|
|
|
|
brw_nir_lower_fs_outputs(nir);
|
i965: Move Gen4-5 interpolation stuff to brw_wm_prog_data.
This fixes glxgears rendering, which had surprisingly been broken since
late October! Specifically, commit 91d61fbf7cb61a44adcaae51ee08ad0dd6b.
glxgears uses glShadeModel(GL_FLAT) when drawing the main portion of the
gears, then uses glShadeModel(GL_SMOOTH) for drawing the Gouraud-shaded
inner portion of the gears. This results in the same fragment program
having two different state-dependent interpolation maps: one where
gl_Color is flat, and another where it's smooth.
The problem is that there's only one gen4_fragment_program, so it can't
store both. Each FS compile would trash the last one. But, the FS
compiles are cached, so the first one would store FLAT, and the second
would see a matching program in the cache and never bother to compile
one with SMOOTH. (Clearing the program cache on every draw made it
render correctly.)
Instead, move it to brw_wm_prog_data, where we can keep a copy for
every specialization of the program. The only downside is bloating
the structure a bit, but we can tighten that up a bit if we need to.
This also lets us kill gen4_fragment_program entirely!
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
2017-01-13 14:29:52 -08:00
|
|
|
|
2019-09-27 16:28:11 -07:00
|
|
|
/* From the SKL PRM, Volume 7, "Alpha Coverage":
|
|
|
|
|
* "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
|
|
|
|
|
* hardware, regardless of the state setting for this feature."
|
|
|
|
|
*/
|
2024-02-15 02:03:38 -08:00
|
|
|
if (key->alpha_to_coverage != BRW_NEVER) {
|
2019-09-27 16:28:11 -07:00
|
|
|
/* Run constant fold optimization in order to get the correct source
|
|
|
|
|
* offset to determine render target 0 store instruction in
|
|
|
|
|
* emit_alpha_to_coverage pass.
|
|
|
|
|
*/
|
2023-01-11 11:15:27 -08:00
|
|
|
NIR_PASS(_, nir, nir_opt_constant_folding);
|
|
|
|
|
NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage, key, prog_data);
|
2019-09-27 16:28:11 -07:00
|
|
|
}
|
|
|
|
|
|
2023-01-11 11:15:27 -08:00
|
|
|
NIR_PASS(_, nir, brw_nir_move_interpolation_to_top);
|
2023-05-17 16:44:17 +02:00
|
|
|
brw_postprocess_nir(nir, compiler, debug_enabled,
|
2022-06-21 18:06:04 -07:00
|
|
|
key->base.robust_flags);
|
2015-11-11 10:04:43 -08:00
|
|
|
|
2021-05-18 11:05:33 -07:00
|
|
|
brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data,
|
|
|
|
|
params->mue_map);
|
2019-07-18 09:15:15 -05:00
|
|
|
|
2022-06-22 16:32:57 -07:00
|
|
|
std::unique_ptr<fs_visitor> v8, v16, v32, vmulti;
|
|
|
|
|
cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL,
|
|
|
|
|
*multi_cfg = NULL;
|
2020-04-02 17:30:06 -07:00
|
|
|
float throughput = 0;
|
2020-05-19 14:37:44 -07:00
|
|
|
bool has_spilled = false;
|
2016-04-28 12:40:14 -07:00
|
|
|
|
2023-12-01 20:32:08 -08:00
|
|
|
if (devinfo->ver < 20) {
|
|
|
|
|
v8 = std::make_unique<fs_visitor>(compiler, ¶ms->base, key,
|
|
|
|
|
prog_data, nir, 8, 1,
|
|
|
|
|
params->base.stats != NULL,
|
|
|
|
|
debug_enabled);
|
|
|
|
|
if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
|
|
|
|
|
params->base.error_str = ralloc_strdup(params->base.mem_ctx,
|
|
|
|
|
v8->fail_msg);
|
|
|
|
|
return NULL;
|
|
|
|
|
} else if (INTEL_SIMD(FS, 8)) {
|
|
|
|
|
simd8_cfg = v8->cfg;
|
|
|
|
|
|
|
|
|
|
assert(v8->payload().num_regs % reg_unit(devinfo) == 0);
|
|
|
|
|
prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo);
|
|
|
|
|
|
|
|
|
|
const performance &perf = v8->performance_analysis.require();
|
|
|
|
|
throughput = MAX2(throughput, perf.throughput);
|
|
|
|
|
has_spilled = v8->spilled_any_registers;
|
|
|
|
|
allow_spilling = false;
|
|
|
|
|
}
|
2016-04-28 12:40:14 -07:00
|
|
|
}
|
|
|
|
|
|
2023-12-01 20:17:54 -08:00
|
|
|
if (key->coarse_pixel && devinfo->ver < 20) {
|
2020-10-29 15:17:16 +02:00
|
|
|
if (prog_data->dual_src_blend) {
|
|
|
|
|
v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot"
|
|
|
|
|
" use SIMD8 messages.\n");
|
|
|
|
|
}
|
|
|
|
|
v8->limit_dispatch_width(16, "SIMD32 not supported with coarse"
|
|
|
|
|
" pixel shading.\n");
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-19 14:37:44 -07:00
|
|
|
if (!has_spilled &&
|
2023-12-01 20:32:08 -08:00
|
|
|
(!v8 || v8->max_dispatch_width >= 16) &&
|
2023-01-21 12:49:44 +01:00
|
|
|
(INTEL_SIMD(FS, 16) || params->use_rep_send)) {
|
2016-04-28 12:40:14 -07:00
|
|
|
/* Try a SIMD16 compile */
|
2022-06-22 16:31:00 -07:00
|
|
|
v16 = std::make_unique<fs_visitor>(compiler, ¶ms->base, key,
|
|
|
|
|
prog_data, nir, 16, 1,
|
2023-07-14 02:10:20 +03:00
|
|
|
params->base.stats != NULL,
|
2022-11-08 14:14:37 -08:00
|
|
|
debug_enabled);
|
2023-12-01 20:32:08 -08:00
|
|
|
if (v8)
|
|
|
|
|
v16->import_uniforms(v8.get());
|
2021-03-22 22:13:09 -07:00
|
|
|
if (!v16->run_fs(allow_spilling, params->use_rep_send)) {
|
2023-07-14 02:10:20 +03:00
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
2021-10-03 15:58:36 +03:00
|
|
|
"SIMD16 shader failed to compile: %s\n",
|
2021-07-29 14:27:57 -07:00
|
|
|
v16->fail_msg);
|
2016-04-28 12:40:14 -07:00
|
|
|
} else {
|
2020-04-02 17:16:45 -07:00
|
|
|
simd16_cfg = v16->cfg;
|
2022-07-19 16:44:26 -07:00
|
|
|
|
|
|
|
|
assert(v16->payload().num_regs % reg_unit(devinfo) == 0);
|
|
|
|
|
prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo);
|
|
|
|
|
|
2020-04-02 17:30:06 -07:00
|
|
|
const performance &perf = v16->performance_analysis.require();
|
|
|
|
|
throughput = MAX2(throughput, perf.throughput);
|
2020-05-19 14:37:44 -07:00
|
|
|
has_spilled = v16->spilled_any_registers;
|
|
|
|
|
allow_spilling = false;
|
2012-07-12 12:48:58 -07:00
|
|
|
}
|
2011-03-11 19:19:01 -08:00
|
|
|
}
|
|
|
|
|
|
2020-05-31 09:27:28 -07:00
|
|
|
const bool simd16_failed = v16 && !simd16_cfg;
|
|
|
|
|
|
2016-04-26 19:45:41 -07:00
|
|
|
/* Currently, the compiler only supports SIMD32 on SNB+ */
|
2020-05-19 14:37:44 -07:00
|
|
|
if (!has_spilled &&
|
2023-12-01 20:32:08 -08:00
|
|
|
(!v8 || v8->max_dispatch_width >= 32) &&
|
|
|
|
|
(!v16 || v16->max_dispatch_width >= 32) && !params->use_rep_send &&
|
2024-02-15 02:03:38 -08:00
|
|
|
!simd16_failed &&
|
2023-01-21 12:49:44 +01:00
|
|
|
INTEL_SIMD(FS, 32)) {
|
2016-04-26 19:45:41 -07:00
|
|
|
/* Try a SIMD32 compile */
|
2022-06-22 16:31:00 -07:00
|
|
|
v32 = std::make_unique<fs_visitor>(compiler, ¶ms->base, key,
|
|
|
|
|
prog_data, nir, 32, 1,
|
2023-07-14 02:10:20 +03:00
|
|
|
params->base.stats != NULL,
|
2022-11-08 14:14:37 -08:00
|
|
|
debug_enabled);
|
2023-12-01 20:32:08 -08:00
|
|
|
if (v8)
|
|
|
|
|
v32->import_uniforms(v8.get());
|
|
|
|
|
else if (v16)
|
|
|
|
|
v32->import_uniforms(v16.get());
|
|
|
|
|
|
2020-04-02 17:16:45 -07:00
|
|
|
if (!v32->run_fs(allow_spilling, false)) {
|
2023-07-14 02:10:20 +03:00
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
2021-10-03 15:58:36 +03:00
|
|
|
"SIMD32 shader failed to compile: %s\n",
|
2021-07-29 14:27:57 -07:00
|
|
|
v32->fail_msg);
|
2016-04-26 19:45:41 -07:00
|
|
|
} else {
|
2020-04-02 17:30:06 -07:00
|
|
|
const performance &perf = v32->performance_analysis.require();
|
|
|
|
|
|
2023-08-10 14:12:24 -04:00
|
|
|
if (!INTEL_DEBUG(DEBUG_DO32) && throughput >= perf.throughput) {
|
2023-07-14 02:10:20 +03:00
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
2021-07-29 14:27:57 -07:00
|
|
|
"SIMD32 shader inefficient\n");
|
2020-04-02 17:30:06 -07:00
|
|
|
} else {
|
|
|
|
|
simd32_cfg = v32->cfg;
|
2022-07-19 16:44:26 -07:00
|
|
|
|
|
|
|
|
assert(v32->payload().num_regs % reg_unit(devinfo) == 0);
|
|
|
|
|
prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo);
|
|
|
|
|
|
2020-04-02 17:30:06 -07:00
|
|
|
throughput = MAX2(throughput, perf.throughput);
|
|
|
|
|
}
|
2016-04-26 19:45:41 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-06-22 16:32:57 -07:00
|
|
|
if (devinfo->ver >= 12 && !has_spilled &&
|
2023-11-30 01:36:47 -08:00
|
|
|
params->max_polygons >= 2 && !key->coarse_pixel) {
|
|
|
|
|
fs_visitor *vbase = v8 ? v8.get() : v16 ? v16.get() : v32.get();
|
|
|
|
|
assert(vbase);
|
|
|
|
|
|
|
|
|
|
if (devinfo->ver >= 20 &&
|
|
|
|
|
params->max_polygons >= 4 &&
|
|
|
|
|
vbase->max_dispatch_width >= 32 &&
|
|
|
|
|
4 * prog_data->num_varying_inputs <= MAX_VARYING &&
|
|
|
|
|
INTEL_SIMD(FS, 4X8)) {
|
|
|
|
|
/* Try a quad-SIMD8 compile */
|
|
|
|
|
vmulti = std::make_unique<fs_visitor>(compiler, ¶ms->base, key,
|
|
|
|
|
prog_data, nir, 32, 4,
|
|
|
|
|
params->base.stats != NULL,
|
|
|
|
|
debug_enabled);
|
|
|
|
|
vmulti->import_uniforms(vbase);
|
|
|
|
|
if (!vmulti->run_fs(false, params->use_rep_send)) {
|
|
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
|
|
|
|
"Quad-SIMD8 shader failed to compile: %s\n",
|
|
|
|
|
vmulti->fail_msg);
|
|
|
|
|
} else {
|
|
|
|
|
multi_cfg = vmulti->cfg;
|
|
|
|
|
assert(!vmulti->spilled_any_registers);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!multi_cfg && devinfo->ver >= 20 &&
|
|
|
|
|
vbase->max_dispatch_width >= 32 &&
|
|
|
|
|
2 * prog_data->num_varying_inputs <= MAX_VARYING &&
|
|
|
|
|
INTEL_SIMD(FS, 2X16)) {
|
|
|
|
|
/* Try a dual-SIMD16 compile */
|
|
|
|
|
vmulti = std::make_unique<fs_visitor>(compiler, ¶ms->base, key,
|
|
|
|
|
prog_data, nir, 32, 2,
|
|
|
|
|
params->base.stats != NULL,
|
|
|
|
|
debug_enabled);
|
|
|
|
|
vmulti->import_uniforms(vbase);
|
|
|
|
|
if (!vmulti->run_fs(false, params->use_rep_send)) {
|
|
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
|
|
|
|
"Dual-SIMD16 shader failed to compile: %s\n",
|
|
|
|
|
vmulti->fail_msg);
|
|
|
|
|
} else {
|
|
|
|
|
multi_cfg = vmulti->cfg;
|
|
|
|
|
assert(!vmulti->spilled_any_registers);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!multi_cfg && vbase->max_dispatch_width >= 16 &&
|
|
|
|
|
2 * prog_data->num_varying_inputs <= MAX_VARYING &&
|
|
|
|
|
INTEL_SIMD(FS, 2X8)) {
|
|
|
|
|
/* Try a dual-SIMD8 compile */
|
|
|
|
|
vmulti = std::make_unique<fs_visitor>(compiler, ¶ms->base, key,
|
|
|
|
|
prog_data, nir, 16, 2,
|
|
|
|
|
params->base.stats != NULL,
|
|
|
|
|
debug_enabled);
|
|
|
|
|
vmulti->import_uniforms(vbase);
|
|
|
|
|
if (!vmulti->run_fs(allow_spilling, params->use_rep_send)) {
|
|
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
|
|
|
|
"Dual-SIMD8 shader failed to compile: %s\n",
|
|
|
|
|
vmulti->fail_msg);
|
|
|
|
|
} else {
|
|
|
|
|
multi_cfg = vmulti->cfg;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (multi_cfg) {
|
|
|
|
|
assert(vmulti->payload().num_regs % reg_unit(devinfo) == 0);
|
|
|
|
|
prog_data->base.dispatch_grf_start_reg = vmulti->payload().num_regs / reg_unit(devinfo);
|
2022-06-22 16:32:57 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-28 12:40:14 -07:00
|
|
|
/* When the caller requests a repclear shader, they want SIMD16-only */
|
2021-03-22 22:13:09 -07:00
|
|
|
if (params->use_rep_send)
|
2016-04-28 12:40:14 -07:00
|
|
|
simd8_cfg = NULL;
|
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
fs_generator g(compiler, ¶ms->base, &prog_data->base,
|
2024-02-27 12:23:52 -08:00
|
|
|
MESA_SHADER_FRAGMENT);
|
2014-10-27 19:40:47 -07:00
|
|
|
|
2021-03-23 11:12:40 -07:00
|
|
|
if (unlikely(debug_enabled)) {
|
2023-07-14 02:10:20 +03:00
|
|
|
g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
|
|
|
|
|
"%s fragment shader %s",
|
2020-09-04 18:43:35 +02:00
|
|
|
nir->info.label ?
|
|
|
|
|
nir->info.label : "unnamed",
|
|
|
|
|
nir->info.name));
|
2014-10-27 19:40:47 -07:00
|
|
|
}
|
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
struct brw_compile_stats *stats = params->base.stats;
|
2023-03-19 15:03:33 +02:00
|
|
|
uint32_t max_dispatch_width = 0;
|
2021-03-22 22:13:09 -07:00
|
|
|
|
2022-06-22 16:32:57 -07:00
|
|
|
if (multi_cfg) {
|
|
|
|
|
prog_data->dispatch_multi = vmulti->dispatch_width;
|
|
|
|
|
prog_data->max_polygons = vmulti->max_polygons;
|
|
|
|
|
g.generate_code(multi_cfg, vmulti->dispatch_width, vmulti->shader_stats,
|
|
|
|
|
vmulti->performance_analysis.require(),
|
|
|
|
|
stats, vmulti->max_polygons);
|
|
|
|
|
stats = stats ? stats + 1 : NULL;
|
|
|
|
|
max_dispatch_width = vmulti->dispatch_width;
|
|
|
|
|
|
|
|
|
|
} else if (simd8_cfg) {
|
2016-04-28 15:37:39 -07:00
|
|
|
prog_data->dispatch_8 = true;
|
2020-03-26 16:27:32 -07:00
|
|
|
g.generate_code(simd8_cfg, 8, v8->shader_stats,
|
2023-12-07 19:47:55 -08:00
|
|
|
v8->performance_analysis.require(), stats, 1);
|
2019-04-23 23:19:56 -05:00
|
|
|
stats = stats ? stats + 1 : NULL;
|
2023-03-19 15:03:33 +02:00
|
|
|
max_dispatch_width = 8;
|
2018-05-17 23:49:29 -07:00
|
|
|
}
|
2016-04-28 15:37:39 -07:00
|
|
|
|
2018-05-17 23:49:29 -07:00
|
|
|
if (simd16_cfg) {
|
2016-04-28 15:37:39 -07:00
|
|
|
prog_data->dispatch_16 = true;
|
2020-03-26 16:27:32 -07:00
|
|
|
prog_data->prog_offset_16 = g.generate_code(
|
|
|
|
|
simd16_cfg, 16, v16->shader_stats,
|
2023-12-07 19:47:55 -08:00
|
|
|
v16->performance_analysis.require(), stats, 1);
|
2019-04-23 23:19:56 -05:00
|
|
|
stats = stats ? stats + 1 : NULL;
|
2023-03-19 15:03:33 +02:00
|
|
|
max_dispatch_width = 16;
|
2016-04-28 12:40:14 -07:00
|
|
|
}
|
|
|
|
|
|
2016-04-26 19:45:41 -07:00
|
|
|
if (simd32_cfg) {
|
|
|
|
|
prog_data->dispatch_32 = true;
|
2020-03-26 16:27:32 -07:00
|
|
|
prog_data->prog_offset_32 = g.generate_code(
|
|
|
|
|
simd32_cfg, 32, v32->shader_stats,
|
2023-12-07 19:47:55 -08:00
|
|
|
v32->performance_analysis.require(), stats, 1);
|
2019-04-23 23:19:56 -05:00
|
|
|
stats = stats ? stats + 1 : NULL;
|
2023-03-19 15:03:33 +02:00
|
|
|
max_dispatch_width = 32;
|
2016-04-26 19:45:41 -07:00
|
|
|
}
|
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
for (struct brw_compile_stats *s = params->base.stats; s != NULL && s != stats; s++)
|
2023-03-19 15:03:33 +02:00
|
|
|
s->max_dispatch_width = max_dispatch_width;
|
|
|
|
|
|
2020-09-04 18:43:35 +02:00
|
|
|
g.add_const_data(nir->constant_data, nir->constant_data_size);
|
2018-02-26 16:34:55 -08:00
|
|
|
return g.get_assembly();
|
2010-08-26 12:12:00 -07:00
|
|
|
}
|
2011-05-16 15:10:26 -07:00
|
|
|
|
2020-03-20 21:02:06 -07:00
|
|
|
unsigned
|
|
|
|
|
brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
|
|
|
|
|
unsigned threads)
|
|
|
|
|
{
|
|
|
|
|
assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
|
|
|
|
|
assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
|
|
|
|
|
return cs_prog_data->push.per_thread.size * threads +
|
|
|
|
|
cs_prog_data->push.cross_thread.size;
|
|
|
|
|
}
|
|
|
|
|
|
2016-05-22 21:46:28 -07:00
|
|
|
static void
|
|
|
|
|
fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)
|
|
|
|
|
{
|
|
|
|
|
block->dwords = dwords;
|
|
|
|
|
block->regs = DIV_ROUND_UP(dwords, 8);
|
|
|
|
|
block->size = block->regs * 32;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2021-04-05 13:19:39 -07:00
|
|
|
cs_fill_push_const_info(const struct intel_device_info *devinfo,
|
2016-05-22 21:46:28 -07:00
|
|
|
struct brw_cs_prog_data *cs_prog_data)
|
|
|
|
|
{
|
2016-09-08 23:48:51 -07:00
|
|
|
const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
|
2022-08-30 00:47:32 -07:00
|
|
|
int subgroup_id_index = brw_get_subgroup_id_param_index(devinfo, prog_data);
|
2016-05-22 21:46:28 -07:00
|
|
|
|
|
|
|
|
/* The thread ID should be stored in the last param dword */
|
2017-08-24 11:40:31 -07:00
|
|
|
assert(subgroup_id_index == -1 ||
|
|
|
|
|
subgroup_id_index == (int)prog_data->nr_params - 1);
|
2016-05-22 21:46:28 -07:00
|
|
|
|
|
|
|
|
unsigned cross_thread_dwords, per_thread_dwords;
|
2024-02-17 22:43:47 -08:00
|
|
|
if (subgroup_id_index >= 0) {
|
2016-05-22 21:46:28 -07:00
|
|
|
/* Fill all but the last register with cross-thread payload */
|
2017-08-24 11:40:31 -07:00
|
|
|
cross_thread_dwords = 8 * (subgroup_id_index / 8);
|
2016-05-22 21:46:28 -07:00
|
|
|
per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
|
|
|
|
|
assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
|
|
|
|
|
} else {
|
|
|
|
|
/* Fill all data using cross-thread payload */
|
|
|
|
|
cross_thread_dwords = prog_data->nr_params;
|
|
|
|
|
per_thread_dwords = 0u;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
|
|
|
|
|
fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
|
|
|
|
|
|
|
|
|
|
assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
|
|
|
|
|
cs_prog_data->push.per_thread.size == 0);
|
|
|
|
|
assert(cs_prog_data->push.cross_thread.dwords +
|
|
|
|
|
cs_prog_data->push.per_thread.dwords ==
|
|
|
|
|
prog_data->nr_params);
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-28 13:09:27 -07:00
|
|
|
static bool
|
2020-07-29 17:50:03 -07:00
|
|
|
filter_simd(const nir_instr *instr, const void * /* options */)
|
2020-04-28 13:09:27 -07:00
|
|
|
{
|
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
switch (nir_instr_as_intrinsic(instr)->intrinsic) {
|
|
|
|
|
case nir_intrinsic_load_simd_width_intel:
|
|
|
|
|
case nir_intrinsic_load_subgroup_id:
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
static nir_def *
|
2020-04-28 13:09:27 -07:00
|
|
|
lower_simd(nir_builder *b, nir_instr *instr, void *options)
|
|
|
|
|
{
|
|
|
|
|
uintptr_t simd_width = (uintptr_t)options;
|
|
|
|
|
|
|
|
|
|
switch (nir_instr_as_intrinsic(instr)->intrinsic) {
|
|
|
|
|
case nir_intrinsic_load_simd_width_intel:
|
|
|
|
|
return nir_imm_int(b, simd_width);
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_subgroup_id:
|
|
|
|
|
/* If the whole workgroup fits in one thread, we can lower subgroup_id
|
|
|
|
|
* to a constant zero.
|
|
|
|
|
*/
|
2021-05-05 12:24:44 -07:00
|
|
|
if (!b->shader->info.workgroup_size_variable) {
|
|
|
|
|
unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *
|
|
|
|
|
b->shader->info.workgroup_size[1] *
|
|
|
|
|
b->shader->info.workgroup_size[2];
|
2020-04-28 13:09:27 -07:00
|
|
|
if (local_workgroup_size <= simd_width)
|
|
|
|
|
return nir_imm_int(b, 0);
|
|
|
|
|
}
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-18 18:35:34 +02:00
|
|
|
bool
|
2020-04-28 13:09:27 -07:00
|
|
|
brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
|
|
|
|
|
{
|
2022-07-18 18:35:34 +02:00
|
|
|
return nir_shader_lower_instructions(nir, filter_simd, lower_simd,
|
2020-04-28 13:09:27 -07:00
|
|
|
(void *)(uintptr_t)dispatch_width);
|
|
|
|
|
}
|
|
|
|
|
|
2015-09-04 16:35:34 -07:00
|
|
|
const unsigned *
|
2021-03-23 21:01:21 -07:00
|
|
|
brw_compile_cs(const struct brw_compiler *compiler,
|
|
|
|
|
struct brw_compile_cs_params *params)
|
2015-09-04 16:35:34 -07:00
|
|
|
{
|
2023-07-14 02:10:20 +03:00
|
|
|
const nir_shader *nir = params->base.nir;
|
2021-03-23 21:01:21 -07:00
|
|
|
const struct brw_cs_prog_key *key = params->key;
|
|
|
|
|
struct brw_cs_prog_data *prog_data = params->prog_data;
|
|
|
|
|
|
2021-03-29 16:14:03 -07:00
|
|
|
const bool debug_enabled =
|
2023-07-14 02:10:20 +03:00
|
|
|
brw_should_print_shader(nir, params->base.debug_flag ?
|
|
|
|
|
params->base.debug_flag : DEBUG_CS);
|
2021-03-23 11:12:40 -07:00
|
|
|
|
2020-11-10 13:11:31 -09:00
|
|
|
prog_data->base.stage = MESA_SHADER_COMPUTE;
|
2021-04-08 12:30:14 +02:00
|
|
|
prog_data->base.total_shared = nir->info.shared_size;
|
2021-10-26 16:39:08 +03:00
|
|
|
prog_data->base.ray_queries = nir->info.ray_queries;
|
2022-02-28 15:13:07 +02:00
|
|
|
prog_data->base.total_scratch = 0;
|
2018-11-12 06:29:51 -08:00
|
|
|
|
2021-10-07 00:23:07 -07:00
|
|
|
if (!nir->info.workgroup_size_variable) {
|
2021-05-05 12:24:44 -07:00
|
|
|
prog_data->local_size[0] = nir->info.workgroup_size[0];
|
|
|
|
|
prog_data->local_size[1] = nir->info.workgroup_size[1];
|
|
|
|
|
prog_data->local_size[2] = nir->info.workgroup_size[2];
|
2020-09-14 13:44:42 -05:00
|
|
|
}
|
|
|
|
|
|
2022-11-08 01:47:50 -08:00
|
|
|
brw_simd_selection_state simd_state{
|
|
|
|
|
.devinfo = compiler->devinfo,
|
|
|
|
|
.prog_data = prog_data,
|
|
|
|
|
.required_width = brw_required_dispatch_width(&nir->info),
|
|
|
|
|
};
|
2020-09-14 13:44:42 -05:00
|
|
|
|
2022-11-08 14:14:37 -08:00
|
|
|
std::unique_ptr<fs_visitor> v[3];
|
2019-07-09 14:28:18 -05:00
|
|
|
|
2021-10-07 00:23:07 -07:00
|
|
|
for (unsigned simd = 0; simd < 3; simd++) {
|
2022-11-08 01:47:50 -08:00
|
|
|
if (!brw_simd_should_compile(simd_state, simd))
|
2021-10-07 00:23:07 -07:00
|
|
|
continue;
|
2020-05-19 10:08:12 -07:00
|
|
|
|
2021-10-07 00:23:07 -07:00
|
|
|
const unsigned dispatch_width = 8u << simd;
|
2017-08-21 21:27:19 -07:00
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
|
2021-10-07 00:23:07 -07:00
|
|
|
brw_nir_apply_key(shader, compiler, &key->base,
|
2023-05-17 17:09:06 +02:00
|
|
|
dispatch_width);
|
2017-08-21 19:30:24 -07:00
|
|
|
|
2022-07-18 18:35:34 +02:00
|
|
|
NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
|
2021-10-07 00:23:07 -07:00
|
|
|
|
|
|
|
|
/* Clean up after the local index and ID calculations. */
|
2022-07-18 18:35:34 +02:00
|
|
|
NIR_PASS(_, shader, nir_opt_constant_folding);
|
|
|
|
|
NIR_PASS(_, shader, nir_opt_dce);
|
2021-10-07 00:23:07 -07:00
|
|
|
|
2023-05-17 16:44:17 +02:00
|
|
|
brw_postprocess_nir(shader, compiler, debug_enabled,
|
2022-06-21 18:06:04 -07:00
|
|
|
key->base.robust_flags);
|
2021-10-07 00:23:07 -07:00
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
v[simd] = std::make_unique<fs_visitor>(compiler, ¶ms->base,
|
|
|
|
|
&key->base,
|
|
|
|
|
&prog_data->base,
|
|
|
|
|
shader, dispatch_width,
|
|
|
|
|
params->base.stats != NULL,
|
2022-11-08 14:14:37 -08:00
|
|
|
debug_enabled);
|
2021-10-07 00:23:07 -07:00
|
|
|
|
2022-11-08 03:38:18 -08:00
|
|
|
const int first = brw_simd_first_compiled(simd_state);
|
|
|
|
|
if (first >= 0)
|
2022-11-08 14:14:37 -08:00
|
|
|
v[simd]->import_uniforms(v[first].get());
|
2015-09-04 16:35:34 -07:00
|
|
|
|
2022-11-08 03:38:18 -08:00
|
|
|
const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable;
|
2021-10-07 00:23:07 -07:00
|
|
|
|
|
|
|
|
if (v[simd]->run_cs(allow_spilling)) {
|
2016-05-22 21:46:28 -07:00
|
|
|
cs_fill_push_const_info(compiler->devinfo, prog_data);
|
2021-10-07 00:23:07 -07:00
|
|
|
|
2022-11-08 01:47:50 -08:00
|
|
|
brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
|
2021-10-07 00:23:07 -07:00
|
|
|
} else {
|
2023-07-14 02:10:20 +03:00
|
|
|
simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
|
2021-10-07 00:23:07 -07:00
|
|
|
if (simd > 0) {
|
2023-07-14 02:10:20 +03:00
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
2021-10-07 00:23:07 -07:00
|
|
|
"SIMD%u shader failed to compile: %s\n",
|
|
|
|
|
dispatch_width, v[simd]->fail_msg);
|
|
|
|
|
}
|
2016-05-16 18:25:22 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-11-08 01:47:50 -08:00
|
|
|
const int selected_simd = brw_simd_select(simd_state);
|
2021-10-07 00:23:07 -07:00
|
|
|
if (selected_simd < 0) {
|
2023-07-14 02:10:20 +03:00
|
|
|
params->base.error_str =
|
|
|
|
|
ralloc_asprintf(params->base.mem_ctx,
|
2023-09-21 13:35:42 -07:00
|
|
|
"Can't compile shader: "
|
|
|
|
|
"SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
|
2023-07-14 02:10:20 +03:00
|
|
|
simd_state.error[0], simd_state.error[1],
|
|
|
|
|
simd_state.error[2]);
|
2020-05-19 10:08:12 -07:00
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2021-10-07 00:23:07 -07:00
|
|
|
assert(selected_simd < 3);
|
|
|
|
|
|
|
|
|
|
if (!nir->info.workgroup_size_variable)
|
|
|
|
|
prog_data->prog_mask = 1 << selected_simd;
|
2020-05-19 10:08:12 -07:00
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
fs_generator g(compiler, ¶ms->base, &prog_data->base,
|
2024-02-27 12:23:52 -08:00
|
|
|
MESA_SHADER_COMPUTE);
|
2021-03-23 11:12:40 -07:00
|
|
|
if (unlikely(debug_enabled)) {
|
2023-07-14 02:10:20 +03:00
|
|
|
char *name = ralloc_asprintf(params->base.mem_ctx,
|
|
|
|
|
"%s compute shader %s",
|
2020-09-04 18:43:35 +02:00
|
|
|
nir->info.label ?
|
|
|
|
|
nir->info.label : "unnamed",
|
|
|
|
|
nir->info.name);
|
2020-05-21 01:56:54 -07:00
|
|
|
g.enable_debug(name);
|
|
|
|
|
}
|
|
|
|
|
|
2023-03-19 15:03:33 +02:00
|
|
|
uint32_t max_dispatch_width = 8u << (util_last_bit(prog_data->prog_mask) - 1);
|
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
struct brw_compile_stats *stats = params->base.stats;
|
2021-10-07 00:23:07 -07:00
|
|
|
for (unsigned simd = 0; simd < 3; simd++) {
|
|
|
|
|
if (prog_data->prog_mask & (1u << simd)) {
|
|
|
|
|
assert(v[simd]);
|
|
|
|
|
prog_data->prog_offset[simd] =
|
|
|
|
|
g.generate_code(v[simd]->cfg, 8u << simd, v[simd]->shader_stats,
|
|
|
|
|
v[simd]->performance_analysis.require(), stats);
|
2023-03-19 15:03:33 +02:00
|
|
|
if (stats)
|
|
|
|
|
stats->max_dispatch_width = max_dispatch_width;
|
2020-05-21 01:56:54 -07:00
|
|
|
stats = stats ? stats + 1 : NULL;
|
2023-03-19 15:03:33 +02:00
|
|
|
max_dispatch_width = 8u << simd;
|
2020-05-21 01:56:54 -07:00
|
|
|
}
|
2015-09-04 16:35:34 -07:00
|
|
|
}
|
|
|
|
|
|
2020-09-04 18:43:35 +02:00
|
|
|
g.add_const_data(nir->constant_data, nir->constant_data_size);
|
2020-08-07 22:26:07 -05:00
|
|
|
|
2022-11-08 14:14:37 -08:00
|
|
|
return g.get_assembly();
|
2015-09-04 16:35:34 -07:00
|
|
|
}
|
2016-09-15 21:43:18 -07:00
|
|
|
|
2024-02-01 16:02:50 -08:00
|
|
|
struct intel_cs_dispatch_info
|
2021-04-28 10:54:53 -07:00
|
|
|
brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
|
|
|
|
|
const struct brw_cs_prog_data *prog_data,
|
|
|
|
|
const unsigned *override_local_size)
|
|
|
|
|
{
|
2024-02-01 16:02:50 -08:00
|
|
|
struct intel_cs_dispatch_info info = {};
|
2021-04-28 10:54:53 -07:00
|
|
|
|
|
|
|
|
const unsigned *sizes =
|
|
|
|
|
override_local_size ? override_local_size :
|
|
|
|
|
prog_data->local_size;
|
|
|
|
|
|
2022-11-08 01:24:36 -08:00
|
|
|
const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
|
2021-10-11 07:49:40 -07:00
|
|
|
assert(simd >= 0 && simd < 3);
|
|
|
|
|
|
2021-04-28 10:54:53 -07:00
|
|
|
info.group_size = sizes[0] * sizes[1] * sizes[2];
|
2021-10-11 07:49:40 -07:00
|
|
|
info.simd_size = 8u << simd;
|
2021-04-28 10:54:53 -07:00
|
|
|
info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
|
|
|
|
|
|
|
|
|
|
const uint32_t remainder = info.group_size & (info.simd_size - 1);
|
|
|
|
|
if (remainder > 0)
|
|
|
|
|
info.right_mask = ~0u >> (32 - remainder);
|
|
|
|
|
else
|
|
|
|
|
info.right_mask = ~0u >> (32 - info.simd_size);
|
|
|
|
|
|
|
|
|
|
return info;
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-04 12:40:06 -05:00
|
|
|
static uint8_t
|
2023-07-14 02:10:20 +03:00
|
|
|
compile_single_bs(const struct brw_compiler *compiler,
|
|
|
|
|
struct brw_compile_bs_params *params,
|
2020-09-04 12:40:06 -05:00
|
|
|
const struct brw_bs_prog_key *key,
|
|
|
|
|
struct brw_bs_prog_data *prog_data,
|
|
|
|
|
nir_shader *shader,
|
|
|
|
|
fs_generator *g,
|
|
|
|
|
struct brw_compile_stats *stats,
|
2023-07-14 02:10:20 +03:00
|
|
|
int *prog_offset)
|
2020-10-21 14:46:50 -05:00
|
|
|
{
|
2023-06-20 14:42:02 -07:00
|
|
|
const bool debug_enabled = brw_should_print_shader(shader, DEBUG_RT);
|
2021-03-23 11:12:40 -07:00
|
|
|
|
2020-10-21 14:46:50 -05:00
|
|
|
prog_data->base.stage = shader->info.stage;
|
2020-09-04 12:40:06 -05:00
|
|
|
prog_data->max_stack_size = MAX2(prog_data->max_stack_size,
|
|
|
|
|
shader->scratch_size);
|
2020-10-21 14:46:50 -05:00
|
|
|
|
|
|
|
|
const unsigned max_dispatch_width = 16;
|
2023-05-17 17:09:06 +02:00
|
|
|
brw_nir_apply_key(shader, compiler, &key->base, max_dispatch_width);
|
2023-05-17 16:44:17 +02:00
|
|
|
brw_postprocess_nir(shader, compiler, debug_enabled,
|
2022-06-21 18:06:04 -07:00
|
|
|
key->base.robust_flags);
|
2020-10-21 14:46:50 -05:00
|
|
|
|
2022-11-07 16:21:17 -08:00
|
|
|
brw_simd_selection_state simd_state{
|
|
|
|
|
.devinfo = compiler->devinfo,
|
|
|
|
|
.prog_data = prog_data,
|
2020-10-21 14:46:50 -05:00
|
|
|
|
2022-11-07 16:21:17 -08:00
|
|
|
/* Since divergence is a lot more likely in RT than compute, it makes
|
2023-01-31 16:02:47 -08:00
|
|
|
* sense to limit ourselves to the smallest available SIMD for now.
|
2022-11-07 16:21:17 -08:00
|
|
|
*/
|
2023-01-31 16:02:47 -08:00
|
|
|
.required_width = compiler->devinfo->ver >= 20 ? 16u : 8u,
|
2022-11-07 16:21:17 -08:00
|
|
|
};
|
2020-10-21 14:46:50 -05:00
|
|
|
|
2022-11-07 16:21:17 -08:00
|
|
|
std::unique_ptr<fs_visitor> v[2];
|
|
|
|
|
|
|
|
|
|
for (unsigned simd = 0; simd < ARRAY_SIZE(v); simd++) {
|
|
|
|
|
if (!brw_simd_should_compile(simd_state, simd))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
const unsigned dispatch_width = 8u << simd;
|
|
|
|
|
|
2023-12-01 20:35:31 -08:00
|
|
|
if (dispatch_width == 8 && compiler->devinfo->ver >= 20)
|
|
|
|
|
continue;
|
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
v[simd] = std::make_unique<fs_visitor>(compiler, ¶ms->base,
|
|
|
|
|
&key->base,
|
2022-11-07 16:21:17 -08:00
|
|
|
&prog_data->base, shader,
|
2023-02-03 17:02:28 +01:00
|
|
|
dispatch_width,
|
|
|
|
|
stats != NULL,
|
|
|
|
|
debug_enabled);
|
2022-11-07 16:21:17 -08:00
|
|
|
|
|
|
|
|
const bool allow_spilling = !brw_simd_any_compiled(simd_state);
|
|
|
|
|
if (v[simd]->run_bs(allow_spilling)) {
|
|
|
|
|
brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
|
2020-10-21 14:46:50 -05:00
|
|
|
} else {
|
2023-07-14 02:10:20 +03:00
|
|
|
simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx,
|
|
|
|
|
v[simd]->fail_msg);
|
2022-11-07 16:21:17 -08:00
|
|
|
if (simd > 0) {
|
2023-07-14 02:10:20 +03:00
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
2022-11-07 16:21:17 -08:00
|
|
|
"SIMD%u shader failed to compile: %s",
|
|
|
|
|
dispatch_width, v[simd]->fail_msg);
|
|
|
|
|
}
|
2020-10-21 14:46:50 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-11-07 16:21:17 -08:00
|
|
|
const int selected_simd = brw_simd_select(simd_state);
|
|
|
|
|
if (selected_simd < 0) {
|
2023-07-14 02:10:20 +03:00
|
|
|
params->base.error_str =
|
|
|
|
|
ralloc_asprintf(params->base.mem_ctx,
|
2023-09-21 13:35:42 -07:00
|
|
|
"Can't compile shader: "
|
|
|
|
|
"SIMD8 '%s' and SIMD16 '%s'.\n",
|
2023-07-14 02:10:20 +03:00
|
|
|
simd_state.error[0], simd_state.error[1]);
|
2022-11-07 16:21:17 -08:00
|
|
|
return 0;
|
2020-10-21 14:46:50 -05:00
|
|
|
}
|
|
|
|
|
|
2022-11-07 16:21:17 -08:00
|
|
|
assert(selected_simd < int(ARRAY_SIZE(v)));
|
|
|
|
|
fs_visitor *selected = v[selected_simd].get();
|
|
|
|
|
assert(selected);
|
|
|
|
|
|
|
|
|
|
const unsigned dispatch_width = selected->dispatch_width;
|
2020-10-21 14:46:50 -05:00
|
|
|
|
2022-11-07 16:21:17 -08:00
|
|
|
int offset = g->generate_code(selected->cfg, dispatch_width, selected->shader_stats,
|
|
|
|
|
selected->performance_analysis.require(), stats);
|
2020-09-04 12:40:06 -05:00
|
|
|
if (prog_offset)
|
|
|
|
|
*prog_offset = offset;
|
|
|
|
|
else
|
|
|
|
|
assert(offset == 0);
|
|
|
|
|
|
2022-11-07 16:21:17 -08:00
|
|
|
return dispatch_width;
|
2020-09-04 12:40:06 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint64_t
|
|
|
|
|
brw_bsr(const struct intel_device_info *devinfo,
|
|
|
|
|
uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset)
|
|
|
|
|
{
|
|
|
|
|
assert(offset % 64 == 0);
|
|
|
|
|
assert(simd_size == 8 || simd_size == 16);
|
|
|
|
|
assert(local_arg_offset % 8 == 0);
|
|
|
|
|
|
|
|
|
|
return offset |
|
2022-01-31 12:43:04 +00:00
|
|
|
SET_BITS(simd_size == 8, 4, 4) |
|
2020-09-04 12:40:06 -05:00
|
|
|
SET_BITS(local_arg_offset / 8, 2, 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const unsigned *
|
2021-03-23 21:21:40 -07:00
|
|
|
brw_compile_bs(const struct brw_compiler *compiler,
|
|
|
|
|
struct brw_compile_bs_params *params)
|
2020-09-04 12:40:06 -05:00
|
|
|
{
|
2023-07-14 02:10:20 +03:00
|
|
|
nir_shader *shader = params->base.nir;
|
2021-03-23 21:21:40 -07:00
|
|
|
struct brw_bs_prog_data *prog_data = params->prog_data;
|
|
|
|
|
unsigned num_resume_shaders = params->num_resume_shaders;
|
|
|
|
|
nir_shader **resume_shaders = params->resume_shaders;
|
2023-06-20 14:42:02 -07:00
|
|
|
const bool debug_enabled = brw_should_print_shader(shader, DEBUG_RT);
|
2020-09-04 12:40:06 -05:00
|
|
|
|
|
|
|
|
prog_data->base.stage = shader->info.stage;
|
2021-10-26 16:39:08 +03:00
|
|
|
prog_data->base.ray_queries = shader->info.ray_queries;
|
2022-02-28 15:13:07 +02:00
|
|
|
prog_data->base.total_scratch = 0;
|
|
|
|
|
|
2020-09-04 12:40:06 -05:00
|
|
|
prog_data->max_stack_size = 0;
|
2021-10-13 13:05:59 +00:00
|
|
|
prog_data->num_resume_shaders = num_resume_shaders;
|
2020-09-04 12:40:06 -05:00
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
fs_generator g(compiler, ¶ms->base, &prog_data->base,
|
2024-02-27 12:23:52 -08:00
|
|
|
shader->info.stage);
|
2021-03-23 11:12:40 -07:00
|
|
|
if (unlikely(debug_enabled)) {
|
2023-07-14 02:10:20 +03:00
|
|
|
char *name = ralloc_asprintf(params->base.mem_ctx,
|
|
|
|
|
"%s %s shader %s",
|
2020-10-21 14:46:50 -05:00
|
|
|
shader->info.label ?
|
|
|
|
|
shader->info.label : "unnamed",
|
|
|
|
|
gl_shader_stage_name(shader->info.stage),
|
|
|
|
|
shader->info.name);
|
|
|
|
|
g.enable_debug(name);
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-04 12:40:06 -05:00
|
|
|
prog_data->simd_size =
|
2023-07-14 02:10:20 +03:00
|
|
|
compile_single_bs(compiler, params, params->key, prog_data,
|
|
|
|
|
shader, &g, params->base.stats, NULL);
|
2020-09-04 12:40:06 -05:00
|
|
|
if (prog_data->simd_size == 0)
|
|
|
|
|
return NULL;
|
2020-10-21 14:46:50 -05:00
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
uint64_t *resume_sbt = ralloc_array(params->base.mem_ctx,
|
|
|
|
|
uint64_t, num_resume_shaders);
|
2020-09-04 12:40:06 -05:00
|
|
|
for (unsigned i = 0; i < num_resume_shaders; i++) {
|
2021-10-13 11:21:41 +02:00
|
|
|
if (INTEL_DEBUG(DEBUG_RT)) {
|
2023-07-14 02:10:20 +03:00
|
|
|
char *name = ralloc_asprintf(params->base.mem_ctx,
|
|
|
|
|
"%s %s resume(%u) shader %s",
|
2020-09-04 12:40:06 -05:00
|
|
|
shader->info.label ?
|
|
|
|
|
shader->info.label : "unnamed",
|
|
|
|
|
gl_shader_stage_name(shader->info.stage),
|
|
|
|
|
i, shader->info.name);
|
|
|
|
|
g.enable_debug(name);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* TODO: Figure out shader stats etc. for resume shaders */
|
|
|
|
|
int offset = 0;
|
|
|
|
|
uint8_t simd_size =
|
2023-07-14 02:10:20 +03:00
|
|
|
compile_single_bs(compiler, params, params->key,
|
|
|
|
|
prog_data, resume_shaders[i], &g, NULL, &offset);
|
2020-09-04 12:40:06 -05:00
|
|
|
if (simd_size == 0)
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
assert(offset > 0);
|
|
|
|
|
resume_sbt[i] = brw_bsr(compiler->devinfo, offset, simd_size, 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* We only have one constant data so we want to make sure they're all the
|
|
|
|
|
* same.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned i = 0; i < num_resume_shaders; i++) {
|
|
|
|
|
assert(resume_shaders[i]->constant_data_size ==
|
|
|
|
|
shader->constant_data_size);
|
|
|
|
|
assert(memcmp(resume_shaders[i]->constant_data,
|
|
|
|
|
shader->constant_data,
|
|
|
|
|
shader->constant_data_size) == 0);
|
|
|
|
|
}
|
2020-10-21 14:46:50 -05:00
|
|
|
|
|
|
|
|
g.add_const_data(shader->constant_data, shader->constant_data_size);
|
2020-09-04 12:40:06 -05:00
|
|
|
g.add_resume_sbt(num_resume_shaders, resume_sbt);
|
2020-10-21 14:46:50 -05:00
|
|
|
|
|
|
|
|
return g.get_assembly();
|
|
|
|
|
}
|
|
|
|
|
|
2020-01-14 12:22:47 -08:00
|
|
|
unsigned
|
|
|
|
|
fs_visitor::workgroup_size() const
|
|
|
|
|
{
|
2021-05-18 10:01:49 -07:00
|
|
|
assert(gl_shader_stage_uses_workgroup(stage));
|
2020-01-14 12:22:47 -08:00
|
|
|
const struct brw_cs_prog_data *cs = brw_cs_prog_data(prog_data);
|
|
|
|
|
return cs->local_size[0] * cs->local_size[1] * cs->local_size[2];
|
|
|
|
|
}
|
2023-06-20 14:42:02 -07:00
|
|
|
|
|
|
|
|
bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
|
|
|
|
|
{
|
|
|
|
|
return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
|
2023-06-21 07:51:00 -07:00
|
|
|
}
|
2023-11-21 07:49:02 -08:00
|
|
|
|
|
|
|
|
namespace brw {
|
|
|
|
|
fs_reg
|
|
|
|
|
fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
|
2022-08-03 16:47:52 -07:00
|
|
|
brw_reg_type type, unsigned n)
|
2023-11-21 07:49:02 -08:00
|
|
|
{
|
|
|
|
|
if (!regs[0])
|
|
|
|
|
return fs_reg();
|
|
|
|
|
|
|
|
|
|
if (bld.dispatch_width() > 16) {
|
2022-08-03 16:47:52 -07:00
|
|
|
const fs_reg tmp = bld.vgrf(type, n);
|
2023-11-21 07:49:02 -08:00
|
|
|
const brw::fs_builder hbld = bld.exec_all().group(16, 0);
|
|
|
|
|
const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
|
2022-08-03 16:47:52 -07:00
|
|
|
fs_reg *const components = new fs_reg[m * n];
|
2023-11-21 07:49:02 -08:00
|
|
|
|
2022-08-03 16:47:52 -07:00
|
|
|
for (unsigned c = 0; c < n; c++) {
|
|
|
|
|
for (unsigned g = 0; g < m; g++)
|
|
|
|
|
components[c * m + g] =
|
|
|
|
|
offset(retype(brw_vec8_grf(regs[g], 0), type), hbld, c);
|
|
|
|
|
}
|
2023-11-21 07:49:02 -08:00
|
|
|
|
2022-08-03 16:47:52 -07:00
|
|
|
hbld.LOAD_PAYLOAD(tmp, components, m * n, 0);
|
2023-11-21 07:49:02 -08:00
|
|
|
|
2022-08-03 16:47:52 -07:00
|
|
|
delete[] components;
|
2023-11-21 07:49:02 -08:00
|
|
|
return tmp;
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
return fs_reg(retype(brw_vec8_grf(regs[0], 0), type));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fs_reg
|
|
|
|
|
fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2])
|
|
|
|
|
{
|
|
|
|
|
if (!regs[0])
|
|
|
|
|
return fs_reg();
|
2023-12-01 21:51:19 -08:00
|
|
|
else if (bld.shader->devinfo->ver >= 20)
|
2024-04-20 17:08:02 -07:00
|
|
|
return fetch_payload_reg(bld, regs, BRW_TYPE_F, 2);
|
2023-11-21 07:49:02 -08:00
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
const fs_reg tmp = bld.vgrf(BRW_TYPE_F, 2);
|
2023-11-21 07:49:02 -08:00
|
|
|
const brw::fs_builder hbld = bld.exec_all().group(8, 0);
|
|
|
|
|
const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
|
|
|
|
|
fs_reg *const components = new fs_reg[2 * m];
|
|
|
|
|
|
|
|
|
|
for (unsigned c = 0; c < 2; c++) {
|
|
|
|
|
for (unsigned g = 0; g < m; g++)
|
|
|
|
|
components[c * m + g] = offset(brw_vec8_grf(regs[g / 2], 0),
|
|
|
|
|
hbld, c + 2 * (g % 2));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);
|
|
|
|
|
|
|
|
|
|
delete[] components;
|
|
|
|
|
return tmp;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
check_dynamic_msaa_flag(const fs_builder &bld,
|
|
|
|
|
const struct brw_wm_prog_data *wm_prog_data,
|
2024-02-01 13:17:42 -08:00
|
|
|
enum intel_msaa_flags flag)
|
2023-11-21 07:49:02 -08:00
|
|
|
{
|
|
|
|
|
fs_inst *inst = bld.AND(bld.null_reg_ud(),
|
|
|
|
|
dynamic_msaa_flags(wm_prog_data),
|
|
|
|
|
brw_imm_ud(flag));
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NZ;
|
|
|
|
|
}
|
|
|
|
|
}
|