2011-05-24 16:34:27 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
2013-09-17 23:32:10 -07:00
|
|
|
/** @file brw_fs_generator.cpp
|
2011-05-24 16:34:27 -07:00
|
|
|
*
|
2013-09-17 23:32:10 -07:00
|
|
|
* This file supports generating code from the FS LIR to the actual
|
2011-05-24 16:34:27 -07:00
|
|
|
* native instructions.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "brw_eu.h"
|
2024-01-24 22:57:44 -08:00
|
|
|
#include "brw_disasm_info.h"
|
2011-05-24 16:34:27 -07:00
|
|
|
#include "brw_fs.h"
|
2012-10-03 13:03:12 -07:00
|
|
|
#include "brw_cfg.h"
|
2023-03-08 12:31:51 -08:00
|
|
|
#include "dev/intel_debug.h"
|
2019-05-23 19:05:23 +03:00
|
|
|
#include "util/mesa-sha1.h"
|
intel/fs: Better handle constant sources of FS_OPCODE_PACK_HALF_2x16_SPLIT
I noticed that a *LOT* of fragment shaders in Shadow of the Tomb Raider,
for instance, end up with a sequence of NIR like:
vec1 32 ssa_2 = load_const (0x00000000 = 0.000000)
...
vec1 32 ssa_191 = pack_half_2x16_split ssa_188, ssa_2
vec1 32 ssa_192 = pack_half_2x16_split ssa_189, ssa_2
vec1 32 ssa_193 = pack_half_2x16_split ssa_190, ssa_2
This results in an assembly sequence like:
mov(8) g28<1>UD 0x00000000UD
mov(8) g21<2>HF g28<8,8,1>F
shl(8) g21<1>UD g21<8,8,1>UD 0x00000010UD
mov(8) g21<2>HF g25<8,8,1>F
mov(8) g19<2>HF g28<8,8,1>F
shl(8) g19<1>UD g19<8,8,1>UD 0x00000010UD
mov(8) g19<2>HF g23<8,8,1>F
mov(8) g20<2>HF g28<8,8,1>F
shl(8) g20<1>UD g20<8,8,1>UD 0x00000010UD
mov(8) g20<2>HF g24<8,8,1>F
After this commit, the generated assembly is:
mov(8) g21<1>UD 0x00000000UD
mov(8) g21<2>HF g23<8,8,1>F
mov(8) g19<1>UD 0x00000000UD
mov(8) g19<2>HF g17<8,8,1>F
mov(8) g20<1>UD 0x00000000UD
mov(8) g20<2>HF g18<8,8,1>F
Tiger Lake, Ice Lake, Skylake, and Haswell had similar results. (Ice Lake shown)
total instructions in shared programs: 20119086 -> 20119034 (<.01%)
instructions in affected programs: 9056 -> 9004 (-0.57%)
helped: 8
HURT: 0
helped stats (abs) min: 2 max: 16 x̄: 6.50 x̃: 4
helped stats (rel) min: 0.29% max: 1.75% x̄: 1.00% x̃: 0.98%
95% mean confidence interval for instructions value: -11.01 -1.99
95% mean confidence interval for instructions %-change: -1.56% -0.44%
Instructions are helped.
total cycles in shared programs: 861019414 -> 861021044 (<.01%)
cycles in affected programs: 279862 -> 281492 (0.58%)
helped: 4
HURT: 2
helped stats (abs) min: 6 max: 936 x̄: 239.00 x̃: 7
helped stats (rel) min: 0.03% max: 8.13% x̄: 2.09% x̃: 0.09%
HURT stats (abs) min: 18 max: 2568 x̄: 1293.00 x̃: 1293
HURT stats (rel) min: 0.36% max: 1.14% x̄: 0.75% x̃: 0.75%
95% mean confidence interval for cycles value: -972.56 1515.89
95% mean confidence interval for cycles %-change: -4.77% 2.49%
Inconclusive result (value mean confidence interval includes 0).
Broadwell
total instructions in shared programs: 17812327 -> 17812263 (<.01%)
instructions in affected programs: 9867 -> 9803 (-0.65%)
helped: 8
HURT: 0
helped stats (abs) min: 2 max: 28 x̄: 8.00 x̃: 4
helped stats (rel) min: 0.32% max: 1.80% x̄: 1.00% x̃: 0.95%
95% mean confidence interval for instructions value: -15.46 -0.54
95% mean confidence interval for instructions %-change: -1.54% -0.47%
Instructions are helped.
total cycles in shared programs: 904768620 -> 904773291 (<.01%)
cycles in affected programs: 454799 -> 459470 (1.03%)
helped: 4
HURT: 4
helped stats (abs) min: 36 max: 586 x̄: 344.50 x̃: 378
helped stats (rel) min: 0.47% max: 4.04% x̄: 2.01% x̃: 1.77%
HURT stats (abs) min: 1 max: 5572 x̄: 1512.25 x̃: 238
HURT stats (rel) min: <.01% max: 2.77% x̄: 1.46% x̃: 1.53%
95% mean confidence interval for cycles value: -1122.40 2290.15
95% mean confidence interval for cycles %-change: -2.26% 1.71%
Inconclusive result (value mean confidence interval includes 0).
total spills in shared programs: 18581 -> 18579 (-0.01%)
spills in affected programs: 323 -> 321 (-0.62%)
helped: 1
HURT: 0
total fills in shared programs: 24985 -> 24981 (-0.02%)
fills in affected programs: 1348 -> 1344 (-0.30%)
helped: 1
HURT: 0
Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown)
Instructions in all programs: 143585431 -> 143513657 (-0.0%)
Instructions helped: 14403
Cycles in all programs: 8439312778 -> 8439371578 (+0.0%)
Cycles helped: 10570
Cycles hurt: 3290
Gained: 146
Lost: 74
All of the lost and gained fossil-db shaders are SIMD32 fragment
shaders. 14,247 of the affected shaders are from Shadow of the Tomb
Raider. 154 are from Batman Arkham Origins, and the remaining two are
from Octopath Traveler.
Reviewed-by: Matt Turner <mattst88@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15089>
2022-02-14 14:07:18 -08:00
|
|
|
#include "util/half_float.h"
|
2011-05-24 16:34:27 -07:00
|
|
|
|
2024-02-28 13:59:35 -08:00
|
|
|
static uint32_t
|
|
|
|
|
brw_math_function(enum opcode op)
|
|
|
|
|
{
|
|
|
|
|
switch (op) {
|
|
|
|
|
case SHADER_OPCODE_RCP:
|
|
|
|
|
return BRW_MATH_FUNCTION_INV;
|
|
|
|
|
case SHADER_OPCODE_RSQ:
|
|
|
|
|
return BRW_MATH_FUNCTION_RSQ;
|
|
|
|
|
case SHADER_OPCODE_SQRT:
|
|
|
|
|
return BRW_MATH_FUNCTION_SQRT;
|
|
|
|
|
case SHADER_OPCODE_EXP2:
|
|
|
|
|
return BRW_MATH_FUNCTION_EXP;
|
|
|
|
|
case SHADER_OPCODE_LOG2:
|
|
|
|
|
return BRW_MATH_FUNCTION_LOG;
|
|
|
|
|
case SHADER_OPCODE_POW:
|
|
|
|
|
return BRW_MATH_FUNCTION_POW;
|
|
|
|
|
case SHADER_OPCODE_SIN:
|
|
|
|
|
return BRW_MATH_FUNCTION_SIN;
|
|
|
|
|
case SHADER_OPCODE_COS:
|
|
|
|
|
return BRW_MATH_FUNCTION_COS;
|
|
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
|
|
|
return BRW_MATH_FUNCTION_INT_DIV_QUOTIENT;
|
|
|
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
|
|
|
|
return BRW_MATH_FUNCTION_INT_DIV_REMAINDER;
|
|
|
|
|
default:
|
|
|
|
|
unreachable("not reached: unknown math function");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2015-10-23 13:11:44 -07:00
|
|
|
static enum brw_reg_file
|
|
|
|
|
brw_file_from_reg(fs_reg *reg)
|
2014-12-05 09:53:11 -08:00
|
|
|
{
|
|
|
|
|
switch (reg->file) {
|
2015-10-26 17:52:57 -07:00
|
|
|
case ARF:
|
|
|
|
|
return BRW_ARCHITECTURE_REGISTER_FILE;
|
|
|
|
|
case FIXED_GRF:
|
2015-10-26 17:09:25 -07:00
|
|
|
case VGRF:
|
2014-12-05 09:53:11 -08:00
|
|
|
return BRW_GENERAL_REGISTER_FILE;
|
|
|
|
|
case IMM:
|
|
|
|
|
return BRW_IMMEDIATE_VALUE;
|
2015-10-26 06:58:56 -07:00
|
|
|
case BAD_FILE:
|
|
|
|
|
case ATTR:
|
|
|
|
|
case UNIFORM:
|
2014-12-05 09:53:11 -08:00
|
|
|
unreachable("not reached");
|
|
|
|
|
}
|
2015-10-23 13:11:44 -07:00
|
|
|
return BRW_ARCHITECTURE_REGISTER_FILE;
|
2014-12-05 09:53:11 -08:00
|
|
|
}
|
|
|
|
|
|
2014-11-28 12:21:03 -08:00
|
|
|
static struct brw_reg
|
2021-04-05 13:19:39 -07:00
|
|
|
brw_reg_from_fs_reg(const struct intel_device_info *devinfo, fs_inst *inst,
|
2016-07-18 07:27:56 +00:00
|
|
|
fs_reg *reg, bool compressed)
|
2014-11-28 12:21:03 -08:00
|
|
|
{
|
|
|
|
|
struct brw_reg brw_reg;
|
|
|
|
|
|
|
|
|
|
switch (reg->file) {
|
2015-10-26 17:09:25 -07:00
|
|
|
case VGRF:
|
2014-11-28 12:21:03 -08:00
|
|
|
if (reg->stride == 0) {
|
2015-10-26 04:35:14 -07:00
|
|
|
brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
|
2014-11-28 12:21:03 -08:00
|
|
|
} else {
|
|
|
|
|
/* From the Haswell PRM:
|
|
|
|
|
*
|
2016-05-19 21:43:48 -07:00
|
|
|
* "VertStride must be used to cross GRF register boundaries. This
|
|
|
|
|
* rule implies that elements within a 'Width' cannot cross GRF
|
|
|
|
|
* boundaries."
|
2014-11-28 12:21:03 -08:00
|
|
|
*
|
2016-05-19 21:43:48 -07:00
|
|
|
* The maximum width value that could satisfy this restriction is:
|
2014-11-28 12:21:03 -08:00
|
|
|
*/
|
2016-05-19 21:43:48 -07:00
|
|
|
const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
|
|
|
|
|
|
|
|
|
|
/* Because the hardware can only split source regions at a whole
|
|
|
|
|
* multiple of width during decompression (i.e. vertically), clamp
|
|
|
|
|
* the value obtained above to the physical execution size of a
|
|
|
|
|
* single decompressed chunk of the instruction:
|
|
|
|
|
*/
|
|
|
|
|
const unsigned phys_width = compressed ? inst->exec_size / 2 :
|
|
|
|
|
inst->exec_size;
|
|
|
|
|
|
2019-08-30 17:16:28 -07:00
|
|
|
const unsigned max_hw_width = 16;
|
|
|
|
|
|
2016-05-19 21:43:48 -07:00
|
|
|
/* XXX - The equation above is strictly speaking not correct on
|
2021-03-29 15:46:12 -07:00
|
|
|
* hardware that supports unbalanced GRF writes -- On Gfx9+
|
2016-05-19 21:43:48 -07:00
|
|
|
* each decompressed chunk of the instruction may have a
|
|
|
|
|
* different execution size when the number of components
|
|
|
|
|
* written to each destination GRF is not the same.
|
|
|
|
|
*/
|
2019-01-18 12:51:57 -08:00
|
|
|
if (reg->stride > 4) {
|
|
|
|
|
assert(reg != &inst->dst);
|
|
|
|
|
assert(reg->stride * type_sz(reg->type) <= REG_SIZE);
|
|
|
|
|
brw_reg = brw_vecn_reg(1, brw_file_from_reg(reg), reg->nr, 0);
|
|
|
|
|
brw_reg = stride(brw_reg, reg->stride, 1, 0);
|
|
|
|
|
} else {
|
2019-08-30 17:16:28 -07:00
|
|
|
const unsigned width = MIN3(reg_width, phys_width, max_hw_width);
|
2019-01-18 12:51:57 -08:00
|
|
|
brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
|
|
|
|
|
brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
|
|
|
|
|
}
|
2014-11-28 12:21:03 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
brw_reg = retype(brw_reg, reg->type);
|
2016-09-01 21:25:18 -07:00
|
|
|
brw_reg = byte_offset(brw_reg, reg->offset);
|
2015-10-24 15:29:03 -07:00
|
|
|
brw_reg.abs = reg->abs;
|
|
|
|
|
brw_reg.negate = reg->negate;
|
2014-11-28 12:21:03 -08:00
|
|
|
break;
|
2015-10-26 17:52:57 -07:00
|
|
|
case ARF:
|
|
|
|
|
case FIXED_GRF:
|
2015-11-02 12:25:24 -08:00
|
|
|
case IMM:
|
2016-09-01 21:25:18 -07:00
|
|
|
assert(reg->offset == 0);
|
2015-11-19 21:51:37 -08:00
|
|
|
brw_reg = reg->as_brw_reg();
|
2014-11-28 12:21:03 -08:00
|
|
|
break;
|
|
|
|
|
case BAD_FILE:
|
|
|
|
|
/* Probably unused. */
|
|
|
|
|
brw_reg = brw_null_reg();
|
|
|
|
|
break;
|
2015-10-26 06:58:56 -07:00
|
|
|
case ATTR:
|
|
|
|
|
case UNIFORM:
|
2014-11-28 12:21:03 -08:00
|
|
|
unreachable("not reached");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return brw_reg;
|
|
|
|
|
}
|
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
fs_generator::fs_generator(const struct brw_compiler *compiler,
|
|
|
|
|
const struct brw_compile_params *params,
|
2014-10-20 22:53:31 -07:00
|
|
|
struct brw_stage_prog_data *prog_data,
|
2016-01-14 20:27:51 -08:00
|
|
|
gl_shader_stage stage)
|
2012-11-09 01:05:47 -08:00
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
: compiler(compiler), params(params),
|
2016-04-25 17:20:35 -07:00
|
|
|
devinfo(compiler->devinfo),
|
2020-09-09 18:13:43 +02:00
|
|
|
prog_data(prog_data), dispatch_width(0),
|
2024-02-27 12:23:52 -08:00
|
|
|
debug_flag(false),
|
2023-07-14 02:10:20 +03:00
|
|
|
shader_name(NULL), stage(stage), mem_ctx(params->mem_ctx)
|
2012-11-09 01:05:47 -08:00
|
|
|
{
|
2015-04-16 11:06:57 -07:00
|
|
|
p = rzalloc(mem_ctx, struct brw_codegen);
|
2022-06-29 14:13:31 -07:00
|
|
|
brw_init_codegen(&compiler->isa, p, mem_ctx);
|
2012-11-09 01:05:47 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fs_generator::~fs_generator()
|
|
|
|
|
{
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-12 11:01:16 -08:00
|
|
|
class ip_record : public exec_node {
|
|
|
|
|
public:
|
|
|
|
|
DECLARE_RALLOC_CXX_OPERATORS(ip_record)
|
|
|
|
|
|
|
|
|
|
ip_record(int ip)
|
|
|
|
|
{
|
|
|
|
|
this->ip = ip;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int ip;
|
|
|
|
|
};
|
|
|
|
|
|
2014-05-16 13:06:45 -07:00
|
|
|
bool
|
2020-11-19 09:32:27 -06:00
|
|
|
fs_generator::patch_halt_jumps()
|
2012-12-06 10:15:08 -08:00
|
|
|
{
|
2020-04-25 14:59:30 -05:00
|
|
|
if (this->discard_halt_patches.is_empty())
|
2014-05-16 13:06:45 -07:00
|
|
|
return false;
|
2012-12-06 10:15:08 -08:00
|
|
|
|
2015-04-14 18:00:06 -07:00
|
|
|
int scale = brw_jump_scale(p->devinfo);
|
2014-06-30 08:00:25 -07:00
|
|
|
|
2024-02-15 15:59:08 -08:00
|
|
|
/* There is a somewhat strange undocumented requirement of using
|
|
|
|
|
* HALT, according to the simulator. If some channel has HALTed to
|
|
|
|
|
* a particular UIP, then by the end of the program, every channel
|
|
|
|
|
* must have HALTed to that UIP. Furthermore, the tracking is a
|
|
|
|
|
* stack, so you can't do the final halt of a UIP after starting
|
|
|
|
|
* halting to a new UIP.
|
|
|
|
|
*
|
|
|
|
|
* Symptoms of not emitting this instruction on actual hardware
|
|
|
|
|
* included GPU hangs and sparkly rendering on the piglit discard
|
|
|
|
|
* tests.
|
|
|
|
|
*/
|
|
|
|
|
brw_inst *last_halt = brw_HALT(p);
|
|
|
|
|
brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
|
|
|
|
|
brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
|
2012-12-06 10:15:08 -08:00
|
|
|
|
|
|
|
|
int ip = p->nr_insn;
|
|
|
|
|
|
2014-06-24 15:53:19 -07:00
|
|
|
foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
|
2014-06-13 14:29:25 -07:00
|
|
|
brw_inst *patch = &p->store[patch_ip->ip];
|
2012-12-06 10:15:08 -08:00
|
|
|
|
2022-06-29 14:13:31 -07:00
|
|
|
assert(brw_inst_opcode(p->isa, patch) == BRW_OPCODE_HALT);
|
2024-02-15 15:59:08 -08:00
|
|
|
/* HALT takes a half-instruction distance from the pre-incremented IP. */
|
|
|
|
|
brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
|
2012-12-06 10:15:08 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
this->discard_halt_patches.make_empty();
|
2020-04-25 14:59:30 -05:00
|
|
|
|
2014-05-16 13:06:45 -07:00
|
|
|
return true;
|
2012-12-06 10:15:08 -08:00
|
|
|
}
|
|
|
|
|
|
2018-10-29 15:06:14 -05:00
|
|
|
void
|
|
|
|
|
fs_generator::generate_send(fs_inst *inst,
|
|
|
|
|
struct brw_reg dst,
|
|
|
|
|
struct brw_reg desc,
|
|
|
|
|
struct brw_reg ex_desc,
|
|
|
|
|
struct brw_reg payload,
|
|
|
|
|
struct brw_reg payload2)
|
|
|
|
|
{
|
2022-07-18 17:08:03 +02:00
|
|
|
const unsigned rlen = inst->dst.is_null() ? 0 : inst->size_written / REG_SIZE;
|
2018-10-29 15:06:14 -05:00
|
|
|
|
|
|
|
|
uint32_t desc_imm = inst->desc |
|
|
|
|
|
brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size);
|
|
|
|
|
|
2021-01-27 15:28:24 -06:00
|
|
|
uint32_t ex_desc_imm = inst->ex_desc |
|
|
|
|
|
brw_message_ex_desc(devinfo, inst->ex_mlen);
|
2018-11-15 21:05:08 -06:00
|
|
|
|
2022-07-18 12:27:53 +03:00
|
|
|
if (ex_desc.file != BRW_IMMEDIATE_VALUE || ex_desc.ud || ex_desc_imm ||
|
|
|
|
|
inst->send_ex_desc_scratch) {
|
2018-11-15 21:05:08 -06:00
|
|
|
/* If we have any sort of extended descriptor, then we need SENDS. This
|
|
|
|
|
* also covers the dual-payload case because ex_mlen goes in ex_desc.
|
|
|
|
|
*/
|
|
|
|
|
brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2,
|
2019-02-07 17:45:51 -06:00
|
|
|
desc, desc_imm, ex_desc, ex_desc_imm,
|
2022-10-14 17:49:00 +03:00
|
|
|
inst->send_ex_desc_scratch,
|
|
|
|
|
inst->send_ex_bso, inst->eot);
|
2018-11-15 21:05:08 -06:00
|
|
|
if (inst->check_tdr)
|
2022-06-29 14:13:31 -07:00
|
|
|
brw_inst_set_opcode(p->isa, brw_last_inst,
|
2021-03-29 14:41:58 -07:00
|
|
|
devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC);
|
2018-11-15 21:05:08 -06:00
|
|
|
} else {
|
2019-02-07 17:45:51 -06:00
|
|
|
brw_send_indirect_message(p, inst->sfid, dst, payload, desc, desc_imm,
|
|
|
|
|
inst->eot);
|
2018-11-15 21:05:08 -06:00
|
|
|
if (inst->check_tdr)
|
2022-06-29 14:13:31 -07:00
|
|
|
brw_inst_set_opcode(p->isa, brw_last_inst, BRW_OPCODE_SENDC);
|
2018-11-15 21:05:08 -06:00
|
|
|
}
|
2018-10-29 15:06:14 -05:00
|
|
|
}
|
|
|
|
|
|
2015-11-07 18:58:34 -08:00
|
|
|
void
|
|
|
|
|
fs_generator::generate_mov_indirect(fs_inst *inst,
|
|
|
|
|
struct brw_reg dst,
|
|
|
|
|
struct brw_reg reg,
|
|
|
|
|
struct brw_reg indirect_byte_offset)
|
|
|
|
|
{
|
|
|
|
|
assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
|
|
|
|
|
assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
|
2017-10-17 11:57:48 -07:00
|
|
|
assert(!reg.abs && !reg.negate);
|
2020-10-29 09:34:08 -05:00
|
|
|
|
|
|
|
|
/* Gen12.5 adds the following region restriction:
|
|
|
|
|
*
|
|
|
|
|
* "Vx1 and VxH indirect addressing for Float, Half-Float, Double-Float
|
|
|
|
|
* and Quad-Word data must not be used."
|
|
|
|
|
*
|
|
|
|
|
* We require the source and destination types to match so stomp to an
|
|
|
|
|
* unsigned integer type.
|
|
|
|
|
*/
|
2017-10-17 11:57:48 -07:00
|
|
|
assert(reg.type == dst.type);
|
2020-10-29 09:34:08 -05:00
|
|
|
reg.type = dst.type = brw_reg_type_from_bit_size(type_sz(reg.type) * 8,
|
|
|
|
|
BRW_REGISTER_TYPE_UD);
|
2015-11-07 18:58:34 -08:00
|
|
|
|
|
|
|
|
unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
|
|
|
|
|
|
2015-11-24 09:01:11 -08:00
|
|
|
if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
|
|
|
|
|
imm_byte_offset += indirect_byte_offset.ud;
|
2015-11-07 18:58:34 -08:00
|
|
|
|
2015-11-24 09:01:11 -08:00
|
|
|
reg.nr = imm_byte_offset / REG_SIZE;
|
|
|
|
|
reg.subnr = imm_byte_offset % REG_SIZE;
|
2024-03-27 15:46:07 -07:00
|
|
|
if (type_sz(reg.type) > 4 && !devinfo->has_64bit_int) {
|
2020-07-17 16:22:11 -05:00
|
|
|
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
|
|
|
|
|
subscript(reg, BRW_REGISTER_TYPE_D, 0));
|
|
|
|
|
brw_set_default_swsb(p, tgl_swsb_null());
|
|
|
|
|
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
|
|
|
|
|
subscript(reg, BRW_REGISTER_TYPE_D, 1));
|
|
|
|
|
} else {
|
|
|
|
|
brw_MOV(p, dst, reg);
|
|
|
|
|
}
|
2015-11-24 09:01:11 -08:00
|
|
|
} else {
|
|
|
|
|
/* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
|
|
|
|
|
struct brw_reg addr = vec8(brw_address_reg(0));
|
|
|
|
|
|
2020-03-03 13:20:47 -08:00
|
|
|
/* Whether we can use destination dependency control without running the
|
|
|
|
|
* risk of a hang if an instruction gets shot down.
|
|
|
|
|
*/
|
|
|
|
|
const bool use_dep_ctrl = !inst->predicate &&
|
|
|
|
|
inst->exec_size == dispatch_width;
|
|
|
|
|
brw_inst *insn;
|
|
|
|
|
|
2015-11-24 09:01:11 -08:00
|
|
|
/* The destination stride of an instruction (in bytes) must be greater
|
|
|
|
|
* than or equal to the size of the rest of the instruction. Since the
|
|
|
|
|
* address register is of type UW, we can't use a D-type instruction.
|
|
|
|
|
* In order to get around this, re retype to UW and use a stride.
|
|
|
|
|
*/
|
|
|
|
|
indirect_byte_offset =
|
|
|
|
|
retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
|
|
|
|
|
|
2016-10-28 14:48:53 -07:00
|
|
|
/* There are a number of reasons why we don't use the base offset here.
|
|
|
|
|
* One reason is that the field is only 9 bits which means we can only
|
|
|
|
|
* use it to access the first 16 GRFs. Also, from the Haswell PRM
|
|
|
|
|
* section "Register Region Restrictions":
|
|
|
|
|
*
|
|
|
|
|
* "The lower bits of the AddressImmediate must not overflow to
|
|
|
|
|
* change the register address. The lower 5 bits of Address
|
|
|
|
|
* Immediate when added to lower 5 bits of address register gives
|
|
|
|
|
* the sub-register offset. The upper bits of Address Immediate
|
|
|
|
|
* when added to upper bits of address register gives the register
|
|
|
|
|
* address. Any overflow from sub-register offset is dropped."
|
|
|
|
|
*
|
|
|
|
|
* Since the indirect may cause us to cross a register boundary, this
|
|
|
|
|
* makes the base offset almost useless. We could try and do something
|
|
|
|
|
* clever where we use a actual base offset if base_offset % 32 == 0 but
|
|
|
|
|
* that would mean we were generating different code depending on the
|
|
|
|
|
* base offset. Instead, for the sake of consistency, we'll just do the
|
|
|
|
|
* add ourselves. This restriction is only listed in the Haswell PRM
|
|
|
|
|
* but empirical testing indicates that it applies on all older
|
|
|
|
|
* generations and is lifted on Broadwell.
|
|
|
|
|
*
|
|
|
|
|
* In the end, while base_offset is nice to look at in the generated
|
|
|
|
|
* code, using it saves us 0 instructions and would require quite a bit
|
|
|
|
|
* of case-by-case work. It's just not worth it.
|
2020-01-30 11:34:51 -06:00
|
|
|
*
|
2021-03-29 15:46:12 -07:00
|
|
|
* Due to a hardware bug some platforms (particularly Gfx11+) seem to
|
2020-03-03 13:20:47 -08:00
|
|
|
* require the address components of all channels to be valid whether or
|
|
|
|
|
* not they're active, which causes issues if we use VxH addressing
|
|
|
|
|
* under non-uniform control-flow. We can easily work around that by
|
|
|
|
|
* initializing the whole address register with a pipelined NoMask MOV
|
|
|
|
|
* instruction.
|
2016-10-28 14:48:53 -07:00
|
|
|
*/
|
2024-02-15 15:59:08 -08:00
|
|
|
insn = brw_MOV(p, addr, brw_imm_uw(imm_byte_offset));
|
|
|
|
|
brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
|
|
|
|
|
brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
|
|
|
|
|
if (devinfo->ver >= 12)
|
|
|
|
|
brw_set_default_swsb(p, tgl_swsb_null());
|
|
|
|
|
else
|
|
|
|
|
brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl);
|
2020-03-03 13:20:47 -08:00
|
|
|
|
|
|
|
|
insn = brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
|
2021-03-29 14:41:58 -07:00
|
|
|
if (devinfo->ver >= 12)
|
2020-03-03 13:20:47 -08:00
|
|
|
brw_set_default_swsb(p, tgl_swsb_regdist(1));
|
2024-02-15 15:59:08 -08:00
|
|
|
else
|
2020-03-03 13:20:47 -08:00
|
|
|
brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl);
|
2017-10-17 14:45:43 -07:00
|
|
|
|
|
|
|
|
if (type_sz(reg.type) > 4 &&
|
2024-03-27 15:46:07 -07:00
|
|
|
(intel_device_info_is_9lp(devinfo) || !devinfo->has_64bit_int)) {
|
2024-03-27 15:56:13 -07:00
|
|
|
/* From the Cherryview PRM Vol 7. "Register Region Restrictions":
|
2017-10-17 14:45:43 -07:00
|
|
|
*
|
2024-03-27 15:56:13 -07:00
|
|
|
* "When source or destination datatype is 64b or operation is
|
2017-10-17 14:45:43 -07:00
|
|
|
* integer DWord multiply, indirect addressing must not be used."
|
|
|
|
|
*
|
2024-03-27 15:56:13 -07:00
|
|
|
* We may also not support Q/UQ types.
|
|
|
|
|
*
|
|
|
|
|
* To work around both of these, we do two integer MOVs instead
|
|
|
|
|
* of one 64-bit MOV. Because no double value should ever cross
|
|
|
|
|
* a register boundary, it's safe to use the immediate offset in
|
|
|
|
|
* the indirect here to handle adding 4 bytes to the offset and
|
|
|
|
|
* avoid the extra ADD to the register file.
|
2017-02-09 10:16:58 -08:00
|
|
|
*/
|
2017-10-17 14:45:43 -07:00
|
|
|
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
|
|
|
|
|
retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
|
2019-09-26 23:38:24 -07:00
|
|
|
brw_set_default_swsb(p, tgl_swsb_null());
|
2017-10-17 14:45:43 -07:00
|
|
|
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
|
|
|
|
|
retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
|
|
|
|
|
} else {
|
|
|
|
|
struct brw_reg ind_src = brw_VxH_indirect(0, 0);
|
2015-11-07 18:58:34 -08:00
|
|
|
|
2024-02-15 15:59:08 -08:00
|
|
|
brw_MOV(p, dst, retype(ind_src, reg.type));
|
2015-11-24 09:01:11 -08:00
|
|
|
}
|
|
|
|
|
}
|
2015-11-07 18:58:34 -08:00
|
|
|
}
|
|
|
|
|
|
2017-08-29 09:21:32 -07:00
|
|
|
void
|
|
|
|
|
fs_generator::generate_shuffle(fs_inst *inst,
|
|
|
|
|
struct brw_reg dst,
|
|
|
|
|
struct brw_reg src,
|
|
|
|
|
struct brw_reg idx)
|
|
|
|
|
{
|
2020-10-28 16:28:46 -05:00
|
|
|
assert(src.file == BRW_GENERAL_REGISTER_FILE);
|
|
|
|
|
assert(!src.abs && !src.negate);
|
|
|
|
|
|
2017-08-29 09:21:32 -07:00
|
|
|
/* Ivy bridge has some strange behavior that makes this a real pain to
|
|
|
|
|
* implement for 64-bit values so we just don't bother.
|
|
|
|
|
*/
|
2024-02-15 15:59:08 -08:00
|
|
|
assert(devinfo->has_64bit_float || type_sz(src.type) <= 4);
|
2017-08-29 09:21:32 -07:00
|
|
|
|
2020-10-29 09:34:08 -05:00
|
|
|
/* Gen12.5 adds the following region restriction:
|
|
|
|
|
*
|
|
|
|
|
* "Vx1 and VxH indirect addressing for Float, Half-Float, Double-Float
|
|
|
|
|
* and Quad-Word data must not be used."
|
|
|
|
|
*
|
|
|
|
|
* We require the source and destination types to match so stomp to an
|
|
|
|
|
* unsigned integer type.
|
|
|
|
|
*/
|
|
|
|
|
assert(src.type == dst.type);
|
|
|
|
|
src.type = dst.type = brw_reg_type_from_bit_size(type_sz(src.type) * 8,
|
|
|
|
|
BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
2024-03-27 16:19:53 -07:00
|
|
|
/* Because we're using the address register, we're limited to 16-wide
|
|
|
|
|
* by the address register file and 8-wide for 64-bit types. We could try
|
|
|
|
|
* and make this instruction splittable higher up in the compiler but that
|
|
|
|
|
* gets weird because it reads all of the channels regardless of execution
|
|
|
|
|
* size. It's easier just to split it here.
|
2017-08-29 09:21:32 -07:00
|
|
|
*/
|
|
|
|
|
const unsigned lower_width =
|
2024-02-15 15:59:08 -08:00
|
|
|
element_sz(src) > 4 || element_sz(dst) > 4 ? 8 :
|
2021-12-20 01:54:57 -08:00
|
|
|
MIN2(16, inst->exec_size);
|
2017-08-29 09:21:32 -07:00
|
|
|
|
|
|
|
|
brw_set_default_exec_size(p, cvt(lower_width) - 1);
|
|
|
|
|
for (unsigned group = 0; group < inst->exec_size; group += lower_width) {
|
|
|
|
|
brw_set_default_group(p, group);
|
|
|
|
|
|
|
|
|
|
if ((src.vstride == 0 && src.hstride == 0) ||
|
|
|
|
|
idx.file == BRW_IMMEDIATE_VALUE) {
|
|
|
|
|
/* Trivial, the source is already uniform or the index is a constant.
|
|
|
|
|
* We will typically not get here if the optimizer is doing its job,
|
|
|
|
|
* but asserting would be mean.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
|
2020-10-27 00:02:43 -05:00
|
|
|
struct brw_reg group_src = stride(suboffset(src, i), 0, 1, 0);
|
2021-12-20 14:49:02 -08:00
|
|
|
struct brw_reg group_dst = suboffset(dst, group << (dst.hstride - 1));
|
2021-12-20 14:52:16 -08:00
|
|
|
brw_MOV(p, group_dst, group_src);
|
2017-08-29 09:21:32 -07:00
|
|
|
} else {
|
|
|
|
|
/* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
|
|
|
|
|
struct brw_reg addr = vec8(brw_address_reg(0));
|
|
|
|
|
|
|
|
|
|
struct brw_reg group_idx = suboffset(idx, group);
|
|
|
|
|
|
|
|
|
|
if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) {
|
|
|
|
|
/* Things get grumpy if the register is too wide. */
|
|
|
|
|
group_idx.width--;
|
|
|
|
|
group_idx.vstride--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
assert(type_sz(group_idx.type) <= 4);
|
|
|
|
|
if (type_sz(group_idx.type) == 4) {
|
|
|
|
|
/* The destination stride of an instruction (in bytes) must be
|
|
|
|
|
* greater than or equal to the size of the rest of the
|
|
|
|
|
* instruction. Since the address register is of type UW, we
|
|
|
|
|
* can't use a D-type instruction. In order to get around this,
|
|
|
|
|
* re retype to UW and use a stride.
|
|
|
|
|
*/
|
|
|
|
|
group_idx = retype(spread(group_idx, 2), BRW_REGISTER_TYPE_W);
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-22 17:42:10 -05:00
|
|
|
uint32_t src_start_offset = src.nr * REG_SIZE + src.subnr;
|
|
|
|
|
|
2020-10-02 13:37:05 -05:00
|
|
|
/* From the Haswell PRM:
|
|
|
|
|
*
|
|
|
|
|
* "When a sequence of NoDDChk and NoDDClr are used, the last
|
|
|
|
|
* instruction that completes the scoreboard clear must have a
|
|
|
|
|
* non-zero execution mask. This means, if any kind of predication
|
|
|
|
|
* can change the execution mask or channel enable of the last
|
|
|
|
|
* instruction, the optimization must be avoided. This is to
|
|
|
|
|
* avoid instructions being shot down the pipeline when no writes
|
|
|
|
|
* are required."
|
|
|
|
|
*
|
|
|
|
|
* Whenever predication is enabled or the instructions being emitted
|
|
|
|
|
* aren't the full width, it's possible that it will be run with zero
|
|
|
|
|
* channels enabled so we can't use dependency control without
|
|
|
|
|
* running the risk of a hang if an instruction gets shot down.
|
2020-09-22 17:42:10 -05:00
|
|
|
*/
|
|
|
|
|
const bool use_dep_ctrl = !inst->predicate &&
|
2020-10-02 13:37:05 -05:00
|
|
|
lower_width == dispatch_width;
|
2020-09-22 17:42:10 -05:00
|
|
|
brw_inst *insn;
|
|
|
|
|
|
2021-03-29 15:46:12 -07:00
|
|
|
/* Due to a hardware bug some platforms (particularly Gfx11+) seem
|
2020-09-22 17:42:10 -05:00
|
|
|
* to require the address components of all channels to be valid
|
|
|
|
|
* whether or not they're active, which causes issues if we use VxH
|
|
|
|
|
* addressing under non-uniform control-flow. We can easily work
|
|
|
|
|
* around that by initializing the whole address register with a
|
|
|
|
|
* pipelined NoMask MOV instruction.
|
|
|
|
|
*/
|
|
|
|
|
insn = brw_MOV(p, addr, brw_imm_uw(src_start_offset));
|
|
|
|
|
brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
|
|
|
|
|
brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
|
2021-03-29 14:41:58 -07:00
|
|
|
if (devinfo->ver >= 12)
|
2020-09-22 17:42:10 -05:00
|
|
|
brw_set_default_swsb(p, tgl_swsb_null());
|
|
|
|
|
else
|
|
|
|
|
brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl);
|
|
|
|
|
|
2017-08-29 09:21:32 -07:00
|
|
|
/* Take into account the component size and horizontal stride. */
|
|
|
|
|
assert(src.vstride == src.hstride + src.width);
|
2020-09-22 17:42:10 -05:00
|
|
|
insn = brw_SHL(p, addr, group_idx,
|
|
|
|
|
brw_imm_uw(util_logbase2(type_sz(src.type)) +
|
|
|
|
|
src.hstride - 1));
|
2021-03-29 14:41:58 -07:00
|
|
|
if (devinfo->ver >= 12)
|
2020-09-22 17:42:10 -05:00
|
|
|
brw_set_default_swsb(p, tgl_swsb_regdist(1));
|
|
|
|
|
else
|
|
|
|
|
brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl);
|
2017-08-29 09:21:32 -07:00
|
|
|
|
|
|
|
|
/* Add on the register start offset */
|
2020-09-22 17:42:10 -05:00
|
|
|
brw_ADD(p, addr, addr, brw_imm_uw(src_start_offset));
|
2021-12-20 14:52:16 -08:00
|
|
|
brw_MOV(p, suboffset(dst, group << (dst.hstride - 1)),
|
|
|
|
|
retype(brw_VxH_indirect(0, 0), src.type));
|
2017-08-29 09:21:32 -07:00
|
|
|
}
|
2019-09-26 23:38:24 -07:00
|
|
|
|
|
|
|
|
brw_set_default_swsb(p, tgl_swsb_null());
|
2017-08-29 09:21:32 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-06 14:11:34 -08:00
|
|
|
void
|
|
|
|
|
fs_generator::generate_quad_swizzle(const fs_inst *inst,
|
|
|
|
|
struct brw_reg dst, struct brw_reg src,
|
|
|
|
|
unsigned swiz)
|
|
|
|
|
{
|
|
|
|
|
/* Requires a quad. */
|
|
|
|
|
assert(inst->exec_size >= 4);
|
|
|
|
|
|
|
|
|
|
if (src.file == BRW_IMMEDIATE_VALUE ||
|
|
|
|
|
has_scalar_region(src)) {
|
|
|
|
|
/* The value is uniform across all channels */
|
|
|
|
|
brw_MOV(p, dst, src);
|
|
|
|
|
|
2021-03-29 14:41:58 -07:00
|
|
|
} else if (devinfo->ver < 11 && type_sz(src.type) == 4) {
|
2018-12-06 14:11:34 -08:00
|
|
|
/* This only works on 8-wide 32-bit values */
|
|
|
|
|
assert(inst->exec_size == 8);
|
|
|
|
|
assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
|
|
|
|
|
assert(src.vstride == src.width + 1);
|
|
|
|
|
brw_set_default_access_mode(p, BRW_ALIGN_16);
|
|
|
|
|
struct brw_reg swiz_src = stride(src, 4, 4, 1);
|
|
|
|
|
swiz_src.swizzle = swiz;
|
|
|
|
|
brw_MOV(p, dst, swiz_src);
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
|
|
|
|
|
assert(src.vstride == src.width + 1);
|
|
|
|
|
const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0));
|
|
|
|
|
|
|
|
|
|
switch (swiz) {
|
|
|
|
|
case BRW_SWIZZLE_XXXX:
|
|
|
|
|
case BRW_SWIZZLE_YYYY:
|
|
|
|
|
case BRW_SWIZZLE_ZZZZ:
|
|
|
|
|
case BRW_SWIZZLE_WWWW:
|
|
|
|
|
brw_MOV(p, dst, stride(src_0, 4, 4, 0));
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_SWIZZLE_XXZZ:
|
|
|
|
|
case BRW_SWIZZLE_YYWW:
|
|
|
|
|
brw_MOV(p, dst, stride(src_0, 2, 2, 0));
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_SWIZZLE_XYXY:
|
|
|
|
|
case BRW_SWIZZLE_ZWZW:
|
|
|
|
|
assert(inst->exec_size == 4);
|
|
|
|
|
brw_MOV(p, dst, stride(src_0, 0, 2, 1));
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
assert(inst->force_writemask_all);
|
|
|
|
|
brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1);
|
|
|
|
|
|
|
|
|
|
for (unsigned c = 0; c < 4; c++) {
|
|
|
|
|
brw_inst *insn = brw_MOV(
|
|
|
|
|
p, stride(suboffset(dst, c),
|
|
|
|
|
4 * inst->dst.stride, 1, 4 * inst->dst.stride),
|
|
|
|
|
stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0));
|
|
|
|
|
|
2021-03-29 14:41:58 -07:00
|
|
|
if (devinfo->ver < 12) {
|
2018-11-09 14:13:36 -08:00
|
|
|
brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
|
|
|
|
|
brw_inst_set_no_dd_check(devinfo, insn, c > 0);
|
|
|
|
|
}
|
2019-09-26 23:38:24 -07:00
|
|
|
|
|
|
|
|
brw_set_default_swsb(p, tgl_swsb_null());
|
2018-12-06 14:11:34 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2014-08-27 11:32:08 -07:00
|
|
|
void
|
intel/compiler: Silence unused parameter warnings in generate_foo methods
Since all of the fs_generator::generate_foo methods take a fs_inst * as
the first parameter, just remove the name to quiet the compiler.
src/intel/compiler/brw_fs_generator.cpp: In member function ‘void fs_generator::generate_barrier(fs_inst*, brw_reg)’:
src/intel/compiler/brw_fs_generator.cpp:743:41: warning: unused parameter ‘inst’ [-Wunused-parameter]
fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src)
^~~~
src/intel/compiler/brw_fs_generator.cpp: In member function ‘void fs_generator::generate_discard_jump(fs_inst*)’:
src/intel/compiler/brw_fs_generator.cpp:1326:46: warning: unused parameter ‘inst’ [-Wunused-parameter]
fs_generator::generate_discard_jump(fs_inst *inst)
^~~~
src/intel/compiler/brw_fs_generator.cpp: In member function ‘void fs_generator::generate_pack_half_2x16_split(fs_inst*, brw_reg, brw_reg, brw_reg)’:
src/intel/compiler/brw_fs_generator.cpp:1675:54: warning: unused parameter ‘inst’ [-Wunused-parameter]
fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
^~~~
src/intel/compiler/brw_fs_generator.cpp: In member function ‘void fs_generator::generate_shader_time_add(fs_inst*, brw_reg, brw_reg, brw_reg)’:
src/intel/compiler/brw_fs_generator.cpp:1743:49: warning: unused parameter ‘inst’ [-Wunused-parameter]
fs_generator::generate_shader_time_add(fs_inst *inst,
^~~~
src/intel/compiler/brw_vec4_generator.cpp: In function ‘void generate_set_simd4x2_header_gen9(brw_codegen*, brw::vec4_instruction*, brw_reg)’:
src/intel/compiler/brw_vec4_generator.cpp:1412:52: warning: unused parameter ‘inst’ [-Wunused-parameter]
vec4_instruction *inst,
^~~~
src/intel/compiler/brw_vec4_generator.cpp: In function ‘void generate_mov_indirect(brw_codegen*, brw::vec4_instruction*, brw_reg, brw_reg, brw_reg, brw_reg)’:
src/intel/compiler/brw_vec4_generator.cpp:1430:41: warning: unused parameter ‘inst’ [-Wunused-parameter]
vec4_instruction *inst,
^~~~
src/intel/compiler/brw_vec4_generator.cpp:1432:63: warning: unused parameter ‘length’ [-Wunused-parameter]
struct brw_reg indirect, struct brw_reg length)
^~~~~~
Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2018-03-28 16:29:45 -07:00
|
|
|
fs_generator::generate_barrier(fs_inst *, struct brw_reg src)
|
2014-08-27 11:32:08 -07:00
|
|
|
{
|
|
|
|
|
brw_barrier(p, src);
|
2021-03-29 14:41:58 -07:00
|
|
|
if (devinfo->ver >= 12) {
|
2019-09-03 17:31:37 -07:00
|
|
|
brw_set_default_swsb(p, tgl_swsb_null());
|
|
|
|
|
brw_SYNC(p, TGL_SYNC_BAR);
|
|
|
|
|
} else {
|
|
|
|
|
brw_WAIT(p);
|
|
|
|
|
}
|
2014-08-27 11:32:08 -07:00
|
|
|
}
|
|
|
|
|
|
2011-05-24 16:34:27 -07:00
|
|
|
/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
|
|
|
|
|
* looking like:
|
|
|
|
|
*
|
|
|
|
|
* arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
|
|
|
|
|
*
|
2013-09-12 13:00:52 +08:00
|
|
|
* Ideally, we want to produce:
|
2011-05-24 16:34:27 -07:00
|
|
|
*
|
|
|
|
|
* DDX DDY
|
|
|
|
|
* dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
|
|
|
|
|
* (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
|
|
|
|
|
* (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
|
|
|
|
|
* (ss0.br - ss0.bl) (ss0.tr - ss0.br)
|
|
|
|
|
* (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
|
|
|
|
|
* (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
|
|
|
|
|
* (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
|
|
|
|
|
* (ss1.br - ss1.bl) (ss1.tr - ss1.br)
|
|
|
|
|
*
|
|
|
|
|
* and add another set of two more subspans if in 16-pixel dispatch mode.
|
|
|
|
|
*
|
|
|
|
|
* For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
|
|
|
|
|
* for each pair, and vertstride = 2 jumps us 2 elements after processing a
|
2013-09-12 13:00:52 +08:00
|
|
|
* pair. But the ideal approximation may impose a huge performance cost on
|
|
|
|
|
* sample_d. On at least Haswell, sample_d instruction does some
|
|
|
|
|
* optimizations if the same LOD is used for all pixels in the subspan.
|
|
|
|
|
*
|
i965/fs: Improve accuracy of dFdy() to match dFdx().
Previously, we computed dFdy() using the following instruction:
add(8) dst<1>F src<4,4,0)F -src.2<4,4,0>F { align1 1Q }
That had the disadvantage that it computed the same value for all 4
pixels of a 2x2 subspan, which meant that it was less accurate than
dFdx(). This patch changes it to the following instruction when
c->key.high_quality_derivatives is set:
add(8) dst<1>F src<4,4,1>.xyxyF -src<4,4,1>.zwzwF { align16 1Q }
This gives it comparable accuracy to dFdx().
Unfortunately, align16 instructions can't be compressed, so in SIMD16
shaders, instead of emitting this instruction:
add(16) dst<1>F src<4,4,1>.xyxyF -src<4,4,1>.zwzwF { align16 1H }
We need to unroll to two instructions:
add(8) dst<1>F src<4,4,1>.xyxyF -src<4,4,1>.zwzwF { align16 1Q }
add(8) (dst+1)<1>F (src+1)<4,4,1>.xyxyF -(src+1)<4,4,1>.zwzwF { align16 2Q }
Fixes piglit test spec/glsl-1.10/execution/fs-dfdy-accuracy.
Acked-by: Chris Forbes <chrisf@ijw.co.nz>
Reviewed-by: Eric Anholt <eric@anholt.net>
2013-09-20 09:04:31 -07:00
|
|
|
* For DDY, we need to use ALIGN16 mode since it's capable of doing the
|
|
|
|
|
* appropriate swizzling.
|
2011-05-24 16:34:27 -07:00
|
|
|
*/
|
|
|
|
|
void
|
2017-06-15 15:41:40 -07:00
|
|
|
fs_generator::generate_ddx(const fs_inst *inst,
|
2014-11-08 01:39:14 -08:00
|
|
|
struct brw_reg dst, struct brw_reg src)
|
2011-05-24 16:34:27 -07:00
|
|
|
{
|
2013-09-12 13:00:52 +08:00
|
|
|
unsigned vstride, width;
|
|
|
|
|
|
2024-02-15 15:59:08 -08:00
|
|
|
if (inst->opcode == FS_OPCODE_DDX_FINE) {
|
|
|
|
|
/* produce accurate derivatives */
|
|
|
|
|
vstride = BRW_VERTICAL_STRIDE_2;
|
|
|
|
|
width = BRW_WIDTH_2;
|
|
|
|
|
} else {
|
|
|
|
|
/* replicate the derivative at the top-left pixel to other pixels */
|
|
|
|
|
vstride = BRW_VERTICAL_STRIDE_4;
|
|
|
|
|
width = BRW_WIDTH_4;
|
|
|
|
|
}
|
2013-09-12 13:00:52 +08:00
|
|
|
|
2024-02-15 15:59:08 -08:00
|
|
|
struct brw_reg src0 = byte_offset(src, type_sz(src.type));;
|
|
|
|
|
struct brw_reg src1 = src;
|
2017-06-15 17:20:29 -07:00
|
|
|
|
2024-02-15 15:59:08 -08:00
|
|
|
src0.vstride = vstride;
|
|
|
|
|
src0.width = width;
|
|
|
|
|
src0.hstride = BRW_HORIZONTAL_STRIDE_0;
|
|
|
|
|
src1.vstride = vstride;
|
|
|
|
|
src1.width = width;
|
|
|
|
|
src1.hstride = BRW_HORIZONTAL_STRIDE_0;
|
2017-06-15 17:20:29 -07:00
|
|
|
|
2024-02-15 15:59:08 -08:00
|
|
|
brw_ADD(p, dst, src0, negate(src1));
|
2011-05-24 16:34:27 -07:00
|
|
|
}
|
|
|
|
|
|
2012-06-20 13:40:45 -07:00
|
|
|
/* The negate_value boolean is used to negate the derivative computation for
|
|
|
|
|
* FBOs, since they place the origin at the upper left instead of the lower
|
|
|
|
|
* left.
|
|
|
|
|
*/
|
2011-05-24 16:34:27 -07:00
|
|
|
void
|
2017-06-15 15:41:40 -07:00
|
|
|
fs_generator::generate_ddy(const fs_inst *inst,
|
2016-05-17 01:52:16 -07:00
|
|
|
struct brw_reg dst, struct brw_reg src)
|
2011-05-24 16:34:27 -07:00
|
|
|
{
|
2018-05-28 12:32:08 +02:00
|
|
|
const uint32_t type_size = type_sz(src.type);
|
|
|
|
|
|
2017-06-15 15:41:40 -07:00
|
|
|
if (inst->opcode == FS_OPCODE_DDY_FINE) {
|
2018-05-30 12:14:14 +02:00
|
|
|
/* produce accurate derivatives.
|
|
|
|
|
*
|
|
|
|
|
* From the Broadwell PRM, Volume 7 (3D-Media-GPGPU)
|
|
|
|
|
* "Register Region Restrictions", Section "1. Special Restrictions":
|
|
|
|
|
*
|
|
|
|
|
* "In Align16 mode, the channel selects and channel enables apply to
|
|
|
|
|
* a pair of half-floats, because these parameters are defined for
|
|
|
|
|
* DWord elements ONLY. This is applicable when both source and
|
|
|
|
|
* destination are half-floats."
|
|
|
|
|
*
|
2021-03-29 15:46:12 -07:00
|
|
|
* So for half-float operations we use the Gfx11+ Align1 path. CHV
|
2018-05-30 12:14:14 +02:00
|
|
|
* inherits its FP16 hardware from SKL, so it is not affected.
|
|
|
|
|
*/
|
2024-02-15 15:59:08 -08:00
|
|
|
if (devinfo->ver >= 11) {
|
intel/compiler/fs: Implement ddy without using align16 for Gen11+
Align16 is no more. We previously generated an align16 ADD instruction
to calculate DDY:
add(16) g25<1>F -g23<4>.xyxyF g23<4>.zwzwF { align16 1H };
Without align16, we now implement it as:
add(4) g25<1>F -g23<0,2,1>F g23.2<0,2,1>F { align1 1N };
add(4) g25.4<1>F -g23.4<0,2,1>F g23.6<0,2,1>F { align1 1N };
add(4) g26<1>F -g24<0,2,1>F g24.2<0,2,1>F { align1 1N };
add(4) g26.4<1>F -g24.4<0,2,1>F g24.6<0,2,1>F { align1 1N };
where only the first two instructions are needed in SIMD8 mode.
Note: an earlier version of the patch implemented this in two
instructions in SIMD16:
add(8) g25<2>F -g23<4,2,0>F g23.2<4,2,0>F { align1 1N };
add(8) g25.1<2>F -g23.1<4,2,0>F g23.3<4,2,0>F { align1 1N };
but I realized that the channel enable bits will not be correct. If we
knew we were under uniform control flow, we could emit only those two
instructions however.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-06-15 17:29:16 -07:00
|
|
|
src = stride(src, 0, 2, 1);
|
2017-06-15 17:20:29 -07:00
|
|
|
|
intel/compiler/fs: Implement ddy without using align16 for Gen11+
Align16 is no more. We previously generated an align16 ADD instruction
to calculate DDY:
add(16) g25<1>F -g23<4>.xyxyF g23<4>.zwzwF { align16 1H };
Without align16, we now implement it as:
add(4) g25<1>F -g23<0,2,1>F g23.2<0,2,1>F { align1 1N };
add(4) g25.4<1>F -g23.4<0,2,1>F g23.6<0,2,1>F { align1 1N };
add(4) g26<1>F -g24<0,2,1>F g24.2<0,2,1>F { align1 1N };
add(4) g26.4<1>F -g24.4<0,2,1>F g24.6<0,2,1>F { align1 1N };
where only the first two instructions are needed in SIMD8 mode.
Note: an earlier version of the patch implemented this in two
instructions in SIMD16:
add(8) g25<2>F -g23<4,2,0>F g23.2<4,2,0>F { align1 1N };
add(8) g25.1<2>F -g23.1<4,2,0>F g23.3<4,2,0>F { align1 1N };
but I realized that the channel enable bits will not be correct. If we
knew we were under uniform control flow, we could emit only those two
instructions however.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-06-15 17:29:16 -07:00
|
|
|
brw_push_insn_state(p);
|
|
|
|
|
brw_set_default_exec_size(p, BRW_EXECUTE_4);
|
2019-06-25 11:10:14 +03:00
|
|
|
for (uint32_t g = 0; g < inst->exec_size; g += 4) {
|
|
|
|
|
brw_set_default_group(p, inst->group + g);
|
|
|
|
|
brw_ADD(p, byte_offset(dst, g * type_size),
|
|
|
|
|
negate(byte_offset(src, g * type_size)),
|
|
|
|
|
byte_offset(src, (g + 2) * type_size));
|
2019-09-26 23:38:24 -07:00
|
|
|
brw_set_default_swsb(p, tgl_swsb_null());
|
intel/compiler/fs: Implement ddy without using align16 for Gen11+
Align16 is no more. We previously generated an align16 ADD instruction
to calculate DDY:
add(16) g25<1>F -g23<4>.xyxyF g23<4>.zwzwF { align16 1H };
Without align16, we now implement it as:
add(4) g25<1>F -g23<0,2,1>F g23.2<0,2,1>F { align1 1N };
add(4) g25.4<1>F -g23.4<0,2,1>F g23.6<0,2,1>F { align1 1N };
add(4) g26<1>F -g24<0,2,1>F g24.2<0,2,1>F { align1 1N };
add(4) g26.4<1>F -g24.4<0,2,1>F g24.6<0,2,1>F { align1 1N };
where only the first two instructions are needed in SIMD8 mode.
Note: an earlier version of the patch implemented this in two
instructions in SIMD16:
add(8) g25<2>F -g23<4,2,0>F g23.2<4,2,0>F { align1 1N };
add(8) g25.1<2>F -g23.1<4,2,0>F g23.3<4,2,0>F { align1 1N };
but I realized that the channel enable bits will not be correct. If we
knew we were under uniform control flow, we could emit only those two
instructions however.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-06-15 17:29:16 -07:00
|
|
|
}
|
|
|
|
|
brw_pop_insn_state(p);
|
|
|
|
|
} else {
|
|
|
|
|
struct brw_reg src0 = stride(src, 4, 4, 1);
|
|
|
|
|
struct brw_reg src1 = stride(src, 4, 4, 1);
|
|
|
|
|
src0.swizzle = BRW_SWIZZLE_XYXY;
|
|
|
|
|
src1.swizzle = BRW_SWIZZLE_ZWZW;
|
|
|
|
|
|
|
|
|
|
brw_push_insn_state(p);
|
|
|
|
|
brw_set_default_access_mode(p, BRW_ALIGN_16);
|
|
|
|
|
brw_ADD(p, dst, negate(src0), src1);
|
|
|
|
|
brw_pop_insn_state(p);
|
|
|
|
|
}
|
i965/fs: Improve accuracy of dFdy() to match dFdx().
Previously, we computed dFdy() using the following instruction:
add(8) dst<1>F src<4,4,0)F -src.2<4,4,0>F { align1 1Q }
That had the disadvantage that it computed the same value for all 4
pixels of a 2x2 subspan, which meant that it was less accurate than
dFdx(). This patch changes it to the following instruction when
c->key.high_quality_derivatives is set:
add(8) dst<1>F src<4,4,1>.xyxyF -src<4,4,1>.zwzwF { align16 1Q }
This gives it comparable accuracy to dFdx().
Unfortunately, align16 instructions can't be compressed, so in SIMD16
shaders, instead of emitting this instruction:
add(16) dst<1>F src<4,4,1>.xyxyF -src<4,4,1>.zwzwF { align16 1H }
We need to unroll to two instructions:
add(8) dst<1>F src<4,4,1>.xyxyF -src<4,4,1>.zwzwF { align16 1Q }
add(8) (dst+1)<1>F (src+1)<4,4,1>.xyxyF -(src+1)<4,4,1>.zwzwF { align16 2Q }
Fixes piglit test spec/glsl-1.10/execution/fs-dfdy-accuracy.
Acked-by: Chris Forbes <chrisf@ijw.co.nz>
Reviewed-by: Eric Anholt <eric@anholt.net>
2013-09-20 09:04:31 -07:00
|
|
|
} else {
|
|
|
|
|
/* replicate the derivative at the top-left pixel to other pixels */
|
2024-02-15 15:59:08 -08:00
|
|
|
struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size);
|
|
|
|
|
struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size);
|
2019-07-25 18:28:44 -05:00
|
|
|
|
2024-02-15 15:59:08 -08:00
|
|
|
brw_ADD(p, dst, negate(src0), src1);
|
i965/fs: Improve accuracy of dFdy() to match dFdx().
Previously, we computed dFdy() using the following instruction:
add(8) dst<1>F src<4,4,0)F -src.2<4,4,0>F { align1 1Q }
That had the disadvantage that it computed the same value for all 4
pixels of a 2x2 subspan, which meant that it was less accurate than
dFdx(). This patch changes it to the following instruction when
c->key.high_quality_derivatives is set:
add(8) dst<1>F src<4,4,1>.xyxyF -src<4,4,1>.zwzwF { align16 1Q }
This gives it comparable accuracy to dFdx().
Unfortunately, align16 instructions can't be compressed, so in SIMD16
shaders, instead of emitting this instruction:
add(16) dst<1>F src<4,4,1>.xyxyF -src<4,4,1>.zwzwF { align16 1H }
We need to unroll to two instructions:
add(8) dst<1>F src<4,4,1>.xyxyF -src<4,4,1>.zwzwF { align16 1Q }
add(8) (dst+1)<1>F (src+1)<4,4,1>.xyxyF -(src+1)<4,4,1>.zwzwF { align16 2Q }
Fixes piglit test spec/glsl-1.10/execution/fs-dfdy-accuracy.
Acked-by: Chris Forbes <chrisf@ijw.co.nz>
Reviewed-by: Eric Anholt <eric@anholt.net>
2013-09-20 09:04:31 -07:00
|
|
|
}
|
2011-05-24 16:34:27 -07:00
|
|
|
}
|
|
|
|
|
|
2012-12-06 10:15:08 -08:00
|
|
|
void
|
2020-11-30 17:24:51 -06:00
|
|
|
fs_generator::generate_halt(fs_inst *)
|
2012-12-06 10:15:08 -08:00
|
|
|
{
|
|
|
|
|
/* This HALT will be patched up at FB write time to point UIP at the end of
|
|
|
|
|
* the program, and at brw_uip_jip() JIP will be set to the end of the
|
|
|
|
|
* current block (or the program).
|
|
|
|
|
*/
|
|
|
|
|
this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
|
2020-04-25 14:59:30 -05:00
|
|
|
brw_HALT(p);
|
2012-12-06 10:15:08 -08:00
|
|
|
}
|
|
|
|
|
|
2020-10-09 04:13:20 -05:00
|
|
|
/* The A32 messages take a buffer base address in header.5:[31:0] (See
|
|
|
|
|
* MH1_A32_PSM for typed messages or MH_A32_GO for byte/dword scattered
|
|
|
|
|
* and OWord block messages in the SKL PRM Vol. 2d for more details.)
|
|
|
|
|
* Unfortunately, there are a number of subtle differences:
|
|
|
|
|
*
|
|
|
|
|
* For the block read/write messages:
|
|
|
|
|
*
|
|
|
|
|
* - We always stomp header.2 to fill in the actual scratch address (in
|
|
|
|
|
* units of OWORDs) so we don't care what's in there.
|
|
|
|
|
*
|
|
|
|
|
* - They rely on per-thread scratch space value in header.3[3:0] to do
|
|
|
|
|
* bounds checking so that needs to be valid. The upper bits of
|
|
|
|
|
* header.3 are ignored, though, so we can copy all of g0.3.
|
|
|
|
|
*
|
|
|
|
|
* - They ignore header.5[9:0] and assumes the address is 1KB aligned.
|
|
|
|
|
*
|
|
|
|
|
*
|
|
|
|
|
* For the byte/dword scattered read/write messages:
|
|
|
|
|
*
|
|
|
|
|
* - We want header.2 to be zero because that gets added to the per-channel
|
|
|
|
|
* offset in the non-header portion of the message.
|
|
|
|
|
*
|
|
|
|
|
* - Contrary to what the docs claim, they don't do any bounds checking so
|
|
|
|
|
* the value of header.3[3:0] doesn't matter.
|
|
|
|
|
*
|
|
|
|
|
* - They consider all of header.5 for the base address and header.5[9:0]
|
|
|
|
|
* are not ignored. This means that we can't copy g0.5 verbatim because
|
|
|
|
|
* g0.5[9:0] contains the FFTID on most platforms. Instead, we have to
|
|
|
|
|
* use an AND to mask off the bottom 10 bits.
|
|
|
|
|
*
|
|
|
|
|
*
|
|
|
|
|
* For block messages, just copying g0 gives a valid header because all the
|
|
|
|
|
* garbage gets ignored except for header.2 which we stomp as part of message
|
|
|
|
|
* setup. For byte/dword scattered messages, we can just zero out the header
|
|
|
|
|
* and copy over the bits we need from g0.5. This opcode, however, tries to
|
|
|
|
|
* satisfy the requirements of both by starting with 0 and filling out the
|
|
|
|
|
* information required by either set of opcodes.
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
fs_generator::generate_scratch_header(fs_inst *inst, struct brw_reg dst)
|
|
|
|
|
{
|
|
|
|
|
assert(inst->exec_size == 8 && inst->force_writemask_all);
|
|
|
|
|
assert(dst.file == BRW_GENERAL_REGISTER_FILE);
|
|
|
|
|
|
|
|
|
|
dst.type = BRW_REGISTER_TYPE_UD;
|
|
|
|
|
|
|
|
|
|
brw_inst *insn = brw_MOV(p, dst, brw_imm_ud(0));
|
2021-03-29 14:41:58 -07:00
|
|
|
if (devinfo->ver >= 12)
|
2020-10-09 04:13:20 -05:00
|
|
|
brw_set_default_swsb(p, tgl_swsb_null());
|
|
|
|
|
else
|
|
|
|
|
brw_inst_set_no_dd_clear(p->devinfo, insn, true);
|
|
|
|
|
|
|
|
|
|
/* Copy the per-thread scratch space size from g0.3[3:0] */
|
|
|
|
|
brw_set_default_exec_size(p, BRW_EXECUTE_1);
|
|
|
|
|
insn = brw_AND(p, suboffset(dst, 3),
|
|
|
|
|
retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
|
|
|
|
|
brw_imm_ud(INTEL_MASK(3, 0)));
|
2021-03-29 14:41:58 -07:00
|
|
|
if (devinfo->ver < 12) {
|
2020-10-09 04:13:20 -05:00
|
|
|
brw_inst_set_no_dd_clear(p->devinfo, insn, true);
|
|
|
|
|
brw_inst_set_no_dd_check(p->devinfo, insn, true);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Copy the scratch base address from g0.5[31:10] */
|
|
|
|
|
insn = brw_AND(p, suboffset(dst, 5),
|
|
|
|
|
retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
|
|
|
|
|
brw_imm_ud(INTEL_MASK(31, 10)));
|
2021-03-29 14:41:58 -07:00
|
|
|
if (devinfo->ver < 12)
|
2020-10-09 04:13:20 -05:00
|
|
|
brw_inst_set_no_dd_check(p->devinfo, insn, true);
|
|
|
|
|
}
|
|
|
|
|
|
2014-10-27 19:40:47 -07:00
|
|
|
void
|
|
|
|
|
fs_generator::enable_debug(const char *shader_name)
|
|
|
|
|
{
|
|
|
|
|
debug_flag = true;
|
|
|
|
|
this->shader_name = shader_name;
|
|
|
|
|
}
|
|
|
|
|
|
2023-09-20 12:42:24 -07:00
|
|
|
static gfx12_systolic_depth
|
|
|
|
|
translate_systolic_depth(unsigned d)
|
|
|
|
|
{
|
|
|
|
|
/* Could also return (ffs(d) - 1) & 3. */
|
|
|
|
|
switch (d) {
|
|
|
|
|
case 2: return BRW_SYSTOLIC_DEPTH_2;
|
|
|
|
|
case 4: return BRW_SYSTOLIC_DEPTH_4;
|
|
|
|
|
case 8: return BRW_SYSTOLIC_DEPTH_8;
|
|
|
|
|
case 16: return BRW_SYSTOLIC_DEPTH_16;
|
|
|
|
|
default: unreachable("Invalid systolic depth.");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-13 16:28:08 -08:00
|
|
|
int
|
2019-04-23 23:19:56 -05:00
|
|
|
fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
|
2020-03-04 16:24:25 -08:00
|
|
|
struct shader_stats shader_stats,
|
2020-03-26 16:27:32 -07:00
|
|
|
const brw::performance &perf,
|
2023-12-07 19:47:55 -08:00
|
|
|
struct brw_compile_stats *stats,
|
|
|
|
|
unsigned max_polygons)
|
2011-05-24 16:34:27 -07:00
|
|
|
{
|
2014-11-13 16:28:08 -08:00
|
|
|
/* align to 64 byte boundary. */
|
2020-08-07 21:59:12 -05:00
|
|
|
brw_realign(p, 64);
|
2014-11-13 16:28:08 -08:00
|
|
|
|
|
|
|
|
this->dispatch_width = dispatch_width;
|
|
|
|
|
|
2014-05-25 10:42:32 -07:00
|
|
|
int start_offset = p->next_insn_offset;
|
2019-09-09 18:31:41 -07:00
|
|
|
|
2024-02-13 00:29:29 -08:00
|
|
|
int loop_count = 0, send_count = 0, nop_count = 0, sync_nop_count = 0;
|
2020-01-14 16:12:31 -08:00
|
|
|
bool is_accum_used = false;
|
2014-05-25 10:42:32 -07:00
|
|
|
|
2022-06-29 14:13:31 -07:00
|
|
|
struct disasm_info *disasm_info = disasm_initialize(p->isa, cfg);
|
2014-05-25 10:42:32 -07:00
|
|
|
|
2014-07-11 21:16:13 -07:00
|
|
|
foreach_block_and_inst (block, fs_inst, inst, cfg) {
|
2019-05-29 17:46:55 -05:00
|
|
|
if (inst->opcode == SHADER_OPCODE_UNDEF)
|
|
|
|
|
continue;
|
|
|
|
|
|
2018-10-29 15:06:14 -05:00
|
|
|
struct brw_reg src[4], dst;
|
2014-05-30 16:41:32 -07:00
|
|
|
unsigned int last_insn_offset = p->next_insn_offset;
|
2014-12-30 12:56:13 -08:00
|
|
|
bool multiple_instructions_emitted = false;
|
2021-05-24 22:53:27 -07:00
|
|
|
tgl_swsb swsb = inst->sched;
|
2011-05-24 16:34:27 -07:00
|
|
|
|
2016-05-02 23:32:13 -07:00
|
|
|
/* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the
|
|
|
|
|
* "Register Region Restrictions" section: for BDW, SKL:
|
|
|
|
|
*
|
|
|
|
|
* "A POW/FDIV operation must not be followed by an instruction
|
|
|
|
|
* that requires two destination registers."
|
|
|
|
|
*
|
|
|
|
|
* The documentation is often lacking annotations for Atom parts,
|
|
|
|
|
* and empirically this affects CHV as well.
|
|
|
|
|
*/
|
2024-02-15 15:59:08 -08:00
|
|
|
if (devinfo->ver <= 9 &&
|
2016-05-02 23:32:13 -07:00
|
|
|
p->nr_insn > 1 &&
|
2022-06-29 14:13:31 -07:00
|
|
|
brw_inst_opcode(p->isa, brw_last_inst) == BRW_OPCODE_MATH &&
|
2016-05-02 23:32:13 -07:00
|
|
|
brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW &&
|
|
|
|
|
inst->dst.component_size(inst->exec_size) > REG_SIZE) {
|
|
|
|
|
brw_NOP(p);
|
|
|
|
|
last_insn_offset = p->next_insn_offset;
|
2020-02-27 10:48:37 -08:00
|
|
|
|
|
|
|
|
/* In order to avoid spurious instruction count differences when the
|
|
|
|
|
* instruction schedule changes, keep track of the number of inserted
|
|
|
|
|
* NOPs.
|
|
|
|
|
*/
|
|
|
|
|
nop_count++;
|
2016-05-02 23:32:13 -07:00
|
|
|
}
|
|
|
|
|
|
2021-03-29 17:15:41 -07:00
|
|
|
/* Wa_14010017096:
|
2020-01-14 16:12:31 -08:00
|
|
|
*
|
|
|
|
|
* Clear accumulator register before end of thread.
|
|
|
|
|
*/
|
2023-02-07 18:00:58 -08:00
|
|
|
if (inst->eot && is_accum_used &&
|
|
|
|
|
intel_needs_workaround(devinfo, 14010017096)) {
|
2020-01-14 16:12:31 -08:00
|
|
|
brw_set_default_exec_size(p, BRW_EXECUTE_16);
|
2023-10-11 11:44:14 +03:00
|
|
|
brw_set_default_group(p, 0);
|
2020-01-14 16:12:31 -08:00
|
|
|
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
|
|
|
|
|
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
|
2023-10-11 11:04:12 +03:00
|
|
|
brw_set_default_flag_reg(p, 0, 0);
|
2021-05-24 22:53:27 -07:00
|
|
|
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
|
2020-01-14 16:12:31 -08:00
|
|
|
brw_MOV(p, brw_acc_reg(8), brw_imm_f(0.0f));
|
|
|
|
|
last_insn_offset = p->next_insn_offset;
|
2021-05-24 22:53:27 -07:00
|
|
|
swsb = tgl_swsb_dst_dep(swsb, 1);
|
2020-01-14 16:12:31 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!is_accum_used && !inst->eot) {
|
|
|
|
|
is_accum_used = inst->writes_accumulator_implicitly(devinfo) ||
|
|
|
|
|
inst->dst.is_accumulator();
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-02 13:57:20 +01:00
|
|
|
/* Wa_14013672992:
|
2021-05-24 23:21:10 -07:00
|
|
|
*
|
|
|
|
|
* Always use @1 SWSB for EOT.
|
|
|
|
|
*/
|
2023-11-02 13:57:20 +01:00
|
|
|
if (inst->eot && intel_needs_workaround(devinfo, 14013672992)) {
|
2021-05-24 23:21:10 -07:00
|
|
|
if (tgl_swsb_src_dep(swsb).mode) {
|
|
|
|
|
brw_set_default_exec_size(p, BRW_EXECUTE_1);
|
|
|
|
|
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
|
|
|
|
|
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
|
2023-10-11 11:04:12 +03:00
|
|
|
brw_set_default_flag_reg(p, 0, 0);
|
2021-05-24 23:21:10 -07:00
|
|
|
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
|
|
|
|
|
brw_SYNC(p, TGL_SYNC_NOP);
|
|
|
|
|
last_insn_offset = p->next_insn_offset;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
swsb = tgl_swsb_dst_dep(swsb, 1);
|
|
|
|
|
}
|
|
|
|
|
|
2014-05-19 10:20:37 -07:00
|
|
|
if (unlikely(debug_flag))
|
2017-11-15 17:08:42 -08:00
|
|
|
disasm_annotate(disasm_info, inst, p->next_insn_offset);
|
2011-05-24 16:34:27 -07:00
|
|
|
|
2016-05-18 18:48:04 -07:00
|
|
|
/* If the instruction writes to more than one register, it needs to be
|
|
|
|
|
* explicitly marked as compressed on Gen <= 5. On Gen >= 6 the
|
|
|
|
|
* hardware figures out by itself what the right compression mode is,
|
|
|
|
|
* but we still need to know whether the instruction is compressed to
|
|
|
|
|
* set up the source register regions appropriately.
|
|
|
|
|
*
|
|
|
|
|
* XXX - This is wrong for instructions that write a single register but
|
|
|
|
|
* read more than one which should strictly speaking be treated as
|
|
|
|
|
* compressed. For instructions that don't write any registers it
|
|
|
|
|
* relies on the destination being a null register of the correct
|
|
|
|
|
* type and regioning so the instruction is considered compressed
|
|
|
|
|
* or not accordingly.
|
|
|
|
|
*/
|
2016-05-20 15:25:28 -07:00
|
|
|
const bool compressed =
|
|
|
|
|
inst->dst.component_size(inst->exec_size) > REG_SIZE;
|
2022-07-08 14:56:03 -07:00
|
|
|
|
2024-02-15 15:59:08 -08:00
|
|
|
if (devinfo->ver >= 20 && inst->group % 8 != 0) {
|
2022-07-08 14:56:03 -07:00
|
|
|
assert(inst->force_writemask_all);
|
|
|
|
|
assert(!inst->predicate && !inst->conditional_mod);
|
|
|
|
|
assert(!inst->writes_accumulator_implicitly(devinfo) &&
|
|
|
|
|
!inst->reads_accumulator_implicitly());
|
|
|
|
|
assert(inst->opcode != SHADER_OPCODE_SEL_EXEC);
|
|
|
|
|
brw_set_default_group(p, 0);
|
|
|
|
|
} else {
|
|
|
|
|
brw_set_default_group(p, inst->group);
|
|
|
|
|
}
|
2011-05-24 16:34:27 -07:00
|
|
|
|
2016-05-16 15:09:17 -07:00
|
|
|
for (unsigned int i = 0; i < inst->sources; i++) {
|
2016-07-18 07:27:56 +00:00
|
|
|
src[i] = brw_reg_from_fs_reg(devinfo, inst,
|
|
|
|
|
&inst->src[i], compressed);
|
2016-05-16 15:09:17 -07:00
|
|
|
/* The accumulator result appears to get used for the
|
|
|
|
|
* conditional modifier generation. When negating a UD
|
|
|
|
|
* value, there is a 33rd bit generated for the sign in the
|
|
|
|
|
* accumulator value, so now you can't check, for example,
|
|
|
|
|
* equality with a 32-bit value. See piglit fs-op-neg-uvec4.
|
|
|
|
|
*/
|
|
|
|
|
assert(!inst->conditional_mod ||
|
|
|
|
|
inst->src[i].type != BRW_REGISTER_TYPE_UD ||
|
|
|
|
|
!inst->src[i].negate);
|
|
|
|
|
}
|
2016-07-18 07:27:56 +00:00
|
|
|
dst = brw_reg_from_fs_reg(devinfo, inst,
|
|
|
|
|
&inst->dst, compressed);
|
2016-05-16 15:09:17 -07:00
|
|
|
|
2016-05-18 18:41:28 -07:00
|
|
|
brw_set_default_access_mode(p, BRW_ALIGN_1);
|
2016-05-16 15:09:17 -07:00
|
|
|
brw_set_default_predicate_control(p, inst->predicate);
|
|
|
|
|
brw_set_default_predicate_inverse(p, inst->predicate_inverse);
|
2021-03-29 15:40:04 -07:00
|
|
|
/* On gfx7 and above, hardware automatically adds the group onto the
|
2024-02-15 15:59:08 -08:00
|
|
|
* flag subregister number.
|
2018-05-17 20:51:24 -07:00
|
|
|
*/
|
2024-02-15 15:59:08 -08:00
|
|
|
const unsigned flag_subreg = inst->flag_subreg;
|
2018-05-17 20:51:24 -07:00
|
|
|
brw_set_default_flag_reg(p, flag_subreg / 2, flag_subreg % 2);
|
2016-05-16 15:09:17 -07:00
|
|
|
brw_set_default_saturate(p, inst->saturate);
|
|
|
|
|
brw_set_default_mask_control(p, inst->force_writemask_all);
|
2022-03-07 16:28:54 -08:00
|
|
|
if (devinfo->ver >= 20 && inst->writes_accumulator) {
|
|
|
|
|
assert(inst->dst.is_accumulator() ||
|
|
|
|
|
inst->opcode == BRW_OPCODE_ADDC ||
|
|
|
|
|
inst->opcode == BRW_OPCODE_MACH ||
|
|
|
|
|
inst->opcode == BRW_OPCODE_SUBB);
|
|
|
|
|
} else {
|
|
|
|
|
brw_set_default_acc_write_control(p, inst->writes_accumulator);
|
|
|
|
|
}
|
2021-05-24 22:53:27 -07:00
|
|
|
brw_set_default_swsb(p, swsb);
|
2016-07-18 07:27:56 +00:00
|
|
|
|
|
|
|
|
unsigned exec_size = inst->exec_size;
|
|
|
|
|
|
|
|
|
|
brw_set_default_exec_size(p, cvt(exec_size) - 1);
|
2016-05-16 15:09:17 -07:00
|
|
|
|
2016-06-20 12:13:14 +02:00
|
|
|
assert(inst->force_writemask_all || inst->exec_size >= 4);
|
2016-05-20 16:14:13 -07:00
|
|
|
assert(inst->force_writemask_all || inst->group % inst->exec_size == 0);
|
2022-08-03 16:51:43 -07:00
|
|
|
assert(inst->mlen <= BRW_MAX_MSG_LENGTH * reg_unit(devinfo));
|
2016-05-16 15:09:17 -07:00
|
|
|
|
2011-05-24 16:34:27 -07:00
|
|
|
switch (inst->opcode) {
|
2019-09-03 17:51:17 -07:00
|
|
|
case BRW_OPCODE_SYNC:
|
|
|
|
|
assert(src[0].file == BRW_IMMEDIATE_VALUE);
|
|
|
|
|
brw_SYNC(p, tgl_sync_function(src[0].ud));
|
2024-02-13 00:29:29 -08:00
|
|
|
|
|
|
|
|
if (tgl_sync_function(src[0].ud) == TGL_SYNC_NOP)
|
|
|
|
|
++sync_nop_count;
|
|
|
|
|
|
2019-09-03 17:51:17 -07:00
|
|
|
break;
|
2011-05-24 16:34:27 -07:00
|
|
|
case BRW_OPCODE_MOV:
|
|
|
|
|
brw_MOV(p, dst, src[0]);
|
|
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_ADD:
|
|
|
|
|
brw_ADD(p, dst, src[0], src[1]);
|
|
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_MUL:
|
|
|
|
|
brw_MUL(p, dst, src[0], src[1]);
|
|
|
|
|
break;
|
2013-12-17 16:39:16 +02:00
|
|
|
case BRW_OPCODE_AVG:
|
|
|
|
|
brw_AVG(p, dst, src[0], src[1]);
|
|
|
|
|
break;
|
2011-08-15 22:36:18 -07:00
|
|
|
case BRW_OPCODE_MACH:
|
|
|
|
|
brw_MACH(p, dst, src[0], src[1]);
|
|
|
|
|
break;
|
2011-05-24 16:34:27 -07:00
|
|
|
|
2021-02-23 18:46:53 -08:00
|
|
|
case BRW_OPCODE_DP4A:
|
|
|
|
|
assert(devinfo->ver >= 12);
|
|
|
|
|
brw_DP4A(p, dst, src[0], src[1], src[2]);
|
|
|
|
|
break;
|
|
|
|
|
|
2014-04-01 17:25:12 -07:00
|
|
|
case BRW_OPCODE_LINE:
|
|
|
|
|
brw_LINE(p, dst, src[0], src[1]);
|
|
|
|
|
break;
|
|
|
|
|
|
2023-09-20 12:42:24 -07:00
|
|
|
case BRW_OPCODE_DPAS:
|
|
|
|
|
assert(devinfo->verx10 >= 125);
|
|
|
|
|
brw_DPAS(p, translate_systolic_depth(inst->sdepth), inst->rcount,
|
|
|
|
|
dst, src[0], src[1], src[2]);
|
|
|
|
|
break;
|
|
|
|
|
|
2012-02-07 00:59:11 +01:00
|
|
|
case BRW_OPCODE_MAD:
|
2021-03-29 14:41:58 -07:00
|
|
|
if (devinfo->ver < 10)
|
2018-01-05 09:46:11 -08:00
|
|
|
brw_set_default_access_mode(p, BRW_ALIGN_16);
|
2016-05-17 19:51:50 -07:00
|
|
|
brw_MAD(p, dst, src[0], src[1], src[2]);
|
2012-02-07 00:59:11 +01:00
|
|
|
break;
|
|
|
|
|
|
2012-12-02 00:08:15 -08:00
|
|
|
case BRW_OPCODE_LRP:
|
2024-02-15 15:59:08 -08:00
|
|
|
assert(devinfo->ver <= 10);
|
2021-03-29 14:41:58 -07:00
|
|
|
if (devinfo->ver < 10)
|
2018-01-05 09:46:11 -08:00
|
|
|
brw_set_default_access_mode(p, BRW_ALIGN_16);
|
2016-05-17 19:51:50 -07:00
|
|
|
brw_LRP(p, dst, src[0], src[1], src[2]);
|
2012-12-02 00:08:15 -08:00
|
|
|
break;
|
|
|
|
|
|
2020-06-05 22:40:26 -07:00
|
|
|
case BRW_OPCODE_ADD3:
|
|
|
|
|
assert(devinfo->verx10 >= 125);
|
|
|
|
|
brw_ADD3(p, dst, src[0], src[1], src[2]);
|
|
|
|
|
break;
|
|
|
|
|
|
2011-05-24 16:34:27 -07:00
|
|
|
case BRW_OPCODE_FRC:
|
|
|
|
|
brw_FRC(p, dst, src[0]);
|
|
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_RNDD:
|
|
|
|
|
brw_RNDD(p, dst, src[0]);
|
|
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_RNDE:
|
|
|
|
|
brw_RNDE(p, dst, src[0]);
|
|
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_RNDZ:
|
|
|
|
|
brw_RNDZ(p, dst, src[0]);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_AND:
|
|
|
|
|
brw_AND(p, dst, src[0], src[1]);
|
|
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_OR:
|
|
|
|
|
brw_OR(p, dst, src[0], src[1]);
|
|
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_XOR:
|
|
|
|
|
brw_XOR(p, dst, src[0], src[1]);
|
|
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_NOT:
|
|
|
|
|
brw_NOT(p, dst, src[0]);
|
|
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_ASR:
|
|
|
|
|
brw_ASR(p, dst, src[0], src[1]);
|
|
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_SHR:
|
|
|
|
|
brw_SHR(p, dst, src[0], src[1]);
|
|
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_SHL:
|
|
|
|
|
brw_SHL(p, dst, src[0], src[1]);
|
|
|
|
|
break;
|
2019-05-29 11:43:30 -07:00
|
|
|
case BRW_OPCODE_ROL:
|
2021-03-29 14:41:58 -07:00
|
|
|
assert(devinfo->ver >= 11);
|
2019-05-29 11:43:30 -07:00
|
|
|
assert(src[0].type == dst.type);
|
|
|
|
|
brw_ROL(p, dst, src[0], src[1]);
|
|
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_ROR:
|
2021-03-29 14:41:58 -07:00
|
|
|
assert(devinfo->ver >= 11);
|
2019-05-29 11:43:30 -07:00
|
|
|
assert(src[0].type == dst.type);
|
|
|
|
|
brw_ROR(p, dst, src[0], src[1]);
|
|
|
|
|
break;
|
2011-05-24 16:34:27 -07:00
|
|
|
case BRW_OPCODE_CMP:
|
2016-05-17 19:59:18 -07:00
|
|
|
brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
|
2011-05-24 16:34:27 -07:00
|
|
|
break;
|
2021-02-13 14:11:30 -08:00
|
|
|
case BRW_OPCODE_CMPN:
|
|
|
|
|
brw_CMPN(p, dst, inst->conditional_mod, src[0], src[1]);
|
|
|
|
|
break;
|
2011-05-24 16:34:27 -07:00
|
|
|
case BRW_OPCODE_SEL:
|
|
|
|
|
brw_SEL(p, dst, src[0], src[1]);
|
|
|
|
|
break;
|
2015-11-22 20:12:17 -08:00
|
|
|
case BRW_OPCODE_CSEL:
|
2021-03-29 14:41:58 -07:00
|
|
|
if (devinfo->ver < 10)
|
2015-11-22 20:12:17 -08:00
|
|
|
brw_set_default_access_mode(p, BRW_ALIGN_16);
|
|
|
|
|
brw_CSEL(p, dst, src[0], src[1], src[2]);
|
|
|
|
|
break;
|
2013-04-09 19:22:34 -07:00
|
|
|
case BRW_OPCODE_BFREV:
|
|
|
|
|
brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
|
2017-06-30 15:10:17 -07:00
|
|
|
retype(src[0], BRW_REGISTER_TYPE_UD));
|
2013-04-09 19:22:34 -07:00
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_FBH:
|
2017-06-30 15:10:17 -07:00
|
|
|
brw_FBH(p, retype(dst, src[0].type), src[0]);
|
2013-04-09 19:22:34 -07:00
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_FBL:
|
2017-06-30 15:10:17 -07:00
|
|
|
brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD),
|
|
|
|
|
retype(src[0], BRW_REGISTER_TYPE_UD));
|
2016-06-21 15:14:03 -07:00
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_LZD:
|
|
|
|
|
brw_LZD(p, dst, src[0]);
|
2013-04-09 19:22:34 -07:00
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_CBIT:
|
2017-06-30 15:10:17 -07:00
|
|
|
brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD),
|
|
|
|
|
retype(src[0], BRW_REGISTER_TYPE_UD));
|
2013-04-09 19:22:34 -07:00
|
|
|
break;
|
2013-09-19 13:01:08 -07:00
|
|
|
case BRW_OPCODE_ADDC:
|
|
|
|
|
brw_ADDC(p, dst, src[0], src[1]);
|
|
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_SUBB:
|
|
|
|
|
brw_SUBB(p, dst, src[0], src[1]);
|
|
|
|
|
break;
|
2014-03-28 15:28:32 +02:00
|
|
|
case BRW_OPCODE_MAC:
|
|
|
|
|
brw_MAC(p, dst, src[0], src[1]);
|
|
|
|
|
break;
|
2013-04-09 19:22:34 -07:00
|
|
|
|
|
|
|
|
case BRW_OPCODE_BFE:
|
2021-03-29 14:41:58 -07:00
|
|
|
if (devinfo->ver < 10)
|
2018-01-05 09:46:11 -08:00
|
|
|
brw_set_default_access_mode(p, BRW_ALIGN_16);
|
2016-05-17 19:51:50 -07:00
|
|
|
brw_BFE(p, dst, src[0], src[1], src[2]);
|
2013-04-09 19:22:34 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_BFI1:
|
2016-05-17 20:02:29 -07:00
|
|
|
brw_BFI1(p, dst, src[0], src[1]);
|
2013-04-09 19:22:34 -07:00
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_BFI2:
|
2021-03-29 14:41:58 -07:00
|
|
|
if (devinfo->ver < 10)
|
2018-01-05 09:46:11 -08:00
|
|
|
brw_set_default_access_mode(p, BRW_ALIGN_16);
|
2016-05-17 20:02:29 -07:00
|
|
|
brw_BFI2(p, dst, src[0], src[1], src[2]);
|
2013-04-09 19:22:34 -07:00
|
|
|
break;
|
2011-05-24 16:34:27 -07:00
|
|
|
|
|
|
|
|
case BRW_OPCODE_IF:
|
2024-02-15 15:59:08 -08:00
|
|
|
brw_IF(p, brw_get_default_exec_size(p));
|
2011-05-24 16:34:27 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_ELSE:
|
|
|
|
|
brw_ELSE(p);
|
|
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_ENDIF:
|
|
|
|
|
brw_ENDIF(p);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_DO:
|
2018-05-29 14:50:46 -07:00
|
|
|
brw_DO(p, brw_get_default_exec_size(p));
|
2011-05-24 16:34:27 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_BREAK:
|
2011-12-06 12:44:41 -08:00
|
|
|
brw_BREAK(p);
|
2011-05-24 16:34:27 -07:00
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_CONTINUE:
|
2014-08-04 14:26:26 -07:00
|
|
|
brw_CONT(p);
|
2011-05-24 16:34:27 -07:00
|
|
|
break;
|
|
|
|
|
|
2011-12-06 12:30:03 -08:00
|
|
|
case BRW_OPCODE_WHILE:
|
|
|
|
|
brw_WHILE(p);
|
2014-08-06 11:27:58 +03:00
|
|
|
loop_count++;
|
2011-05-24 16:34:27 -07:00
|
|
|
break;
|
|
|
|
|
|
2011-08-05 12:38:58 -07:00
|
|
|
case SHADER_OPCODE_RCP:
|
|
|
|
|
case SHADER_OPCODE_RSQ:
|
|
|
|
|
case SHADER_OPCODE_SQRT:
|
|
|
|
|
case SHADER_OPCODE_EXP2:
|
|
|
|
|
case SHADER_OPCODE_LOG2:
|
|
|
|
|
case SHADER_OPCODE_SIN:
|
|
|
|
|
case SHADER_OPCODE_COS:
|
2014-11-21 12:34:22 -08:00
|
|
|
assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
|
2024-02-15 15:59:08 -08:00
|
|
|
assert(inst->mlen == 0);
|
|
|
|
|
gfx6_math(p, dst, brw_math_function(inst->opcode),
|
|
|
|
|
src[0], brw_null_reg());
|
2011-08-18 11:55:42 -07:00
|
|
|
break;
|
2011-09-28 17:37:54 -07:00
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
|
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
2011-08-18 11:55:42 -07:00
|
|
|
case SHADER_OPCODE_POW:
|
2018-11-19 14:54:43 -08:00
|
|
|
assert(devinfo->verx10 < 125);
|
2014-11-21 12:34:22 -08:00
|
|
|
assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
|
2024-02-15 15:59:08 -08:00
|
|
|
assert(inst->mlen == 0);
|
|
|
|
|
assert(inst->opcode == SHADER_OPCODE_POW || inst->exec_size == 8);
|
|
|
|
|
gfx6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
|
2011-05-24 16:34:27 -07:00
|
|
|
break;
|
2024-04-11 01:10:51 -07:00
|
|
|
case BRW_OPCODE_PLN:
|
|
|
|
|
/* PLN reads:
|
|
|
|
|
* / in SIMD16 \
|
|
|
|
|
* -----------------------------------
|
|
|
|
|
* | src1+0 | src1+1 | src1+2 | src1+3 |
|
|
|
|
|
* |-----------------------------------|
|
|
|
|
|
* |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
|
|
|
|
|
* -----------------------------------
|
|
|
|
|
*/
|
|
|
|
|
brw_PLN(p, dst, src[0], src[1]);
|
2011-05-24 16:34:27 -07:00
|
|
|
break;
|
2015-04-14 13:17:38 -07:00
|
|
|
case FS_OPCODE_PIXEL_X:
|
|
|
|
|
assert(src[0].type == BRW_REGISTER_TYPE_UW);
|
2020-10-29 15:10:59 +02:00
|
|
|
assert(src[1].type == BRW_REGISTER_TYPE_UW);
|
2015-04-14 13:17:38 -07:00
|
|
|
src[0].subnr = 0 * type_sz(src[0].type);
|
2020-10-29 15:10:59 +02:00
|
|
|
if (src[1].file == BRW_IMMEDIATE_VALUE) {
|
|
|
|
|
assert(src[1].ud == 0);
|
|
|
|
|
brw_MOV(p, dst, stride(src[0], 8, 4, 1));
|
|
|
|
|
} else {
|
|
|
|
|
/* Coarse pixel case */
|
|
|
|
|
brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]);
|
|
|
|
|
}
|
2015-04-14 13:17:38 -07:00
|
|
|
break;
|
|
|
|
|
case FS_OPCODE_PIXEL_Y:
|
|
|
|
|
assert(src[0].type == BRW_REGISTER_TYPE_UW);
|
2020-10-29 15:10:59 +02:00
|
|
|
assert(src[1].type == BRW_REGISTER_TYPE_UW);
|
2015-04-14 13:17:38 -07:00
|
|
|
src[0].subnr = 4 * type_sz(src[0].type);
|
2020-10-29 15:10:59 +02:00
|
|
|
if (src[1].file == BRW_IMMEDIATE_VALUE) {
|
|
|
|
|
assert(src[1].ud == 0);
|
|
|
|
|
brw_MOV(p, dst, stride(src[0], 8, 4, 1));
|
|
|
|
|
} else {
|
|
|
|
|
/* Coarse pixel case */
|
|
|
|
|
brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]);
|
|
|
|
|
}
|
2015-04-14 13:17:38 -07:00
|
|
|
break;
|
2018-10-29 15:06:14 -05:00
|
|
|
|
|
|
|
|
case SHADER_OPCODE_SEND:
|
|
|
|
|
generate_send(inst, dst, src[0], src[1], src[2],
|
|
|
|
|
inst->ex_mlen > 0 ? src[3] : brw_null_reg());
|
2022-05-24 02:44:53 -07:00
|
|
|
send_count++;
|
2018-10-29 15:06:14 -05:00
|
|
|
break;
|
|
|
|
|
|
2014-11-08 01:39:14 -08:00
|
|
|
case FS_OPCODE_DDX_COARSE:
|
|
|
|
|
case FS_OPCODE_DDX_FINE:
|
2017-06-15 15:41:40 -07:00
|
|
|
generate_ddx(inst, dst, src[0]);
|
2014-11-08 01:39:14 -08:00
|
|
|
break;
|
|
|
|
|
case FS_OPCODE_DDY_COARSE:
|
|
|
|
|
case FS_OPCODE_DDY_FINE:
|
2017-06-15 15:41:40 -07:00
|
|
|
generate_ddy(inst, dst, src[0]);
|
2011-05-24 16:34:27 -07:00
|
|
|
break;
|
|
|
|
|
|
2020-10-09 04:13:20 -05:00
|
|
|
case SHADER_OPCODE_SCRATCH_HEADER:
|
|
|
|
|
generate_scratch_header(inst, dst);
|
|
|
|
|
break;
|
|
|
|
|
|
2015-11-07 18:58:34 -08:00
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
|
|
|
generate_mov_indirect(inst, dst, src[0], src[1]);
|
|
|
|
|
break;
|
|
|
|
|
|
2020-08-08 13:56:16 -05:00
|
|
|
case SHADER_OPCODE_MOV_RELOC_IMM:
|
|
|
|
|
assert(src[0].file == BRW_IMMEDIATE_VALUE);
|
|
|
|
|
brw_MOV_reloc_imm(p, dst, dst.type, src[0].ud);
|
|
|
|
|
break;
|
|
|
|
|
|
2020-11-30 17:24:51 -06:00
|
|
|
case BRW_OPCODE_HALT:
|
|
|
|
|
generate_halt(inst);
|
2012-12-06 10:15:08 -08:00
|
|
|
break;
|
|
|
|
|
|
intel/fs,vec4: Pull stall logic for memory fences up into the IR
Instead of emitting the stall MOV "inside" the
SHADER_OPCODE_MEMORY_FENCE generation, use the scheduling fences when
creating the IR.
For IvyBridge, every (data cache) fence is accompained by a render
cache fence, that now is explicit in the IR, two
SHADER_OPCODE_MEMORY_FENCEs are emitted (with different SFIDs).
Because Begin and End interlock intrinsics are effectively memory
barriers, move its handling alongside the other memory barrier
intrinsics. The SHADER_OPCODE_INTERLOCK is still used to distinguish
if we are going to use a SENDC (for Begin) or regular SEND (for End).
This change is a preparation to allow emitting both SENDs in Gen11+
before we can stall on them.
Shader-db results for IVB (i965):
total instructions in shared programs: 11971190 -> 11971200 (<.01%)
instructions in affected programs: 11482 -> 11492 (0.09%)
helped: 0
HURT: 8
HURT stats (abs) min: 1 max: 3 x̄: 1.25 x̃: 1
HURT stats (rel) min: 0.03% max: 0.50% x̄: 0.14% x̃: 0.10%
95% mean confidence interval for instructions value: 0.66 1.84
95% mean confidence interval for instructions %-change: 0.01% 0.27%
Instructions are HURT.
Unlike the previous code, that used the `mov g1 g2` trick to force
both `g1` and `g2` to stall, the scheduling fence will generate `mov
null g1` and `mov null g2`. During review it was decided it was not
worth keeping the special codepath for the small effect will have.
Shader-db results for HSW (i965), BDW and SKL don't have a change
on instruction count, but do report changes in cycles count, showing
SKL results below
total cycles in shared programs: 341738444 -> 341710570 (<.01%)
cycles in affected programs: 7240002 -> 7212128 (-0.38%)
helped: 46
HURT: 5
helped stats (abs) min: 14 max: 1940 x̄: 676.22 x̃: 154
helped stats (rel) min: <.01% max: 2.62% x̄: 1.28% x̃: 0.95%
HURT stats (abs) min: 2 max: 1768 x̄: 646.40 x̃: 362
HURT stats (rel) min: <.01% max: 0.83% x̄: 0.28% x̃: 0.08%
95% mean confidence interval for cycles value: -777.71 -315.38
95% mean confidence interval for cycles %-change: -1.42% -0.83%
Cycles are helped.
This seems to be the effect of allocating two registers separatedly
instead of a single one with size 2, which causes different register
allocation, affecting the cycle estimates.
while ICL also has not change on instruction count but report changes
negative changes in cycles
total cycles in shared programs: 352665369 -> 352707484 (0.01%)
cycles in affected programs: 9608288 -> 9650403 (0.44%)
helped: 4
HURT: 104
helped stats (abs) min: 24 max: 128 x̄: 88.50 x̃: 101
helped stats (rel) min: <.01% max: 0.85% x̄: 0.46% x̃: 0.49%
HURT stats (abs) min: 2 max: 2016 x̄: 408.36 x̃: 48
HURT stats (rel) min: <.01% max: 3.31% x̄: 0.88% x̃: 0.45%
95% mean confidence interval for cycles value: 256.67 523.24
95% mean confidence interval for cycles %-change: 0.63% 1.03%
Cycles are HURT.
AFAICT this is the result of the case above.
Shader-db results for TGL have similar cycles result as ICL, but also
affect instructions
total instructions in shared programs: 17690586 -> 17690597 (<.01%)
instructions in affected programs: 64617 -> 64628 (0.02%)
helped: 55
HURT: 32
helped stats (abs) min: 1 max: 16 x̄: 4.13 x̃: 3
helped stats (rel) min: 0.05% max: 2.78% x̄: 0.86% x̃: 0.74%
HURT stats (abs) min: 1 max: 65 x̄: 7.44 x̃: 2
HURT stats (rel) min: 0.05% max: 4.58% x̄: 1.13% x̃: 0.69%
95% mean confidence interval for instructions value: -2.03 2.28
95% mean confidence interval for instructions %-change: -0.41% 0.15%
Inconclusive result (value mean confidence interval includes 0).
Now that more is done in the IR, more dependencies are visible and
more SWSB annotations are emitted. Mixed with different register
allocation decisions like above, some shaders will see more `sync
nops` while others able to avoid them.
Most of the new `sync nops` are also redundant and could be dropped,
which will be fixed in a separate change.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3278>
2020-01-17 15:07:44 -08:00
|
|
|
case SHADER_OPCODE_INTERLOCK:
|
2020-01-03 10:05:39 -08:00
|
|
|
case SHADER_OPCODE_MEMORY_FENCE: {
|
2019-05-22 12:36:17 -05:00
|
|
|
assert(src[1].file == BRW_IMMEDIATE_VALUE);
|
2019-07-10 12:02:23 -07:00
|
|
|
assert(src[2].file == BRW_IMMEDIATE_VALUE);
|
intel/fs,vec4: Pull stall logic for memory fences up into the IR
Instead of emitting the stall MOV "inside" the
SHADER_OPCODE_MEMORY_FENCE generation, use the scheduling fences when
creating the IR.
For IvyBridge, every (data cache) fence is accompained by a render
cache fence, that now is explicit in the IR, two
SHADER_OPCODE_MEMORY_FENCEs are emitted (with different SFIDs).
Because Begin and End interlock intrinsics are effectively memory
barriers, move its handling alongside the other memory barrier
intrinsics. The SHADER_OPCODE_INTERLOCK is still used to distinguish
if we are going to use a SENDC (for Begin) or regular SEND (for End).
This change is a preparation to allow emitting both SENDs in Gen11+
before we can stall on them.
Shader-db results for IVB (i965):
total instructions in shared programs: 11971190 -> 11971200 (<.01%)
instructions in affected programs: 11482 -> 11492 (0.09%)
helped: 0
HURT: 8
HURT stats (abs) min: 1 max: 3 x̄: 1.25 x̃: 1
HURT stats (rel) min: 0.03% max: 0.50% x̄: 0.14% x̃: 0.10%
95% mean confidence interval for instructions value: 0.66 1.84
95% mean confidence interval for instructions %-change: 0.01% 0.27%
Instructions are HURT.
Unlike the previous code, that used the `mov g1 g2` trick to force
both `g1` and `g2` to stall, the scheduling fence will generate `mov
null g1` and `mov null g2`. During review it was decided it was not
worth keeping the special codepath for the small effect will have.
Shader-db results for HSW (i965), BDW and SKL don't have a change
on instruction count, but do report changes in cycles count, showing
SKL results below
total cycles in shared programs: 341738444 -> 341710570 (<.01%)
cycles in affected programs: 7240002 -> 7212128 (-0.38%)
helped: 46
HURT: 5
helped stats (abs) min: 14 max: 1940 x̄: 676.22 x̃: 154
helped stats (rel) min: <.01% max: 2.62% x̄: 1.28% x̃: 0.95%
HURT stats (abs) min: 2 max: 1768 x̄: 646.40 x̃: 362
HURT stats (rel) min: <.01% max: 0.83% x̄: 0.28% x̃: 0.08%
95% mean confidence interval for cycles value: -777.71 -315.38
95% mean confidence interval for cycles %-change: -1.42% -0.83%
Cycles are helped.
This seems to be the effect of allocating two registers separatedly
instead of a single one with size 2, which causes different register
allocation, affecting the cycle estimates.
while ICL also has not change on instruction count but report changes
negative changes in cycles
total cycles in shared programs: 352665369 -> 352707484 (0.01%)
cycles in affected programs: 9608288 -> 9650403 (0.44%)
helped: 4
HURT: 104
helped stats (abs) min: 24 max: 128 x̄: 88.50 x̃: 101
helped stats (rel) min: <.01% max: 0.85% x̄: 0.46% x̃: 0.49%
HURT stats (abs) min: 2 max: 2016 x̄: 408.36 x̃: 48
HURT stats (rel) min: <.01% max: 3.31% x̄: 0.88% x̃: 0.45%
95% mean confidence interval for cycles value: 256.67 523.24
95% mean confidence interval for cycles %-change: 0.63% 1.03%
Cycles are HURT.
AFAICT this is the result of the case above.
Shader-db results for TGL have similar cycles result as ICL, but also
affect instructions
total instructions in shared programs: 17690586 -> 17690597 (<.01%)
instructions in affected programs: 64617 -> 64628 (0.02%)
helped: 55
HURT: 32
helped stats (abs) min: 1 max: 16 x̄: 4.13 x̃: 3
helped stats (rel) min: 0.05% max: 2.78% x̄: 0.86% x̃: 0.74%
HURT stats (abs) min: 1 max: 65 x̄: 7.44 x̃: 2
HURT stats (rel) min: 0.05% max: 4.58% x̄: 1.13% x̃: 0.69%
95% mean confidence interval for instructions value: -2.03 2.28
95% mean confidence interval for instructions %-change: -0.41% 0.15%
Inconclusive result (value mean confidence interval includes 0).
Now that more is done in the IR, more dependencies are visible and
more SWSB annotations are emitted. Mixed with different register
allocation decisions like above, some shaders will see more `sync
nops` while others able to avoid them.
Most of the new `sync nops` are also redundant and could be dropped,
which will be fixed in a separate change.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3278>
2020-01-17 15:07:44 -08:00
|
|
|
|
|
|
|
|
const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ?
|
|
|
|
|
BRW_OPCODE_SENDC : BRW_OPCODE_SEND;
|
|
|
|
|
|
|
|
|
|
brw_memory_fence(p, dst, src[0], send_op,
|
|
|
|
|
brw_message_target(inst->sfid),
|
2021-10-27 14:11:27 -07:00
|
|
|
inst->desc,
|
intel/fs,vec4: Pull stall logic for memory fences up into the IR
Instead of emitting the stall MOV "inside" the
SHADER_OPCODE_MEMORY_FENCE generation, use the scheduling fences when
creating the IR.
For IvyBridge, every (data cache) fence is accompained by a render
cache fence, that now is explicit in the IR, two
SHADER_OPCODE_MEMORY_FENCEs are emitted (with different SFIDs).
Because Begin and End interlock intrinsics are effectively memory
barriers, move its handling alongside the other memory barrier
intrinsics. The SHADER_OPCODE_INTERLOCK is still used to distinguish
if we are going to use a SENDC (for Begin) or regular SEND (for End).
This change is a preparation to allow emitting both SENDs in Gen11+
before we can stall on them.
Shader-db results for IVB (i965):
total instructions in shared programs: 11971190 -> 11971200 (<.01%)
instructions in affected programs: 11482 -> 11492 (0.09%)
helped: 0
HURT: 8
HURT stats (abs) min: 1 max: 3 x̄: 1.25 x̃: 1
HURT stats (rel) min: 0.03% max: 0.50% x̄: 0.14% x̃: 0.10%
95% mean confidence interval for instructions value: 0.66 1.84
95% mean confidence interval for instructions %-change: 0.01% 0.27%
Instructions are HURT.
Unlike the previous code, that used the `mov g1 g2` trick to force
both `g1` and `g2` to stall, the scheduling fence will generate `mov
null g1` and `mov null g2`. During review it was decided it was not
worth keeping the special codepath for the small effect will have.
Shader-db results for HSW (i965), BDW and SKL don't have a change
on instruction count, but do report changes in cycles count, showing
SKL results below
total cycles in shared programs: 341738444 -> 341710570 (<.01%)
cycles in affected programs: 7240002 -> 7212128 (-0.38%)
helped: 46
HURT: 5
helped stats (abs) min: 14 max: 1940 x̄: 676.22 x̃: 154
helped stats (rel) min: <.01% max: 2.62% x̄: 1.28% x̃: 0.95%
HURT stats (abs) min: 2 max: 1768 x̄: 646.40 x̃: 362
HURT stats (rel) min: <.01% max: 0.83% x̄: 0.28% x̃: 0.08%
95% mean confidence interval for cycles value: -777.71 -315.38
95% mean confidence interval for cycles %-change: -1.42% -0.83%
Cycles are helped.
This seems to be the effect of allocating two registers separatedly
instead of a single one with size 2, which causes different register
allocation, affecting the cycle estimates.
while ICL also has not change on instruction count but report changes
negative changes in cycles
total cycles in shared programs: 352665369 -> 352707484 (0.01%)
cycles in affected programs: 9608288 -> 9650403 (0.44%)
helped: 4
HURT: 104
helped stats (abs) min: 24 max: 128 x̄: 88.50 x̃: 101
helped stats (rel) min: <.01% max: 0.85% x̄: 0.46% x̃: 0.49%
HURT stats (abs) min: 2 max: 2016 x̄: 408.36 x̃: 48
HURT stats (rel) min: <.01% max: 3.31% x̄: 0.88% x̃: 0.45%
95% mean confidence interval for cycles value: 256.67 523.24
95% mean confidence interval for cycles %-change: 0.63% 1.03%
Cycles are HURT.
AFAICT this is the result of the case above.
Shader-db results for TGL have similar cycles result as ICL, but also
affect instructions
total instructions in shared programs: 17690586 -> 17690597 (<.01%)
instructions in affected programs: 64617 -> 64628 (0.02%)
helped: 55
HURT: 32
helped stats (abs) min: 1 max: 16 x̄: 4.13 x̃: 3
helped stats (rel) min: 0.05% max: 2.78% x̄: 0.86% x̃: 0.74%
HURT stats (abs) min: 1 max: 65 x̄: 7.44 x̃: 2
HURT stats (rel) min: 0.05% max: 4.58% x̄: 1.13% x̃: 0.69%
95% mean confidence interval for instructions value: -2.03 2.28
95% mean confidence interval for instructions %-change: -0.41% 0.15%
Inconclusive result (value mean confidence interval includes 0).
Now that more is done in the IR, more dependencies are visible and
more SWSB annotations are emitted. Mixed with different register
allocation decisions like above, some shaders will see more `sync
nops` while others able to avoid them.
Most of the new `sync nops` are also redundant and could be dropped,
which will be fixed in a separate change.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3278>
2020-01-17 15:07:44 -08:00
|
|
|
/* commit_enable */ src[1].ud,
|
|
|
|
|
/* bti */ src[2].ud);
|
|
|
|
|
send_count++;
|
2018-04-27 15:06:56 +01:00
|
|
|
break;
|
2020-01-03 10:05:39 -08:00
|
|
|
}
|
2018-04-27 15:06:56 +01:00
|
|
|
|
2020-01-02 15:27:58 -08:00
|
|
|
case FS_OPCODE_SCHEDULING_FENCE:
|
2021-05-24 22:53:27 -07:00
|
|
|
if (inst->sources == 0 && swsb.regdist == 0 &&
|
|
|
|
|
swsb.mode == TGL_SBID_NULL) {
|
2020-01-17 14:52:13 -08:00
|
|
|
if (unlikely(debug_flag))
|
|
|
|
|
disasm_info->use_tail = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2021-03-29 14:41:58 -07:00
|
|
|
if (devinfo->ver >= 12) {
|
2020-01-17 14:52:13 -08:00
|
|
|
/* Use the available SWSB information to stall. A single SYNC is
|
|
|
|
|
* sufficient since if there were multiple dependencies, the
|
|
|
|
|
* scoreboard algorithm already injected other SYNCs before this
|
|
|
|
|
* instruction.
|
|
|
|
|
*/
|
|
|
|
|
brw_SYNC(p, TGL_SYNC_NOP);
|
|
|
|
|
} else {
|
|
|
|
|
for (unsigned i = 0; i < inst->sources; i++) {
|
|
|
|
|
/* Emit a MOV to force a stall until the instruction producing the
|
|
|
|
|
* registers finishes.
|
|
|
|
|
*/
|
|
|
|
|
brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
|
|
|
|
|
retype(src[i], BRW_REGISTER_TYPE_UW));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (inst->sources > 1)
|
|
|
|
|
multiple_instructions_emitted = true;
|
|
|
|
|
}
|
|
|
|
|
|
2020-01-02 15:27:58 -08:00
|
|
|
break;
|
|
|
|
|
|
2022-06-06 14:05:54 -07:00
|
|
|
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
|
|
|
|
|
case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
|
2024-01-05 09:19:38 -08:00
|
|
|
case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
|
|
|
|
|
unreachable("Should be lowered by lower_find_live_channel()");
|
|
|
|
|
break;
|
|
|
|
|
|
2020-01-23 23:01:32 -08:00
|
|
|
case FS_OPCODE_LOAD_LIVE_CHANNELS: {
|
|
|
|
|
assert(inst->force_writemask_all && inst->group == 0);
|
|
|
|
|
assert(inst->dst.file == BAD_FILE);
|
|
|
|
|
brw_set_default_exec_size(p, BRW_EXECUTE_1);
|
|
|
|
|
brw_MOV(p, retype(brw_flag_subreg(inst->flag_subreg),
|
|
|
|
|
BRW_REGISTER_TYPE_UD),
|
|
|
|
|
retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
|
|
|
|
|
break;
|
|
|
|
|
}
|
2015-02-20 20:14:24 +02:00
|
|
|
case SHADER_OPCODE_BROADCAST:
|
2016-05-19 00:10:03 -07:00
|
|
|
assert(inst->force_writemask_all);
|
2015-02-20 20:14:24 +02:00
|
|
|
brw_broadcast(p, dst, src[0], src[1]);
|
|
|
|
|
break;
|
|
|
|
|
|
2017-08-29 09:21:32 -07:00
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
generate_shuffle(inst, dst, src[0], src[1]);
|
|
|
|
|
break;
|
|
|
|
|
|
2017-08-31 21:45:30 -07:00
|
|
|
case SHADER_OPCODE_SEL_EXEC:
|
|
|
|
|
assert(inst->force_writemask_all);
|
2021-12-20 14:33:45 -08:00
|
|
|
assert(devinfo->has_64bit_float || type_sz(dst.type) <= 4);
|
|
|
|
|
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
|
|
|
|
|
brw_MOV(p, dst, src[1]);
|
|
|
|
|
brw_set_default_mask_control(p, BRW_MASK_ENABLE);
|
|
|
|
|
brw_set_default_swsb(p, tgl_swsb_null());
|
|
|
|
|
brw_MOV(p, dst, src[0]);
|
2017-08-31 21:45:30 -07:00
|
|
|
break;
|
|
|
|
|
|
2017-09-01 15:18:02 -07:00
|
|
|
case SHADER_OPCODE_QUAD_SWIZZLE:
|
|
|
|
|
assert(src[1].file == BRW_IMMEDIATE_VALUE);
|
|
|
|
|
assert(src[1].type == BRW_REGISTER_TYPE_UD);
|
2018-12-06 14:11:34 -08:00
|
|
|
generate_quad_swizzle(inst, dst, src[0], src[1].ud);
|
2017-09-01 15:18:02 -07:00
|
|
|
break;
|
|
|
|
|
|
2017-08-31 21:45:30 -07:00
|
|
|
case SHADER_OPCODE_CLUSTER_BROADCAST: {
|
2024-03-27 15:59:49 -07:00
|
|
|
assert((!intel_device_info_is_9lp(devinfo) &&
|
2021-12-20 14:34:13 -08:00
|
|
|
devinfo->has_64bit_float) || type_sz(src[0].type) <= 4);
|
2017-08-31 21:45:30 -07:00
|
|
|
assert(!src[0].negate && !src[0].abs);
|
|
|
|
|
assert(src[1].file == BRW_IMMEDIATE_VALUE);
|
|
|
|
|
assert(src[1].type == BRW_REGISTER_TYPE_UD);
|
|
|
|
|
assert(src[2].file == BRW_IMMEDIATE_VALUE);
|
|
|
|
|
assert(src[2].type == BRW_REGISTER_TYPE_UD);
|
|
|
|
|
const unsigned component = src[1].ud;
|
|
|
|
|
const unsigned cluster_size = src[2].ud;
|
2021-12-20 00:20:38 -08:00
|
|
|
assert(inst->src[0].file != ARF && inst->src[0].file != FIXED_GRF);
|
|
|
|
|
const unsigned s = inst->src[0].stride;
|
|
|
|
|
unsigned vstride = cluster_size * s;
|
2019-09-04 15:07:20 -07:00
|
|
|
unsigned width = cluster_size;
|
|
|
|
|
|
|
|
|
|
/* The maximum exec_size is 32, but the maximum width is only 16. */
|
|
|
|
|
if (inst->exec_size == width) {
|
|
|
|
|
vstride = 0;
|
|
|
|
|
width = 1;
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-20 00:20:38 -08:00
|
|
|
struct brw_reg strided = stride(suboffset(src[0], component * s),
|
2019-09-04 15:07:20 -07:00
|
|
|
vstride, width, 0);
|
2021-12-20 14:34:13 -08:00
|
|
|
brw_MOV(p, dst, strided);
|
2017-08-31 21:45:30 -07:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-19 09:32:27 -06:00
|
|
|
case SHADER_OPCODE_HALT_TARGET:
|
2013-03-27 23:19:39 -07:00
|
|
|
/* This is the place where the final HALT needs to be inserted if
|
|
|
|
|
* we've emitted any discards. If not, this will emit no code.
|
|
|
|
|
*/
|
2020-11-19 09:32:27 -06:00
|
|
|
if (!patch_halt_jumps()) {
|
2014-05-25 10:30:13 -07:00
|
|
|
if (unlikely(debug_flag)) {
|
2017-11-15 17:08:42 -08:00
|
|
|
disasm_info->use_tail = true;
|
2014-05-25 10:30:13 -07:00
|
|
|
}
|
2014-05-19 10:20:37 -07:00
|
|
|
}
|
2013-03-27 23:19:39 -07:00
|
|
|
break;
|
|
|
|
|
|
2014-08-27 11:32:08 -07:00
|
|
|
case SHADER_OPCODE_BARRIER:
|
|
|
|
|
generate_barrier(inst, src[0]);
|
2019-09-09 18:31:41 -07:00
|
|
|
send_count++;
|
2014-08-27 11:32:08 -07:00
|
|
|
break;
|
|
|
|
|
|
2019-09-13 01:34:35 +03:00
|
|
|
case SHADER_OPCODE_RND_MODE: {
|
2017-07-01 08:12:59 +02:00
|
|
|
assert(src[0].file == BRW_IMMEDIATE_VALUE);
|
2019-09-13 01:34:35 +03:00
|
|
|
/*
|
|
|
|
|
* Changes the floating point rounding mode updating the control
|
|
|
|
|
* register field defined at cr0.0[5-6] bits.
|
|
|
|
|
*/
|
|
|
|
|
enum brw_rnd_mode mode =
|
|
|
|
|
(enum brw_rnd_mode) (src[0].d << BRW_CR0_RND_MODE_SHIFT);
|
|
|
|
|
brw_float_controls_mode(p, mode, BRW_CR0_RND_MODE_MASK);
|
|
|
|
|
}
|
2017-07-01 08:12:59 +02:00
|
|
|
break;
|
|
|
|
|
|
2019-09-13 01:38:06 +03:00
|
|
|
case SHADER_OPCODE_FLOAT_CONTROL_MODE:
|
|
|
|
|
assert(src[0].file == BRW_IMMEDIATE_VALUE);
|
|
|
|
|
assert(src[1].file == BRW_IMMEDIATE_VALUE);
|
|
|
|
|
brw_float_controls_mode(p, src[0].d, src[1].d);
|
|
|
|
|
break;
|
|
|
|
|
|
2021-06-18 14:10:06 +03:00
|
|
|
case SHADER_OPCODE_READ_SR_REG:
|
|
|
|
|
if (devinfo->ver >= 12) {
|
2020-10-21 14:46:50 -05:00
|
|
|
/* There is a SWSB restriction that requires that any time sr0 is
|
|
|
|
|
* accessed both the instruction doing the access and the next one
|
|
|
|
|
* have SWSB set to RegDist(1).
|
|
|
|
|
*/
|
|
|
|
|
if (brw_get_default_swsb(p).mode != TGL_SBID_NULL)
|
|
|
|
|
brw_SYNC(p, TGL_SYNC_NOP);
|
2021-06-18 14:10:06 +03:00
|
|
|
assert(src[0].file == BRW_IMMEDIATE_VALUE);
|
2020-10-21 14:46:50 -05:00
|
|
|
brw_set_default_swsb(p, tgl_swsb_regdist(1));
|
2021-06-18 14:10:06 +03:00
|
|
|
brw_MOV(p, dst, brw_sr0_reg(src[0].ud));
|
2020-10-21 14:46:50 -05:00
|
|
|
brw_set_default_swsb(p, tgl_swsb_regdist(1));
|
2021-06-18 14:10:06 +03:00
|
|
|
brw_AND(p, dst, dst, brw_imm_ud(0xffffffff));
|
2020-10-21 14:46:50 -05:00
|
|
|
} else {
|
2021-06-18 14:10:06 +03:00
|
|
|
brw_MOV(p, dst, brw_sr0_reg(src[0].ud));
|
2020-10-21 14:46:50 -05:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
2011-05-24 16:34:27 -07:00
|
|
|
default:
|
2015-04-15 14:51:18 -07:00
|
|
|
unreachable("Unsupported opcode");
|
2014-05-27 18:47:40 -07:00
|
|
|
|
|
|
|
|
case SHADER_OPCODE_LOAD_PAYLOAD:
|
2014-06-29 14:54:01 -07:00
|
|
|
unreachable("Should be lowered by lower_load_payload()");
|
2011-05-24 16:34:27 -07:00
|
|
|
}
|
2014-05-30 16:41:32 -07:00
|
|
|
|
2014-12-30 12:56:13 -08:00
|
|
|
if (multiple_instructions_emitted)
|
|
|
|
|
continue;
|
|
|
|
|
|
2014-06-28 23:31:04 -07:00
|
|
|
if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
|
|
|
|
|
assert(p->next_insn_offset == last_insn_offset + 16 ||
|
|
|
|
|
!"conditional_mod, no_dd_check, or no_dd_clear set for IR "
|
|
|
|
|
"emitting more than 1 instruction");
|
|
|
|
|
|
2014-06-13 14:29:25 -07:00
|
|
|
brw_inst *last = &p->store[last_insn_offset / 16];
|
2014-06-28 23:31:04 -07:00
|
|
|
|
2014-11-21 12:20:53 -08:00
|
|
|
if (inst->conditional_mod)
|
2015-04-14 18:00:06 -07:00
|
|
|
brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
|
2021-03-29 14:41:58 -07:00
|
|
|
if (devinfo->ver < 12) {
|
2018-11-09 14:13:36 -08:00
|
|
|
brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
|
|
|
|
|
brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
|
|
|
|
|
}
|
2014-05-30 16:41:32 -07:00
|
|
|
}
|
2023-03-08 12:31:51 -08:00
|
|
|
|
|
|
|
|
/* When enabled, insert sync NOP after every instruction and make sure
|
|
|
|
|
* that current instruction depends on the previous instruction.
|
|
|
|
|
*/
|
|
|
|
|
if (INTEL_DEBUG(DEBUG_SWSB_STALL) && devinfo->ver >= 12) {
|
|
|
|
|
brw_set_default_swsb(p, tgl_swsb_regdist(1));
|
|
|
|
|
brw_SYNC(p, TGL_SYNC_NOP);
|
|
|
|
|
}
|
2011-05-24 16:34:27 -07:00
|
|
|
}
|
|
|
|
|
|
2016-08-29 15:57:41 -07:00
|
|
|
brw_set_uip_jip(p, start_offset);
|
2017-11-15 17:08:42 -08:00
|
|
|
|
|
|
|
|
/* end of program sentinel */
|
|
|
|
|
disasm_new_inst_group(disasm_info, p->next_insn_offset);
|
2014-05-25 10:42:32 -07:00
|
|
|
|
2022-05-24 02:44:53 -07:00
|
|
|
/* `send_count` explicitly does not include spills or fills, as we'd
|
|
|
|
|
* like to use it as a metric for intentional memory access or other
|
|
|
|
|
* shared function use. Otherwise, subtle changes to scheduling or
|
|
|
|
|
* register allocation could cause it to fluctuate wildly - and that
|
|
|
|
|
* effect is already counted in spill/fill counts.
|
|
|
|
|
*/
|
|
|
|
|
send_count -= shader_stats.spill_count;
|
|
|
|
|
send_count -= shader_stats.fill_count;
|
|
|
|
|
|
2015-06-29 14:08:51 -07:00
|
|
|
#ifndef NDEBUG
|
2017-11-16 13:35:01 -08:00
|
|
|
bool validated =
|
2015-06-29 14:08:51 -07:00
|
|
|
#else
|
|
|
|
|
if (unlikely(debug_flag))
|
2017-11-16 13:35:01 -08:00
|
|
|
#endif
|
2022-06-29 14:13:31 -07:00
|
|
|
brw_validate_instructions(&compiler->isa, p->store,
|
2017-04-28 17:05:44 -07:00
|
|
|
start_offset,
|
|
|
|
|
p->next_insn_offset,
|
2017-11-15 17:08:42 -08:00
|
|
|
disasm_info);
|
2015-06-29 14:08:51 -07:00
|
|
|
|
2014-05-25 14:56:41 -07:00
|
|
|
int before_size = p->next_insn_offset - start_offset;
|
2017-11-15 17:08:42 -08:00
|
|
|
brw_compact_instructions(p, start_offset, disasm_info);
|
2014-05-25 14:56:41 -07:00
|
|
|
int after_size = p->next_insn_offset - start_offset;
|
2014-05-25 10:42:32 -07:00
|
|
|
|
2023-10-09 12:48:02 +03:00
|
|
|
bool dump_shader_bin = brw_should_dump_shader_bin();
|
|
|
|
|
unsigned char sha1[21];
|
|
|
|
|
char sha1buf[41];
|
2019-05-23 19:05:23 +03:00
|
|
|
|
2023-10-09 12:48:02 +03:00
|
|
|
if (unlikely(debug_flag || dump_shader_bin)) {
|
2019-05-23 19:05:23 +03:00
|
|
|
_mesa_sha1_compute(p->store + start_offset / sizeof(brw_inst),
|
|
|
|
|
after_size, sha1);
|
|
|
|
|
_mesa_sha1_format(sha1buf, sha1);
|
2023-10-09 12:48:02 +03:00
|
|
|
}
|
2019-05-23 19:05:23 +03:00
|
|
|
|
2023-10-09 12:48:02 +03:00
|
|
|
if (unlikely(dump_shader_bin))
|
|
|
|
|
brw_dump_shader_bin(p->store, start_offset, p->next_insn_offset,
|
|
|
|
|
sha1buf);
|
|
|
|
|
|
|
|
|
|
if (unlikely(debug_flag)) {
|
2023-07-18 18:48:48 +00:00
|
|
|
fprintf(stderr, "Native code for %s (src_hash 0x%08x) (sha1 %s)\n"
|
2016-10-17 14:12:28 -07:00
|
|
|
"SIMD%d shader: %d instructions. %d loops. %u cycles. "
|
2019-09-09 18:31:41 -07:00
|
|
|
"%d:%d spills:fills, %u sends, "
|
2016-10-17 14:12:28 -07:00
|
|
|
"scheduled with mode %s. "
|
|
|
|
|
"Promoted %u constants. "
|
|
|
|
|
"Compacted %d to %d bytes (%.0f%%)\n",
|
2023-07-18 18:48:48 +00:00
|
|
|
shader_name, params->source_hash, sha1buf,
|
2019-05-23 19:05:23 +03:00
|
|
|
dispatch_width, before_size / 16,
|
2020-03-26 16:27:32 -07:00
|
|
|
loop_count, perf.latency,
|
2022-05-24 02:44:53 -07:00
|
|
|
shader_stats.spill_count,
|
|
|
|
|
shader_stats.fill_count,
|
|
|
|
|
send_count,
|
2016-10-17 14:12:28 -07:00
|
|
|
shader_stats.scheduler_mode,
|
|
|
|
|
shader_stats.promoted_constants,
|
|
|
|
|
before_size, after_size,
|
2014-05-25 14:56:41 -07:00
|
|
|
100.0f * (before_size - after_size) / before_size);
|
2014-05-25 10:46:55 -07:00
|
|
|
|
2019-05-23 19:05:23 +03:00
|
|
|
/* overriding the shader makes disasm_info invalid */
|
|
|
|
|
if (!brw_try_override_assembly(p, start_offset, sha1buf)) {
|
2019-06-03 14:55:23 +03:00
|
|
|
dump_assembly(p->store, start_offset, p->next_insn_offset,
|
|
|
|
|
disasm_info, perf.block_latency);
|
2019-05-23 19:05:23 +03:00
|
|
|
} else {
|
|
|
|
|
fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf);
|
|
|
|
|
}
|
2014-05-25 10:42:32 -07:00
|
|
|
}
|
2017-11-20 10:57:17 +02:00
|
|
|
ralloc_free(disasm_info);
|
2020-09-02 14:26:41 +02:00
|
|
|
#ifndef NDEBUG
|
|
|
|
|
if (!validated && !debug_flag) {
|
|
|
|
|
fprintf(stderr,
|
|
|
|
|
"Validation failed. Rerun with INTEL_DEBUG=shaders to get more information.\n");
|
|
|
|
|
}
|
|
|
|
|
#endif
|
2015-06-29 14:08:51 -07:00
|
|
|
assert(validated);
|
2014-11-13 16:28:08 -08:00
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
brw_shader_debug_log(compiler, params->log_data,
|
2021-07-29 14:13:27 -07:00
|
|
|
"%s SIMD%d shader: %d inst, %d loops, %u cycles, "
|
|
|
|
|
"%d:%d spills:fills, %u sends, "
|
|
|
|
|
"scheduled with mode %s, "
|
|
|
|
|
"Promoted %u constants, "
|
2021-10-03 15:58:36 +03:00
|
|
|
"compacted %d to %d bytes.\n",
|
2021-07-29 14:13:27 -07:00
|
|
|
_mesa_shader_stage_to_abbrev(stage),
|
2024-02-13 00:29:29 -08:00
|
|
|
dispatch_width,
|
|
|
|
|
before_size / 16 - nop_count - sync_nop_count,
|
2021-07-29 14:13:27 -07:00
|
|
|
loop_count, perf.latency,
|
2022-05-24 02:44:53 -07:00
|
|
|
shader_stats.spill_count,
|
|
|
|
|
shader_stats.fill_count,
|
|
|
|
|
send_count,
|
2021-07-29 14:13:27 -07:00
|
|
|
shader_stats.scheduler_mode,
|
|
|
|
|
shader_stats.promoted_constants,
|
|
|
|
|
before_size, after_size);
|
2019-04-23 23:19:56 -05:00
|
|
|
if (stats) {
|
|
|
|
|
stats->dispatch_width = dispatch_width;
|
2023-12-07 19:47:55 -08:00
|
|
|
stats->max_polygons = max_polygons;
|
2023-03-19 15:03:33 +02:00
|
|
|
stats->max_dispatch_width = dispatch_width;
|
2024-02-13 00:29:29 -08:00
|
|
|
stats->instructions = before_size / 16 - nop_count - sync_nop_count;
|
2020-04-03 13:09:41 -05:00
|
|
|
stats->sends = send_count;
|
2019-04-23 23:19:56 -05:00
|
|
|
stats->loops = loop_count;
|
2020-03-26 16:27:32 -07:00
|
|
|
stats->cycles = perf.latency;
|
2022-05-24 02:44:53 -07:00
|
|
|
stats->spills = shader_stats.spill_count;
|
|
|
|
|
stats->fills = shader_stats.fill_count;
|
2023-02-03 17:02:28 +01:00
|
|
|
stats->max_live_registers = shader_stats.max_register_pressure;
|
2019-04-23 23:19:56 -05:00
|
|
|
}
|
2014-11-14 12:46:44 -08:00
|
|
|
|
2014-11-13 16:28:08 -08:00
|
|
|
return start_offset;
|
2011-05-24 16:34:27 -07:00
|
|
|
}
|
2012-11-09 01:05:47 -08:00
|
|
|
|
2020-08-07 22:26:07 -05:00
|
|
|
void
|
|
|
|
|
fs_generator::add_const_data(void *data, unsigned size)
|
|
|
|
|
{
|
|
|
|
|
assert(prog_data->const_data_size == 0);
|
|
|
|
|
if (size > 0) {
|
|
|
|
|
prog_data->const_data_size = size;
|
|
|
|
|
prog_data->const_data_offset = brw_append_data(p, data, size, 32);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-04 12:40:06 -05:00
|
|
|
void
|
|
|
|
|
fs_generator::add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt)
|
|
|
|
|
{
|
|
|
|
|
assert(brw_shader_stage_is_bindless(stage));
|
|
|
|
|
struct brw_bs_prog_data *bs_prog_data = brw_bs_prog_data(prog_data);
|
|
|
|
|
if (num_resume_shaders > 0) {
|
|
|
|
|
bs_prog_data->resume_sbt_offset =
|
|
|
|
|
brw_append_data(p, sbt, num_resume_shaders * sizeof(uint64_t), 32);
|
|
|
|
|
for (unsigned i = 0; i < num_resume_shaders; i++) {
|
|
|
|
|
size_t offset = bs_prog_data->resume_sbt_offset + i * sizeof(*sbt);
|
|
|
|
|
assert(offset <= UINT32_MAX);
|
|
|
|
|
brw_add_reloc(p, BRW_SHADER_RELOC_SHADER_START_OFFSET,
|
|
|
|
|
BRW_SHADER_RELOC_TYPE_U32,
|
|
|
|
|
(uint32_t)offset, (uint32_t)sbt[i]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2012-11-09 01:05:47 -08:00
|
|
|
const unsigned *
|
2018-02-26 16:34:55 -08:00
|
|
|
fs_generator::get_assembly()
|
2012-11-09 01:05:47 -08:00
|
|
|
{
|
2020-08-08 13:56:16 -05:00
|
|
|
prog_data->relocs = brw_get_shader_relocs(p, &prog_data->num_relocs);
|
|
|
|
|
|
2018-02-26 16:34:55 -08:00
|
|
|
return brw_get_program(p, &prog_data->program_size);
|
2012-11-09 01:05:47 -08:00
|
|
|
}
|