2024-01-04 23:26:50 -08:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
* SPDX-License-Identifier: MIT
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "brw_fs.h"
|
|
|
|
|
#include "brw_fs_builder.h"
|
2024-11-14 10:04:26 -08:00
|
|
|
#include "util/half_float.h"
|
2024-01-04 23:26:50 -08:00
|
|
|
|
|
|
|
|
using namespace brw;
|
|
|
|
|
|
|
|
|
|
static uint64_t
|
2024-06-18 23:42:59 -07:00
|
|
|
src_as_uint(const brw_reg &src)
|
2024-01-04 23:26:50 -08:00
|
|
|
{
|
|
|
|
|
assert(src.file == IMM);
|
|
|
|
|
|
|
|
|
|
switch (src.type) {
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_W:
|
2024-01-04 23:26:50 -08:00
|
|
|
return (uint64_t)(int16_t)(src.ud & 0xffff);
|
|
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_UW:
|
2024-01-04 23:26:50 -08:00
|
|
|
return (uint64_t)(uint16_t)(src.ud & 0xffff);
|
|
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_D:
|
2024-01-04 23:26:50 -08:00
|
|
|
return (uint64_t)src.d;
|
|
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_UD:
|
2024-01-04 23:26:50 -08:00
|
|
|
return (uint64_t)src.ud;
|
|
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_Q:
|
2024-01-04 23:26:50 -08:00
|
|
|
return src.d64;
|
|
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_UQ:
|
2024-01-04 23:26:50 -08:00
|
|
|
return src.u64;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
unreachable("Invalid integer type.");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-14 10:04:26 -08:00
|
|
|
static double
|
|
|
|
|
src_as_float(const brw_reg &src)
|
|
|
|
|
{
|
|
|
|
|
assert(src.file == IMM);
|
|
|
|
|
|
|
|
|
|
switch (src.type) {
|
|
|
|
|
case BRW_TYPE_HF:
|
|
|
|
|
return _mesa_half_to_float((uint16_t)src.d);
|
|
|
|
|
|
|
|
|
|
case BRW_TYPE_F:
|
|
|
|
|
return src.f;
|
|
|
|
|
|
|
|
|
|
case BRW_TYPE_DF:
|
|
|
|
|
return src.df;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
unreachable("Invalid float type.");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
static brw_reg
|
2024-01-04 23:26:50 -08:00
|
|
|
brw_imm_for_type(uint64_t value, enum brw_reg_type type)
|
|
|
|
|
{
|
|
|
|
|
switch (type) {
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_W:
|
2024-01-04 23:26:50 -08:00
|
|
|
return brw_imm_w(value);
|
|
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_UW:
|
2024-01-04 23:26:50 -08:00
|
|
|
return brw_imm_uw(value);
|
|
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_D:
|
2024-01-04 23:26:50 -08:00
|
|
|
return brw_imm_d(value);
|
|
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_UD:
|
2024-01-04 23:26:50 -08:00
|
|
|
return brw_imm_ud(value);
|
|
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_Q:
|
2024-01-04 23:26:50 -08:00
|
|
|
return brw_imm_d(value);
|
|
|
|
|
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_UQ:
|
2024-01-04 23:26:50 -08:00
|
|
|
return brw_imm_uq(value);
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
unreachable("Invalid integer type.");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-14 10:04:26 -08:00
|
|
|
/**
|
|
|
|
|
* Converts a MAD to an ADD by folding the multiplicand sources.
|
|
|
|
|
*/
|
|
|
|
|
static void
|
|
|
|
|
fold_multiplicands_of_MAD(fs_inst *inst)
|
|
|
|
|
{
|
|
|
|
|
assert(inst->opcode == BRW_OPCODE_MAD);
|
|
|
|
|
assert (inst->src[1].file == IMM &&
|
|
|
|
|
inst->src[2].file == IMM &&
|
|
|
|
|
!brw_type_is_vector_imm(inst->src[1].type) &&
|
|
|
|
|
!brw_type_is_vector_imm(inst->src[2].type));
|
|
|
|
|
|
|
|
|
|
if (brw_type_is_int(inst->src[1].type)) {
|
|
|
|
|
const uint64_t imm1 = src_as_uint(inst->src[1]);
|
|
|
|
|
const uint64_t imm2 = src_as_uint(inst->src[2]);
|
|
|
|
|
|
|
|
|
|
brw_reg product = brw_imm_ud(imm1 * imm2);
|
|
|
|
|
|
|
|
|
|
inst->src[1] = retype(product,
|
|
|
|
|
brw_type_larger_of(inst->src[1].type,
|
|
|
|
|
inst->src[2].type));
|
|
|
|
|
} else {
|
|
|
|
|
const double product = src_as_float(inst->src[1]) *
|
|
|
|
|
src_as_float(inst->src[2]);
|
|
|
|
|
|
|
|
|
|
switch (brw_type_larger_of(inst->src[1].type,
|
|
|
|
|
inst->src[2].type)) {
|
|
|
|
|
case BRW_TYPE_HF:
|
|
|
|
|
inst->src[1] = retype(brw_imm_w(_mesa_float_to_half(product)),
|
|
|
|
|
BRW_TYPE_HF);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_TYPE_F:
|
|
|
|
|
inst->src[1] = brw_imm_f(product);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_TYPE_DF:
|
|
|
|
|
unreachable("float64 should be impossible.");
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
unreachable("Invalid float type.");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_ADD;
|
|
|
|
|
inst->resize_sources(2);
|
|
|
|
|
}
|
|
|
|
|
|
2024-10-10 14:07:04 -07:00
|
|
|
bool
|
|
|
|
|
brw_constant_fold_instruction(const intel_device_info *devinfo, fs_inst *inst)
|
|
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
switch (inst->opcode) {
|
|
|
|
|
case BRW_OPCODE_ADD:
|
2024-12-10 13:22:21 -08:00
|
|
|
if (inst->src[0].file != IMM || inst->src[1].file != IMM)
|
2024-10-10 14:07:04 -07:00
|
|
|
break;
|
|
|
|
|
|
2024-12-10 13:22:21 -08:00
|
|
|
if (brw_type_is_int(inst->src[0].type)) {
|
|
|
|
|
const uint64_t src0 = src_as_uint(inst->src[0]);
|
|
|
|
|
const uint64_t src1 = src_as_uint(inst->src[1]);
|
2024-10-10 14:07:04 -07:00
|
|
|
|
2024-12-10 13:22:21 -08:00
|
|
|
inst->src[0] = brw_imm_for_type(src0 + src1, inst->dst.type);
|
|
|
|
|
} else {
|
2024-10-10 14:07:04 -07:00
|
|
|
assert(inst->src[0].type == BRW_TYPE_F);
|
|
|
|
|
inst->src[0].f += inst->src[1].f;
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-10 13:22:21 -08:00
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
inst->resize_sources(1);
|
|
|
|
|
progress = true;
|
2024-10-10 14:07:04 -07:00
|
|
|
break;
|
|
|
|
|
|
2023-06-22 17:15:00 -07:00
|
|
|
case BRW_OPCODE_ADD3:
|
|
|
|
|
if (inst->src[0].file == IMM &&
|
|
|
|
|
inst->src[1].file == IMM &&
|
|
|
|
|
inst->src[2].file == IMM) {
|
|
|
|
|
const uint64_t src0 = src_as_uint(inst->src[0]);
|
|
|
|
|
const uint64_t src1 = src_as_uint(inst->src[1]);
|
|
|
|
|
const uint64_t src2 = src_as_uint(inst->src[2]);
|
|
|
|
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
inst->src[0] = brw_imm_for_type(src0 + src1 + src2,
|
|
|
|
|
inst->dst.type);
|
|
|
|
|
inst->resize_sources(1);
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
break;
|
2024-10-10 14:07:04 -07:00
|
|
|
|
|
|
|
|
case BRW_OPCODE_AND:
|
|
|
|
|
if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
|
|
|
|
|
const uint64_t src0 = src_as_uint(inst->src[0]);
|
|
|
|
|
const uint64_t src1 = src_as_uint(inst->src[1]);
|
|
|
|
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
inst->src[0] = brw_imm_for_type(src0 & src1, inst->dst.type);
|
|
|
|
|
inst->resize_sources(1);
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
2024-11-14 10:04:26 -08:00
|
|
|
case BRW_OPCODE_MAD:
|
|
|
|
|
if (inst->src[1].file == IMM &&
|
|
|
|
|
inst->src[2].file == IMM &&
|
|
|
|
|
inst->src[3].file == IMM &&
|
|
|
|
|
!brw_type_is_vector_imm(inst->src[1].type) &&
|
|
|
|
|
!brw_type_is_vector_imm(inst->src[2].type) &&
|
|
|
|
|
!brw_type_is_vector_imm(inst->src[3].type)) {
|
|
|
|
|
fold_multiplicands_of_MAD(inst);
|
|
|
|
|
assert(inst->opcode == BRW_OPCODE_ADD);
|
|
|
|
|
|
|
|
|
|
ASSERTED bool folded = brw_constant_fold_instruction(devinfo, inst);
|
|
|
|
|
assert(folded);
|
|
|
|
|
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
2024-10-10 14:07:04 -07:00
|
|
|
case BRW_OPCODE_MUL:
|
2024-12-10 14:41:28 -08:00
|
|
|
if (brw_type_is_float(inst->src[1].type))
|
2024-10-10 14:07:04 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
/* From the BDW PRM, Vol 2a, "mul - Multiply":
|
|
|
|
|
*
|
|
|
|
|
* "When multiplying integer datatypes, if src0 is DW and src1
|
|
|
|
|
* is W, irrespective of the destination datatype, the
|
|
|
|
|
* accumulator maintains full 48-bit precision."
|
|
|
|
|
* ...
|
|
|
|
|
* "When multiplying integer data types, if one of the sources
|
|
|
|
|
* is a DW, the resulting full precision data is stored in
|
|
|
|
|
* the accumulator."
|
|
|
|
|
*
|
|
|
|
|
* There are also similar notes in earlier PRMs.
|
|
|
|
|
*
|
|
|
|
|
* The MOV instruction can copy the bits of the source, but it
|
|
|
|
|
* does not clear the higher bits of the accumulator. So, because
|
|
|
|
|
* we might use the full accumulator in the MUL/MACH macro, we
|
|
|
|
|
* shouldn't replace such MULs with MOVs.
|
|
|
|
|
*/
|
|
|
|
|
if ((brw_type_size_bytes(inst->src[0].type) == 4 ||
|
|
|
|
|
brw_type_size_bytes(inst->src[1].type) == 4) &&
|
|
|
|
|
(inst->dst.is_accumulator() ||
|
|
|
|
|
inst->writes_accumulator_implicitly(devinfo)))
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
if (inst->src[0].is_zero() || inst->src[1].is_zero()) {
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
inst->src[0] = brw_imm_d(0);
|
|
|
|
|
inst->resize_sources(1);
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-10 14:41:28 -08:00
|
|
|
if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
|
|
|
|
|
const uint64_t src0 = src_as_uint(inst->src[0]);
|
|
|
|
|
const uint64_t src1 = src_as_uint(inst->src[1]);
|
2024-10-10 14:07:04 -07:00
|
|
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
2024-12-10 14:41:28 -08:00
|
|
|
inst->src[0] = brw_imm_for_type(src0 * src1, inst->dst.type);
|
2024-10-10 14:07:04 -07:00
|
|
|
inst->resize_sources(1);
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_OR:
|
|
|
|
|
if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
|
|
|
|
|
const uint64_t src0 = src_as_uint(inst->src[0]);
|
|
|
|
|
const uint64_t src1 = src_as_uint(inst->src[1]);
|
|
|
|
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
inst->src[0] = brw_imm_for_type(src0 | src1, inst->dst.type);
|
|
|
|
|
inst->resize_sources(1);
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_SHL:
|
|
|
|
|
if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
|
|
|
|
|
/* It's not currently possible to generate this, and this constant
|
|
|
|
|
* folding does not handle it.
|
|
|
|
|
*/
|
|
|
|
|
assert(!inst->saturate);
|
|
|
|
|
|
|
|
|
|
brw_reg result;
|
|
|
|
|
|
|
|
|
|
switch (brw_type_size_bytes(inst->src[0].type)) {
|
|
|
|
|
case 2:
|
|
|
|
|
result = brw_imm_uw(0x0ffff & (inst->src[0].ud << (inst->src[1].ud & 0x1f)));
|
|
|
|
|
break;
|
|
|
|
|
case 4:
|
|
|
|
|
result = brw_imm_ud(inst->src[0].ud << (inst->src[1].ud & 0x1f));
|
|
|
|
|
break;
|
|
|
|
|
case 8:
|
|
|
|
|
result = brw_imm_uq(inst->src[0].u64 << (inst->src[1].ud & 0x3f));
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
/* Just in case a future platform re-enables B or UB types. */
|
|
|
|
|
unreachable("Invalid source size.");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
inst->src[0] = retype(result, inst->dst.type);
|
|
|
|
|
inst->resize_sources(1);
|
|
|
|
|
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
|
|
/* The function is only intended to do constant folding, so the result of
|
|
|
|
|
* progress must be a MOV of an immediate value.
|
|
|
|
|
*/
|
|
|
|
|
if (progress) {
|
|
|
|
|
assert(inst->opcode == BRW_OPCODE_MOV);
|
|
|
|
|
assert(inst->src[0].file == IMM);
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-04 23:26:50 -08:00
|
|
|
bool
|
|
|
|
|
brw_fs_opt_algebraic(fs_visitor &s)
|
|
|
|
|
{
|
|
|
|
|
const intel_device_info *devinfo = s.devinfo;
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
|
2024-12-12 17:14:36 -08:00
|
|
|
if (brw_constant_fold_instruction(devinfo, inst)) {
|
|
|
|
|
progress = true;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-04 23:26:50 -08:00
|
|
|
switch (inst->opcode) {
|
2024-12-10 13:22:21 -08:00
|
|
|
case BRW_OPCODE_ADD:
|
2024-12-12 17:14:36 -08:00
|
|
|
if (brw_type_is_int(inst->src[1].type) &&
|
2024-12-10 13:22:21 -08:00
|
|
|
inst->src[1].is_zero()) {
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
inst->resize_sources(1);
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
2023-06-22 17:15:00 -07:00
|
|
|
case BRW_OPCODE_ADD3: {
|
|
|
|
|
const unsigned num_imm = (inst->src[0].file == IMM) +
|
|
|
|
|
(inst->src[1].file == IMM) +
|
|
|
|
|
(inst->src[2].file == IMM);
|
|
|
|
|
|
|
|
|
|
/* If there is more than one immediate value, fold the values and
|
|
|
|
|
* convert the instruction to either ADD or MOV.
|
|
|
|
|
*/
|
2024-12-12 17:14:36 -08:00
|
|
|
assert(num_imm < 3);
|
|
|
|
|
if (num_imm == 2) {
|
2023-06-22 17:15:00 -07:00
|
|
|
uint64_t sum = 0;
|
|
|
|
|
brw_reg src;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 3; i++) {
|
|
|
|
|
if (inst->src[i].file == IMM) {
|
|
|
|
|
sum += src_as_uint(inst->src[i]);
|
|
|
|
|
} else {
|
|
|
|
|
assert(src.file == BAD_FILE);
|
|
|
|
|
src = inst->src[i];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
assert(src.file != BAD_FILE);
|
|
|
|
|
|
|
|
|
|
if (uint32_t(sum) == 0) {
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
inst->src[0] = src;
|
|
|
|
|
inst->resize_sources(1);
|
|
|
|
|
} else {
|
|
|
|
|
inst->opcode = BRW_OPCODE_ADD;
|
|
|
|
|
inst->src[0] = src;
|
|
|
|
|
inst->src[1] = brw_imm_ud(sum);
|
|
|
|
|
inst->resize_sources(2);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
progress = true;
|
|
|
|
|
} else if (num_imm == 1) {
|
|
|
|
|
/* If there is a single constant, and that constant is zero,
|
|
|
|
|
* convert the instruction to regular ADD.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned i = 0; i < 3; i++) {
|
|
|
|
|
if (inst->src[i].is_zero()) {
|
|
|
|
|
inst->opcode = BRW_OPCODE_ADD;
|
|
|
|
|
inst->src[i] = inst->src[2];
|
|
|
|
|
inst->resize_sources(2);
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-04 23:26:50 -08:00
|
|
|
case BRW_OPCODE_MOV:
|
|
|
|
|
if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
|
|
|
|
|
inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
|
|
|
|
|
inst->dst.is_null() &&
|
|
|
|
|
(inst->src[0].abs || inst->src[0].negate)) {
|
|
|
|
|
inst->src[0].abs = false;
|
|
|
|
|
inst->src[0].negate = false;
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (inst->src[0].file != IMM)
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
if (inst->saturate) {
|
|
|
|
|
/* Full mixed-type saturates don't happen. However, we can end up
|
|
|
|
|
* with things like:
|
|
|
|
|
*
|
|
|
|
|
* mov.sat(8) g21<1>DF -1F
|
|
|
|
|
*
|
|
|
|
|
* Other mixed-size-but-same-base-type cases may also be possible.
|
|
|
|
|
*/
|
|
|
|
|
if (inst->dst.type != inst->src[0].type &&
|
2024-04-20 17:08:02 -07:00
|
|
|
inst->dst.type != BRW_TYPE_DF &&
|
|
|
|
|
inst->src[0].type != BRW_TYPE_F)
|
2024-01-04 23:26:50 -08:00
|
|
|
assert(!"unimplemented: saturate mixed types");
|
|
|
|
|
|
2024-06-19 09:55:00 -07:00
|
|
|
if (brw_reg_saturate_immediate(&inst->src[0])) {
|
2024-01-04 23:26:50 -08:00
|
|
|
inst->saturate = false;
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
2024-12-10 14:41:28 -08:00
|
|
|
case BRW_OPCODE_MUL:
|
2024-12-12 17:14:36 -08:00
|
|
|
if (brw_type_is_int(inst->src[0].type)){
|
2024-12-10 14:41:28 -08:00
|
|
|
/* From the BDW PRM, Vol 2a, "mul - Multiply":
|
|
|
|
|
*
|
|
|
|
|
* "When multiplying integer datatypes, if src0 is DW and src1
|
|
|
|
|
* is W, irrespective of the destination datatype, the
|
|
|
|
|
* accumulator maintains full 48-bit precision."
|
|
|
|
|
* ...
|
|
|
|
|
* "When multiplying integer data types, if one of the sources
|
|
|
|
|
* is a DW, the resulting full precision data is stored in the
|
|
|
|
|
* accumulator."
|
|
|
|
|
*
|
|
|
|
|
* There are also similar notes in earlier PRMs.
|
|
|
|
|
*
|
|
|
|
|
* The MOV instruction can copy the bits of the source, but it
|
|
|
|
|
* does not clear the higher bits of the accumulator. So, because
|
|
|
|
|
* we might use the full accumulator in the MUL/MACH macro, we
|
|
|
|
|
* shouldn't replace such MULs with MOVs.
|
|
|
|
|
*/
|
|
|
|
|
if ((brw_type_size_bytes(inst->src[0].type) == 4 ||
|
|
|
|
|
brw_type_size_bytes(inst->src[1].type) == 4) &&
|
|
|
|
|
(inst->dst.is_accumulator() ||
|
|
|
|
|
inst->writes_accumulator_implicitly(devinfo)))
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
/* a * 1 = a */
|
|
|
|
|
if (inst->src[i].is_one()) {
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
} else if (inst->src[i].is_negative_one()) {
|
|
|
|
|
/* a * -1 = -a */
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
|
|
|
|
/* If the source other than the -1 is immediate, just
|
|
|
|
|
* toggling the negation flag will not work. Due to the
|
|
|
|
|
* previous call to brw_constant_fold_instruction, this
|
|
|
|
|
* should not be possible.
|
|
|
|
|
*/
|
|
|
|
|
assert(inst->src[1 - i].file != IMM);
|
|
|
|
|
inst->src[1 - i].negate = !inst->src[1 - i].negate;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_MOV) {
|
|
|
|
|
/* If the literal 1 was src0, put the old src1 in src0. */
|
|
|
|
|
if (i == 0)
|
|
|
|
|
inst->src[0] = inst->src[1];
|
|
|
|
|
|
|
|
|
|
inst->resize_sources(1);
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
2024-01-04 23:26:50 -08:00
|
|
|
case BRW_OPCODE_OR:
|
2024-12-12 17:14:36 -08:00
|
|
|
if (inst->src[0].equals(inst->src[1]) || inst->src[1].is_zero()) {
|
2024-01-04 23:26:50 -08:00
|
|
|
/* On Gfx8+, the OR instruction can have a source modifier that
|
|
|
|
|
* performs logical not on the operand. Cases of 'OR r0, ~r1, 0'
|
|
|
|
|
* or 'OR r0, ~r1, ~r1' should become a NOT instead of a MOV.
|
|
|
|
|
*/
|
|
|
|
|
if (inst->src[0].negate) {
|
|
|
|
|
inst->opcode = BRW_OPCODE_NOT;
|
|
|
|
|
inst->src[0].negate = false;
|
|
|
|
|
} else {
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
}
|
2024-05-08 09:37:24 -07:00
|
|
|
inst->resize_sources(1);
|
2024-01-04 23:26:50 -08:00
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_CMP:
|
|
|
|
|
if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
|
|
|
|
|
inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
|
|
|
|
|
inst->src[1].is_zero() &&
|
|
|
|
|
(inst->src[0].abs || inst->src[0].negate)) {
|
|
|
|
|
inst->src[0].abs = false;
|
|
|
|
|
inst->src[0].negate = false;
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_SEL:
|
|
|
|
|
if (inst->src[0].equals(inst->src[1])) {
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
inst->predicate = BRW_PREDICATE_NONE;
|
|
|
|
|
inst->predicate_inverse = false;
|
2024-05-08 09:37:24 -07:00
|
|
|
inst->resize_sources(1);
|
2024-01-04 23:26:50 -08:00
|
|
|
progress = true;
|
|
|
|
|
} else if (inst->saturate && inst->src[1].file == IMM) {
|
|
|
|
|
switch (inst->conditional_mod) {
|
|
|
|
|
case BRW_CONDITIONAL_LE:
|
|
|
|
|
case BRW_CONDITIONAL_L:
|
|
|
|
|
switch (inst->src[1].type) {
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_F:
|
2024-01-04 23:26:50 -08:00
|
|
|
if (inst->src[1].f >= 1.0f) {
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NONE;
|
2024-05-08 09:37:24 -07:00
|
|
|
inst->resize_sources(1);
|
2024-01-04 23:26:50 -08:00
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
case BRW_CONDITIONAL_GE:
|
|
|
|
|
case BRW_CONDITIONAL_G:
|
|
|
|
|
switch (inst->src[1].type) {
|
2024-04-20 17:08:02 -07:00
|
|
|
case BRW_TYPE_F:
|
2024-01-04 23:26:50 -08:00
|
|
|
if (inst->src[1].f <= 0.0f) {
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NONE;
|
2024-05-08 09:37:24 -07:00
|
|
|
inst->resize_sources(1);
|
2024-01-04 23:26:50 -08:00
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-04-05 10:16:40 -07:00
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_CSEL:
|
|
|
|
|
if (brw_type_is_float(inst->dst.type)) {
|
|
|
|
|
/* This transformation can both clean up spurious modifiers
|
|
|
|
|
* (making assembly dumps easier to read) and convert GE with -abs
|
|
|
|
|
* to LE with abs. See abs handling below.
|
|
|
|
|
*/
|
|
|
|
|
if (inst->src[2].negate) {
|
|
|
|
|
inst->conditional_mod = brw_swap_cmod(inst->conditional_mod);
|
|
|
|
|
inst->src[2].negate = false;
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (inst->src[2].abs) {
|
|
|
|
|
switch (inst->conditional_mod) {
|
|
|
|
|
case BRW_CONDITIONAL_Z:
|
|
|
|
|
case BRW_CONDITIONAL_NZ:
|
|
|
|
|
inst->src[2].abs = false;
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case BRW_CONDITIONAL_LE:
|
|
|
|
|
/* Converting to Z can help constant propagation into src0
|
|
|
|
|
* and src1.
|
|
|
|
|
*/
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_Z;
|
|
|
|
|
inst->src[2].abs = false;
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
/* GE or L conditions with absolute value could be used to
|
|
|
|
|
* implement isnan(x) in CSEL. Transforming G with absolute
|
|
|
|
|
* value to NZ is **not** NaN safe.
|
|
|
|
|
*/
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else if (brw_type_is_sint(inst->src[2].type)) {
|
|
|
|
|
/* Integer transformations are more challenging than floating
|
|
|
|
|
* point transformations due to INT_MIN == -(INT_MIN) ==
|
|
|
|
|
* abs(INT_MIN).
|
|
|
|
|
*/
|
|
|
|
|
if (inst->src[2].negate && inst->src[2].abs) {
|
|
|
|
|
switch (inst->conditional_mod) {
|
|
|
|
|
case BRW_CONDITIONAL_GE:
|
|
|
|
|
inst->src[2].negate = false;
|
|
|
|
|
inst->src[2].abs = false;
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_Z;
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
case BRW_CONDITIONAL_L:
|
|
|
|
|
inst->src[2].negate = false;
|
|
|
|
|
inst->src[2].abs = false;
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NZ;
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
case BRW_CONDITIONAL_G:
|
|
|
|
|
/* This is a contradtion. -abs(x) cannot be > 0. */
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
inst->src[0] = inst->src[1];
|
|
|
|
|
inst->resize_sources(1);
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
case BRW_CONDITIONAL_LE:
|
|
|
|
|
/* This is a tautology. -abs(x) must be <= 0. */
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
inst->resize_sources(1);
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
case BRW_CONDITIONAL_Z:
|
|
|
|
|
case BRW_CONDITIONAL_NZ:
|
|
|
|
|
inst->src[2].negate = false;
|
|
|
|
|
inst->src[2].abs = false;
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
unreachable("Impossible icsel condition.");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-01-04 23:26:50 -08:00
|
|
|
break;
|
|
|
|
|
case BRW_OPCODE_MAD:
|
2024-11-14 10:04:26 -08:00
|
|
|
if (inst->src[1].file == IMM &&
|
|
|
|
|
inst->src[2].file == IMM &&
|
|
|
|
|
!brw_type_is_vector_imm(inst->src[1].type) &&
|
|
|
|
|
!brw_type_is_vector_imm(inst->src[2].type)) {
|
|
|
|
|
fold_multiplicands_of_MAD(inst);
|
|
|
|
|
|
|
|
|
|
/* This could result in (x + 0). For floats, we want to leave this
|
|
|
|
|
* as an ADD so that a subnormal x will get flushed to zero.
|
|
|
|
|
*/
|
|
|
|
|
assert(inst->opcode == BRW_OPCODE_ADD);
|
|
|
|
|
progress = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-04 23:26:50 -08:00
|
|
|
if (inst->src[1].is_one()) {
|
|
|
|
|
inst->opcode = BRW_OPCODE_ADD;
|
|
|
|
|
inst->src[1] = inst->src[2];
|
2024-05-08 09:37:24 -07:00
|
|
|
inst->resize_sources(2);
|
2024-01-04 23:26:50 -08:00
|
|
|
progress = true;
|
|
|
|
|
} else if (inst->src[2].is_one()) {
|
|
|
|
|
inst->opcode = BRW_OPCODE_ADD;
|
2024-05-08 09:37:24 -07:00
|
|
|
inst->resize_sources(2);
|
2024-01-04 23:26:50 -08:00
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_BROADCAST:
|
|
|
|
|
if (is_uniform(inst->src[0])) {
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
inst->force_writemask_all = true;
|
brw/build: Use SIMD8 temporaries in emit_uniformize
The fossil-db results are very different from v1. This is now mostly
helpful on older platforms.
v2: When optimizing BROADCAST or FIND_LIVE_CHANNEL to a simple MOV,
adjust the exec_size to match the size allocated for the destination
register. Fixes EU validation failures in some piglit OpenCL tests
(e.g., atomic_add-global-return.cl).
v3: Use component_size() in emit_uniformize and BROADCAST to properly
account for UQ vs UD destination. This doesn't matter for
emit_uniformize because the type is always UD, but it is technically
more correct.
v4: Update trace checksums. Now amly expects the same checksum as
several other platforms.
v5: Use xbld.dispatch_width() in the builder for when scalar_group()
eventually becomes SIMD1. Suggested by Lionel.
shader-db:
Lunar Lake, Meteor Lake, DG2, and Tiger Lake had similar results. (Lunar Lake shown)
total instructions in shared programs: 18091701 -> 18091586 (<.01%)
instructions in affected programs: 29616 -> 29501 (-0.39%)
helped: 28 / HURT: 18
total cycles in shared programs: 919250494 -> 919123828 (-0.01%)
cycles in affected programs: 12201102 -> 12074436 (-1.04%)
helped: 124 / HURT: 108
LOST: 0
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20480808 -> 20480624 (<.01%)
instructions in affected programs: 58465 -> 58281 (-0.31%)
helped: 61 / HURT: 20
total cycles in shared programs: 874860168 -> 874960312 (0.01%)
cycles in affected programs: 18240986 -> 18341130 (0.55%)
helped: 113 / HURT: 158
total spills in shared programs: 4557 -> 4555 (-0.04%)
spills in affected programs: 93 -> 91 (-2.15%)
helped: 1 / HURT: 0
total fills in shared programs: 5247 -> 5243 (-0.08%)
fills in affected programs: 224 -> 220 (-1.79%)
helped: 1 / HURT: 0
fossil-db:
Lunar Lake
Totals:
Instrs: 220486064 -> 220486959 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 14102592 -> 14102624 (+0.00%)
Cycle count: 31602733838 -> 31604733270 (+0.01%); split: -0.01%, +0.02%
Max live registers: 65371025 -> 65355084 (-0.02%)
Totals from 12130 (1.73% of 702392) affected shaders:
Instrs: 5162700 -> 5163595 (+0.02%); split: -0.06%, +0.08%
Subgroup size: 388128 -> 388160 (+0.01%)
Cycle count: 751721956 -> 753721388 (+0.27%); split: -0.54%, +0.81%
Max live registers: 1538550 -> 1522609 (-1.04%)
Meteor Lake and DG2 had similar results. (Meteor Lake shown)
Totals:
Instrs: 241601142 -> 241599114 (-0.00%); split: -0.00%, +0.00%
Subgroup size: 9631168 -> 9631216 (+0.00%)
Cycle count: 25101781573 -> 25097909570 (-0.02%); split: -0.03%, +0.01%
Max live registers: 41540611 -> 41514296 (-0.06%)
Max dispatch width: 6993456 -> 7000928 (+0.11%); split: +0.15%, -0.05%
Totals from 16852 (2.11% of 796880) affected shaders:
Instrs: 6303937 -> 6301909 (-0.03%); split: -0.11%, +0.07%
Subgroup size: 323592 -> 323640 (+0.01%)
Cycle count: 625455880 -> 621583877 (-0.62%); split: -1.20%, +0.58%
Max live registers: 1072491 -> 1046176 (-2.45%)
Max dispatch width: 76672 -> 84144 (+9.75%); split: +14.04%, -4.30%
Tiger Lake
Totals:
Instrs: 235190395 -> 235193286 (+0.00%); split: -0.00%, +0.00%
Cycle count: 23130855720 -> 23128936334 (-0.01%); split: -0.02%, +0.01%
Max live registers: 41644106 -> 41620052 (-0.06%)
Max dispatch width: 6959160 -> 6981512 (+0.32%); split: +0.34%, -0.02%
Totals from 15102 (1.90% of 793371) affected shaders:
Instrs: 5771042 -> 5773933 (+0.05%); split: -0.06%, +0.11%
Cycle count: 371062226 -> 369142840 (-0.52%); split: -1.04%, +0.52%
Max live registers: 989858 -> 965804 (-2.43%)
Max dispatch width: 61344 -> 83696 (+36.44%); split: +38.42%, -1.98%
Ice Lake and Skylake had similar results. (Ice Lake shown)
Totals:
Instrs: 236063150 -> 236063242 (+0.00%); split: -0.00%, +0.00%
Cycle count: 24516187174 -> 24516027518 (-0.00%); split: -0.00%, +0.00%
Spill count: 567071 -> 567049 (-0.00%)
Fill count: 701323 -> 701273 (-0.01%)
Max live registers: 41914047 -> 41913281 (-0.00%)
Max dispatch width: 7042608 -> 7042736 (+0.00%); split: +0.00%, -0.00%
Totals from 3904 (0.49% of 798473) affected shaders:
Instrs: 2809690 -> 2809782 (+0.00%); split: -0.02%, +0.03%
Cycle count: 182114259 -> 181954603 (-0.09%); split: -0.34%, +0.25%
Spill count: 1696 -> 1674 (-1.30%)
Fill count: 2523 -> 2473 (-1.98%)
Max live registers: 341695 -> 340929 (-0.22%)
Max dispatch width: 32752 -> 32880 (+0.39%); split: +0.44%, -0.05%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32097>
2024-10-15 15:51:22 -07:00
|
|
|
inst->exec_size = 8 * reg_unit(devinfo);
|
|
|
|
|
assert(inst->size_written == inst->dst.component_size(inst->exec_size));
|
2024-05-08 09:37:24 -07:00
|
|
|
inst->resize_sources(1);
|
2024-01-04 23:26:50 -08:00
|
|
|
progress = true;
|
|
|
|
|
} else if (inst->src[1].file == IMM) {
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
/* It's possible that the selected component will be too large and
|
|
|
|
|
* overflow the register. This can happen if someone does a
|
|
|
|
|
* readInvocation() from GLSL or SPIR-V and provides an OOB
|
|
|
|
|
* invocationIndex. If this happens and we some how manage
|
|
|
|
|
* to constant fold it in and get here, then component() may cause
|
|
|
|
|
* us to start reading outside of the VGRF which will lead to an
|
|
|
|
|
* assert later. Instead, just let it wrap around if it goes over
|
|
|
|
|
* exec_size.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
|
|
|
|
|
inst->src[0] = component(inst->src[0], comp);
|
|
|
|
|
inst->force_writemask_all = true;
|
brw/build: Use SIMD8 temporaries in emit_uniformize
The fossil-db results are very different from v1. This is now mostly
helpful on older platforms.
v2: When optimizing BROADCAST or FIND_LIVE_CHANNEL to a simple MOV,
adjust the exec_size to match the size allocated for the destination
register. Fixes EU validation failures in some piglit OpenCL tests
(e.g., atomic_add-global-return.cl).
v3: Use component_size() in emit_uniformize and BROADCAST to properly
account for UQ vs UD destination. This doesn't matter for
emit_uniformize because the type is always UD, but it is technically
more correct.
v4: Update trace checksums. Now amly expects the same checksum as
several other platforms.
v5: Use xbld.dispatch_width() in the builder for when scalar_group()
eventually becomes SIMD1. Suggested by Lionel.
shader-db:
Lunar Lake, Meteor Lake, DG2, and Tiger Lake had similar results. (Lunar Lake shown)
total instructions in shared programs: 18091701 -> 18091586 (<.01%)
instructions in affected programs: 29616 -> 29501 (-0.39%)
helped: 28 / HURT: 18
total cycles in shared programs: 919250494 -> 919123828 (-0.01%)
cycles in affected programs: 12201102 -> 12074436 (-1.04%)
helped: 124 / HURT: 108
LOST: 0
GAINED: 1
Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20480808 -> 20480624 (<.01%)
instructions in affected programs: 58465 -> 58281 (-0.31%)
helped: 61 / HURT: 20
total cycles in shared programs: 874860168 -> 874960312 (0.01%)
cycles in affected programs: 18240986 -> 18341130 (0.55%)
helped: 113 / HURT: 158
total spills in shared programs: 4557 -> 4555 (-0.04%)
spills in affected programs: 93 -> 91 (-2.15%)
helped: 1 / HURT: 0
total fills in shared programs: 5247 -> 5243 (-0.08%)
fills in affected programs: 224 -> 220 (-1.79%)
helped: 1 / HURT: 0
fossil-db:
Lunar Lake
Totals:
Instrs: 220486064 -> 220486959 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 14102592 -> 14102624 (+0.00%)
Cycle count: 31602733838 -> 31604733270 (+0.01%); split: -0.01%, +0.02%
Max live registers: 65371025 -> 65355084 (-0.02%)
Totals from 12130 (1.73% of 702392) affected shaders:
Instrs: 5162700 -> 5163595 (+0.02%); split: -0.06%, +0.08%
Subgroup size: 388128 -> 388160 (+0.01%)
Cycle count: 751721956 -> 753721388 (+0.27%); split: -0.54%, +0.81%
Max live registers: 1538550 -> 1522609 (-1.04%)
Meteor Lake and DG2 had similar results. (Meteor Lake shown)
Totals:
Instrs: 241601142 -> 241599114 (-0.00%); split: -0.00%, +0.00%
Subgroup size: 9631168 -> 9631216 (+0.00%)
Cycle count: 25101781573 -> 25097909570 (-0.02%); split: -0.03%, +0.01%
Max live registers: 41540611 -> 41514296 (-0.06%)
Max dispatch width: 6993456 -> 7000928 (+0.11%); split: +0.15%, -0.05%
Totals from 16852 (2.11% of 796880) affected shaders:
Instrs: 6303937 -> 6301909 (-0.03%); split: -0.11%, +0.07%
Subgroup size: 323592 -> 323640 (+0.01%)
Cycle count: 625455880 -> 621583877 (-0.62%); split: -1.20%, +0.58%
Max live registers: 1072491 -> 1046176 (-2.45%)
Max dispatch width: 76672 -> 84144 (+9.75%); split: +14.04%, -4.30%
Tiger Lake
Totals:
Instrs: 235190395 -> 235193286 (+0.00%); split: -0.00%, +0.00%
Cycle count: 23130855720 -> 23128936334 (-0.01%); split: -0.02%, +0.01%
Max live registers: 41644106 -> 41620052 (-0.06%)
Max dispatch width: 6959160 -> 6981512 (+0.32%); split: +0.34%, -0.02%
Totals from 15102 (1.90% of 793371) affected shaders:
Instrs: 5771042 -> 5773933 (+0.05%); split: -0.06%, +0.11%
Cycle count: 371062226 -> 369142840 (-0.52%); split: -1.04%, +0.52%
Max live registers: 989858 -> 965804 (-2.43%)
Max dispatch width: 61344 -> 83696 (+36.44%); split: +38.42%, -1.98%
Ice Lake and Skylake had similar results. (Ice Lake shown)
Totals:
Instrs: 236063150 -> 236063242 (+0.00%); split: -0.00%, +0.00%
Cycle count: 24516187174 -> 24516027518 (-0.00%); split: -0.00%, +0.00%
Spill count: 567071 -> 567049 (-0.00%)
Fill count: 701323 -> 701273 (-0.01%)
Max live registers: 41914047 -> 41913281 (-0.00%)
Max dispatch width: 7042608 -> 7042736 (+0.00%); split: +0.00%, -0.00%
Totals from 3904 (0.49% of 798473) affected shaders:
Instrs: 2809690 -> 2809782 (+0.00%); split: -0.02%, +0.03%
Cycle count: 182114259 -> 181954603 (-0.09%); split: -0.34%, +0.25%
Spill count: 1696 -> 1674 (-1.30%)
Fill count: 2523 -> 2473 (-1.98%)
Max live registers: 341695 -> 340929 (-0.22%)
Max dispatch width: 32752 -> 32880 (+0.39%); split: +0.44%, -0.05%
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32097>
2024-10-15 15:51:22 -07:00
|
|
|
inst->exec_size = 8 * reg_unit(devinfo);
|
|
|
|
|
assert(inst->size_written == inst->dst.component_size(inst->exec_size));
|
2024-05-08 09:37:24 -07:00
|
|
|
inst->resize_sources(1);
|
2024-01-04 23:26:50 -08:00
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
if (is_uniform(inst->src[0])) {
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
2024-05-08 09:37:24 -07:00
|
|
|
inst->resize_sources(1);
|
2024-01-04 23:26:50 -08:00
|
|
|
progress = true;
|
|
|
|
|
} else if (inst->src[1].file == IMM) {
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
inst->src[0] = component(inst->src[0],
|
|
|
|
|
inst->src[1].ud);
|
2024-05-08 09:37:24 -07:00
|
|
|
inst->resize_sources(1);
|
2024-01-04 23:26:50 -08:00
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Ensure that the correct source has the immediate value. 2-source
|
|
|
|
|
* instructions must have the immediate in src[1]. On Gfx12 and later,
|
|
|
|
|
* some 3-source instructions can have the immediate in src[0] or
|
|
|
|
|
* src[2]. It's complicated, so don't mess with 3-source instructions
|
|
|
|
|
* here.
|
|
|
|
|
*/
|
|
|
|
|
if (progress && inst->sources == 2 && inst->is_commutative()) {
|
|
|
|
|
if (inst->src[0].file == IMM) {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg tmp = inst->src[1];
|
2024-01-04 23:26:50 -08:00
|
|
|
inst->src[1] = inst->src[0];
|
|
|
|
|
inst->src[0] = tmp;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
|
|
|
|
|
DEPENDENCY_INSTRUCTION_DETAIL);
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|